Source code for pastas.stats.dutch

"""The following methods are descriptive statistics commonly used to describe
groundwater time series in the Netherlands.

.. codeauthor:: R. Calje, T. van Steijn and R. Collenteur
"""

# Type Hinting
from typing import Optional, Union

from numpy import nan
from packaging.version import parse as parse_version
from pandas import Series, Timedelta, concat, date_range
from pandas import __version__ as pd_version

from pastas.timeseries_utils import get_sample
from pastas.typing import Function, TimestampType

pandas_version = parse_version(pd_version)

year_offset = "YE" if pandas_version >= parse_version("2.2.0") else "A"


[docs]def q_ghg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, q: float = 0.94, by_year: bool = True, ) -> Series: """Gemiddeld Hoogste Grondwaterstand (GHG) also called MHGL (Mean High GW Level). Parameters ---------- series: pandas.Series Series to calculate the GHG for. tmin, tmax: pandas.Timestamp, optional q : float, optional quantile fraction of exceedance (default 0.94) by_year: bool, optional Take average over quantiles per year (default True) Notes ----- Approximated by taking quantiles of the time series values per year and calculating the mean of the quantiles. The series is first resampled to daily values. """ return _q_gxg(series, q, tmin=tmin, tmax=tmax, by_year=by_year)
[docs]def q_glg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, q: float = 0.06, by_year: bool = True, ) -> Series: """Gemiddeld Laagste Grondwaterstand (GLG) also called MLGL (Mean Low Groundwater Level). Parameters ---------- series: pandas.Series Series to calculate the GLG for. tmin, tmax: pandas.Timestamp, optional q : float, optional quantile, fraction of exceedance (default 0.06) by_year: bool, optional Take average over quantiles per year (default True) Notes ----- Approximated by taking quantiles of the time series values per year and calculating the mean of the quantiles. The series is first resampled to daily values. """ return _q_gxg(series, q, tmin=tmin, tmax=tmax, by_year=by_year)
[docs]def q_gvg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, by_year: bool = True, ) -> Series: """Gemiddeld Voorjaarsgrondwaterstand (GVG) also called MSGL (Mean Spring GW Level). Parameters ---------- series: pandas.Series Series to calculate the GVG for. tmin, tmax: pandas.Timestamp, optional by_year: bool, optional Take average over quantiles per year (default True) Notes ----- Approximated by taking the median of the values in the period between 14 March and 15 April (after resampling to daily values). This function does not care about series length! """ if tmin is not None: series = series.loc[tmin:] if tmax is not None: series = series.loc[:tmax] series = series.resample("d").median() inspring = _in_spring(series) if any(inspring): if by_year: return series.loc[inspring].resample(year_offset).median().mean() else: return series.loc[inspring].median() else: return nan
[docs]def ghg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, fill_method: str = "nearest", limit: int = 0, output: str = "mean", min_n_meas: int = 16, min_n_years: int = 8, year_offset: str = year_offset + "-MAR", ) -> Union[Series, float]: """Calculate the 'Gemiddelde Hoogste Grondwaterstand' (Average High Groundwater Level) Parameters ---------- series: pandas.Series with a DatetimeIndex The pandas Series of which the statistic is determined. tmin: pandas.Timestamp, optional The lowest index to take into account. tmax: pandas.Timestamp, optional The highest index to take into account. fill_method : str see .. :mod: pastas.stats.dutch._gxg limit : int or None, optional Maximum number of days to fill using fill method, use None to fill nothing. output : str, optional output type * 'mean' (default) : for mean of yearly values. * 'yearly': for series of yearly values. * 'g3': for series with selected data for calculating statistic. * 'semimonthly': for series with all data points (14th, 28th of each month). min_n_meas: int, optional Minimum number of measurements per year (at maximum 24). min_n_years: int, optional Minimum number of years. year_offset: resampling offset. Use 'YE' for calendar years (jan 1 to dec 31) and 'YE-MAR' for hydrological years (apr 1 to mar 31). Returns ------- pd.Series or scalar Series of yearly values or mean of yearly values. Notes ----- Classic method resampling the series to every 14th and 28th of the month. Taking the mean of the mean of three highest values per year. """ # mean_high = lambda s: s.nlargest(3).mean() def highs(s, min_n_meas): if len(s) < min_n_meas: return Series(nan) else: if len(s) > 20: return s.nlargest(3) elif len(s) > 12: return s.nlargest(2) else: return s.nlargest(1) def mean_high(s, min_n_meas): return highs(s, min_n_meas).mean() if output in ["mean", "yearly"]: f_agg = mean_high elif output == "g3": f_agg = highs elif output == "semimonthly": f_agg = None else: raise ValueError(f"Unrecognized option for output: {output}") return _gxg( series, f_agg, tmin=tmin, tmax=tmax, fill_method=fill_method, limit=limit, output=output, min_n_meas=min_n_meas, min_n_years=min_n_years, year_offset=year_offset, )
[docs]def glg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, fill_method: str = "nearest", limit: int = 0, output: str = "mean", min_n_meas: int = 16, min_n_years: int = 8, year_offset: str = year_offset + "-MAR", ) -> Union[Series, float]: """Calculate the 'Gemiddelde Laagste Grondwaterstand' (Average Low GW Level). Parameters ---------- series: pandas.Series with a DatetimeIndex The pandas Series of which the statistic is determined. tmin: pandas.Timestamp, optional The lowest index to take into account. tmax: pandas.Timestamp, optional The highest index to take into account. fill_method : str, optional see .. :mod: pastas.stats.dutch._gxg limit : int or None, optional Maximum number of days to fill using fill method, use None to fill nothing. output : str, optional output type * 'mean' (default) : for mean of yearly values. * 'yearly': for series of yearly values. * 'g3': for series with selected data for calculating statistic. * 'semimonthly': for series with all data points (14th, 28th of each month). min_n_meas: int, optional Minimum number of measurements per year (at maximum 24). min_n_years: int, optional Minimum number of years. year_offset: resampling offset. Use 'YE' for calendar years (jan 1 to dec 31) and 'YE-MAR' for hydrological years (apr 1 to mar 31). Returns ------- pd.Series or scalar Series of yearly values or mean of yearly values. Notes ----- Classic method resampling the series to every 14th and 28th of the month. Taking the mean of the mean of three lowest values per year. """ # mean_low = lambda s: s.nsmallest(3).mean() def lows(s, min_n_meas): if len(s) < min_n_meas: return Series(nan) else: if len(s) > 20: return s.nsmallest(3) elif len(s) > 12: return s.nsmallest(2) else: return s.nsmallest(1) def mean_low(s, min_n_meas): return lows(s, min_n_meas).mean() if output in ["mean", "yearly"]: f_agg = mean_low elif output == "g3": f_agg = lows elif output == "semimonthly": f_agg = None else: raise ValueError(f"Unrecognized option for output: {output}") return _gxg( series, f_agg, tmin=tmin, tmax=tmax, fill_method=fill_method, limit=limit, output=output, min_n_meas=min_n_meas, min_n_years=min_n_years, year_offset=year_offset, )
[docs]def gvg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, fill_method: str = "linear", limit: int = 8, output: str = "mean", min_n_meas: int = 2, min_n_years: int = 8, year_offset: str = year_offset, ) -> Union[Series, float]: """Calculate the 'Gemiddelde Voorjaars Grondwaterstand' (Average Spring GW Level). Parameters ---------- series: pandas.Series with a DatetimeIndex The pandas Series of which the statistic is determined. tmin: pandas.Timestamp, optional The lowest index to take into account. tmax: pandas.Timestamp, optional The highest index to take into account. fill_method : str, optional see .. :mod: pastas.stats.dutch._gxg limit : int or None, optional Maximum number of days to fill using fill method, use None to fill nothing. output : str, optional output type * 'mean' (default) : for mean of yearly values. * 'yearly': for series of yearly values. * 'g3': for series with selected data for calculating statistic. * 'semimonthly': for series with all data points (14th, 28th of each month). min_n_meas: int, optional Minimum number of measurements per year (at maximum 3). min_n_years: int, optional Minimum number of years. year_offset: resampling offset. Use "YE" for calendar years (jan 1 to dec 31) and "YE-MAR" for hydrological years (apr 1 to mar 31). Returns ------- pandas.Series or scalar Series of yearly values or mean of yearly values. Notes ----- Classic method resampling the series to every 14th and 28th of the month. Taking the mean of the values on March 14, March 28 and April 14. """ def _mean_spring(s, min_n_meas): return _get_spring(s, min_n_meas).mean() if output in ["mean", "yearly"]: f_agg = _mean_spring elif output == "g3": f_agg = _get_spring elif output == "semimonthly": f_agg = None else: raise ValueError(f"Unrecognized option for output: {output}") return _gxg( series, f_agg, tmin=tmin, tmax=tmax, fill_method=fill_method, limit=limit, output=output, min_n_meas=min_n_meas, min_n_years=min_n_years, year_offset=year_offset, )
[docs]def gg( series: Series, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, fill_method: str = "nearest", limit: int = 0, output: str = "mean", min_n_meas: int = 16, min_n_years: int = 8, year_offset: str = year_offset + "-MAR", ) -> Union[Series, float]: """Calculate the 'Gemiddelde Grondwaterstand' (Average Groundwater Level). Parameters ---------- series: pandas.Series with a DatetimeIndex The pandas Series of which the statistic is determined. tmin: pandas.Timestamp, optional The lowest index to take into account. tmax: pandas.Timestamp, optional The highest index to take into account. fill_method : str, optional see .. :mod: pastas.stats.dutch._gxg limit : int or None, optional Maximum number of days to fill using fill method, use None to fill nothing. output : str, optional output type * 'mean' (default) : for mean of yearly values. * 'yearly': for series of yearly values. * 'g3': for series with selected data for calculating statistic. * 'semimonthly': for series with all data points (14th, 28th of each month). min_n_meas: int, optional Minimum number of measurements per year (at maximum 24). min_n_years: int, optional Minimum number of years. year_offset: resampling offset. Use "YE" for calendar years (jan 1 to dec 31) and 'YE-MAR' for hydrological years (apr 1 to mar 31). Returns ------- pd.Series or scalar series of yearly values or mean of yearly values. Notes ----- Classic method resampling the series to every 14th and 28th of the month. """ # mean_low = lambda s: s.nsmallest(3).mean() def mean_all(s, min_n_meas): if len(s) < min_n_meas: return nan else: return s.mean() return _gxg( series, mean_all, tmin=tmin, tmax=tmax, fill_method=fill_method, limit=limit, output=output, min_n_meas=min_n_meas, min_n_years=min_n_years, year_offset=year_offset, )
# Helper functions def _get_spring(series: Series, min_n_meas: int) -> float: """Internal method to get values of time series values in spring. Part of year aggregator function for gvg method. Parameters ---------- series : pandas.Series series with datetime index. Returns ------- float values of series in spring, or NaN if no values in spring. """ inspring = _in_spring(series) if inspring.sum() < min_n_meas: return Series(nan) else: return series.loc[inspring] def _in_spring(series: Series) -> Series: """Internal method to test if time series index is between 14 March and 15 April. Parameters ---------- series : pd.Series series with datetime index. Returns ------- pd.Series Boolean series with datetimeindex. """ def isinspring(x): return ((x.month == 3) and (x.day >= 14)) or ((x.month == 4) and (x.day < 15)) return Series([isinspring(x) for x in series.index], index=series.index) def _gxg( series: Series, year_agg: Function, tmin: Optional[TimestampType], tmax: Optional[TimestampType], fill_method: str, limit: Union[int, None], output: str, min_n_meas: int, min_n_years: int, year_offset: str, ) -> Union[Series, float]: """Internal method for classic GXG statistics. Resampling the series to every 14th and 28th of the month. Taking the mean of aggregated values per year. Parameters ---------- series: pandas.Series with a DatetimeIndex The pandas Series of which the statistic is determined. year_agg : function series -> scalar Aggregator function to one value per year. tmin: pandas.Timestamp, optional The lowest index to take into account. tmax: pandas.Timestamp, optional The highest index to take into account. fill_method : str see notes below. limit : int or None, optional Maximum number of days to fill using fill method, use None to fill nothing. output : str output type * 'mean' (default) : for mean of yearly values * 'yearly': for series of yearly values * 'g3': for series with selected data for calculating statistic * 'semimonthly': for series with all data points (14th, 28th of each month) min_n_meas: int, optional Minimum number of measurements per year. min_n_years: int Minimum number of years. year_offset: string resampling offset. Use "YE" for calendar years (jan 1 to dec 31) and 'YE-MAR' for hydrological years (apr 1 to mar 31) Returns ------- pandas.Series or scalar Series of yearly values or mean of yearly values. Raises ------ ValueError When output argument is unknown. Notes ----- fill method for interpolation to 14th and 28th of the month see: * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.ffill.html * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.bfill.html * https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.reindex.html * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.interpolate.html * Use None to omit filling and drop NaNs """ # handle tmin and tmax if tmin is not None: series = series.loc[tmin:] if tmax is not None: series = series.loc[:tmax] if series.empty: if output.startswith("year"): return Series() elif output == "mean": return nan else: ValueError("{output:} is not a valid output option".format(output=output)) # resample the series to values at the 14th and 28th of every month # first generate a daily series by averaging multiple measurements during the day series = series.resample("d").mean() select14or28 = True if fill_method is None: series = series.dropna() elif fill_method == "ffill": series = series.ffill(limit=limit) elif fill_method == "bfill": series = series.bfill(limit=limit) elif fill_method == "nearest": if limit == 0: # limit=0 is a trick to only use each measurement once # only keep days with measurements series = series.dropna() # generate an index at the 14th and 28th of every month buf = Timedelta(8, "d") ref_index = date_range(series.index.min() - buf, series.index.max() + buf) mask = [(x.day == 14) or (x.day == 28) for x in ref_index] ref_index = ref_index[mask] # only keep the days that are closest to series.index ref_index = get_sample(ref_index, series.index) # and set the index of series to this index # (and remove rows in series that are not in ref_index) series = series.reindex(ref_index, method=fill_method) select14or28 = False else: # with a large limit (larger than 6) it is possible that one measurement # is used more than once series = series.dropna().reindex( series.index, method=fill_method, limit=limit ) else: series = series.interpolate( method=fill_method, limit=limit, limit_direction="both" ) # and select the 14th and 28th of each month (if needed still) if select14or28: mask = [(x.day == 14) or (x.day == 28) for x in series.index] series = series.loc[mask] # remove NaNs that may have formed in the process above series.dropna(inplace=True) # resample the series to yearly values if output == "semimonthly": return series elif output in ["yearly", "mean"]: yearly = series.resample(year_offset).apply(year_agg, min_n_meas=min_n_meas) elif output == "g3": yearly = series.resample(year_offset) collect = {} for yr, group in yearly: s = year_agg(group, min_n_meas=min_n_meas) if isinstance(s, Series): s = s.sort_index() collect[yr] = s yearly = concat(collect) # return statements if output.startswith("year"): return yearly elif output == "g3": return yearly elif output == "mean": if yearly.notna().sum() < min_n_years: return nan else: return yearly.mean() else: msg = "{} is not a valid output option".format(output) raise (ValueError(msg)) def _q_gxg( series: Series, q: float, tmin: Optional[TimestampType] = None, tmax: Optional[TimestampType] = None, by_year: bool = True, ) -> Series: """Dutch groundwater statistics GHG and GLG approximated by taking quantiles of the time series values per year and taking the mean of the quantiles. The series is first resampled to daily values. Parameters ---------- series: pandas.Series Series to calculate the GXG for. q: float quantile fraction of exceedance. tmin: pandas.Timestamp, optional tmax: pandas.Timestamp, optional by_year: bool, optional Take average over quantiles per year (default True). """ if tmin is not None: series = series.loc[tmin:] if tmax is not None: series = series.loc[:tmax] series = series.resample("d").median() if by_year: return series.resample(year_offset).apply(lambda s: s.quantile(q)).mean() else: return series.quantile(q)