Source code for pastas.stats.metrics

"""The following methods may be used to describe the fit between the model
simulation and the observations.

Examples
========
These methods may be used as follows:

>>> ps.stats.rmse(sim, obs)

or

>>> ml.stats.rmse()
"""

from logging import getLogger

from numpy import abs, average, log, nan, sqrt
from pastas.stats.core import _get_weights, mean, std, var

__all__ = ["rmse", "sse", "mae", "nse", "evp", "rsq", "bic", "aic",
           "pearsonr", "kge_2012"]
logger = getLogger(__name__)


# Absolute Error Metrics

[docs]def mae(obs=None, sim=None, res=None, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Mean Absolute Error (MAE). Parameters ---------- sim: pandas.Series Series with the simulated values. obs: pandas.Series Series with the observed values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional Weight the values by the normalized time step to account for irregular time series. Default is True. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- The Mean Absolute Error (MAE) between two time series x and y is computed as follows: .. math:: \\text{MAE} = \\sum_{i=1}^{N} w_i |x_i - y_i| where :math:`N` is the number of observations in the observed time series. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan w = _get_weights(res, weighted=weighted, max_gap=max_gap) return (w * abs(res.to_numpy())).sum()
[docs]def rmse(obs=None, sim=None, res=None, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Root Mean Squared Error (RMSE). Parameters ---------- sim: pandas.Series Series with the simulated values. obs: pandas.Series Series with the observed values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional Weight the values by the normalized time step to account for irregular time series. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- Computes the Root Mean Squared Error (RMSE) as follows: .. math:: \\text{RMSE} = \\sqrt{\\sum_{i=1}^{N} w_i n_i^2} where :math:`N` is the number of residuals :math:`n`. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan w = _get_weights(res, weighted=weighted, max_gap=max_gap) return sqrt((w * res.to_numpy() ** 2).sum())
[docs]def sse(obs=None, sim=None, res=None, missing="drop"): """Compute the Sum of the Squared Errors (SSE). Parameters ---------- sim: pandas.Series Series with the simulated values. obs: pandas.Series Series with the observed values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. Notes ----- The Sum of the Squared Errors (SSE) is calculated as follows: .. math:: \\text{SSE} = \\sum(r^2) Where :math:`r` are the residuals. """ if res is None: res = (sim - obs) if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan return (res.to_numpy() ** 2).sum()
# Percentage Error Metrics
[docs]def pearsonr(obs, sim, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Pearson correlation (r). Parameters ---------- sim: pandas.Series Series with the simulated values. obs: pandas.Series Series with the observed values. missing: str, optional string with the rule to deal with missing values in the observed series. Only "drop" is supported now. weighted: bool, optional Weight the values by the normalized time step to account for irregular time series. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- The Pearson correlation (r) is computed as follows: .. math:: r = \\frac{\\sum_{i=1}^{N}w_i (x_i - \\bar{x})(y_i - \\bar{y})} {\\sqrt{\\sum_{i=1}^{N} w_i(x_i-\\bar{x})^2 \\sum_{i=1}^{N} w_i(y_i-\\bar{y})^2}} Where :math:`x` is is observed time series, :math:`y` the simulated time series, and :math:`N` the number of observations in the observed time series. """ if missing == "drop": obs = obs.dropna() w = _get_weights(obs, weighted=weighted, max_gap=max_gap) sim = sim.reindex(obs.index).dropna().to_numpy() # Return nan if the time indices of the sim and obs don't match if sim.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan sim = sim - average(sim, weights=w) obs = obs.to_numpy() - average(obs.to_numpy(), weights=w) r = (w * sim * obs).sum() / \ sqrt((w * sim ** 2).sum() * (w * obs ** 2).sum()) return r
[docs]def evp(obs, sim=None, res=None, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Explained Variance Percentage (EVP). Parameters ---------- obs: pandas.Series Series with the observed values. sim: pandas.Series Series with the simulated values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional If weighted is True, the variances are computed using the time step between observations as weights. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- Commonly used goodness-of-fit metric groundwater level models as computed in [asmuth_2012]_. .. math:: \\text{EVP} = \\frac{\\sigma_h^2 - \\sigma_r^2}{\\sigma_h^2} * 100 where :math:`\\sigma_h^2` is the variance of the observations and :math:`\\sigma_r^2` is the variance of the residuals. The returned value is bounded between 0% and 100%. References ---------- .. [asmuth_2012] von Asmuth, J., K. Maas, M. Knotters, M. Bierkens, M. Bakker, T.N. Olsthoorn, D.G. Cirkel, I. Leunk, F. Schaars, and D.C. von Asmuth. 2012. Software for hydrogeologic time series analysis, interfacing data with physical insight. Environmental Modelling & Software 38: 178–130. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan if obs.var() == 0.0: return 100. else: return max(0.0, (1 - var(res, weighted=weighted, max_gap=max_gap) / var(obs, weighted=weighted, max_gap=max_gap))) * 100
[docs]def nse(obs, sim=None, res=None, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Nash-Sutcliffe Efficiency (NSE). Parameters ---------- obs: pandas.Series Series with the observed values. sim: pandas.Series Series with the simulated values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional If weighted is True, the variances are computed using the time step between observations as weights. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- NSE computed according to [nash_1970]_ .. math:: \\text{NSE} = 1 - \\frac{\\sum(h_s-h_o)^2}{\\sum(h_o-\\mu_{h,o})} References ---------- .. [nash_1970] Nash, J. E., & Sutcliffe, J. V. (1970). River flow forecasting through conceptual models part I-A discussion of principles. Journal of hydrology, 10(3), 282-230. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan w = _get_weights(res, weighted=weighted, max_gap=max_gap) mu = average(obs.to_numpy(), weights=w) return 1 - (w * res.to_numpy() ** 2).sum() / \ (w * (obs.to_numpy() - mu) ** 2).sum()
[docs]def rsq(obs, sim=None, res=None, missing="drop", weighted=False, max_gap=30, nparam=None): """Compute R-squared, possibly adjusted for the number of free parameters. Parameters ---------- obs: pandas.Series Series with the observed values. sim: pandas.Series Series with the simulated values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional If weighted is True, the variances are computed using the time step between observations as weights. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. nparam: int, optional number of calibrated parameters. Notes ----- .. math:: \\rho_{adj} = 1- \\frac{n-1}{n-n_{param}}*\\frac{rss}{tss} Where n is the number of observations, :math:`n_{param}` the number of free parameters, rss the sum of the squared residuals, and tss the total sum of squared residuals. When nparam is provided, the :math:`\\rho` is adjusted for the number of calibration parameters. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan w = _get_weights(res, weighted=weighted, max_gap=max_gap) mu = average(obs.to_numpy(), weights=w) rss = (w * res.to_numpy() ** 2.0).sum() tss = (w * (obs.to_numpy() - mu) ** 2.0).sum() if nparam: return 1.0 - (obs.size - 1.0) / (obs.size - nparam) * rss / tss else: return 1.0 - rss / tss
[docs]def bic(obs=None, sim=None, res=None, missing="drop", nparam=1): """Compute the Bayesian Information Criterium (BIC). Parameters ---------- obs: pandas.Series Series with the observed values. sim: pandas.Series Series with the simulated values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. nparam: int, optional number of calibrated parameters. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. Notes ----- The Bayesian Information Criterium (BIC) [akaike_1979]_ is computed as follows: .. math:: \\text{BIC} = -2 log(L) + n_{param} * log(N) where :math:`n_{param}` is the number of calibration parameters. References ---------- .. [akaike_1979] Akaike, H. (1979). A Bayesian extension of the minimum AIC procedure of autoregressive model fitting. Biometrika, 66(2), 237-242. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan return (res.index.size * log((res.to_numpy() ** 2.0).sum() / res.index.size) + nparam * log(res.index.size))
[docs]def aic(obs=None, sim=None, res=None, missing="drop", nparam=1): """Compute the Akaike Information Criterium (AIC). Parameters ---------- obs: pandas.Series Series with the observed values. sim: pandas.Series Series with the simulated values. res: pandas.Series Series with the residual values. If time series for the residuals are provided, the sim and obs arguments are ignored. nparam: int, optional number of calibrated parameters. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. Notes ----- The Akaike Information Criterium (AIC) [akaike_1974]_ is computed as follows: .. math:: \\text{AIC} = -2 log(L) + 2 nparam where :math:`n_{param}` is the number of calibration parameters and L is the likelihood function for the model. References ---------- .. [akaike_1974] Akaike, H. (1974). A new look at the statistical model identification. IEEE transactions on automatic control, 19(6), 716-723. """ if res is None: res = sim - obs if missing == "drop": res = res.dropna() # Return nan if the time indices of the sim and obs don't match if res.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan return (res.index.size * log((res.to_numpy() ** 2.0).sum() / res.index.size) + 2.0 * nparam)
# Forecast Error Metrics
[docs]def kge_2012(obs, sim, missing="drop", weighted=False, max_gap=30): """Compute the (weighted) Kling-Gupta Efficiency (KGE). Parameters ---------- sim: pandas.Series Series with the simulated values. obs: pandas.Series Series with the observed values. missing: str, optional string with the rule to deal with missing values. Only "drop" is supported now. weighted: bool, optional Weight the values by the normalized time step to account for irregular time series. Default is False. max_gap: int, optional maximum allowed gap period in days to use for the computation of the weights. All time steps larger than max_gap are replace with the max_gap value. Default value is 30 days. Notes ----- The (weighted) Kling-Gupta Efficiency [kling_2012]_ is computed as follows: .. math:: \\text{KGE} = 1 - \\sqrt{(r-1)^2 + (\\beta-1)^2 - (\\gamma-1)^2} where :math:`\\beta = \\bar{x} / \\bar{y}` and :math:`\\gamma = \\frac{\\bar{\\sigma}_x / \\bar{x}}{\\bar{\\sigma}_y / \\bar{y}}`. If weighted equals True, the weighted mean, variance and pearson correlation are used. References ---------- .. [kling_2012] Kling, H., Fuchs, M., and Paulin, M. (2012). Runoff conditions in the upper Danube basin under an ensemble of climate change scenarios. Journal of Hydrology, 424-425:264 - 277. """ if missing == "drop": obs = obs.dropna() sim = sim.reindex(obs.index).dropna() # Return nan if the time indices of the sim and obs don't match if sim.index.size == 0: logger.warning("Time indices of the sim and obs don't match.") return nan r = pearsonr(obs=obs, sim=sim, weighted=weighted, max_gap=max_gap) mu_sim = mean(sim, weighted=weighted, max_gap=max_gap) mu_obs = mean(obs, weighted=weighted, max_gap=max_gap) beta = mu_sim / mu_obs gamma = (std(sim, weighted=weighted, max_gap=max_gap) / mu_sim) / \ (std(obs, weighted=weighted, max_gap=max_gap) / mu_obs) kge = 1 - sqrt((r - 1) ** 2 + (beta - 1) ** 2 + (gamma - 1) ** 2) return kge