Source code for pastas.stats.metrics

"""The following methods may be used to describe the fit between the model simulation
and the observations.

Examples
========
These methods may be used as follows:

>>> ps.stats.rmse(sim, obs)

or directly from a Pastas model:

>>> ml.stats.rmse()
"""

from logging import getLogger

# Type Hinting
from typing import Optional

from numpy import abs, average, log, nan, sqrt
from pandas import Series

from pastas.stats.core import _get_weights, mean, std, var

__all__ = [
    "rmse",
    "sse",
    "mae",
    "nse",
    "evp",
    "rsq",
    "bic",
    "aic",
    "pearsonr",
    "kge_2012",
]
logger = getLogger(__name__)


# Absolute Error Metrics


[docs]def mae(
    obs: Optional[Series] = None,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Mean Absolute Error (MAE).

    Parameters
    ----------
    sim: pandas.Series
        Series with the simulated values.
    obs: pandas.Series
        The Series with the observed values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.
    weighted: bool, optional
        Weight the values by the normalized time step to account for irregular time
        series. Default is True.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.

    Notes
    -----
    The Mean Absolute Error (MAE) between two time series x and y is computed as
    follows:

    .. math:: \\text{MAE} = \\sum_{i=1}^{N} w_i |x_i - y_i|

    where :math:`N` is the number of observations in the observed time series.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    w = _get_weights(res, weighted=weighted, max_gap=max_gap)
    return (w * abs(res.to_numpy())).sum()


[docs]def rmse(
    obs: Optional[Series] = None,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Root Mean Squared Error (RMSE).

    Parameters
    ----------
    sim: pandas.Series
        Series with the simulated values.
    obs: pandas.Series
        The Series with the observed values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.
    weighted: bool, optional
        Weight the values by the normalized time step to account for irregular time
        series. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.

    Notes
    -----
    Computes the Root Mean Squared Error (RMSE) as follows:

    .. math:: \\text{RMSE} = \\sqrt{\\sum_{i=1}^{N} w_i n_i^2}

    where :math:`N` is the number of residuals :math:`n`.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    w = _get_weights(res, weighted=weighted, max_gap=max_gap)
    return sqrt((w * res.to_numpy() ** 2).sum())


[docs]def sse(
    obs: Optional[Series] = None,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
) -> float:
    """Compute the Sum of the Squared Errors (SSE).

    Parameters
    ----------
    sim: pandas.Series
        Series with the simulated values.
    obs: pandas.Series
        The Series with the observed values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.

    Notes
    -----
    The Sum of the Squared Errors (SSE) is calculated as follows:

    .. math:: \\text{SSE} = \\sum(r^2)

    where :math:`r` are the residuals.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    return (res.to_numpy() ** 2).sum()


# Percentage Error Metrics


[docs]def pearsonr(
    obs: Series,
    sim: Series,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Pearson correlation (r).

    Parameters
    ----------
    sim: pandas.Series
        The Series with the simulated values.
    obs: pandas.Series
        The Series with the observed values.
    missing: str, optional
        string with the rule to deal with missing values in the observed series. Only
        "drop" is supported now.
    weighted: bool, optional
        Weight the values by the normalized time step to account for irregular time
        series. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.

    Notes
    -----
    The Pearson correlation (r) is computed as follows:

    .. math:: r = \\frac{\\sum_{i=1}^{N}w_i (x_i - \\bar{x})(y_i - \\bar{y})}
        {\\sqrt{\\sum_{i=1}^{N} w_i(x_i-\\bar{x})^2 \\sum_{i=1}^{N}
        w_i(y_i-\\bar{y})^2}}

    Where :math:`x` is observed time series, :math:`y` the simulated time series,
    and :math:`N` the number of observations in the observed time series.
    """
    if missing == "drop":
        obs = obs.dropna()

    w = _get_weights(obs, weighted=weighted, max_gap=max_gap)
    sim = sim.reindex(obs.index).dropna().to_numpy()

    # Return nan if the time indices of the sim and obs don't match
    if sim.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    sim = sim - average(sim, weights=w)
    obs = obs.to_numpy() - average(obs.to_numpy(), weights=w)

    r = (w * sim * obs).sum() / sqrt((w * sim**2).sum() * (w * obs**2).sum())

    return r


[docs]def evp(
    obs: Series,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Explained Variance Percentage (EVP).

    Parameters
    ----------
    obs: pandas.Series
        Series with the observed values.
    sim: pandas.Series
        The Series with the simulated values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.
    weighted: bool, optional
        If weighted is True, the variances are computed using the time step between
        observations as weights. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.

    Notes
    -----
    Commonly used goodness-of-fit metric groundwater level models as computed in
    :cite:t:`von_asmuth_groundwater_2012`.

    .. math:: \\text{EVP} = \\frac{\\sigma_h^2 - \\sigma_r^2}{\\sigma_h^2}
        * 100

    where :math:`\\sigma_h^2` is the variance of the observations and
    :math:`\\sigma_r^2` is the variance of the residuals. The returned value is
    bounded between 0% and 100%.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    if obs.var() == 0.0:
        return 100.0
    else:
        return (
            max(
                0.0,
                (
                    1
                    - var(res, weighted=weighted, max_gap=max_gap)
                    / var(obs, weighted=weighted, max_gap=max_gap)
                ),
            )
            * 100
        )


[docs]def nse(
    obs: Series,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Nash-Sutcliffe Efficiency (NSE).

    Parameters
    ----------
    obs: pandas.Series
        Series with the observed values.
    sim: pandas.Series
        The Series with the simulated values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.
    weighted: bool, optional
        If weighted is True, the variances are computed using the time step between
        observations as weights. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.

    Notes
    -----
    NSE computed according to :cite:t:`nash_river_1970`

    .. math:: \\text{NSE} = 1 - \\frac{\\sum(h_s-h_o)^2}{\\sum(h_o-\\mu_{h,o})}
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    w = _get_weights(res, weighted=weighted, max_gap=max_gap)
    mu = average(obs.to_numpy(), weights=w)

    return 1 - (w * res.to_numpy() ** 2).sum() / (w * (obs.to_numpy() - mu) ** 2).sum()


[docs]def rsq(
    obs: Series,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
    nparam: Optional[int] = None,
) -> float:
    """Compute R-squared, possibly adjusted for the number of free parameters.

    Parameters
    ----------
    obs: pandas.Series
        Series with the observed values.
    sim: pandas.Series
        The Series with the simulated values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.
    weighted: bool, optional
        If weighted is True, the variances are computed using the time step between
        observations as weights. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the weights.
        All time steps larger than max_gap are replace with the max_gap value.
        Default value is 30 days.
    nparam: int, optional
        number of calibrated parameters.

    Notes
    -----
    .. math:: \\rho_{adj} = 1-  \\frac{n-1}{n-n_{param}}*\\frac{rss}{tss}

    Where n is the number of observations, :math:`n_{param}` the number of free
    parameters, rss the sum of the squared residuals, and tss the total sum of
    squared residuals.

    When nparam is provided, the :math:`\\rho` is adjusted for the number of
    calibration parameters.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    w = _get_weights(res, weighted=weighted, max_gap=max_gap)
    mu = average(obs.to_numpy(), weights=w)
    rss = (w * res.to_numpy() ** 2.0).sum()
    tss = (w * (obs.to_numpy() - mu) ** 2.0).sum()

    if nparam:
        return 1.0 - (obs.size - 1.0) / (obs.size - nparam) * rss / tss
    else:
        return 1.0 - rss / tss


[docs]def bic(
    obs: Optional[Series] = None,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    nparam: int = 1,
) -> float:
    """Compute the Bayesian Information Criterium (BIC).

    Parameters
    ----------
    obs: pandas.Series
        Series with the observed values.
    sim: pandas.Series
        The Series with the simulated values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    nparam: int, optional
        number of calibrated parameters.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.

    Notes
    -----
    The Bayesian Information Criterium (BIC) :cite:p:`akaike_bayesian_1979` is
    computed as follows:

    .. math:: \\text{BIC} = -2 log(L) + n_{param} * log(N)

    where :math:`n_{param}` is the number of calibration parameters.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    return res.index.size * log(
        (res.to_numpy() ** 2.0).sum() / res.index.size
    ) + nparam * log(res.index.size)


[docs]def aic(
    obs: Optional[Series] = None,
    sim: Optional[Series] = None,
    res: Optional[Series] = None,
    missing: str = "drop",
    nparam: int = 1,
) -> float:
    """Compute the Akaike Information Criterium (AIC).

    Parameters
    ----------
    obs: pandas.Series
        Series with the observed values.
    sim: pandas.Series
        The Series with the simulated values.
    res: pandas.Series
        The Series with the residual values. If time series for the residuals are
        provided, the sim and obs arguments are ignored.
    nparam: int, optional
        number of calibrated parameters.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is supported now.

    Notes
    -----
    The Akaike Information Criterium (AIC) :cite:p:`akaike_new_1974` is computed as
    follows:

    .. math:: \\text{AIC} = -2 log(L) + 2 nparam

    where :math:`n_{param}` is the number of calibration parameters and L is the
    likelihood function for the model.
    """
    if res is None:
        res = sim - obs

    if missing == "drop":
        res = res.dropna()

    # Return nan if the time indices of the sim and obs don't match
    if res.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    return (
        res.index.size * log((res.to_numpy() ** 2.0).sum() / res.index.size)
        + 2.0 * nparam
    )


# Forecast Error Metrics
[docs]def kge_2012(
    obs: Series,
    sim: Series,
    missing: str = "drop",
    weighted: bool = False,
    max_gap: int = 30,
) -> float:
    """Compute the (weighted) Kling-Gupta Efficiency (KGE).

    Parameters
    ----------
    sim: pandas.Series
        Series with the simulated values.
    obs: pandas.Series
        Series with the observed values.
    missing: str, optional
        string with the rule to deal with missing values. Only "drop" is
        supported now.
    weighted: bool, optional
        Weight the values by the normalized time step to account for
        irregular time series. Default is False.
    max_gap: int, optional
        maximum allowed gap period in days to use for the computation of the
        weights. All time steps larger than max_gap are replace with the
        max_gap value. Default value is 30 days.

    Notes
    -----
    The (weighted) Kling-Gupta Efficiency :cite:t:`kling_runoff_2012` is
    computed as follows:

    .. math:: \\text{KGE} = 1 - \\sqrt{(r-1)^2 + (\\beta-1)^2 - (\\gamma-1)^2}

    where :math:`\\beta = \\bar{x} / \\bar{y}` and :math:`\\gamma =
    \\frac{\\bar{\\sigma}_x / \\bar{x}}{\\bar{\\sigma}_y / \\bar{y}}`. If
    weighted equals True, the weighted mean, variance and pearson
    correlation are used.
    """
    if missing == "drop":
        obs = obs.dropna()

    sim = sim.reindex(obs.index).dropna()

    # Return nan if the time indices of the sim and obs don't match
    if sim.index.size == 0:
        logger.warning("Time indices of the sim and obs don't match.")
        return nan

    r = pearsonr(obs=obs, sim=sim, weighted=weighted, max_gap=max_gap)

    mu_sim = mean(sim, weighted=weighted, max_gap=max_gap)
    mu_obs = mean(obs, weighted=weighted, max_gap=max_gap)

    beta = mu_sim / mu_obs
    gamma = (std(sim, weighted=weighted, max_gap=max_gap) / mu_sim) / (
        std(obs, weighted=weighted, max_gap=max_gap) / mu_obs
    )

    kge = 1 - sqrt((r - 1) ** 2 + (beta - 1) ** 2 + (gamma - 1) ** 2)
    return kge