from logging import getLogger
# Type Hinting
from typing import Optional, Union
import pandas as pd
from pandas import Series, Timedelta
from pandas.tseries.frequencies import to_offset
from .rcparams import rcParams
from .timeseries_utils import _get_dt, _get_time_offset, _infer_fixed_freq, resample
from .utils import validate_name
logger = getLogger(__name__)
[docs]class TimeSeries:
"""Class that deals with all user-provided time series.
Parameters
----------
series: pandas.Series
pandas.Series with pandas.DatetimeIndex.
name: str, optional
String with the name of the time series, if None is provided, pastas will try
to derive the name from the series.
settings: str or dict, optional
The settings of the stress. This can be a string referring to a predefined
settings dictionary (defined in ps.rcParams["timeseries"]), or a dictionary with
the settings to apply. For more information refer to Time series settings
section below.
metadata: dict, optional
Dictionary with metadata of the time series.
Returns
-------
series: pastas.TimeSeries
Returns a pastas.TimeSeries object.
Time series settings
--------------------
fill_nan : {"drop", "mean", "interpolate"} or float
Method for filling NaNs.
* `drop`: drop NaNs from time series
* `mean`: fill NaNs with mean value of time series
* `interpolate`: fill NaNs by interpolating between finite values
* `float`: fill NaN with provided value, e.g. 0.0
fill_before : {"mean", "bfill"} or float
Method for extending time series into past.
* `mean`: extend time series into past with mean value of time series
* `bfill`: extend time series into past by back-filling first value
* `float`: extend time series into past with provided value, e.g. 0.0
fill_after : {"mean", "ffill"} or float
Method for extending time series into future.
* `mean`: extend time series into future with mean value of time series
* `ffill`: extend time series into future by forward-filling last value
* `float`: extend time series into future with provided value, e.g. 0.0
sample_up : {"mean", "interpolate", "divide"} or float
Method for up-sampling time series (increasing frequency, e.g. going from weekly
to daily values).
* `bfill` or `backfill`: fill up-sampled time steps by back-filling current
values
* `ffill` or `pad`: fill up-sampled time steps by forward-filling current
values
* `mean`: fill up-sampled time steps with mean of timeseries
* `interpolate`: fill up-sampled time steps by interpolating between current
values
* `divide`: fill up-sampled steps with current value divided by length of
current time steps (i.e. spread value over new time steps).
sample_down : {"mean", "drop", "sum", "min", "max"}
Method for down-sampling time series (decreasing frequency, e.g. going from
daily to weekly values).
* `mean`: resample time series by taking the mean
* `drop`: resample the time series by taking the mean, dropping any
NaN-values
* `sum`: resample time series by summing values
* `max`: resample time series with maximum value
* `min`: resample time series with minimum value
Examples
--------
To obtain the predefined TimeSeries settings, you can run the following line of
code:
>>> ps.rcParams["timeseries"]
See Also
--------
pastas.timeseries.TimeSeries.update_series
For the individual options for the different settings.
"""
_predefined_settings = rcParams["timeseries"]
[docs] def __init__(
self,
series: Series,
name: Optional[str] = None,
settings: Optional[Union[str, dict]] = None,
metadata: Optional[dict] = None,
) -> None:
# Make sure we have a Pandas Series and not a 1D-DataFrame
if isinstance(series, pd.DataFrame):
if len(series.columns) == 1:
series = series.iloc[:, 0]
logger.info(
"1D-DataFrame was provided, automatically transformed to "
"pandas.Series."
)
# Make sure we have a workable Pandas Series, depends on type of time series
if settings == "oseries":
validate_oseries(series)
else:
if settings is not None and not isinstance(settings, str):
if settings["fill_nan"] == "drop":
raise UserWarning(
"The fill_nan setting 'drop' for a stress is not allowed "
"because the stress time series need to be equidistant. "
"Please change this."
)
validate_stress(series)
# Store a copy of the original series
self._series_original = series.copy() # copy of the original series
self._series = None
self.freq_original = _infer_fixed_freq(self._series_original.index)
self.settings = {
"freq": self.freq_original,
"sample_up": None,
"sample_down": None,
"fill_nan": "interpolate",
"fill_before": None,
"fill_after": None,
"tmin": series.first_valid_index(),
"tmax": series.last_valid_index(),
"time_offset": pd.Timedelta(0),
}
self.metadata = {"x": 0.0, "y": 0.0, "z": 0.0, "projection": None}
# Use user provided name or set from series
if name is None:
name = series.name
self.name = validate_name(name)
self._series_original.name = validate_name(name)
if metadata is not None:
self.metadata.update(metadata)
# Update the settings with user-provided values, if any.
if settings:
if isinstance(settings, str):
if settings in self._predefined_settings.keys():
settings = self._predefined_settings[settings]
else:
msg = (
"Settings shortcut code '%s' is not in the predefined "
"settings options. Please choose from %s.",
)
raise KeyError(msg, settings, self._predefined_settings.keys())
self._update_settings(**settings)
self.update_series(force_update=True, **self.settings)
def __repr__(self) -> str:
"""Prints a simple string representation of the time series."""
return (
f"{self.__class__.__name__}"
f"(name={self.name}, "
f"freq={self.settings['freq']}, "
f"freq_original={self.freq_original}, "
f"tmin={self.settings['tmin']}, "
f"tmax={self.settings['tmax']})"
)
@property
def series_original(self) -> Series:
return self._series_original
@series_original.setter
def series_original(self, series: Series) -> None:
"""Sets a new freq_original for the TimeSeries."""
validate_stress(series)
self._series_original = series.copy()
self.freq_original = pd.infer_freq(self._series_original.index)
self.settings["tmin"] = series.index.min() # reset tmin
self.settings["tmax"] = series.index.max() # reset tmax
self.update_series(force_update=True, **self.settings)
@property
def series(self) -> Series:
return self._series
@series.setter
def series(self, value):
raise AttributeError(
"You cannot set series by yourself, as it is calculated from "
"series_original. Please set series_original to update the series."
)
[docs] def update_series(self, force_update: bool = False, **kwargs) -> None:
"""Method to update the series with new options.
Parameters
----------
force_update: bool, optional
argument that is used to force an update, even when no changes are found.
Internally used by the __init__ method. Default is False.
freq: str, optional
String representing the desired frequency of the time series. Must be one
of the following: (D, h, m, s, ms, us, ns) or a multiple of that e.g. "7D".
sample_up: str or float, optional
String with the method to use when the frequency is increased (e.g.,
Weekly to daily). Possible values are: "backfill", "bfill", "pad",
"ffill", "mean", "interpolate", "divide" or a float value to fill the gaps.
sample_down: str, optional
String with the method to use when the frequency decreases (e.g., from
daily to weekly values). Possible values are: "mean", "drop", "sum",
"min", "max".
fill_nan: str or float, optional
Method to use when there ar nan-values in the time series. Possible
values are: "mean", "drop", "interpolate" (default) or a float value.
fill_before: str or float, optional
Method used to extend a time series before any measurements are
available. possible values are: "mean" or a float value.
fill_after: str or float, optional
Method used to extend a time series after any measurements are available.
Possible values are: "mean" or a float value.
tmin: str or pandas.Timestamp, optional
String that can be converted to, or a Pandas Timestamp with the minimum
time of the series.
tmax: str or pandas.Timestamp, optional
String that can be converted to, or a Pandas Timestamp with the maximum
time of the series.
Notes
-----
The method will validate if any of the settings is changed to determine if
the series need to be updated.
"""
if self._update_settings(**kwargs) or force_update:
tmin = self.settings["tmin"]
freq = self.settings["freq"]
if tmin is not None and freq is not None:
self.settings["time_offset"] = _get_time_offset(tmin, freq)
# Get the original series to start with
series = self._series_original.copy(deep=True)
# Only fill_nans if necessary
if series.hasnans:
series = self._fill_nan(series)
# Update the series with the new settings
series = self._change_frequency(series)
series = self._fill_before(series)
series = self._fill_after(series)
series.name = self._series_original.name
self._series = series
def _update_settings(self, **kwargs) -> bool:
"""Internal method that check if an update is actually necessary.
Returns
-------
update: bool
True if settings are changed and series need to be updated.
"""
update = False
for key, value in kwargs.items():
if key in ["tmin", "tmax"]:
if value is None:
pass
else:
value = pd.Timestamp(value)
if (value != self.settings[key]) and (value is not None):
self.settings[key] = value
update = True
return update
def _change_frequency(self, series: Series) -> Series:
"""Method to change the frequency of the time series."""
freq = self.settings["freq"]
# 1. If no freq string is present or is provided (e.g. Oseries)
if not freq:
return series
# 2. If new frequency is the same
elif freq == self.freq_original:
return series
# 3. If new frequency is required (only up or down sampling allowed)
else:
dt_new = _get_dt(freq)
dt_org = _get_dt(self.freq_original)
# If new frequency is lower than its original
if dt_new < dt_org:
series = self._sample_up(series)
# If new frequency is higher than its original
elif dt_new > dt_org:
series = self._sample_down(series)
# Drop nan-values at the beginning and end of the time series
series = series.loc[series.first_valid_index() : series.last_valid_index()]
return series
def _sample_up(self, series: Series) -> Series:
"""Resample the time series when the frequency increases (e.g. from weekly to
daily values)."""
method = self.settings["sample_up"]
freq = self.settings["freq"]
success = True
if method in ["backfill", "bfill", "pad", "ffill"]:
series = series.asfreq(freq, method=method)
elif method is None:
success = False
else:
if method == "mean":
series = series.asfreq(freq).fillna(series.mean())
elif method == "interpolate":
series = series.asfreq(freq).interpolate(method="time")
elif method == "divide":
dt = series.index.to_series().diff() / Timedelta(to_offset(freq))
series = series / dt
series = series.asfreq(freq, method="bfill")
elif isinstance(method, float):
series = series.asfreq(freq).fillna(method)
else:
success = False
if success:
logger.info("Time Series '%s' were sampled up using %s.", self.name, method)
else:
logger.warning(
"Time Series '%s': User-defined option for sample_up %s is not "
"supported",
self.name,
method,
)
return series
def _sample_down(self, series: Series) -> Series:
"""Resample the time series when the frequency decreases (e.g. from daily to
weekly values).
Notes
-----
make sure the labels are still at the end of each period, and data at the
right-side of the bucket is included (see
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.resample.html)
"""
method = self.settings["sample_down"]
freq = self.settings["freq"]
# when a multiple freq is used (like '7D') make sure the first record
# has a rounded index
# TODO: check if we can replace this with origin with pandas 1.1.0
start_time = series.index[0].ceil(freq) + self.settings["time_offset"]
series = series.loc[start_time:]
# TODO: replace by adding offset to resample method with pandas 1.1.0
# Shift time series back by offset so resample can take it into account
if self.settings["time_offset"] > pd.Timedelta(0):
series = series.shift(-1, freq=self.settings["time_offset"])
success = True
if method == "mean":
series = resample(series, freq).mean()
elif method == "drop":
series = resample(series, freq).mean().dropna()
elif method == "sum":
series = resample(series, freq).sum()
elif method == "min":
series = resample(series, freq).min()
elif method == "max":
series = resample(series, freq).max()
else:
success = False
# TODO: replace by adding offset to resample method with pandas 1.1.0
if self.settings["time_offset"] > pd.Timedelta(0):
# The offset is removed by the resample-method, so we add it again
series = series.shift(1, freq=self.settings["time_offset"])
if success:
logger.info(
"Time Series '%s' was sampled down to freq %s with method " "%s.",
self.name,
freq,
method,
)
else:
logger.warning(
"Time Series '%s': User-defined option for sample down %s is not "
"supported",
self.name,
method,
)
return series
def _fill_nan(self, series: Series) -> Series:
"""Fill up the nan-values when present."""
method = self.settings["fill_nan"]
n = series.isnull().values.sum()
success = True
if method == "drop":
series = series.dropna()
elif method == "mean":
series = series.fillna(series.mean())
elif method == "interpolate":
series = series.interpolate(method="time")
elif isinstance(method, float):
series = series.fillna(method)
else:
success = False
if success:
logger.info(
"Time Series '%s': %s nan-value(s) was/were found and filled with: %s.",
self.name,
n,
method,
)
else:
logger.warning(
"Time Series '%s': User-defined option for fill_nan %s is not supported.",
self.name,
method,
)
return series
def _fill_before(self, series: Series) -> Series:
"""Method to add a period in front of the available time series."""
freq = self.settings["freq"]
method = self.settings["fill_before"]
tmin = self.settings["tmin"]
if tmin is None:
pass
elif pd.Timestamp(tmin) > series.index.max():
msg = (
"The tmin is later than the last value of the time series. Pastas "
"does not support this. Please extend time series manually."
)
logger.error(msg)
raise ValueError(msg)
elif pd.Timestamp(tmin) >= series.index.min():
series = series.loc[pd.Timestamp(tmin) :]
else:
index_extend = pd.date_range(
start=pd.Timestamp(tmin), end=series.index.min(), freq=freq
)
series = series.reindex(series.index.union(index_extend[:-1]))
if method == "mean":
mean_value = series.mean()
series = series.fillna(mean_value) # Default option
logger.info(
"Time Series '%s' was extended in the past to %s with the mean "
"value (%.2g) of the time series.",
self.name,
series.index.min(),
mean_value,
)
elif method == "bfill":
first_value = series.at[series.first_valid_index()]
series = series.fillna(method="bfill") # Default option
logger.info(
"Time Series '%s' was extended in the past to %s with the first "
"value (%.2g) of the time series.",
self.name,
series.index.min(),
first_value,
)
elif isinstance(method, float):
series = series.fillna(method)
logger.info(
"Time Series '%s' was extended in the past to %s by adding %s "
"values.",
self.name,
series.index.min(),
method,
)
elif method is None:
msg = (
"Time Series '%s': cannot be extended into past to %s as "
"'fill_before' method is 'None'. Provide settings to stress model,"
"e.g. `ps.StressModel(stress, settings='prec')`."
)
logger.error(msg, self.name, series.index.min())
raise ValueError(msg % (self.name, series.index.min()))
else:
logger.info(
"Time Series '%s': User-defined option for fill_before '%s' is not "
"supported.",
self.name,
method,
)
return series
def _fill_after(self, series: Series) -> Series:
"""Method to add a period in front of the available time series."""
freq = self.settings["freq"]
method = self.settings["fill_after"]
tmax = self.settings["tmax"]
if tmax is None:
pass
elif pd.Timestamp(tmax) <= series.index.min():
msg = (
"The tmax is before the first value of the time series. Pastas does "
"not support this. Please extend time series manually."
)
logger.error(msg)
raise ValueError(msg)
elif pd.Timestamp(tmax) <= series.index.max():
series = series.loc[: pd.Timestamp(tmax)]
else:
index_extend = pd.date_range(
start=series.index.max(), end=pd.Timestamp(tmax), freq=freq
)
series = series.reindex(series.index.union(index_extend))
if method == "mean":
mean_value = series.mean()
series = series.fillna(mean_value) # Default option
logger.info(
"Time Series '%s' was extended in the future to %s with the mean "
"value (%.2g) of the time series.",
self.name,
series.index.max(),
mean_value,
)
elif method == "ffill":
last_value = series.at[series.last_valid_index()]
series = series.fillna(method="ffill")
logger.info(
"Time Series '%s' was extended in the future to %s with the last "
"value (%.2g) of the time series.",
self.name,
series.index.max(),
last_value,
)
elif isinstance(method, float):
series = series.fillna(method)
logger.info(
"Time Series '%s' was extended in the future to %s by adding %s "
"values.",
self.name,
series.index.max(),
method,
)
elif method is None:
msg = (
"Time Series '%s': cannot be extended into future to %s as "
"'fill_after' method is 'None'. Provide settings to stress model, "
"e.g. `ps.StressModel(stress, settings='prec')`."
)
logger.error(msg, self.name, series.index.max())
raise ValueError(msg % (self.name, series.index.max()))
else:
logger.warning(
"Time Series '%s': User-defined option for fill_after '%s' is not "
"supported",
self.name,
method,
)
return series
[docs] def to_dict(self, series: Optional[bool] = True) -> dict:
"""Method to export the Time Series to a json format.
Parameters
----------
series: bool, optional
True to export the original time series, False to only export the
TimeSeries object"s name.
Returns
-------
data: dict
dictionary with the necessary information to recreate the TimeSeries
object completely.
"""
data = {}
if series is True or series == "original":
data["series"] = self.series_original
elif series == "modified":
data["series"] = self
data["name"] = self.name
data["settings"] = self.settings
data["metadata"] = self.metadata
return data
def validate_stress(series: Series):
"""Method to validate user-provided stress input time series.
Parameters
----------
series: pandas.Series
Pandas.Series object containing the series time series.
Returns
-------
bool:
True if the series is valid. If not, an error is raised.
Notes
-----
The Series are validated for the following cases:
0. Make sure the series is a Pandas.Series
1. Make sure the values are floats
2. Make sure the index is a DatetimeIndex
3. Make sure the indices are datetime64
4. Make sure the index is monotonically increasing
5. Make sure there are no duplicate indices
6. Make sure the time series has no nan-values
7. Make sure the time series has equidistant time steps
If any of these checks are not passed the method will throw an error that needs
to be fixed by the user.
Examples
--------
>>> ps.validate_stress(series)
"""
return _validate_series(series, equidistant=True)
def validate_oseries(series: Series):
"""Method to validate user-provided oseries input time series.
Parameters
----------
series: pandas.Series
Pandas.Series object containing the series time series.
Returns
-------
bool:
True if the series is valid. If not, an error is raised.
Notes
-----
The Series are validated for the following cases:
0. Make sure the series is a Pandas.Series
1. Make sure the values are floats
2. Make sure the index is a DatetimeIndex
3. Make sure the indices are datetime64
4. Make sure the index has no NaT-values
5. Make sure the index is monotonically increasing
6. Make sure there are no duplicate indices
7. Make sure the time series has no nan-values
If any of these checks are not passed the method will throw an error that needs
to be fixed by the user.
Examples
--------
>>> ps.validate_oseries(series)
"""
return _validate_series(series, equidistant=False)
def _validate_series(series: Series, equidistant: bool = True):
"""Internal method to validate user-provided input time series.
Parameters
----------
series: pandas.Series
Pandas.Series object containing the series time series.
equidistant: bool, optional
Whether the time series should have equidistant time step or not.
Returns
-------
bool:
True if the series is valid. If not, an error is raised.
Notes
-----
If any of these checks are not passed the method will throw an error that needs
to be fixed by the user.
"""
# Because we are friendly and allow 1D DataFrames
if isinstance(series, pd.DataFrame):
if len(series.columns) == 1:
series = series.iloc[:, 0]
elif len(series.columns) > 1:
# helpful specific message for multi-column DataFrames
msg = "DataFrame with multiple columns. Please select one."
logger.error(msg)
raise ValueError(msg)
# 0. Make sure it is a Series and not something else (e.g., DataFrame)
if not isinstance(series, pd.Series):
msg = "Expected a Pandas Series, got %s"
logger.error(msg, type(series))
raise ValueError(msg % type(series))
name = series.name # Only Series have a name, DateFrame do not
# 1. Make sure the values are floats
if not pd.api.types.is_float_dtype(series):
msg = "Values of time series %s are not dtype=float."
logger.error(msg, name)
raise ValueError(msg % name)
# 2. Make sure the index is a DatetimeIndex
if not isinstance(series.index, pd.DatetimeIndex):
msg = "Index of series %s is not a pandas.DatetimeIndex."
logger.error(msg, name)
raise ValueError(msg % name)
# 3. Make sure the indices are datetime64
if not pd.api.types.is_datetime64_dtype(series.index):
msg = "Indices os series %s are not datetime64."
logger.error(msg, name)
raise ValueError(msg % name)
# 4. Make sure there are no NaT in index
if series.index.hasnans:
msg = (
"The index of series %s contains NaNs. "
"Try to remove these with `series.loc[series.index.dropna()]`."
)
logger.error(msg, name)
raise ValueError(msg % name)
# 5. Make sure the index is monotonically increasing
if not series.index.is_monotonic_increasing:
msg = (
"The time-indices of series %s are not monotonically increasing. Try "
"to use `series.sort_index()` to fix it."
)
logger.error(msg, name)
raise ValueError(msg % name)
# 6. Make sure there are no duplicate indices
if not series.index.is_unique:
msg = (
"duplicate time-indexes were found in the time series %s. Make sure "
"there are no duplicate indices. For example by "
"`grouped = series.groupby(level=0); series = grouped.mean()`"
"or `series = series.loc[~series.index.duplicated(keep='first/last')]`"
)
logger.error(msg, name)
raise ValueError(msg % name)
# 7. Make sure the time series has no nan-values
if series.hasnans:
msg = (
"The Time Series '%s' has nan-values. Pastas will use the fill_nan "
"settings to fill up the nan-values."
)
logger.warning(msg, name)
# 8. Make sure the time series has equidistant time steps
if equidistant:
if not pd.infer_freq(series.index):
msg = (
"The frequency of the index of time series %s could not be "
"inferred. Please provide a time series with a regular time step."
)
logger.error(msg, name)
raise ValueError(msg % name)
# If all checks are passed, return True
return True