Source code for pastas.timeseries

from logging import getLogger

# Type Hinting
from typing import Optional, Union

import pandas as pd
from pandas import Series
from pandas.tseries.frequencies import to_offset

from .rcparams import rcParams
from .timeseries_utils import _get_dt, _get_time_offset, _infer_fixed_freq, resample
from .utils import validate_name

logger = getLogger(__name__)


[docs]class TimeSeries: """Class that deals with all user-provided time series. Parameters ---------- series: pandas.Series pandas.Series with pandas.DatetimeIndex. name: str, optional String with the name of the time series, if None is provided, pastas will try to derive the name from the series. settings: str or dict, optional String with the name of one of the predefined settings (obtained through ps.TimeSeries._predefined_settings.) or a dictionary with the settings to be applied. This does not have to include all the settings arguments. metadata: dict, optional Dictionary with metadata of the time series. Returns ------- series: pastas.TimeSeries Returns a pastas.TimeSeries object. Examples -------- To obtain the predefined TimeSeries settings, you can run the following line of code: >>> ps.rcParams["timeseries"] See Also -------- pastas.timeseries.TimeSeries.update_series For the individual options for the different settings. """ _predefined_settings = rcParams["timeseries"]
[docs] def __init__( self, series: Series, name: Optional[str] = None, settings: Optional[Union[str, dict]] = None, metadata: Optional[dict] = None, ) -> None: # Make sure we have a Pandas Series and not a 1D-DataFrame if isinstance(series, pd.DataFrame): if len(series.columns) == 1: series = series.iloc[:, 0] logger.info( "1D-DataFrame was provided, automatically transformed to " "pandas.Series." ) # Make sure we have a workable Pandas Series, depends on type of time series if settings == "oseries": validate_oseries(series) else: if settings is not None and not isinstance(settings, str): if settings["fill_nan"] == "drop": raise UserWarning( "The fill_nan setting 'drop' for a stress is not allowed " "because the stress time series need to be equidistant. " "Please change this." ) validate_stress(series) # Store a copy of the original series self._series_original = series.copy() # copy of the original series self._series = None self.freq_original = _infer_fixed_freq(self._series_original.index) self.settings = { "freq": self.freq_original, "sample_up": None, "sample_down": None, "fill_nan": "interpolate", "fill_before": None, "fill_after": None, "tmin": series.index.min(), "tmax": series.index.max(), "time_offset": pd.Timedelta(0), } self.metadata = {"x": 0.0, "y": 0.0, "z": 0.0, "projection": None} # Use user provided name or set from series if name is None: name = series.name self.name = validate_name(name) self._series_original.name = validate_name(name) if metadata is not None: self.metadata.update(metadata) # Update the settings with user-provided values, if any. if settings: if isinstance(settings, str): if settings in self._predefined_settings.keys(): settings = self._predefined_settings[settings] else: error = ( f"Settings shortcut code '{settings}' is not in the " f"predefined settings options. Please choose from" f" {self._predefined_settings.keys()}" ) raise KeyError(error) self._update_settings(**settings) self.update_series(force_update=True, **self.settings)
def __repr__(self) -> str: """Prints a simple string representation of the time series.""" return ( f"{self.__class__.__name__}" f"(name={self.name}, " f"freq={self.settings['freq']}, " f"freq_original={self.freq_original}, " f"tmin={self.settings['tmin']}, " f"tmax={self.settings['tmax']})" ) @property def series_original(self) -> Series: return self._series_original @series_original.setter def series_original(self, series: Series) -> None: """Sets a new freq_original for the TimeSeries.""" validate_stress(series) self._series_original = series.copy() self.freq_original = pd.infer_freq(self._series_original.index) self.settings["tmin"] = series.index.min() # reset tmin self.settings["tmax"] = series.index.max() # reset tmax self.update_series(force_update=True, **self.settings) @property def series(self) -> Series: return self._series @series.setter def series(self, value): raise AttributeError( "You cannot set series by yourself, as it is calculated from " "series_original. Please set series_original to update the series." )
[docs] def update_series(self, force_update: bool = False, **kwargs) -> None: """Method to update the series with new options. Parameters ---------- force_update: bool, optional argument that is used to force an update, even when no changes are found. Internally used by the __init__ method. Default is False. freq: str, optional String representing the desired frequency of the time series. Must be one of the following: (D, h, m, s, ms, us, ns) or a multiple of that e.g. "7D". sample_up: str or float, optional String with the method to use when the frequency is increased (e.g., Weekly to daily). Possible values are: "backfill", "bfill", "pad", "ffill", "mean", "interpolate", "divide" or a float value to fill the gaps. sample_down: str, optional String with the method to use when the frequency decreases (e.g., from daily to weekly values). Possible values are: "mean", "drop", "sum", "min", "max". fill_nan: str or float, optional Method to use when there ar nan-values in the time series. Possible values are: "mean", "drop", "interpolate" (default) or a float value. fill_before: str or float, optional Method used to extend a time series before any measurements are available. possible values are: "mean" or a float value. fill_after: str or float, optional Method used to extend a time series after any measurements are available. Possible values are: "mean" or a float value. tmin: str or pandas.Timestamp, optional String that can be converted to, or a Pandas Timestamp with the minimum time of the series. tmax: str or pandas.Timestamp, optional String that can be converted to, or a Pandas Timestamp with the maximum time of the series. Notes ----- The method will validate if any of the settings is changed to determine if the series need to be updated. """ if self._update_settings(**kwargs) or force_update: tmin = self.settings["tmin"] freq = self.settings["freq"] if tmin is not None and freq is not None: self.settings["time_offset"] = _get_time_offset(tmin, freq) # Get the original series to start with series = self._series_original.copy(deep=True) # Only fill_nans if necessary if series.hasnans: series = self._fill_nan(series) # Update the series with the new settings series = self._change_frequency(series) series = self._fill_before(series) series = self._fill_after(series) series.name = self._series_original.name self._series = series
def _update_settings(self, **kwargs) -> bool: """Internal method that check if an update is actually necessary. Returns ------- update: bool True if settings are changed and series need to be updated. """ update = False for key, value in kwargs.items(): if key in ["tmin", "tmax"]: if value is None: pass else: value = pd.Timestamp(value) if (value != self.settings[key]) and (value is not None): self.settings[key] = value update = True return update def _change_frequency(self, series: Series) -> Series: """Method to change the frequency of the time series.""" freq = self.settings["freq"] # 1. If no freq string is present or is provided (e.g. Oseries) if not freq: return series # 2. If new frequency is the same elif freq == self.freq_original: return series # 3. If new frequency is required (only up or down sampling allowed) else: dt_new = _get_dt(freq) dt_org = _get_dt(self.freq_original) # If new frequency is lower than its original if dt_new < dt_org: series = self._sample_up(series) # If new frequency is higher than its original elif dt_new > dt_org: series = self._sample_down(series) # Drop nan-values at the beginning and end of the time series series = series.loc[series.first_valid_index() : series.last_valid_index()] return series def _sample_up(self, series: Series) -> Series: """Resample the time series when the frequency increases (e.g. from weekly to daily values).""" method = self.settings["sample_up"] freq = self.settings["freq"] success = True if method in ["backfill", "bfill", "pad", "ffill"]: series = series.asfreq(freq, method=method) elif method is None: success = False else: if method == "mean": series = series.asfreq(freq).fillna(series.mean()) elif method == "interpolate": series = series.asfreq(freq).interpolate(method="time") elif method == "divide": dt = series.index.to_series().diff() / to_offset(freq).delta series = series / dt series = series.asfreq(freq, method="bfill") elif isinstance(method, float): series = series.asfreq(freq).fillna(method) else: success = False if success: logger.info("Time Series %s were sampled up using %s.", self.name, method) else: logger.warning( "Time Series %s: User-defined option for sample_up %s is not " "supported", self.name, method, ) return series def _sample_down(self, series: Series) -> Series: """Resample the time series when the frequency decreases (e.g. from daily to weekly values). Notes ----- make sure the labels are still at the end of each period, and data at the right-side of the bucket is included (see http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.resample.html) """ method = self.settings["sample_down"] freq = self.settings["freq"] # when a multiple freq is used (like '7D') make sure the first record # has a rounded index # TODO: check if we can replace this with origin with pandas 1.1.0 start_time = series.index[0].ceil(freq) + self.settings["time_offset"] series = series.loc[start_time:] # TODO: replace by adding offset to resample method with pandas 1.1.0 # Shift time series back by offset so resample can take it into account if self.settings["time_offset"] > pd.Timedelta(0): series = series.shift(-1, freq=self.settings["time_offset"]) success = True if method == "mean": series = resample(series, freq).mean() elif method == "drop": series = resample(series, freq).mean().dropna() elif method == "sum": series = resample(series, freq).sum() elif method == "min": series = resample(series, freq).min() elif method == "max": series = resample(series, freq).max() else: success = False # TODO: replace by adding offset to resample method with pandas 1.1.0 if self.settings["time_offset"] > pd.Timedelta(0): # The offset is removed by the resample-method, so we add it again series = series.shift(1, freq=self.settings["time_offset"]) if success: logger.info( "Time Series %s was sampled down to freq %s with method " "%s.", self.name, freq, method, ) else: logger.warning( "Time Series %s: User-defined option for sample down %s is not " "supported", self.name, method, ) return series def _fill_nan(self, series: Series) -> Series: """Fill up the nan-values when present.""" method = self.settings["fill_nan"] n = series.isnull().values.sum() success = True if method == "drop": series = series.dropna() elif method == "mean": series = series.fillna(series.mean()) elif method == "interpolate": series = series.interpolate(method="time") elif isinstance(method, float): series = series.fillna(method) else: success = False if success: logger.info( "Time Series %s: %s nan-value(s) was/were found and filled with: %s.", self.name, n, method, ) else: logger.warning( "Time Series %s: User-defined option for fill_nan %s is not supported.", self.name, method, ) return series def _fill_before(self, series: Series) -> Series: """Method to add a period in front of the available time series.""" freq = self.settings["freq"] method = self.settings["fill_before"] tmin = self.settings["tmin"] if tmin is None: pass elif pd.Timestamp(tmin) > series.index.max(): logger.error( "The tmin is later than the last value of the time series. Pastas " "does not support this. Please extend time series manually." ) elif pd.Timestamp(tmin) >= series.index.min(): series = series.loc[pd.Timestamp(tmin) :] else: index_extend = pd.date_range( start=pd.Timestamp(tmin), end=series.index.min(), freq=freq ) series = series.reindex(series.index.union(index_extend[:-1])) if method == "mean": mean_value = series.mean() series = series.fillna(mean_value) # Default option logger.info( "Time Series %s was extended in the past to %s with the mean " "value (%.2g) of the time series.", self.name, series.index.min(), mean_value, ) elif method == "bfill": first_value = series.loc[series.first_valid_index()] series = series.fillna(method="bfill") # Default option logger.info( "Time Series %s was extended in the past to %s with the first " "value (%.2g) of the time series.", self.name, series.index.min(), first_value, ) elif isinstance(method, float): series = series.fillna(method) logger.info( "Time Series %s was extended in the past to %s by adding %s " "values.", self.name, series.index.min(), method, ) else: logger.info( "Time Series %s: User-defined option for fill_before '%s' is not " "supported.", self.name, method, ) return series def _fill_after(self, series: Series) -> Series: """Method to add a period in front of the available time series.""" freq = self.settings["freq"] method = self.settings["fill_after"] tmax = self.settings["tmax"] if tmax is None: pass elif pd.Timestamp(tmax) <= series.index.min(): logger.error( "The tmax is before the first value of the time series. Pastas does " "not support this. Please extend time series manually." ) elif pd.Timestamp(tmax) <= series.index.max(): series = series.loc[: pd.Timestamp(tmax)] else: index_extend = pd.date_range( start=series.index.max(), end=pd.Timestamp(tmax), freq=freq ) series = series.reindex(series.index.union(index_extend)) if method == "mean": mean_value = series.mean() series = series.fillna(mean_value) # Default option logger.info( "Time Series %s was extended in the future to %s with the mean " "value (%.2g) of the time series.", self.name, series.index.max(), mean_value, ) elif method == "ffill": last_value = series.loc[series.last_valid_index()] series = series.fillna(method="ffill") logger.info( "Time Series %s was extended in the future to %s with the last " "value (%.2g) of the time series.", self.name, series.index.max(), last_value, ) elif isinstance(method, float): series = series.fillna(method) logger.info( "Time Series %s was extended in the future to %s by adding %s " "values.", self.name, series.index.max(), method, ) else: logger.info( "Time Series %s: User-defined option for fill_after '%s' is not " "supported", self.name, method, ) return series
[docs] def to_dict(self, series: Optional[bool] = True) -> dict: """Method to export the Time Series to a json format. Parameters ---------- series: bool, optional True to export the original time series, False to only export the TimeSeries object"s name. Returns ------- data: dict dictionary with the necessary information to recreate the TimeSeries object completely. """ data = {} if series is True or series == "original": data["series"] = self.series_original elif series == "modified": data["series"] = self data["name"] = self.name data["settings"] = self.settings data["metadata"] = self.metadata return data
def validate_stress(series: Series): """Method to validate user-provided stress input time series. Parameters ---------- series: pandas.Series Pandas.Series object containing the series time series. Notes ----- The Series are validated for the following cases: 0. Make sure the series is a Pandas.Series 1. Make sure the values are floats 2. Make sure the index is a DatetimeIndex 3. Make sure the indices are datetime64 4. Make sure the index is monotonically increasing 5. Make sure there are no duplicate indices 6. Make sure the time series has no nan-values 7. Make sure the time series has equidistant time steps If any of these checks are not passed the method will throw an error that needs to be fixed by the user. Examples -------- >>> ps.validate_stress(series) """ _validate_series(series, equidistant=True) def validate_oseries(series: Series): """Method to validate user-provided oseries input time series. Parameters ---------- series: pandas.Series Pandas.Series object containing the series time series. Notes ----- The Series are validated for the following cases: 0. Make sure the series is a Pandas.Series 1. Make sure the values are floats 2. Make sure the index is a DatetimeIndex 3. Make sure the indices are datetime64 4. Make sure the index is monotonically increasing 5. Make sure there are no duplicate indices 6. Make sure the time series has no nan-values If any of these checks are not passed the method will throw an error that needs to be fixed by the user. Examples -------- >>> ps.validate_oseries(series) """ _validate_series(series, equidistant=False) def _validate_series(series: Series, equidistant: bool = True): """Internal method to validate user-provided input time series. Parameters ---------- series: pandas.Series Pandas.Series object containing the series time series. equidistant: bool, optional Whether the time series should have equidistant time step or not. Notes ----- If any of these checks are not passed the method will throw an error that needs to be fixed by the user. """ # Because we are friendly and allow 1D DataFrames if isinstance(series, pd.DataFrame): if len(series.columns) == 1: series = series.iloc[:, 0] # 0. Make sure it is a Series and not something else (e.g., DataFrame) if not isinstance(series, pd.Series): msg = f"Expected a Pandas Series, got {type(series)}" logger.error(msg) raise ValueError(msg) name = series.name # Only Series have a name, DateFrame do not # 1. Make sure the values are floats if not pd.api.types.is_float_dtype(series): msg = f"Values of time series {name} are not dtype=float." logger.error(msg) raise ValueError(msg) # 2. Make sure the index is a DatetimeIndex if not isinstance(series.index, pd.DatetimeIndex): msg = f"Index of series {name} is not a pandas.DatetimeIndex." logger.error(msg) raise ValueError(msg) # 3. Make sure the indices are datetime64 if not pd.api.types.is_datetime64_dtype(series.index): msg = f"Indices os series {name} are not datetime64." logger.error(msg) raise ValueError(msg) # 4. Make sure the index is monotonically increasing if not series.index.is_monotonic_increasing: msg = ( f"The time-indices of series {name} are not monotonically increasing. Try " f"to use `series.sort_index()` to fix it." ) logger.error(msg) raise ValueError(msg) # 5. Make sure there are no duplicate indices if not series.index.is_unique: msg = ( f"duplicate time-indexes were found in the time series {name}. Make sure " f"there are no duplicate indices. For example by " f"`grouped = series.groupby(level=0); series = grouped.mean()`" ) logger.error(msg) raise ValueError(msg) # 6. Make sure the time series has no nan-values if series.hasnans: msg = ( "The time series %s has nan-values. Pastas will use the fill_nan " "settings to fill up the nan-values." ) logger.warning(msg, name) # 7. Make sure the time series has equidistant time steps if equidistant: if not pd.infer_freq(series.index): msg = ( f"The frequency of the index of time series {name} could not be " f"inferred. Please provide a time series with a regular time step." ) logger.error(msg) raise ValueError(msg)