Source code for pastas.dataset
"""This module contains functions to load datasets from the pastas-data repository on
GitHub. The datasets are used for testing and examples in the documentation. The
load_dataset function can be used to load a single csv file or multiple csv files from
a subfolder in the pastas-data repository.
"""
from functools import lru_cache
from typing import Dict, List, Union
from pandas import DataFrame, read_csv
GITHUB_URL = "https://api.github.com/repos/pastas/pastas-data/contents/"
[docs]@lru_cache
def load_dataset(name: str) -> Union[DataFrame, Dict[str, DataFrame]]:
"""Load csv-files from a subfolder in the pastas dataset repository on GitHub.
Parameters
----------
name : str
The name of the subfolder, i.e., collenteur_2023. For a list of available
datasets, see the pastas-data repository on GitHub
(www.github.com/pastas/pastas-data).
Returns
-------
Union[pd.DataFrame, Dict[str, pd.DataFrame]]
The loaded dataset(s). If one csv file is found, returns a pandas DataFrame.
If multiple csv files are found, returns a dictionary with file names as keys
and dataframes as values.
Raises
------
Exception: If the request status code is not 200 (OK), an exception is raised. This
is likely due to an invalid folder name. Check the pastas-data repository on GitHub
for available datasets.
Examples
--------
>>> ps.load_dataset("collenteur_2021")
Returns the dataset from the "collenteur_2021" subfolder as a pandas DataFrame.
>>> ps.load_dataset("collenteur_2023")
Returns a dictionary with datasets from the "collenteur_2023" subfolder. The keys
are the file names and the values are pandas DataFrames.
"""
# Try to import requests, if not installed raise error
try:
import requests
except ImportError:
raise ImportError(
"The requests package is required to load datasets from the pastas-data "
"repository. Install requests using 'pip install requests'."
)
# Get the folder from the pastas-data repository
r = requests.get(f"{GITHUB_URL}/{name}/")
# Check if requests status is okay, otherwise raise error and return status code
if not r.status_code == 200:
raise Exception(f"Error: {r.status_code}. Reason: {r.reason}. ")
# Get information about the files in the folder
data = {}
# Loop over the files in the folder
rjson = r.json()
read_csv_kwargs = requests.get(
[x for x in rjson if x["name"] == "settings.json"][0]["download_url"]
).json()
for file in rjson:
fname = file["name"]
if fname.endswith(".csv"):
df = read_csv(file["download_url"], **read_csv_kwargs[fname])
data[fname.split(".")[0]] = df
# Return the data, if only one file is found return the dataframe, otherwise return
# a dictionary with the dataframes
if len(data) == 1:
return list(data.values())[0]
elif len(data) > 1:
return data
else:
raise Exception(
f"No csv files found in the folder {name}. Check the pastas-data repository "
"on GitHub for available datasets."
)
[docs]@lru_cache
def list_datasets(silent: bool = True) -> List[str]:
"""Print a list of available datasets in the pastas-data repository on GitHub.
Returns
-------
list[str]
A list of available datasets in the pastas-data repository on GitHub.
Prints a list of available datasets in the pastas-data repository on GitHub.
Examples
--------
>>> ps.list_datasets()
Prints a list of available datasets in the pastas-data repository on GitHub.
"""
# Try to import requests, if not installed raise error
try:
import requests
except ImportError:
raise ImportError(
"The requests package is required to load datasets from the pastas-data "
"repository. Install requests using 'pip install requests'."
)
# Get the folder from the pastas-data repository
r = requests.get(GITHUB_URL)
# Check if requests status is okay, otherwise raise error and return status code
if not r.status_code == 200:
raise Exception(f"Error: {r.status_code}. Reason: {r.reason}. ")
# Get information about the files in the folder
data = []
# Loop over the files in the folder
for file in r.json():
if file["type"] == "dir":
data.append(file["name"])
# Print the list of datasets
if not silent:
print("Available datasets in the pastas-data repository on GitHub:")
for folder in data:
print(f" - {folder}")
print(
"Use ps.load_dataset('folder_name') to load a dataset from the repository."
)
return data