Skip to content

Commit

Permalink
Merge pull request #303 from CSHS-CWRA/add-platform-caching
Browse files Browse the repository at this point in the history
Add platformdirs for consistent testing data caching
  • Loading branch information
Zeitsperre authored Sep 5, 2023
2 parents 5be0d41 + dd306b0 commit 78e8927
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
pull_request:

env:
RAVENPY_TESTDATA_BRANCH: master
RAVEN_TESTING_DATA_BRANCH: master

jobs:
black:
Expand Down
6 changes: 6 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
History
=======

0.12.3 (2023-08-25)
-------------------

* `RavenPy` now uses `platformdirs` to write `raven_testing` to the user's cache directory. Dynamic paths are now used to cache data dependent on the user's operating system. Developers can now safely delete the `.raven_testing_data` folder in their home directory without affecting the functionality of `RavenPy`.
* Updated `raven-hydro` to v0.2.4 to address CMake build issues.

0.12.2 (2023-07-04)
-------------------
This release is primarily a bugfix to address issues arising from dependencies.
Expand Down
2 changes: 1 addition & 1 deletion environment-rtd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ channels:
- defaults
dependencies:
- python >=3.9,<3.10 # fixed to reduce solver time
- raven-hydro ==0.2.3
- raven-hydro ==0.2.4
- autodoc-pydantic
- click
# - clisops # mocked
Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ channels:
- defaults
dependencies:
- python >=3.8,<3.12
- raven-hydro ==0.2.3
- raven-hydro ==0.2.4
- affine
- cftime
- cf_xarray
Expand All @@ -25,6 +25,7 @@ dependencies:
- owslib <0.29.0 # see: https://github.com/geopython/OWSLib/issues/871
- pandas
- pint >=0.20
- platformdirs
- pre-commit
- pydantic >=1.10.8,<2.0
- pymbolic
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ classifiers = [
dynamic = ["description", "version"]
dependencies = [
"cftime",
# cf-xarray is differently named on conda-forge
"cf-xarray[all]",
"click",
"climpred>=2.2",
Expand All @@ -50,7 +51,7 @@ dependencies = [
"pint>=0.20",
"pydantic>=1.10.8,<2.0",
"pymbolic",
"raven-hydro==0.2.3",
"raven-hydro==0.2.4",
"requests",
"scipy",
"spotpy",
Expand All @@ -73,6 +74,7 @@ dev = [
"hvplot",
"isort",
"mypy",
"platformdirs",
"pre-commit",
"pytest",
"pytest-cov",
Expand Down
44 changes: 24 additions & 20 deletions ravenpy/utilities/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
from urllib.request import urlretrieve

import requests
from platformdirs import user_cache_dir
from xarray import Dataset
from xarray import open_dataset as _open_dataset

_default_cache_dir = Path.home() / ".raven_testing_data"
_default_cache_dir = user_cache_dir("raven_testing_data")

LOGGER = logging.getLogger("RAVEN")

Expand All @@ -36,9 +37,9 @@ def file_md5_checksum(fname):

def get_local_testdata(
patterns: Union[str, Sequence[str]],
temp_folder: Union[str, os.PathLike],
temp_folder: Union[str, Path],
branch: str = "master",
_local_cache: Union[str, os.PathLike] = _default_cache_dir,
_local_cache: Union[str, Path] = _default_cache_dir,
) -> Union[Path, List[Path]]:
"""Copy specific testdata from a default cache to a temporary folder.
Expand All @@ -48,11 +49,11 @@ def get_local_testdata(
----------
patterns : str or Sequence of str
Glob patterns, which must include the folder.
temp_folder : str or os.PathLike
temp_folder : str or Path
Target folder to copy files and filetree to.
branch : str, optional
For GitHub-hosted files, the branch to download from.
_local_cache : str or os.PathLike
branch : str
For GitHub-hosted files, the branch to download from. Default: "master".
_local_cache : str or Path
Local cache of testing data.
Returns
Expand Down Expand Up @@ -170,24 +171,24 @@ def _get(

# idea copied from xclim that borrowed it from xarray that was borrowed from Seaborn
def get_file(
name: Union[str, os.PathLike, Sequence[Union[str, os.PathLike]]],
name: Union[str, Path, Sequence[Union[str, Path]]],
github_url: str = "https://github.com/Ouranosinc/raven-testdata",
branch: str = "master",
cache_dir: Path = _default_cache_dir,
cache_dir: Union[str, Path] = _default_cache_dir,
) -> Union[Path, List[Path]]:
"""
Return a file from an online GitHub-like repository.
If a local copy is found then always use that to avoid network traffic.
Parameters
----------
name : str or os.PathLike or Sequence of str or os.PathLike
name : str or Path or Sequence of str or Path
Name of the file or list/tuple of names of files containing the dataset(s) including suffixes.
github_url : str
URL to GitHub repository where the data is stored.
branch : str, optional
For GitHub-hosted files, the branch to download from.
cache_dir : Path
branch : str
For GitHub-hosted files, the branch to download from. Default: "master".
cache_dir : str or Path
The directory in which to search for and write cached data.
Returns
Expand All @@ -197,6 +198,8 @@ def get_file(
if isinstance(name, (str, Path)):
name = [name]

cache_dir = Path(cache_dir)

files = list()
for n in name:
fullname = Path(n)
Expand Down Expand Up @@ -234,8 +237,8 @@ def query_folder(
Regex pattern to identify a file.
github_url : str
URL to GitHub repository where the data is stored.
branch : str, optional
For GitHub-hosted files, the branch to download from.
branch : str
For GitHub-hosted files, the branch to download from. Default: "master".
Returns
-------
Expand Down Expand Up @@ -274,10 +277,10 @@ def open_dataset(
github_url: str = "https://github.com/Ouranosinc/raven-testdata",
branch: str = "master",
cache: bool = True,
cache_dir: Path = _default_cache_dir,
cache_dir: Union[str, Path] = _default_cache_dir,
**kwds,
) -> Dataset:
"""Open a dataset from the online GitHub-like repository.
r"""Open a dataset from the online GitHub-like repository.
If a local copy is found then always use that to avoid network traffic.
Expand All @@ -293,11 +296,11 @@ def open_dataset(
URL to GitHub repository where the data is stored.
branch : str, optional
For GitHub-hosted files, the branch to download from.
cache_dir : Path
The directory in which to search for and write cached data.
cache : bool
If True, then cache data locally for use on subsequent calls.
**kwds
cache_dir : str or Path
The directory in which to search for and write cached data.
\*\*kwds
For NetCDF files, keywords passed to xarray.open_dataset.
Returns
Expand All @@ -309,6 +312,7 @@ def open_dataset(
xarray.open_dataset
"""
name = Path(name)
cache_dir = Path(cache_dir)
if suffix is None:
suffix = ".nc"
fullname = name.with_suffix(suffix)
Expand Down
37 changes: 20 additions & 17 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@

from .common import _convert_2d, _convert_3d

TESTDATA_BRANCH = os.getenv("RAVENPY_TESTDATA_BRANCH", "master")
RAVEN_TESTING_DATA_BRANCH = os.getenv("RAVEN_TESTING_DATA_BRANCH", "master")
SKIP_TEST_DATA = os.getenv("RAVENPY_SKIP_TEST_DATA")
DEFAULT_CACHE = Path(_default_cache_dir)


def populate_testing_data(
temp_folder: Optional[Path] = None,
branch: str = TESTDATA_BRANCH,
_local_cache: Path = _default_cache_dir,
branch: str = RAVEN_TESTING_DATA_BRANCH,
_local_cache: Path = DEFAULT_CACHE,
) -> None:
if _local_cache.joinpath(".data_written").exists():
# This flag prevents multiple calls from re-attempting to download testing data in the same pytest run
Expand Down Expand Up @@ -128,7 +129,9 @@ def threadsafe_data_dir(tmp_path_factory) -> Path:
@pytest.fixture(scope="session")
def get_file(threadsafe_data_dir):
def _get_session_scoped_file(file: Union[str, Path]):
return _get_file(file, cache_dir=threadsafe_data_dir, branch=TESTDATA_BRANCH)
return _get_file(
file, cache_dir=threadsafe_data_dir, branch=RAVEN_TESTING_DATA_BRANCH
)

return _get_session_scoped_file

Expand All @@ -139,8 +142,8 @@ def _get_session_scoped_local_testdata(file: Union[str, Path]):
return _get_local_testdata(
file,
temp_folder=threadsafe_data_dir,
branch=TESTDATA_BRANCH,
_local_cache=_default_cache_dir,
branch=RAVEN_TESTING_DATA_BRANCH,
_local_cache=DEFAULT_CACHE,
)

return _get_session_scoped_local_testdata
Expand All @@ -150,22 +153,22 @@ def _get_session_scoped_local_testdata(file: Union[str, Path]):
def gather_session_data(threadsafe_data_dir, worker_id):
"""Gather testing data on pytest run.
When running pytest with multiple workers, one worker will copy data remotely to _default_cache_dir while
When running pytest with multiple workers, one worker will copy data remotely to DEFAULT_CACHE while
other workers wait using lockfile. Once the lock is released, all workers will copy data to their local
threadsafe_data_dir."""
if worker_id == "master":
if not SKIP_TEST_DATA:
populate_testing_data(branch=TESTDATA_BRANCH)
populate_testing_data(branch=RAVEN_TESTING_DATA_BRANCH)
else:
if not SKIP_TEST_DATA:
_default_cache_dir.mkdir(exist_ok=True)
test_data_being_written = FileLock(_default_cache_dir.joinpath(".lock"))
DEFAULT_CACHE.mkdir(exist_ok=True)
test_data_being_written = FileLock(DEFAULT_CACHE.joinpath(".lock"))
with test_data_being_written as fl:
# This flag prevents multiple calls from re-attempting to download testing data in the same pytest run
populate_testing_data(branch=TESTDATA_BRANCH)
_default_cache_dir.joinpath(".data_written").touch()
populate_testing_data(branch=RAVEN_TESTING_DATA_BRANCH)
DEFAULT_CACHE.joinpath(".data_written").touch()
fl.acquire()
shutil.copytree(_default_cache_dir, threadsafe_data_dir)
shutil.copytree(DEFAULT_CACHE, threadsafe_data_dir)


@pytest.fixture(scope="session", autouse=True)
Expand All @@ -176,7 +179,7 @@ def cleanup(request):
"""

def remove_data_written_flag():
flag = _default_cache_dir.joinpath(".data_written")
flag = DEFAULT_CACHE.joinpath(".data_written")
if flag.exists():
flag.unlink()

Expand All @@ -189,7 +192,7 @@ def q_sim_1(threadsafe_data_dir):
return _get_file(
"hydro_simulations/raven-gr4j-cemaneige-sim_hmets-0_Hydrographs.nc",
cache_dir=threadsafe_data_dir,
branch=TESTDATA_BRANCH,
branch=RAVEN_TESTING_DATA_BRANCH,
)


Expand Down Expand Up @@ -242,7 +245,7 @@ def salmon(threadsafe_data_dir):
return _get_file(
"raven-gr4j-cemaneige/Salmon-River-Near-Prince-George_meteo_daily.nc",
cache_dir=threadsafe_data_dir,
branch=TESTDATA_BRANCH,
branch=RAVEN_TESTING_DATA_BRANCH,
)


Expand Down Expand Up @@ -639,4 +642,4 @@ class TestConfig(Config):

#
if __name__ == "__main__":
populate_testing_data(branch=TESTDATA_BRANCH)
populate_testing_data(branch=RAVEN_TESTING_DATA_BRANCH)

0 comments on commit 78e8927

Please sign in to comment.