From 36faf3ea447c18c942aafdba1080cb40136907f0 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sun, 22 Nov 2020 17:49:36 -0500 Subject: [PATCH 01/34] wip: new recipe syntax --- pangeo_forge/recipe/recipe.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 pangeo_forge/recipe/recipe.py diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py new file mode 100644 index 00000000..266b0506 --- /dev/null +++ b/pangeo_forge/recipe/recipe.py @@ -0,0 +1,28 @@ +""" +A Pangeo Forge Recipe +""" + +import xarray as xr +import fsspec + + +class Recipe: + datasets = [] + keys = [] + + def filenames_for_chunk(**chunk_keys): + raise NotImplementedError + + def open_file(self, fname): + # todo: caching + return fsspec.open(fname) + + def open_chunk(self, filenames): + files_to_open = [self.open_file(f) for f in filenames] + ds_chunk = xr.open_mfdataset(filenames, **self.open_chunk_kwargs) + return ds_chunk + + def store_chunk(self, store_target, **chunk_keys): + filenames = self.filenames_for_chunk(**chunk_keys) + ds_chunk = self.open_chunk(filenames) + ds_chunk.to_zarr(store_target, mode='a') From 7ee78f22c5108fc10009b9bcf8f7bb668b041a4d Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 28 Nov 2020 21:32:01 -0500 Subject: [PATCH 02/34] messy wip --- pangeo_forge/__init__.py | 10 +- pangeo_forge/pipelines/__init__.py | 5 - pangeo_forge/recipe/__init__.py | 0 pangeo_forge/recipe/pipeline.py | 0 pangeo_forge/recipe/recipe.py | 154 +++++++++++++++++++++++++++-- pangeo_forge/recipe/target.py | 22 +++++ pangeo_forge/utils.py | 18 ---- requirements.txt | 1 - tests/__init__.py | 0 tests/fixtures.py | 88 +++++++++++++++++ tests/test_recipe.py | 48 +++++++++ tests/test_utils.py | 17 +++- 12 files changed, 322 insertions(+), 41 deletions(-) create mode 100644 pangeo_forge/recipe/__init__.py create mode 100644 pangeo_forge/recipe/pipeline.py create mode 100644 pangeo_forge/recipe/target.py create mode 100644 tests/__init__.py create mode 100644 tests/fixtures.py create mode 100644 tests/test_recipe.py diff --git a/pangeo_forge/__init__.py b/pangeo_forge/__init__.py index a9528c6e..722da05d 100644 --- a/pangeo_forge/__init__.py +++ b/pangeo_forge/__init__.py @@ -1,6 +1,6 @@ from pkg_resources import DistributionNotFound, get_distribution -from pangeo_forge.pipelines import AbstractPipeline +#from pangeo_forge.pipelines import AbstractPipeline try: __version__ = get_distribution(__name__).version @@ -10,7 +10,7 @@ del get_distribution, DistributionNotFound - -__all__ = [ - "AbstractPipeline", -] +# +# __all__ = [ +# "AbstractPipeline", +# ] diff --git a/pangeo_forge/pipelines/__init__.py b/pangeo_forge/pipelines/__init__.py index 122c0873..e69de29b 100644 --- a/pangeo_forge/pipelines/__init__.py +++ b/pangeo_forge/pipelines/__init__.py @@ -1,5 +0,0 @@ -from .base import AbstractPipeline - -__all__ = [ - "AbstractPipeline", -] diff --git a/pangeo_forge/recipe/__init__.py b/pangeo_forge/recipe/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pangeo_forge/recipe/pipeline.py b/pangeo_forge/recipe/pipeline.py new file mode 100644 index 00000000..e69de29b diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py index 266b0506..3cc86180 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe/recipe.py @@ -4,25 +4,161 @@ import xarray as xr import fsspec +from ..utils import chunked_iterable +from .target import Target +### How to manually execute a recipe: ### +# +# r = PangeoForgeTarget() +# r = MyRecipe(**opts) # 1 +# r.set_target(tmp_dir) # 2 +# # manual execution of recipe +# r.prepare() # 3 +# for input_key in r.iter_inputs(): +# r.cache_input(input_key) # 4 +# for chunk_key in r.iter_chunks(): +# r.store_chunk(chunk_key) # 5 +# r.finalize() # 6 +# +### -class Recipe: - datasets = [] - keys = [] - def filenames_for_chunk(**chunk_keys): - raise NotImplementedError +# 1) Initialize the Recipe object +# 2) Point the Recipe at its Target +# 3) Initialize the recipe. +# Check if the target exists; if not, create it. +# + + +#@dataclass +class DatasetRecipe(): + target: Target + + +class FSSpecInputOpenerMixin: def open_file(self, fname): # todo: caching return fsspec.open(fname) - def open_chunk(self, filenames): + +class XarrayChunkOpenerMixin: + + def open_chunk(self, filenames, load=True): files_to_open = [self.open_file(f) for f in filenames] ds_chunk = xr.open_mfdataset(filenames, **self.open_chunk_kwargs) + if load: + ds_chunk.load() return ds_chunk - def store_chunk(self, store_target, **chunk_keys): - filenames = self.filenames_for_chunk(**chunk_keys) + +class ZarrWriterMixin: + + + def store_chunk(self, chunk_key): + filenames = self.filenames_for_chunk(chunk_key) ds_chunk = self.open_chunk(filenames) - ds_chunk.to_zarr(store_target, mode='a') + target_mapper = self.target.get_mapper() + write_region = self.get_write_region(chunk_key) + ds_chunk.to_zarr(target_mapper, region=write_region, **target_kwargs) + + + def open_target(self): + target_mapper = self.target.get_mapper() + return xr.open_zarr(target_mapper) + + + + + +class FileSequenceRecipe(DatasetRecipe): + + + def __init__(self, file_urls, sequence_dim, files_per_chunk=1, nitems_per_file=1, chunksize_within_file=None): + self.file_urls = file_urls + + if chunksize_within_file: + assert file_per_chunk is None + assert nitems_per_file >= chunksize_within_file + + self.files_per_chunk = files_per_chunk + self.nitems_per_file = nitems_per_file + self.sequence_dim = sequence_dim + + # mapping between chunks and file names + self._chunks_files = {k: v for k, v in + enumerate(chunked_iterable(file_urls, files_per_chunk))} + + + def filenames_for_chunk(self, chunk_key): + return self._chunks_files[chunk_key] + + + def nitems_for_chunk(self, chunk_key): + return self.nitems_per_file * len(self.filenames_for_chunk(chunk_key)) + + + def region_for_chunk(self, chunk_key): + # return a dict suitable to pass to xr.to_zarr(region=...) + # specifies where in the overall array to put this chunk's data + stride = self.nitems_per_file * self.files_per_chunk + start = chunk_key * stride + return { + self.sequence_dim: + slice(start, start + self.nitems_for_chunk(chunk_key)) + } + + + def sequence_dim(self): + # tells the total size of dataset along the sequence dimension + return { + self.sequence_dim: + sum([self.nitems_for_chunk(k) for k in self.iter_chunks()]) + } + + + def sequence_chunks(self): + # chunking + return {self.sequence_dim: self.files_per_chunk * self.nitems_per_file} + + + def iter_chunks(self): + for k in self._chunks_files: + yield k + + def prepare(self): + + target_store = self.get_store_target() + + try: + ds = self.open_target(target_store) + + except IOError: + first_chunk_key = next(self.iter_chunks()) + ds = self.open_chunk(first_chunk_key).chunk() + ds.to_zarr(path, compute=False, consolidated=False) + + encoding = {v: ds[v].encoding for v in ds} + + # now resize the sequence dim at the zarr level + sequence_axes = {v: ds[v].get_axis_num(self.sequence_dim) + for v in ds + if self.sequence_dim in ds[v].dims} + N = self.sequence_dim() + + zgroup = zarr.open_group(target_store) + + for v, axis in sequence_axes.items(): + arr = zgroup[v] + shape = list(arr.shape) + shape[axis] = N + arr.resize(shape) + + +class StandardSequentialRecipe( + FileSequenceRecipe, + FSSpecInputOpenerMixin, + XarrayChunkOpenerMixin, + ZarrWriterMixin, + ): + pass diff --git a/pangeo_forge/recipe/target.py b/pangeo_forge/recipe/target.py new file mode 100644 index 00000000..b7274d94 --- /dev/null +++ b/pangeo_forge/recipe/target.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass +import fsspec + + +@dataclass +class Target: + """Representation of a storage target for Pangeo Forge. + Attributes + ---------- + url : FileSystemSpec.AbtractFileSystem + The filesystem we are writing to. Should be instantiated outside this + class. + path : str + The path where the target data will be saved. + """ + + fs: fsspec.AbstractFileSystem + path: str + + def get_mapper(self): + # don't want to use this because we want to use a fancier Zarr FSStore + return self.fs.get_mapper(self.path) diff --git a/pangeo_forge/utils.py b/pangeo_forge/utils.py index f4b76625..c49b3014 100644 --- a/pangeo_forge/utils.py +++ b/pangeo_forge/utils.py @@ -1,8 +1,4 @@ import itertools -from typing import Any, List, Tuple - -from prefect import task - # https://alexwlchan.net/2018/12/iterating-in-fixed-size-chunks/ def chunked_iterable(iterable, size): @@ -12,17 +8,3 @@ def chunked_iterable(iterable, size): if not chunk: break yield chunk - - -@task -def chunk(sources: List[Any], size: int) -> List[Tuple[Any, ...]]: - """ - Prefect task to chunk a list of sources into batches. - - Examples - -------- - >>> import pangeo_forge.utils - >>> pangeo_forge.utils.chunk.run([1, 2, 3, 4, 5], size=2) - [(1, 2), (3, 4), (5,)] - """ - return list(chunked_iterable(sources, size)) diff --git a/requirements.txt b/requirements.txt index c6a2743f..dbf84c5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ setuptools click dask distributed -prefect xarray zarr fsspec[http] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixtures.py b/tests/fixtures.py new file mode 100644 index 00000000..0e54d899 --- /dev/null +++ b/tests/fixtures.py @@ -0,0 +1,88 @@ +import subprocess +import time + +import fsspec +import numpy as np +import pandas as pd +import pytest +import xarray as xr + + + +# where to run the http server +_PORT = "8080" +_ADDRESS = "127.0.0.1" + + +@pytest.fixture(scope="session") +def daily_xarray_dataset(): + """Return a synthetic random xarray dataset.""" + np.random.seed(1) + nt, ny, nx = 10, 18, 36 + time = pd.date_range(start="2010-01-01", periods=nt, freq="D") + lon = (np.arange(nx) + 0.5) * 360 / nx + lon_attrs = {"units": "degrees_east", "long_name": "longitude"} + lat = (np.arange(ny) + 0.5) * 180 / ny + lat_attrs = {"units": "degrees_north", "long_name": "latitude"} + foo = np.random.rand(nt, ny, nx) + foo_attrs = {"long_name": "Fantastic Foo"} + bar = np.random.rand(nt, ny, nx) + bar_attrs = {"long_name": "Beautiful Bar"} + dims = ("time", "lat", "lon") + ds = xr.Dataset( + {"foo": (dims, foo, foo_attrs), "bar": (dims, bar, bar_attrs)}, + coords={ + "time": ("time", time), + "lat": ("lat", lat, lat_attrs), + "lon": ("lon", lon, lon_attrs), + }, + attrs={"conventions": "CF 1.6"}, + ) + return ds + + +@pytest.fixture(scope="session", params=["D", "2D"]) +def netcdf_local_paths(daily_xarray_dataset, tmpdir_factory, request): + """Return a list of paths pointing to netcdf files.""" + tmp_path = tmpdir_factory.mktemp("netcdf_data") + gb = daily_xarray_dataset.resample(time=request.param) + _, datasets = zip(*gb) + fnames = [f"{n:03d}.nc" for n in range(len(datasets))] + paths = [tmp_path.join(fname) for fname in fnames] + print(len(paths)) + xr.save_mfdataset(datasets, [str(path) for path in paths]) + return paths + + +@pytest.fixture(scope="session") +def netcdf_http_server(netcdf_local_paths): + first_path = netcdf_local_paths[0] + # assume that all files are in the same directory + basedir = first_path.dirpath() + print(basedir) + fnames = [path.basename for path in netcdf_local_paths] + + # this feels very hacky + command_list = ["python", "-m", "http.server", _PORT, "--bind", _ADDRESS] + p = subprocess.Popen(command_list, cwd=basedir) + url = f"http://{_ADDRESS}:{_PORT}" + time.sleep(0.1) # let the server start up + yield url, fnames + p.kill() + + +# tests that our fixtures work + + +def test_fixture_local_files(daily_xarray_dataset, netcdf_local_paths): + paths = [str(path) for path in netcdf_local_paths] + ds = xr.open_mfdataset(paths, combine="nested", concat_dim="time") + assert ds.identical(daily_xarray_dataset) + + +def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): + url, paths = netcdf_http_server + urls = ["/".join([url, str(path)]) for path in paths] + open_files = [fsspec.open(url).open() for url in urls] + ds = xr.open_mfdataset(open_files, combine="nested", concat_dim="time") + assert ds.identical(daily_xarray_dataset) diff --git a/tests/test_recipe.py b/tests/test_recipe.py new file mode 100644 index 00000000..7ca003a6 --- /dev/null +++ b/tests/test_recipe.py @@ -0,0 +1,48 @@ +import pytest + +from pangeo_forge.recipe import recipe + +from .fixtures import daily_xarray_dataset, netcdf_local_paths + +dummy_fnames = ["a.nc", "b.nc", "c.nc"] +@pytest.mark.parametrize( + "file_urls, files_per_chunk, expected_keys, expected_filenames", + [ + (dummy_fnames, 1, [0, 1, 2], [("a.nc",), ("b.nc",), ("c.nc",)]), + (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]) + ] +) +def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames): + + r = recipe.FileSequenceRecipe( + file_urls=file_urls, + sequence_dim="time", + files_per_chunk=files_per_chunk + ) + + chunk_keys = list(r.iter_chunks()) + assert chunk_keys == expected_keys + + for k, expected in zip(r.iter_chunks(), expected_filenames): + fnames = r.filenames_for_chunk(k) + assert fnames == expected + + +def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_dir): + + r = StandardSequentialRecipe( + file_urls=netcdf_local_paths, + sequence_dim='time', + ) + + r.set_target(tmp_dir) + # manual execution of recipe + r.prepare() + for input_key in r.iter_inputs(): + r.cache_input(input_key) + for chunk_key in r.iter_chunks(): + r.store_chunk(chunk_key) + r.finalize() + + ds_target = xr.open_dataset(tmp_dir) + assert ds_target.identical(daily_xarray_dataset) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1b4b8d74..03e6be7f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,17 @@ +import pytest + import pangeo_forge.utils -def test_chunk(): - result = pangeo_forge.utils.chunk.run([1, 2, 3], 2) - assert result == [(1, 2), (3,)] +@pytest.mark.parametrize( + "iterable, size, expected", + [ + ([1, 2, 3], 1, [(1,), (2,), (3,)]), + ([1, 2, 3], 2, [(1, 2), (3,)]), + ([1, 2, 3], 3, [(1, 2, 3,)]), + ([1, 2, 3], 4, [(1, 2, 3,)]) + ] +) +def test_chunked_iterable(iterable, size, expected): + actual = list(pangeo_forge.utils.chunked_iterable(iterable, size)) + assert actual == expected From f980e8f8cff0f01b72ad4db1cd0ae36214b02c22 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 28 Nov 2020 22:08:05 -0500 Subject: [PATCH 03/34] made target fixture --- pangeo_forge/recipe/recipe.py | 34 +++++++++++++++++++--------------- tests/fixtures.py | 19 ++++++++++++++++++- tests/test_recipe.py | 3 ++- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py index 3cc86180..38c67d95 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe/recipe.py @@ -2,11 +2,16 @@ A Pangeo Forge Recipe """ +from dataclasses import dataclass + import xarray as xr import fsspec from ..utils import chunked_iterable from .target import Target +from typing import Optional, Iterable + + ### How to manually execute a recipe: ### # # r = PangeoForgeTarget() @@ -27,10 +32,15 @@ # 2) Point the Recipe at its Target # 3) Initialize the recipe. # Check if the target exists; if not, create it. -# +# 4) cache the inputs to proximate storage (OPTIONAL) +# Some recipes won't need this (e.g. cloud to cloud) +# If so, iter_inputs is just an empty iterator +# 5) Load each chunk from the inputs and store it in the target +# Might be coming from the cache or might be read directly. +# 6) -#@dataclass +@dataclass class DatasetRecipe(): target: Target @@ -71,23 +81,17 @@ def open_target(self): +@dataclass class FileSequenceRecipe(DatasetRecipe): + file_urls: Iterable[str] + sequence_dim: str + files_per_chunk: int = 1 + nitems_per_file: int = 1 - def __init__(self, file_urls, sequence_dim, files_per_chunk=1, nitems_per_file=1, chunksize_within_file=None): - self.file_urls = file_urls - - if chunksize_within_file: - assert file_per_chunk is None - assert nitems_per_file >= chunksize_within_file - - self.files_per_chunk = files_per_chunk - self.nitems_per_file = nitems_per_file - self.sequence_dim = sequence_dim - - # mapping between chunks and file names + def __post_init__(self): self._chunks_files = {k: v for k, v in - enumerate(chunked_iterable(file_urls, files_per_chunk))} + enumerate(chunked_iterable(self.file_urls, self.files_per_chunk))} def filenames_for_chunk(self, chunk_key): diff --git a/tests/fixtures.py b/tests/fixtures.py index 0e54d899..7f0f1d92 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ import pytest import xarray as xr - +from pangeo_forge.recipe.target import Target # where to run the http server _PORT = "8080" @@ -71,6 +71,14 @@ def netcdf_http_server(netcdf_local_paths): p.kill() +@pytest.fixture() +def tmp_target(tmpdir_factory): + import fsspec + fs = fsspec.get_filesystem_class("file")() + path = str(tmpdir_factory.mktemp("target")) + return Target(fs, path) + + # tests that our fixtures work @@ -86,3 +94,12 @@ def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): open_files = [fsspec.open(url).open() for url in urls] ds = xr.open_mfdataset(open_files, combine="nested", concat_dim="time") assert ds.identical(daily_xarray_dataset) + + +def test_target(tmp_target): + mapper = tmp_target.get_mapper() + mybytes = b'bar' + mapper['foo'] = b'bar' + with open(tmp_target.path + '/foo') as f: + res = f.read() + assert res == 'bar' diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 7ca003a6..b797d73f 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -17,7 +17,8 @@ def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expecte r = recipe.FileSequenceRecipe( file_urls=file_urls, sequence_dim="time", - files_per_chunk=files_per_chunk + files_per_chunk=files_per_chunk, + target= ) chunk_keys = list(r.iter_chunks()) From 0f311415f0802121ec2a85c66c015bcfcc7df99f Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 28 Nov 2020 22:09:25 -0500 Subject: [PATCH 04/34] made target fixture --- tests/test_recipe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_recipe.py b/tests/test_recipe.py index b797d73f..11a0f41d 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -2,7 +2,7 @@ from pangeo_forge.recipe import recipe -from .fixtures import daily_xarray_dataset, netcdf_local_paths +from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_target dummy_fnames = ["a.nc", "b.nc", "c.nc"] @pytest.mark.parametrize( @@ -12,13 +12,13 @@ (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]) ] ) -def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames): +def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): r = recipe.FileSequenceRecipe( file_urls=file_urls, sequence_dim="time", files_per_chunk=files_per_chunk, - target= + target=tmp_target ) chunk_keys = list(r.iter_chunks()) @@ -29,7 +29,7 @@ def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expecte assert fnames == expected -def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_dir): +def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target): r = StandardSequentialRecipe( file_urls=netcdf_local_paths, From 15e240d2d19967534510ae9a3448965954f86b8f Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sun, 29 Nov 2020 22:24:59 -0500 Subject: [PATCH 05/34] spaghetti at this point --- pangeo_forge/recipe/recipe.py | 120 ++++++++++++++++++++++++++------- pangeo_forge/recipe/storage.py | 59 ++++++++++++++++ pangeo_forge/recipe/target.py | 22 ------ 3 files changed, 154 insertions(+), 47 deletions(-) create mode 100644 pangeo_forge/recipe/storage.py delete mode 100644 pangeo_forge/recipe/target.py diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py index 38c67d95..142fe849 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe/recipe.py @@ -9,14 +9,13 @@ from ..utils import chunked_iterable from .target import Target -from typing import Optional, Iterable +from typing import Optional, Iterable, Callable ### How to manually execute a recipe: ### # -# r = PangeoForgeTarget() -# r = MyRecipe(**opts) # 1 -# r.set_target(tmp_dir) # 2 +# t = PangeoForgeTarget() +# r = MyRecipe(target=t, **opts) # 1 # # manual execution of recipe # r.prepare() # 3 # for input_key in r.iter_inputs(): @@ -44,39 +43,110 @@ class DatasetRecipe(): target: Target + def prepare(self): + pass + + def iter_inputs(self): + return [] + + def cache_input(self, input_key): + raise NotImplementedError + + def iter_chunks(self): + raise NotImplementedError + + def store_chunk(self, chunk_key): + raise NotImplementedError + + def finalize(self): + raise NotImplementedError + + +@dataclass +class FSSpecFileOpenerMixin: + input_open_kwargs: dict = {} + + @contextmanager + def input_opener(self, fname): + with fsspec.open(fname, **input_open_kwargs) as f: + yield f + -class FSSpecInputOpenerMixin: +@dataclass +class InputCachingMixin(FSSpecFileOpenerMixin): + input_cache: InputCache + require_cache: bool -> False + + # returns a function that takes one input, the input_key + # this allows us to parallelize these operations + @property + def cache_input(self): - def open_file(self, fname): - # todo: caching - return fsspec.open(fname) + def cache_func(fname: str) -> None: + with super().input_opener(fname, mode="rb") as source: + with self.input_cache.open(fname, mode="wb") as target: + target.write(source.read()) + return cache_func -class XarrayChunkOpenerMixin: + @contextmanager + def input_opener(self, fname): + if self.input_cache.exists(fname): + with self.input_chache.open(fname, mode='rb') as f: + yield f + elif self.require_cache: + raise IOError("Input can only be opened from cache. Call .cache_input first.") + else: + # This will bypass the cache. May be slow. + with super().input_opener(fname, mode="rb") as f: + yield f - def open_chunk(self, filenames, load=True): - files_to_open = [self.open_file(f) for f in filenames] - ds_chunk = xr.open_mfdataset(filenames, **self.open_chunk_kwargs) - if load: - ds_chunk.load() - return ds_chunk + + +@dataclass +class XarrayInputOpener: + xarray_open_kwargs: Any + + def open_input(self, fname): + with self.input_opener(fname) as f: + ds = xr.open_dataset(f, **self.xarray_open_kwargs) + return ds + + +@dataclass +class XarrayConcatChunkOpener(XarrayInputOpener): + xarray_concat_kwargs: Any + + def open_chunk(self, chunk_key): + inputs = self.inputs_for_chunk(chunk_key) + dsets = [self.open_input(i) for i in inputs] + combined = xr.concat(dsets, **xarray_concat_kwargs) + # TODO: maybe do some chunking here? + return combined class ZarrWriterMixin: + @property + def store_chunk(self) -> Callable: + + def _store_chunk(chunk_key): + ds_chunk = self.open_chunk(chunk_key) + target_mapper = self.target.get_mapper() + write_region = self.get_write_region(chunk_key) + ds_chunk.to_zarr(target_mapper, region=write_region) + + return _store_chunk - def store_chunk(self, chunk_key): - filenames = self.filenames_for_chunk(chunk_key) - ds_chunk = self.open_chunk(filenames) - target_mapper = self.target.get_mapper() - write_region = self.get_write_region(chunk_key) - ds_chunk.to_zarr(target_mapper, region=write_region, **target_kwargs) + @property + def open_target(self) -> Callable: - def open_target(self): - target_mapper = self.target.get_mapper() - return xr.open_zarr(target_mapper) + def _open_target(): + target_mapper = self.target.get_mapper() + return xr.open_zarr(target_mapper) + return _open_target @@ -94,7 +164,7 @@ def __post_init__(self): enumerate(chunked_iterable(self.file_urls, self.files_per_chunk))} - def filenames_for_chunk(self, chunk_key): + def inputs_for_chunk(self, chunk_key): return self._chunks_files[chunk_key] diff --git a/pangeo_forge/recipe/storage.py b/pangeo_forge/recipe/storage.py new file mode 100644 index 00000000..474597a9 --- /dev/null +++ b/pangeo_forge/recipe/storage.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass +from contextlib import closing, contextmanager +import fsspec + + +@dataclass +class Target: + """Representation of a storage target for Pangeo Forge. + Attributes + ---------- + fs : FileSystemSpec.AbtractFileSystem + The filesystem we are writing to. Should be instantiated outside this + class. + path : str + The path where the target data will be saved. + """ + + fs: fsspec.AbstractFileSystem + path: str + + def get_mapper(self): + # don't want to use this because we want to use a fancier Zarr FSStore + return self.fs.get_mapper(self.path) + + +def _hash_path(path: str) -> str: + return str(hash(path)) + + +@dataclass +class InputCache: + """Representation of an intermediate storage location where remote files + Can be cached locally. + + Attributes + ---------- + fs : FileSystemSpec.AbtractFileSystem + The filesystem we are writing to. Should be instantiated outside this + class. + prefix : str + A path prepended to all paths. + """ + + fs: fsspec.AbstractFileSystem + prefix: str = "" + + def _full_path(self, path): + return os.path.join(self.prefix, _hash_path(path)) + + def exists(self, path): + return self.fs.exists(self._full_path(path)) + + def rm(self, path): + self.fs.rm(self._full_path(path)) + + @contextmanager + def open(self, path): + with self.fs.open(self._full_path(path)) as f: + yield f diff --git a/pangeo_forge/recipe/target.py b/pangeo_forge/recipe/target.py deleted file mode 100644 index b7274d94..00000000 --- a/pangeo_forge/recipe/target.py +++ /dev/null @@ -1,22 +0,0 @@ -from dataclasses import dataclass -import fsspec - - -@dataclass -class Target: - """Representation of a storage target for Pangeo Forge. - Attributes - ---------- - url : FileSystemSpec.AbtractFileSystem - The filesystem we are writing to. Should be instantiated outside this - class. - path : str - The path where the target data will be saved. - """ - - fs: fsspec.AbstractFileSystem - path: str - - def get_mapper(self): - # don't want to use this because we want to use a fancier Zarr FSStore - return self.fs.get_mapper(self.path) From 18a895a65e1037da863e4ecbaf7b7c5ea6370b04 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 15:34:05 -0500 Subject: [PATCH 06/34] working storage classes --- pangeo_forge/recipe/storage.py | 9 +++++++-- tests/fixtures.py | 20 +++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pangeo_forge/recipe/storage.py b/pangeo_forge/recipe/storage.py index 474597a9..04e949a0 100644 --- a/pangeo_forge/recipe/storage.py +++ b/pangeo_forge/recipe/storage.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from contextlib import closing, contextmanager import fsspec +import os @dataclass @@ -54,6 +55,10 @@ def rm(self, path): self.fs.rm(self._full_path(path)) @contextmanager - def open(self, path): - with self.fs.open(self._full_path(path)) as f: + def open(self, path, **kwargs): + with self.fs.open(self._full_path(path), **kwargs) as f: yield f + + def __post_init__(self): + if not self.fs.isdir(self.prefix): + self.fs.mkdir(self.prefix) diff --git a/tests/fixtures.py b/tests/fixtures.py index 7f0f1d92..28b2ead5 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ import pytest import xarray as xr -from pangeo_forge.recipe.target import Target +from pangeo_forge.recipe.storage import Target, InputCache # where to run the http server _PORT = "8080" @@ -79,6 +79,13 @@ def tmp_target(tmpdir_factory): return Target(fs, path) +@pytest.fixture() +def tmp_cache(tmpdir_factory): + path = str(tmpdir_factory.mktemp("cache")) + fs = fsspec.get_filesystem_class("file")() + cache = InputCache(fs, prefix='cache') + return cache + # tests that our fixtures work @@ -103,3 +110,14 @@ def test_target(tmp_target): with open(tmp_target.path + '/foo') as f: res = f.read() assert res == 'bar' + + +def test_cache(tmp_cache): + assert not tmp_cache.exists('foo') + with tmp_cache.open('foo', mode='w') as f: + f.write('bar') + assert tmp_cache.exists('foo') + with tmp_cache.open('foo', mode='r') as f: + assert f.read() == 'bar' + tmp_cache.rm('foo') + assert not tmp_cache.exists('foo') From 301206b4bc01565f7f2b1dc5e02d7477c4cc2840 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 16:39:13 -0500 Subject: [PATCH 07/34] recipe working pretty well --- pangeo_forge/recipe/recipe.py | 255 +++++++++++++++++++++++++--------- 1 file changed, 186 insertions(+), 69 deletions(-) diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py index 142fe849..25702df1 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe/recipe.py @@ -2,15 +2,21 @@ A Pangeo Forge Recipe """ -from dataclasses import dataclass +import logging +from dataclasses import dataclass, field +from contextlib import contextmanager +from typing import Optional, Iterable, Callable, Any +import numpy as np import xarray as xr import fsspec -from ..utils import chunked_iterable -from .target import Target +import zarr -from typing import Optional, Iterable, Callable +from ..utils import chunked_iterable +from .storage import Target, InputCache +#logger = logging.getLogger(__name__) +logger = logging.getLogger("recipe") ### How to manually execute a recipe: ### # @@ -42,48 +48,76 @@ @dataclass class DatasetRecipe(): target: Target + chunk_preprocess_funcs: Iterable[Callable] + @property def prepare(self): - pass + def _prepare(): + pass + return _prepare def iter_inputs(self): return [] - def cache_input(self, input_key): - raise NotImplementedError + # need to figure out what's going on with these methods and inheritance + # @property + # def cache_input(self): + # def _cache_input(input_key): + # raise NotImplementedError + # return _cache_input + + # this only gets run when iterating, not preparing! + def preprocess_chunk(self, ds): + for f in self.chunk_preprocess_funcs: + ds = f(ds) + return ds def iter_chunks(self): raise NotImplementedError - def store_chunk(self, chunk_key): - raise NotImplementedError + # @property + # def store_chunk(self): + # def _store_chunk(chunk_key): + # raise NotImplementedError + # return _store_chunk - def finalize(self): - raise NotImplementedError + # @property + # def finalize(self): + # + # def _finalize(): + # pass + # return _finalize +# Notes about dataclasses: +# - https://www.python.org/dev/peps/pep-0557/#inheritance +# - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses +# This means that, for now, I can't get default arguments to work. @dataclass class FSSpecFileOpenerMixin: - input_open_kwargs: dict = {} + #input_open_kwargs: dict #= field(default_factory=dict) @contextmanager - def input_opener(self, fname): - with fsspec.open(fname, **input_open_kwargs) as f: + def input_opener(self, fname, **kwargs): + logger.info(f"Opening input '{fname}'") + with fsspec.open(fname, **kwargs) as f: yield f @dataclass class InputCachingMixin(FSSpecFileOpenerMixin): + require_cache: bool #= False input_cache: InputCache - require_cache: bool -> False # returns a function that takes one input, the input_key # this allows us to parallelize these operations @property def cache_input(self): + opener = super().input_opener def cache_func(fname: str) -> None: - with super().input_opener(fname, mode="rb") as source: + logger.info(f"Caching input '{fname}'") + with opener(fname, mode="rb") as source: with self.input_cache.open(fname, mode="wb") as target: target.write(source.read()) @@ -92,11 +126,14 @@ def cache_func(fname: str) -> None: @contextmanager def input_opener(self, fname): if self.input_cache.exists(fname): - with self.input_chache.open(fname, mode='rb') as f: + logger.info(f"Input '{fname}' found in cache") + with self.input_cache.open(fname, mode='rb') as f: yield f elif self.require_cache: + # this creates an error on prepare because nothing is cached raise IOError("Input can only be opened from cache. Call .cache_input first.") else: + logger.info(f"Input '{fname}' not found in cache. Opening directly.") # This will bypass the cache. May be slow. with super().input_opener(fname, mode="rb") as f: yield f @@ -105,26 +142,39 @@ def input_opener(self, fname): @dataclass class XarrayInputOpener: - xarray_open_kwargs: Any + xarray_open_kwargs: dict def open_input(self, fname): - with self.input_opener(fname) as f: - ds = xr.open_dataset(f, **self.xarray_open_kwargs) + with self.input_opener(fname) as f: + logger.info(f"Opening input with Xarray '{fname}'") + ds = xr.open_dataset(f, **self.xarray_open_kwargs).load() + # do we always want to remove encoding? I think so. + ds = _fix_scalar_attr_encoding(ds) + logger.debug(f"{ds}") return ds @dataclass class XarrayConcatChunkOpener(XarrayInputOpener): - xarray_concat_kwargs: Any + xarray_concat_kwargs: dict def open_chunk(self, chunk_key): + logger.info(f"Concatenating inputs for chunk '{chunk_key}'") inputs = self.inputs_for_chunk(chunk_key) dsets = [self.open_input(i) for i in inputs] - combined = xr.concat(dsets, **xarray_concat_kwargs) + # CONCAT DELETES ENCODING!!! + ds = xr.concat(dsets, self.sequence_dim, **self.xarray_concat_kwargs) + logger.debug(f"{ds}") + + # do we really want to just delete all encoding? + #for v in ds.variables: + # ds[v].encoding = {} + # TODO: maybe do some chunking here? - return combined + return ds +@dataclass class ZarrWriterMixin: @property @@ -132,50 +182,99 @@ def store_chunk(self) -> Callable: def _store_chunk(chunk_key): ds_chunk = self.open_chunk(chunk_key) + ds_chunk = self.preprocess_chunk(ds_chunk) target_mapper = self.target.get_mapper() - write_region = self.get_write_region(chunk_key) + write_region = self.region_for_chunk(chunk_key) + logger.info(f"Storing chunk '{chunk_key}' to Zarr region {write_region}") ds_chunk.to_zarr(target_mapper, region=write_region) return _store_chunk - @property - def open_target(self) -> Callable: + def open_target(self): + target_mapper = self.target.get_mapper() + return xr.open_zarr(target_mapper) - def _open_target(): - target_mapper = self.target.get_mapper() - return xr.open_zarr(target_mapper) - return _open_target + def initialize_target(self, ds, **expand_dims): + logger.info(f"Creating a new dataset in target") + target_mapper = self.target.get_mapper() + ds.to_zarr(target_mapper, mode='w', compute=False) + + + def expand_target_dim(self, dim, dimsize): + target_mapper = self.target.get_mapper() + zgroup = zarr.open_group(target_mapper) + + ds = self.open_target() + sequence_axes = {v: ds[v].get_axis_num(dim) + for v in ds.variables + if dim in ds[v].dims} + + for v, axis in sequence_axes.items(): + arr = zgroup[v] + shape = list(arr.shape) + shape[axis] = dimsize + arr.resize(shape) @dataclass -class FileSequenceRecipe(DatasetRecipe): - file_urls: Iterable[str] +class ZarrConsolidatorMixin(): + consolidate_zarr: bool #= True + + @property + def finalize(self): + + def _finalize(): + if self.consolidate_zarr: + logger.info(f"Consolidating Zarr metadata") + target_mapper = self.target.get_mapper() + zarr.consolidate_metadata(target_mapper) + + return _finalize + + +@dataclass +class SequenceRecipe(DatasetRecipe): + input_urls: Iterable[str] sequence_dim: str - files_per_chunk: int = 1 - nitems_per_file: int = 1 + inputs_per_chunk: int = 1 + nitems_per_input: int = 1 def __post_init__(self): - self._chunks_files = {k: v for k, v in - enumerate(chunked_iterable(self.file_urls, self.files_per_chunk))} + self._chunks_inputs = {k: v for k, v in + enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk))} + + def drop_vars(ds): + # writing a region means that all the variables MUST have sequence_dim + to_drop = [v for v in ds.variables + if self.sequence_dim not in ds[v].dims] + return ds.drop(to_drop) + + self.chunk_preprocess_funcs.append(drop_vars) def inputs_for_chunk(self, chunk_key): - return self._chunks_files[chunk_key] + return self._chunks_inputs[chunk_key] + + + def iter_inputs(self): + for chunk_key in self.iter_chunks(): + for input in self.inputs_for_chunk(chunk_key): + yield input def nitems_for_chunk(self, chunk_key): - return self.nitems_per_file * len(self.filenames_for_chunk(chunk_key)) + return self.nitems_per_input * len(self.inputs_for_chunk(chunk_key)) def region_for_chunk(self, chunk_key): # return a dict suitable to pass to xr.to_zarr(region=...) # specifies where in the overall array to put this chunk's data - stride = self.nitems_per_file * self.files_per_chunk + stride = self.nitems_per_input * self.inputs_per_chunk start = chunk_key * stride return { self.sequence_dim: @@ -183,56 +282,74 @@ def region_for_chunk(self, chunk_key): } - def sequence_dim(self): + def sequence_len(self): # tells the total size of dataset along the sequence dimension - return { - self.sequence_dim: - sum([self.nitems_for_chunk(k) for k in self.iter_chunks()]) - } + return sum([self.nitems_for_chunk(k) for k in self.iter_chunks()]) def sequence_chunks(self): # chunking - return {self.sequence_dim: self.files_per_chunk * self.nitems_per_file} + return {self.sequence_dim: self.inputs_per_chunk * self.nitems_per_input} def iter_chunks(self): - for k in self._chunks_files: + for k in self._chunks_inputs: yield k + @property def prepare(self): - target_store = self.get_store_target() - - try: - ds = self.open_target(target_store) + def _prepare(): - except IOError: - first_chunk_key = next(self.iter_chunks()) - ds = self.open_chunk(first_chunk_key).chunk() - ds.to_zarr(path, compute=False, consolidated=False) + target_store = self.target.get_mapper() - encoding = {v: ds[v].encoding for v in ds} + try: + ds = self.open_target() + logger.info(f"Found an existing dataset in target") + logger.debug(f"{ds}") + except (IOError, zarr.errors.GroupNotFoundError): + first_chunk_key = next(self.iter_chunks()) + ds = self.open_chunk(first_chunk_key).chunk() - # now resize the sequence dim at the zarr level - sequence_axes = {v: ds[v].get_axis_num(self.sequence_dim) - for v in ds - if self.sequence_dim in ds[v].dims} - N = self.sequence_dim() + # make sure the concat dim has a valid fill_value to avoid + # overruns when writing chunk + #ds[self.sequence_dim].encoding = {'_FillValue': 0} + # actually not necessary if we use decode_times=False + self.initialize_target(ds) - zgroup = zarr.open_group(target_store) + self.expand_target_dim(self.sequence_dim, self.sequence_len()) - for v, axis in sequence_axes.items(): - arr = zgroup[v] - shape = list(arr.shape) - shape[axis] = N - arr.resize(shape) + return _prepare +@dataclass class StandardSequentialRecipe( - FileSequenceRecipe, - FSSpecInputOpenerMixin, - XarrayChunkOpenerMixin, + SequenceRecipe, + InputCachingMixin, + XarrayConcatChunkOpener, ZarrWriterMixin, + ZarrConsolidatorMixin ): pass + + +# helper utilities + +# only needed because of +# https://github.com/pydata/xarray/issues/4631 +def _fix_scalar_attr_encoding(ds): + + def _fixed_attrs(d): + fixed = {} + for k, v in d.items(): + if isinstance(v, np.ndarray) and len(v) == 1: + fixed[k] = v[0] + return fixed + + ds = ds.copy() + ds.attrs.update(_fixed_attrs(ds.attrs)) + ds.encoding.update(_fixed_attrs(ds.encoding)) + for v in ds.variables: + ds[v].attrs.update(_fixed_attrs(ds[v].attrs)) + ds[v].encoding.update(_fixed_attrs(ds[v].encoding)) + return ds From a11fdc35923f1f7815cfe40ee9b1342db765447c Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:21:32 -0500 Subject: [PATCH 08/34] recipe tests pass --- pangeo_forge/recipe/recipe.py | 6 ++--- tests/fixtures.py | 2 ++ tests/test_recipe.py | 44 +++++++++++++++++++++++------------ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe/recipe.py index 25702df1..2c828500 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe/recipe.py @@ -175,7 +175,7 @@ def open_chunk(self, chunk_key): @dataclass -class ZarrWriterMixin: +class ZarrXarrayWriterMixin: @property def store_chunk(self) -> Callable: @@ -313,7 +313,7 @@ def _prepare(): # make sure the concat dim has a valid fill_value to avoid # overruns when writing chunk - #ds[self.sequence_dim].encoding = {'_FillValue': 0} + ds[self.sequence_dim].encoding = {'_FillValue': -1} # actually not necessary if we use decode_times=False self.initialize_target(ds) @@ -327,7 +327,7 @@ class StandardSequentialRecipe( SequenceRecipe, InputCachingMixin, XarrayConcatChunkOpener, - ZarrWriterMixin, + ZarrXarrayWriterMixin, ZarrConsolidatorMixin ): pass diff --git a/tests/fixtures.py b/tests/fixtures.py index 28b2ead5..4ac47d9f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -45,6 +45,8 @@ def daily_xarray_dataset(): def netcdf_local_paths(daily_xarray_dataset, tmpdir_factory, request): """Return a list of paths pointing to netcdf files.""" tmp_path = tmpdir_factory.mktemp("netcdf_data") + items_per_file = {"D": 1, "2D": 2} + daily_xarray_dataset.attrs['items_per_file'] = items_per_file[request.param] gb = daily_xarray_dataset.resample(time=request.param) _, datasets = zip(*gb) fnames = [f"{n:03d}.nc" for n in range(len(datasets))] diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 11a0f41d..7b266473 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -1,8 +1,10 @@ import pytest +import xarray as xr +import zarr from pangeo_forge.recipe import recipe -from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_target +from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache dummy_fnames = ["a.nc", "b.nc", "c.nc"] @pytest.mark.parametrize( @@ -12,32 +14,43 @@ (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]) ] ) -def test_file_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): +def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): - r = recipe.FileSequenceRecipe( - file_urls=file_urls, + r = recipe.SequenceRecipe( + input_urls=file_urls, sequence_dim="time", - files_per_chunk=files_per_chunk, - target=tmp_target + inputs_per_chunk=files_per_chunk, + target=tmp_target, + chunk_preprocess_funcs=[], ) + assert r.sequence_len() == len(file_urls) + chunk_keys = list(r.iter_chunks()) assert chunk_keys == expected_keys for k, expected in zip(r.iter_chunks(), expected_filenames): - fnames = r.filenames_for_chunk(k) + fnames = r.inputs_for_chunk(k) assert fnames == expected -def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target): +def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): - r = StandardSequentialRecipe( - file_urls=netcdf_local_paths, - sequence_dim='time', + r = recipe.StandardSequentialRecipe( + consolidate_zarr=True, + xarray_open_kwargs={}, + xarray_concat_kwargs={}, + require_cache=False, + input_cache=tmp_cache, + target=tmp_target, + chunk_preprocess_funcs=[], + input_urls=netcdf_local_paths, + sequence_dim="time", + inputs_per_chunk=1, + nitems_per_input=daily_xarray_dataset.attrs['items_per_file'] ) - r.set_target(tmp_dir) - # manual execution of recipe + # this is the cannonical way to manually execute a recipe r.prepare() for input_key in r.iter_inputs(): r.cache_input(input_key) @@ -45,5 +58,6 @@ def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target): r.store_chunk(chunk_key) r.finalize() - ds_target = xr.open_dataset(tmp_dir) - assert ds_target.identical(daily_xarray_dataset) + ds_target = xr.open_zarr(tmp_target.get_mapper(), consolidated=True).load() + ds_expected = daily_xarray_dataset.compute() + assert ds_target.identical(ds_expected) From b1cc65b06b3356e41749300cdc35a833fcf0b640 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:24:07 -0500 Subject: [PATCH 09/34] prune old stuff --- pangeo_forge/pipelines/README.md | 8 - pangeo_forge/pipelines/__init__.py | 0 pangeo_forge/pipelines/base.py | 169 --------------------- pangeo_forge/pipelines/http_xarray_zarr.py | 35 ----- pangeo_forge/pipelines/job.yaml | 28 ---- pangeo_forge/pipelines/worker_pod.yaml | 41 ----- pangeo_forge/tasks/README.md | 8 - pangeo_forge/tasks/__init__.py | 0 pangeo_forge/tasks/http.py | 36 ----- pangeo_forge/tasks/xarray.py | 73 --------- pangeo_forge/tasks/zarr.py | 23 --- 11 files changed, 421 deletions(-) delete mode 100644 pangeo_forge/pipelines/README.md delete mode 100644 pangeo_forge/pipelines/__init__.py delete mode 100644 pangeo_forge/pipelines/base.py delete mode 100644 pangeo_forge/pipelines/http_xarray_zarr.py delete mode 100644 pangeo_forge/pipelines/job.yaml delete mode 100644 pangeo_forge/pipelines/worker_pod.yaml delete mode 100644 pangeo_forge/tasks/README.md delete mode 100644 pangeo_forge/tasks/__init__.py delete mode 100644 pangeo_forge/tasks/http.py delete mode 100644 pangeo_forge/tasks/xarray.py delete mode 100644 pangeo_forge/tasks/zarr.py diff --git a/pangeo_forge/pipelines/README.md b/pangeo_forge/pipelines/README.md deleted file mode 100644 index a1738027..00000000 --- a/pangeo_forge/pipelines/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# `pangeo_forge.pipelines` - -A collection of Pangeo-forge _Pipelines_. - -## Details - -- An abstract Pipeline class is defined in `base.py`. -- A general file naming convention might be `{source}_{container}_{target}.py`. For example, a pipeline that downloads netCDF files from a remote http server and publishes a Zarr store would be `http_xarray_zarr.py`. The main class in each module would follow a similar convention, e.g. `HttpXarrayZarr`. diff --git a/pangeo_forge/pipelines/__init__.py b/pangeo_forge/pipelines/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pangeo_forge/pipelines/base.py b/pangeo_forge/pipelines/base.py deleted file mode 100644 index ebf5d3ea..00000000 --- a/pangeo_forge/pipelines/base.py +++ /dev/null @@ -1,169 +0,0 @@ -""" -Base class for most pipelines. - -Design ------- - -To the extent possible, we want recipe maintainers to focus on the code -needed to express the data transformations. In particular, we don't -want them to worry about things like the execution environment and how -their code is loaded into it. - -The Flow -======== - -We use prefect_ to express ETL pipelines. It provides us a few things: - -1. A Highlevel API for writing data transformations. -2. A robust scheduling and orchestration system for executing pipelines. - -Every pangeo-forge recipe *must* have a ``prefect.Flow`` instance at the -top-level of their pipeline module. - - -.. code-block:: python - - class Pipeline(pangeo_forge.AbstractPipeline): - @property - def flow(self) -> Flow: - with Flow( - self.name, - environment=self.environment, - storage=self.storage - ) as flow: - # Your pipeline goes here. - ... - - return flow - - pipeline = Pipeline() - flow = pipeline.flow - -The ``storage`` keyword controls how your source code is loaded into -pangeo-forge, and the ``environment`` keyword controls where it's run. -""" -import textwrap -from abc import ABC, abstractmethod -from pathlib import Path -from typing import List - -from prefect import Flow -from prefect.environments import DaskKubernetesEnvironment, Environment -from prefect.environments.storage import Storage -from prefect.environments.storage.github import GitHub - -HERE = Path(__file__).parent.absolute() - - -def finalize_flow(cls): - # Nothing to see here, move along. - # OK, fine... this is a small hack to hide Prefect's orchestration stuff - # (environments, storage) from the recipe writer. Prefect currently - # requires that the module stored on GitHub have - # 1. A `Flow` instance at the top-level of the module. - # 2. A properly set `storage` and `environment`. - # Which would require this at the bottom of every python module - # >>> pipeline = MyPipeline() - # >>> flow = pipeline.flow - # >>> flow.storage = pipeline.storage - # >>> flow.environment = pipeline.environment - # Which is ugly. To avoid that, we just override how `.` works on - # AbstractPipeline. If we're getting `.flow`, this little decorator - # takes over and attaches the storage and environment. - orig_getattribute = cls.__getattribute__ - - def new_getattribute(self, name): - result = orig_getattribute(self, name) - if name == "flow": - result.storage = self.storage - result.environment = self.environment - return result - - cls.__getattribute__ = new_getattribute - return cls - - -@finalize_flow -class AbstractPipeline(ABC): - name = "AbstractPipeline" - path = "recipe/pipeline.py" - - @property - @abstractmethod - def repo(self): - """The GitHub repository containing the pipeline definition.""" - - @property - @abstractmethod - def sources(self) -> List[str]: - """A list of source URLs containing the original data.""" - pass - - @property - @abstractmethod - def targets(self) -> List[str]: - """A list of target URLs where the transformed data is written.""" - pass - - @abstractmethod - def flow(self) -> Flow: - """The """ - pass - - @property - def environment(self) -> Environment: - """ - The pipeline runtime environment. - - Returns - ------- - prefect.environments.Environment - An instance of a Prefect Environment. By default - a :class:`prefect.environments.DaskKubernetesEnvironment` - is used. - """ - scheduler_spec_file = str(HERE / "job.yaml") - worker_spec_file = str(HERE / "worker_pod.yaml") - - environment = DaskKubernetesEnvironment( - min_workers=1, - max_workers=30, - scheduler_spec_file=scheduler_spec_file, - worker_spec_file=worker_spec_file, - metadata=dict(image="pangeoforge/default-image"), - ) - return environment - - @property - def storage(self) -> Storage: - """ - The pipeline storage. - - Returns - ------- - prefect.environments.storage.Storage - By default a :class:`prefect.environments.storage.github.GitHub` - environment is used with ``self.repo`` as the repository - and ``self.path`` as the path. - """ - return GitHub(self.repo, path=self.path) - - def _generate_run(self, source): - name = type(self).__name__ - with open(source, encoding="utf-8") as f: - source = f.read() - - return textwrap.dedent( - """\ - # file: run.py - # generated by pangeo-forge. - - {source} - - # ---------------------------------------------------------------- - pipe = {name}() - flow = pipe.flow - flow.storage = pipe.storage - flow.environment = pipe.environment - """ - ).format(source=source, name=name) diff --git a/pangeo_forge/pipelines/http_xarray_zarr.py b/pangeo_forge/pipelines/http_xarray_zarr.py deleted file mode 100644 index 4b72703a..00000000 --- a/pangeo_forge/pipelines/http_xarray_zarr.py +++ /dev/null @@ -1,35 +0,0 @@ -from prefect import Flow - -from ..tasks.http import download -from ..tasks.xarray import combine_and_write -from ..tasks.zarr import consolidate_metadata -from ..utils import chunked_iterable - - -class HttpXarrayZarrMixin: - @property - def flow(self): - - if len(self.targets) == 1: - target = self.targets[0] - else: - raise ValueError("Zarr target requires self.targets be a length one list") - - with Flow(self.name) as _flow: - - cached_sources = [download(k, self.cache_location) for k in self.sources] - - write_tasks = [] - for source_group in chunked_iterable(cached_sources, self.files_per_chunk): - write_task = combine_and_write( - source_group, target, self.append_dim, self.concat_dim, - ) - write_tasks.append(write_task) - cm = consolidate_metadata(target, write_tasks) - - # create dependencies in imperative mode - for n in range(1, len(write_tasks)): - write_tasks[n].set_upstream(write_tasks[n - 1], flow=_flow) - cm.set_upstream(write_tasks[-1], flow=_flow) - - return _flow diff --git a/pangeo_forge/pipelines/job.yaml b/pangeo_forge/pipelines/job.yaml deleted file mode 100644 index 5fa08ca8..00000000 --- a/pangeo_forge/pipelines/job.yaml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: prefect-dask-job - labels: - app: prefect-dask-job -spec: - template: - metadata: - labels: - app: prefect-dask-job - spec: - serviceAccountName: pangeo-forge - serviceAccount: pangeo-forge - containers: - - name: flow - imagePullPolicy: Always - command: ["/bin/sh", "-c"] - args: - [ - 'python -c "import prefect; prefect.environments.execution.load_and_run_flow()"', - ] - resources: - requests: - cpu: "100m" - limits: - cpu: "100m" - restartPolicy: Never diff --git a/pangeo_forge/pipelines/worker_pod.yaml b/pangeo_forge/pipelines/worker_pod.yaml deleted file mode 100644 index d7a16440..00000000 --- a/pangeo_forge/pipelines/worker_pod.yaml +++ /dev/null @@ -1,41 +0,0 @@ -kind: Pod -metadata: - labels: - app: prefect-dask-worker -spec: - restartPolicy: Never - serviceAccountName: pangeo-forge - serviceAccount: pangeo-forge - containers: - - image: pangeoforge/noaa-oisst-avhrr - imagePullPolicy: Always - args: [dask-worker, --no-bokeh, --death-timeout, '60'] - name: dask-worker - env: - - name: PREFECT__CLOUD__GRAPHQL - value: PREFECT__CLOUD__GRAPHQL - - name: PREFECT__CLOUD__AUTH_TOKEN - value: PREFECT__CLOUD__AUTH_TOKEN - - name: PREFECT__CONTEXT__FLOW_RUN_ID - value: PREFECT__CONTEXT__FLOW_RUN_ID - - name: PREFECT__CLOUD__USE_LOCAL_SECRETS - value: "false" - - name: PREFECT__ENGINE__FLOW_RUNNER__DEFAULT_CLASS - value: "prefect.engine.cloud.CloudFlowRunner" - - name: PREFECT__ENGINE__TASK_RUNNER__DEFAULT_CLASS - value: "prefect.engine.cloud.CloudTaskRunner" - - name: PREFECT__ENGINE__EXECUTOR__DEFAULT_CLASS - value: "prefect.engine.executors.DaskExecutor" - - name: PREFECT__LOGGING__LOG_TO_CLOUD - value: "true" - - name: PREFECT__LOGGING__LEVEL - value: "DEBUG" - - name: PREFECT__DEBUG - value: "true" - - name: PREFECT__LOGGING__EXTRA_LOGGERS - value: PREFECT__LOGGING__EXTRA_LOGGERS - resources: - requests: - cpu: "500m" - limits: - cpu: "500m" diff --git a/pangeo_forge/tasks/README.md b/pangeo_forge/tasks/README.md deleted file mode 100644 index 8a77c5e1..00000000 --- a/pangeo_forge/tasks/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# `pangeo_forge.tasks` - -A collection of Prefect Tasks for Pangeo-Forge. - -## Details - -- This is simply a collection of Prefect Tasks. Much like https://docs.prefect.io/api/latest/#task-library or https://github.com/PrefectHQ/prefect/tree/master/src/prefect/tasks. -- This should be considered low-level public API. diff --git a/pangeo_forge/tasks/__init__.py b/pangeo_forge/tasks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pangeo_forge/tasks/http.py b/pangeo_forge/tasks/http.py deleted file mode 100644 index 32078d0c..00000000 --- a/pangeo_forge/tasks/http.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -import fsspec -from prefect import task - - -@task -def download(source_url, cache_location): - """ - Download a remote file to a cache. - - Parameters - ---------- - source_url : str - Path or url to the source file. - cache_location : str - Path or url to the target location for the source file. - - Returns - ------- - target_url : str - Path or url in the form of `{cache_location}/hash({source_url})`. - """ - target_url = os.path.join(cache_location, str(hash(source_url))) - - # there is probably a better way to do caching! - try: - fsspec.open(target_url).open() - return target_url - except FileNotFoundError: - pass - - with fsspec.open(source_url, mode="rb") as source: - with fsspec.open(target_url, mode="wb") as target: - target.write(source.read()) - return target_url diff --git a/pangeo_forge/tasks/xarray.py b/pangeo_forge/tasks/xarray.py deleted file mode 100644 index 20f65f2c..00000000 --- a/pangeo_forge/tasks/xarray.py +++ /dev/null @@ -1,73 +0,0 @@ -from typing import List - -import fsspec -import xarray as xr -from prefect import task - - -@task -def combine_and_write( - sources: List[str], target: str, append_dim: str, concat_dim: str -) -> List[str]: - """ - Write a batch of intermediate files to a combined zarr store. - - Parameters - ---------- - sources : List[str] - A list of URLs pointing to the intermediate files. - target : str - The URL for the target combined store. - append_dim : str - Name of the dimension of which datasets should be appended during write. - concat_dim : str - The dimension to concatenate along. - - Returns - ------- - target : str - The URL of the written combined Zarr store (same as target). - - Examples - -------- - >>> import pangeo_forge.tasks.xarray - >>> import fsspec - >>> import xarray as xr - >>> from prefect import Flow - - >>> # Load sample data into `sources`. - >>> ds = xr.tutorial.open_dataset('rasm').load() - >>> fs = fsspec.get_filesystem_class("memory")() - >>> dsets = ds.isel(time=slice(18)), ds.isel(time=slice(18, None)) - >>> for i, dset in enumerate(dsets): - ... as_bytes = dset.to_netcdf() - ... with fs.open(f"cache/{i}.nc", "wb") as f: - ... f.write(as_bytes) - - >>> sources = [f"memory://{dset}" for dset in fs.ls("cache")] - >>> with Flow("my-flow") as flow: - ... result = pangeo_forge.tasks.xarray.combine_and_write( - ... sources, "memory://target.zarr", concat_dim="time" - ... ) - >>> result - - - We can run that outside of a flow context with ``.run()`` - >>> pangeo_forge.tasks.xarray.combine_and_write.run( - ... sources, "memory://target.zarr", concat_dim="time" - ... ) - 'memory://target.zarr' - """ - double_open_files = [fsspec.open(url).open() for url in sources] - ds = xr.open_mfdataset(double_open_files, combine="nested", concat_dim=concat_dim) - # by definition, this should be a contiguous chunk - ds = ds.chunk({append_dim: len(sources)}) - mapper = fsspec.get_mapper(target) - - if not len(mapper): - # The first write, . - kwargs = dict(mode="w") - else: - kwargs = dict(mode="a", append_dim=append_dim) - ds.to_zarr(mapper, **kwargs) - return target diff --git a/pangeo_forge/tasks/zarr.py b/pangeo_forge/tasks/zarr.py deleted file mode 100644 index 9fd60149..00000000 --- a/pangeo_forge/tasks/zarr.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import List, Optional - -import fsspec -import zarr -from prefect import task - - -@task -def consolidate_metadata(target, writes: Optional[List[str]] = None) -> None: - """ - Consolidate the metadata the Zarr group at `target`. - - Parameters - ---------- - target : str - The URL for the (combined) Zarr group. - writes : list of strings, optional - The URLs the combined stores were written to. This is only a - parameter to introduce a dependency in the pipeline execution graph. - The actual value isn't used. - """ - mapper = fsspec.get_mapper(target) - zarr.consolidate_metadata(mapper) From 35e0c9f59d6e7aa64aaff5e4830c1705255d9fc8 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:34:11 -0500 Subject: [PATCH 10/34] big cleanup --- pangeo_forge/__init__.py | 5 - pangeo_forge/{recipe => }/recipe.py | 4 +- pangeo_forge/recipe/__init__.py | 0 pangeo_forge/recipe/pipeline.py | 0 pangeo_forge/{recipe => }/storage.py | 0 tests/fixtures.py | 2 +- tests/tasks/__init__.py | 0 tests/tasks/test_xarray.py | 36 ------ tests/test_pipeline.py | 161 --------------------------- tests/test_recipe.py | 2 +- 10 files changed, 4 insertions(+), 206 deletions(-) rename pangeo_forge/{recipe => }/recipe.py (99%) delete mode 100644 pangeo_forge/recipe/__init__.py delete mode 100644 pangeo_forge/recipe/pipeline.py rename pangeo_forge/{recipe => }/storage.py (100%) delete mode 100644 tests/tasks/__init__.py delete mode 100644 tests/tasks/test_xarray.py delete mode 100644 tests/test_pipeline.py diff --git a/pangeo_forge/__init__.py b/pangeo_forge/__init__.py index 722da05d..30bf1a56 100644 --- a/pangeo_forge/__init__.py +++ b/pangeo_forge/__init__.py @@ -9,8 +9,3 @@ pass del get_distribution, DistributionNotFound - -# -# __all__ = [ -# "AbstractPipeline", -# ] diff --git a/pangeo_forge/recipe/recipe.py b/pangeo_forge/recipe.py similarity index 99% rename from pangeo_forge/recipe/recipe.py rename to pangeo_forge/recipe.py index 2c828500..88a13b1b 100644 --- a/pangeo_forge/recipe/recipe.py +++ b/pangeo_forge/recipe.py @@ -12,7 +12,7 @@ import fsspec import zarr -from ..utils import chunked_iterable +from .utils import chunked_iterable from .storage import Target, InputCache #logger = logging.getLogger(__name__) @@ -252,7 +252,7 @@ def drop_vars(ds): # writing a region means that all the variables MUST have sequence_dim to_drop = [v for v in ds.variables if self.sequence_dim not in ds[v].dims] - return ds.drop(to_drop) + return ds.drop_vars(to_drop) self.chunk_preprocess_funcs.append(drop_vars) diff --git a/pangeo_forge/recipe/__init__.py b/pangeo_forge/recipe/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pangeo_forge/recipe/pipeline.py b/pangeo_forge/recipe/pipeline.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pangeo_forge/recipe/storage.py b/pangeo_forge/storage.py similarity index 100% rename from pangeo_forge/recipe/storage.py rename to pangeo_forge/storage.py diff --git a/tests/fixtures.py b/tests/fixtures.py index 4ac47d9f..1ee9427e 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ import pytest import xarray as xr -from pangeo_forge.recipe.storage import Target, InputCache +from pangeo_forge.storage import Target, InputCache # where to run the http server _PORT = "8080" diff --git a/tests/tasks/__init__.py b/tests/tasks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/tasks/test_xarray.py b/tests/tasks/test_xarray.py deleted file mode 100644 index ae22014a..00000000 --- a/tests/tasks/test_xarray.py +++ /dev/null @@ -1,36 +0,0 @@ -import fsspec -import xarray as xr -from prefect import Flow, Task - -import pangeo_forge.tasks.xarray - - -def test_combine_and_write(): - ds = xr.tutorial.open_dataset("rasm").load() - dsets = ds.isel(time=slice(18)), ds.isel(time=slice(18, None)) - fs = fsspec.get_filesystem_class("memory")() - - for i, dset in enumerate(dsets): - as_bytes = dset.to_netcdf() - - with fs.open(f"cache/{i}.nc", "wb") as f: - f.write(as_bytes) - - sources = [f"memory://{dset}" for dset in fs.ls("cache")] - - # In a flow context - - target = "memory://target.zarr" - with Flow("test") as flow: - result = pangeo_forge.tasks.xarray.combine_and_write( - sources, target, concat_dim="time", append_dim="time" - ) - assert isinstance(result, Task) - flow.validate() - - result = pangeo_forge.tasks.xarray.combine_and_write.run( - sources, target, concat_dim="time", append_dim="time" - ) - assert result == target - result = xr.open_zarr(fs.get_mapper("target.zarr")) - xr.testing.assert_equal(ds, result) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py deleted file mode 100644 index abfa20d7..00000000 --- a/tests/test_pipeline.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Integration testing of pipelines.""" - -import subprocess -import time - -import fsspec -import numpy as np -import pandas as pd -import pytest -import xarray as xr - -# classes tested here -from pangeo_forge.pipelines.base import AbstractPipeline -from pangeo_forge.pipelines.http_xarray_zarr import HttpXarrayZarrMixin - -# where to run the http server -_PORT = "8080" -_ADDRESS = "127.0.0.1" - - -@pytest.fixture(scope="session") -def daily_xarray_dataset(): - """Return a synthetic random xarray dataset.""" - np.random.seed(1) - nt, ny, nx = 10, 18, 36 - time = pd.date_range(start="2010-01-01", periods=nt, freq="D") - lon = (np.arange(nx) + 0.5) * 360 / nx - lon_attrs = {"units": "degrees_east", "long_name": "longitude"} - lat = (np.arange(ny) + 0.5) * 180 / ny - lat_attrs = {"units": "degrees_north", "long_name": "latitude"} - foo = np.random.rand(nt, ny, nx) - foo_attrs = {"long_name": "Fantastic Foo"} - bar = np.random.rand(nt, ny, nx) - bar_attrs = {"long_name": "Beautiful Bar"} - dims = ("time", "lat", "lon") - ds = xr.Dataset( - {"foo": (dims, foo, foo_attrs), "bar": (dims, bar, bar_attrs)}, - coords={ - "time": ("time", time), - "lat": ("lat", lat, lat_attrs), - "lon": ("lon", lon, lon_attrs), - }, - attrs={"conventions": "CF 1.6"}, - ) - return ds - - -@pytest.fixture(scope="session", params=["D", "2D"]) -def netcdf_local_paths(daily_xarray_dataset, tmpdir_factory, request): - """Return a list of paths pointing to netcdf files.""" - tmp_path = tmpdir_factory.mktemp("netcdf_data") - gb = daily_xarray_dataset.resample(time=request.param) - _, datasets = zip(*gb) - fnames = [f"{n:03d}.nc" for n in range(len(datasets))] - paths = [tmp_path.join(fname) for fname in fnames] - print(len(paths)) - xr.save_mfdataset(datasets, [str(path) for path in paths]) - return paths - - -@pytest.fixture(scope="session") -def netcdf_http_server(netcdf_local_paths): - first_path = netcdf_local_paths[0] - # assume that all files are in the same directory - basedir = first_path.dirpath() - print(basedir) - fnames = [path.basename for path in netcdf_local_paths] - - # this feels very hacky - command_list = ["python", "-m", "http.server", _PORT, "--bind", _ADDRESS] - p = subprocess.Popen(command_list, cwd=basedir) - url = f"http://{_ADDRESS}:{_PORT}" - time.sleep(0.1) # let the server start up - yield url, fnames - p.kill() - - -# tests that our fixtures work - - -def test_fixture_local_files(daily_xarray_dataset, netcdf_local_paths): - paths = [str(path) for path in netcdf_local_paths] - ds = xr.open_mfdataset(paths, combine="nested", concat_dim="time") - assert ds.identical(daily_xarray_dataset) - - -def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): - url, paths = netcdf_http_server - urls = ["/".join([url, str(path)]) for path in paths] - open_files = [fsspec.open(url).open() for url in urls] - ds = xr.open_mfdataset(open_files, combine="nested", concat_dim="time") - assert ds.identical(daily_xarray_dataset) - - -# a pipeline to load that data - - -class MyPipeline(HttpXarrayZarrMixin, AbstractPipeline): - repo = "pangeo-forge/pangeo-forge" - - def __init__( - self, - name, - cache_path, - target_path, - concat_dim, - append_dim, - files_per_chunk, - url_base, - nfiles, - ): - self.name = name - self.cache_location = f"{cache_path}/{name}-cache/" - self.target_location = f"{target_path}/{name}.zarr" - self.append_dim = append_dim - self.concat_dim = concat_dim - self.files_per_chunk = files_per_chunk - - # needed to build up sources - self._url_base = url_base - self._nfiles = nfiles - - @property - def sources(self): - keys = range(self._nfiles) - source_url_pattern = self._url_base + "/{n:03d}.nc" - source_urls = [source_url_pattern.format(n=key) for key in keys] - return source_urls - - @property - def targets(self): - return [self.target_location] - - -# a basic pipeline test -def test_pipeline(daily_xarray_dataset, netcdf_http_server, tmpdir): - name = "TEST_DATASET" - cache_dir = tmpdir.mkdir("cache") - target_dir = tmpdir.mkdir("target") - concat_dim = "time" - append_dim = "time" - files_per_chunk = 5 - - url_base, paths = netcdf_http_server - nfiles = len(paths) - - pipeline = MyPipeline( - name, cache_dir, target_dir, concat_dim, append_dim, files_per_chunk, url_base, nfiles - ) - pipeline.flow.run() - - ds_test = xr.open_zarr(pipeline.targets[0]) - assert ds_test.identical(daily_xarray_dataset) - - -def test_storage_environment_set(): - pipeline = MyPipeline( - "name", "cache", "target", "concat", "append", "files_per_chunk", "url_base", "nfiles" - ) - assert pipeline.storage - assert pipeline.environment diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 7b266473..b91ea2c5 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -2,7 +2,7 @@ import xarray as xr import zarr -from pangeo_forge.recipe import recipe +from pangeo_forge import recipe from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache From 885634450a2d04c8e6a5ae5ea498ed995d530f24 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:36:34 -0500 Subject: [PATCH 11/34] lint and fix tests --- .github/workflows/main.yaml | 2 +- pangeo_forge/__init__.py | 2 +- pangeo_forge/recipe.py | 82 ++++++++++++++----------------------- pangeo_forge/storage.py | 5 ++- pangeo_forge/utils.py | 1 + 5 files changed, 37 insertions(+), 55 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f77811c3..ef75c257 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -29,7 +29,7 @@ jobs: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('ci/py${{ matrix.python-version }}.yml') }} - name: setup miniconda - uses: goanpeca/setup-miniconda@v1 + uses: conda-incubator/setup-miniconda@v2 with: activate-environment: pangeo-forge environment-file: ci/py${{ matrix.python-version }}.yml diff --git a/pangeo_forge/__init__.py b/pangeo_forge/__init__.py index 30bf1a56..7e3d3772 100644 --- a/pangeo_forge/__init__.py +++ b/pangeo_forge/__init__.py @@ -1,6 +1,6 @@ from pkg_resources import DistributionNotFound, get_distribution -#from pangeo_forge.pipelines import AbstractPipeline +# from pangeo_forge.pipelines import AbstractPipeline try: __version__ = get_distribution(__name__).version diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index 88a13b1b..d265be8e 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -3,19 +3,19 @@ """ import logging -from dataclasses import dataclass, field from contextlib import contextmanager -from typing import Optional, Iterable, Callable, Any +from dataclasses import dataclass, field +from typing import Any, Callable, Iterable, Optional +import fsspec import numpy as np import xarray as xr -import fsspec import zarr +from .storage import InputCache, Target from .utils import chunked_iterable -from .storage import Target, InputCache -#logger = logging.getLogger(__name__) +# logger = logging.getLogger(__name__) logger = logging.getLogger("recipe") ### How to manually execute a recipe: ### @@ -46,7 +46,7 @@ @dataclass -class DatasetRecipe(): +class DatasetRecipe: target: Target chunk_preprocess_funcs: Iterable[Callable] @@ -54,6 +54,7 @@ class DatasetRecipe(): def prepare(self): def _prepare(): pass + return _prepare def iter_inputs(self): @@ -69,7 +70,7 @@ def iter_inputs(self): # this only gets run when iterating, not preparing! def preprocess_chunk(self, ds): for f in self.chunk_preprocess_funcs: - ds = f(ds) + ds = f(ds) return ds def iter_chunks(self): @@ -88,14 +89,16 @@ def iter_chunks(self): # pass # return _finalize + # Notes about dataclasses: # - https://www.python.org/dev/peps/pep-0557/#inheritance # - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses # This means that, for now, I can't get default arguments to work. + @dataclass class FSSpecFileOpenerMixin: - #input_open_kwargs: dict #= field(default_factory=dict) + # input_open_kwargs: dict #= field(default_factory=dict) @contextmanager def input_opener(self, fname, **kwargs): @@ -106,7 +109,7 @@ def input_opener(self, fname, **kwargs): @dataclass class InputCachingMixin(FSSpecFileOpenerMixin): - require_cache: bool #= False + require_cache: bool # = False input_cache: InputCache # returns a function that takes one input, the input_key @@ -115,6 +118,7 @@ class InputCachingMixin(FSSpecFileOpenerMixin): def cache_input(self): opener = super().input_opener + def cache_func(fname: str) -> None: logger.info(f"Caching input '{fname}'") with opener(fname, mode="rb") as source: @@ -127,7 +131,7 @@ def cache_func(fname: str) -> None: def input_opener(self, fname): if self.input_cache.exists(fname): logger.info(f"Input '{fname}' found in cache") - with self.input_cache.open(fname, mode='rb') as f: + with self.input_cache.open(fname, mode="rb") as f: yield f elif self.require_cache: # this creates an error on prepare because nothing is cached @@ -139,7 +143,6 @@ def input_opener(self, fname): yield f - @dataclass class XarrayInputOpener: xarray_open_kwargs: dict @@ -167,7 +170,7 @@ def open_chunk(self, chunk_key): logger.debug(f"{ds}") # do we really want to just delete all encoding? - #for v in ds.variables: + # for v in ds.variables: # ds[v].encoding = {} # TODO: maybe do some chunking here? @@ -176,10 +179,8 @@ def open_chunk(self, chunk_key): @dataclass class ZarrXarrayWriterMixin: - @property def store_chunk(self) -> Callable: - def _store_chunk(chunk_key): ds_chunk = self.open_chunk(chunk_key) ds_chunk = self.preprocess_chunk(ds_chunk) @@ -190,26 +191,21 @@ def _store_chunk(chunk_key): return _store_chunk - def open_target(self): target_mapper = self.target.get_mapper() return xr.open_zarr(target_mapper) - def initialize_target(self, ds, **expand_dims): logger.info(f"Creating a new dataset in target") target_mapper = self.target.get_mapper() - ds.to_zarr(target_mapper, mode='w', compute=False) - + ds.to_zarr(target_mapper, mode="w", compute=False) def expand_target_dim(self, dim, dimsize): target_mapper = self.target.get_mapper() zgroup = zarr.open_group(target_mapper) ds = self.open_target() - sequence_axes = {v: ds[v].get_axis_num(dim) - for v in ds.variables - if dim in ds[v].dims} + sequence_axes = {v: ds[v].get_axis_num(dim) for v in ds.variables if dim in ds[v].dims} for v, axis in sequence_axes.items(): arr = zgroup[v] @@ -218,15 +214,12 @@ def expand_target_dim(self, dim, dimsize): arr.resize(shape) - - @dataclass -class ZarrConsolidatorMixin(): - consolidate_zarr: bool #= True +class ZarrConsolidatorMixin: + consolidate_zarr: bool # = True @property def finalize(self): - def _finalize(): if self.consolidate_zarr: logger.info(f"Consolidating Zarr metadata") @@ -243,62 +236,50 @@ class SequenceRecipe(DatasetRecipe): inputs_per_chunk: int = 1 nitems_per_input: int = 1 - def __post_init__(self): - self._chunks_inputs = {k: v for k, v in - enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk))} + self._chunks_inputs = { + k: v for k, v in enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk)) + } def drop_vars(ds): # writing a region means that all the variables MUST have sequence_dim - to_drop = [v for v in ds.variables - if self.sequence_dim not in ds[v].dims] + to_drop = [v for v in ds.variables if self.sequence_dim not in ds[v].dims] return ds.drop_vars(to_drop) self.chunk_preprocess_funcs.append(drop_vars) - def inputs_for_chunk(self, chunk_key): return self._chunks_inputs[chunk_key] - def iter_inputs(self): for chunk_key in self.iter_chunks(): for input in self.inputs_for_chunk(chunk_key): yield input - def nitems_for_chunk(self, chunk_key): return self.nitems_per_input * len(self.inputs_for_chunk(chunk_key)) - def region_for_chunk(self, chunk_key): # return a dict suitable to pass to xr.to_zarr(region=...) # specifies where in the overall array to put this chunk's data stride = self.nitems_per_input * self.inputs_per_chunk start = chunk_key * stride - return { - self.sequence_dim: - slice(start, start + self.nitems_for_chunk(chunk_key)) - } - + return {self.sequence_dim: slice(start, start + self.nitems_for_chunk(chunk_key))} def sequence_len(self): # tells the total size of dataset along the sequence dimension return sum([self.nitems_for_chunk(k) for k in self.iter_chunks()]) - def sequence_chunks(self): # chunking return {self.sequence_dim: self.inputs_per_chunk * self.nitems_per_input} - def iter_chunks(self): for k in self._chunks_inputs: yield k @property def prepare(self): - def _prepare(): target_store = self.target.get_mapper() @@ -313,7 +294,7 @@ def _prepare(): # make sure the concat dim has a valid fill_value to avoid # overruns when writing chunk - ds[self.sequence_dim].encoding = {'_FillValue': -1} + ds[self.sequence_dim].encoding = {"_FillValue": -1} # actually not necessary if we use decode_times=False self.initialize_target(ds) @@ -324,12 +305,12 @@ def _prepare(): @dataclass class StandardSequentialRecipe( - SequenceRecipe, - InputCachingMixin, - XarrayConcatChunkOpener, - ZarrXarrayWriterMixin, - ZarrConsolidatorMixin - ): + SequenceRecipe, + InputCachingMixin, + XarrayConcatChunkOpener, + ZarrXarrayWriterMixin, + ZarrConsolidatorMixin, +): pass @@ -338,7 +319,6 @@ class StandardSequentialRecipe( # only needed because of # https://github.com/pydata/xarray/issues/4631 def _fix_scalar_attr_encoding(ds): - def _fixed_attrs(d): fixed = {} for k, v in d.items(): diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index 04e949a0..b3777097 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -1,7 +1,8 @@ -from dataclasses import dataclass +import os from contextlib import closing, contextmanager +from dataclasses import dataclass + import fsspec -import os @dataclass diff --git a/pangeo_forge/utils.py b/pangeo_forge/utils.py index c49b3014..c865e521 100644 --- a/pangeo_forge/utils.py +++ b/pangeo_forge/utils.py @@ -1,5 +1,6 @@ import itertools + # https://alexwlchan.net/2018/12/iterating-in-fixed-size-chunks/ def chunked_iterable(iterable, size): it = iter(iterable) From b3a42ed950716775a391d4030396a3dc2a327786 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:45:22 -0500 Subject: [PATCH 12/34] update requirements --- ci/py3.7.yml | 3 ++- ci/py3.8.yml | 3 ++- requirements.txt | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/py3.7.yml b/ci/py3.7.yml index 8d5f0961..c310ff4c 100644 --- a/ci/py3.7.yml +++ b/ci/py3.7.yml @@ -25,4 +25,5 @@ dependencies: - scipy - setuptools - toolz - - zarr + - xarray>=0.16.2 + - zarr>=2.6.0 diff --git a/ci/py3.8.yml b/ci/py3.8.yml index ada7aae7..739ce1ef 100644 --- a/ci/py3.8.yml +++ b/ci/py3.8.yml @@ -25,4 +25,5 @@ dependencies: - scipy - setuptools - toolz - - zarr + - xarray>=0.16.2 + - zarr>=2.6.0 diff --git a/requirements.txt b/requirements.txt index dbf84c5f..11a36cbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ setuptools click dask distributed -xarray -zarr +xarray >= 0.16.2 +zarr >= 2.6.0 fsspec[http] From ec95d9d7bb23606ecc9f6f5bb90568286fca096d Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 1 Dec 2020 21:51:07 -0500 Subject: [PATCH 13/34] more linting --- tests/fixtures.py | 32 +++++++++++++++++--------------- tests/test_recipe.py | 21 +++++++++++++++++---- tests/test_utils.py | 26 +++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 1ee9427e..5ac6738d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ import pytest import xarray as xr -from pangeo_forge.storage import Target, InputCache +from pangeo_forge.storage import InputCache, Target # where to run the http server _PORT = "8080" @@ -46,7 +46,7 @@ def netcdf_local_paths(daily_xarray_dataset, tmpdir_factory, request): """Return a list of paths pointing to netcdf files.""" tmp_path = tmpdir_factory.mktemp("netcdf_data") items_per_file = {"D": 1, "2D": 2} - daily_xarray_dataset.attrs['items_per_file'] = items_per_file[request.param] + daily_xarray_dataset.attrs["items_per_file"] = items_per_file[request.param] gb = daily_xarray_dataset.resample(time=request.param) _, datasets = zip(*gb) fnames = [f"{n:03d}.nc" for n in range(len(datasets))] @@ -76,6 +76,7 @@ def netcdf_http_server(netcdf_local_paths): @pytest.fixture() def tmp_target(tmpdir_factory): import fsspec + fs = fsspec.get_filesystem_class("file")() path = str(tmpdir_factory.mktemp("target")) return Target(fs, path) @@ -85,9 +86,10 @@ def tmp_target(tmpdir_factory): def tmp_cache(tmpdir_factory): path = str(tmpdir_factory.mktemp("cache")) fs = fsspec.get_filesystem_class("file")() - cache = InputCache(fs, prefix='cache') + cache = InputCache(fs, prefix="cache") return cache + # tests that our fixtures work @@ -107,19 +109,19 @@ def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): def test_target(tmp_target): mapper = tmp_target.get_mapper() - mybytes = b'bar' - mapper['foo'] = b'bar' - with open(tmp_target.path + '/foo') as f: + mybytes = b"bar" + mapper["foo"] = b"bar" + with open(tmp_target.path + "/foo") as f: res = f.read() - assert res == 'bar' + assert res == "bar" def test_cache(tmp_cache): - assert not tmp_cache.exists('foo') - with tmp_cache.open('foo', mode='w') as f: - f.write('bar') - assert tmp_cache.exists('foo') - with tmp_cache.open('foo', mode='r') as f: - assert f.read() == 'bar' - tmp_cache.rm('foo') - assert not tmp_cache.exists('foo') + assert not tmp_cache.exists("foo") + with tmp_cache.open("foo", mode="w") as f: + f.write("bar") + assert tmp_cache.exists("foo") + with tmp_cache.open("foo", mode="r") as f: + assert f.read() == "bar" + tmp_cache.rm("foo") + assert not tmp_cache.exists("foo") diff --git a/tests/test_recipe.py b/tests/test_recipe.py index b91ea2c5..d55b4710 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -4,15 +4,28 @@ from pangeo_forge import recipe -from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache +from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_cache, tmp_target dummy_fnames = ["a.nc", "b.nc", "c.nc"] + + @pytest.mark.parametrize( "file_urls, files_per_chunk, expected_keys, expected_filenames", [ (dummy_fnames, 1, [0, 1, 2], [("a.nc",), ("b.nc",), ("c.nc",)]), - (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]) - ] + ( + dummy_fnames, + 2, + [0, 1], + [ + ( + "a.nc", + "b.nc", + ), + ("c.nc",), + ], + ), + ], ) def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): @@ -47,7 +60,7 @@ def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_c input_urls=netcdf_local_paths, sequence_dim="time", inputs_per_chunk=1, - nitems_per_input=daily_xarray_dataset.attrs['items_per_file'] + nitems_per_input=daily_xarray_dataset.attrs["items_per_file"], ) # this is the cannonical way to manually execute a recipe diff --git a/tests/test_utils.py b/tests/test_utils.py index 03e6be7f..0d9b8ee8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,9 +8,29 @@ [ ([1, 2, 3], 1, [(1,), (2,), (3,)]), ([1, 2, 3], 2, [(1, 2), (3,)]), - ([1, 2, 3], 3, [(1, 2, 3,)]), - ([1, 2, 3], 4, [(1, 2, 3,)]) - ] + ( + [1, 2, 3], + 3, + [ + ( + 1, + 2, + 3, + ) + ], + ), + ( + [1, 2, 3], + 4, + [ + ( + 1, + 2, + 3, + ) + ], + ), + ], ) def test_chunked_iterable(iterable, size, expected): actual = list(pangeo_forge.utils.chunked_iterable(iterable, size)) From 79ab04fcdcbda4041cdda9d39043a679014d0dfd Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 2 Dec 2020 14:00:24 -0500 Subject: [PATCH 14/34] added executors --- pangeo_forge/executors/__init__.py | 0 pangeo_forge/executors/prefect.py | 38 ++++++++++++++++++++++++++++++ pangeo_forge/executors/python.py | 29 +++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 pangeo_forge/executors/__init__.py create mode 100644 pangeo_forge/executors/prefect.py create mode 100644 pangeo_forge/executors/python.py diff --git a/pangeo_forge/executors/__init__.py b/pangeo_forge/executors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pangeo_forge/executors/prefect.py b/pangeo_forge/executors/prefect.py new file mode 100644 index 00000000..cea2c367 --- /dev/null +++ b/pangeo_forge/executors/prefect.py @@ -0,0 +1,38 @@ +from typing import Any, Iterable, Tuple + +import prefect + +from ..recipe import DatasetRecipe + +class PrefectExecutor(): + + def prepare_plan(self, r: DatasetRecipe) -> prefect.Flow: + + # wrap our functions as prefect tasks + @prefect.task + def prepare() -> None: + r.prepare() + + @prefect.task + def cache_input(input: Any) -> None: + r.cache_input(input) + + @prefect.task + def store_chunk(input: Any) -> None: + r.store_chunk(input) + + @prefect.task + def finalize() -> None: + r.finalize() + + with prefect.Flow("Pangeo-Forge") as flow: + prepare() + cache_input.map(list(r.iter_inputs())) + store_chunk.map(list(r.iter_chunks())) + finalize() + + return flow + + + def execute_plan(self, plan: prefect.Flow, **kwargs): + return plan.run(**kwargs) diff --git a/pangeo_forge/executors/python.py b/pangeo_forge/executors/python.py new file mode 100644 index 00000000..4be76073 --- /dev/null +++ b/pangeo_forge/executors/python.py @@ -0,0 +1,29 @@ +from functools import partial + +#from ..types import Pipeline, Stage, Task +from ..recipe import DatasetRecipe + +from typing import Callable, Iterable + +Task = Callable[[], None] + +class PythonExecutor: + + def prepare_plan(self, r: DatasetRecipe) -> Task: + tasks = [] + tasks.append(r.prepare) + for input in r.iter_inputs(): + tasks.append(partial(r.cache_input, input)) + for chunk in r.iter_chunks(): + tasks.append(partial(r.store_chunk, chunk)) + tasks.append(r.finalize) + + return partial(_execute_all, tasks) + + def execute_plan(self, plan: Task, **kwargs): + plan() + + +def _execute_all(tasks: Iterable[Task]) -> None: + for task in tasks: + task() From e6b32a98282e823a73c7f4f8b718063a158a1615 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 3 Dec 2020 20:41:35 -0500 Subject: [PATCH 15/34] linting and stuff --- pangeo_forge/executors/prefect.py | 5 ++- pangeo_forge/executors/python.py | 7 ++--- pangeo_forge/recipe.py | 50 ++++++++++++------------------ pangeo_forge/storage.py | 2 +- pangeo_forge/utils.py | 21 +++++++++++++ tests/{fixtures.py => conftest.py} | 39 +---------------------- tests/test_recipe.py | 3 -- 7 files changed, 47 insertions(+), 80 deletions(-) rename tests/{fixtures.py => conftest.py} (69%) diff --git a/pangeo_forge/executors/prefect.py b/pangeo_forge/executors/prefect.py index cea2c367..afe13b8a 100644 --- a/pangeo_forge/executors/prefect.py +++ b/pangeo_forge/executors/prefect.py @@ -1,11 +1,11 @@ -from typing import Any, Iterable, Tuple +from typing import Any import prefect from ..recipe import DatasetRecipe -class PrefectExecutor(): +class PrefectExecutor: def prepare_plan(self, r: DatasetRecipe) -> prefect.Flow: # wrap our functions as prefect tasks @@ -33,6 +33,5 @@ def finalize() -> None: return flow - def execute_plan(self, plan: prefect.Flow, **kwargs): return plan.run(**kwargs) diff --git a/pangeo_forge/executors/python.py b/pangeo_forge/executors/python.py index 4be76073..5e1d1c64 100644 --- a/pangeo_forge/executors/python.py +++ b/pangeo_forge/executors/python.py @@ -1,14 +1,13 @@ from functools import partial +from typing import Callable, Iterable -#from ..types import Pipeline, Stage, Task +# from ..types import Pipeline, Stage, Task from ..recipe import DatasetRecipe -from typing import Callable, Iterable - Task = Callable[[], None] -class PythonExecutor: +class PythonExecutor: def prepare_plan(self, r: DatasetRecipe) -> Task: tasks = [] tasks.append(r.prepare) diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index d265be8e..e651e834 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -4,21 +4,20 @@ import logging from contextlib import contextmanager -from dataclasses import dataclass, field -from typing import Any, Callable, Iterable, Optional +from dataclasses import dataclass +from typing import Callable, Iterable import fsspec -import numpy as np import xarray as xr import zarr from .storage import InputCache, Target -from .utils import chunked_iterable +from .utils import chunked_iterable, fix_scalar_attr_encoding # logger = logging.getLogger(__name__) logger = logging.getLogger("recipe") -### How to manually execute a recipe: ### +# How to manually execute a recipe: ### # # t = PangeoForgeTarget() # r = MyRecipe(target=t, **opts) # 1 @@ -29,8 +28,6 @@ # for chunk_key in r.iter_chunks(): # r.store_chunk(chunk_key) # 5 # r.finalize() # 6 -# -### # 1) Initialize the Recipe object @@ -152,7 +149,7 @@ def open_input(self, fname): logger.info(f"Opening input with Xarray '{fname}'") ds = xr.open_dataset(f, **self.xarray_open_kwargs).load() # do we always want to remove encoding? I think so. - ds = _fix_scalar_attr_encoding(ds) + ds = fix_scalar_attr_encoding(ds) logger.debug(f"{ds}") return ds @@ -196,7 +193,7 @@ def open_target(self): return xr.open_zarr(target_mapper) def initialize_target(self, ds, **expand_dims): - logger.info(f"Creating a new dataset in target") + logger.info("Creating a new dataset in target") target_mapper = self.target.get_mapper() ds.to_zarr(target_mapper, mode="w", compute=False) @@ -222,7 +219,7 @@ class ZarrConsolidatorMixin: def finalize(self): def _finalize(): if self.consolidate_zarr: - logger.info(f"Consolidating Zarr metadata") + logger.info("Consolidating Zarr metadata") target_mapper = self.target.get_mapper() zarr.consolidate_metadata(target_mapper) @@ -231,10 +228,21 @@ def _finalize(): @dataclass class SequenceRecipe(DatasetRecipe): + """There are many inputs (a.k.a. files, granules), arranged in a sequence + along the dimension `sequence_dim`. Each file may contain multiple variables. + """ + input_urls: Iterable[str] + """The inputs used to generate the dataset.""" + sequence_dim: str + """The dimension name along which the inputs will be concatenated.""" + inputs_per_chunk: int = 1 + """The number of inputs to use in each chunk.""" + nitems_per_input: int = 1 + """The length of each input along the `sequence_dim` dimension.""" def __post_init__(self): self._chunks_inputs = { @@ -282,11 +290,9 @@ def iter_chunks(self): def prepare(self): def _prepare(): - target_store = self.target.get_mapper() - try: ds = self.open_target() - logger.info(f"Found an existing dataset in target") + logger.info("Found an existing dataset in target") logger.debug(f"{ds}") except (IOError, zarr.errors.GroupNotFoundError): first_chunk_key = next(self.iter_chunks()) @@ -315,21 +321,3 @@ class StandardSequentialRecipe( # helper utilities - -# only needed because of -# https://github.com/pydata/xarray/issues/4631 -def _fix_scalar_attr_encoding(ds): - def _fixed_attrs(d): - fixed = {} - for k, v in d.items(): - if isinstance(v, np.ndarray) and len(v) == 1: - fixed[k] = v[0] - return fixed - - ds = ds.copy() - ds.attrs.update(_fixed_attrs(ds.attrs)) - ds.encoding.update(_fixed_attrs(ds.encoding)) - for v in ds.variables: - ds[v].attrs.update(_fixed_attrs(ds[v].attrs)) - ds[v].encoding.update(_fixed_attrs(ds[v].encoding)) - return ds diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index b3777097..742bb9f2 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -1,5 +1,5 @@ import os -from contextlib import closing, contextmanager +from contextlib import contextmanager from dataclasses import dataclass import fsspec diff --git a/pangeo_forge/utils.py b/pangeo_forge/utils.py index c865e521..1ac84c7b 100644 --- a/pangeo_forge/utils.py +++ b/pangeo_forge/utils.py @@ -1,5 +1,7 @@ import itertools +import numpy as np + # https://alexwlchan.net/2018/12/iterating-in-fixed-size-chunks/ def chunked_iterable(iterable, size): @@ -9,3 +11,22 @@ def chunked_iterable(iterable, size): if not chunk: break yield chunk + + +# only needed because of +# https://github.com/pydata/xarray/issues/4631 +def fix_scalar_attr_encoding(ds): + def _fixed_attrs(d): + fixed = {} + for k, v in d.items(): + if isinstance(v, np.ndarray) and len(v) == 1: + fixed[k] = v[0] + return fixed + + ds = ds.copy() + ds.attrs.update(_fixed_attrs(ds.attrs)) + ds.encoding.update(_fixed_attrs(ds.encoding)) + for v in ds.variables: + ds[v].attrs.update(_fixed_attrs(ds[v].attrs)) + ds[v].encoding.update(_fixed_attrs(ds[v].encoding)) + return ds diff --git a/tests/fixtures.py b/tests/conftest.py similarity index 69% rename from tests/fixtures.py rename to tests/conftest.py index 5ac6738d..c2584809 100644 --- a/tests/fixtures.py +++ b/tests/conftest.py @@ -86,42 +86,5 @@ def tmp_target(tmpdir_factory): def tmp_cache(tmpdir_factory): path = str(tmpdir_factory.mktemp("cache")) fs = fsspec.get_filesystem_class("file")() - cache = InputCache(fs, prefix="cache") + cache = InputCache(fs, prefix=path) return cache - - -# tests that our fixtures work - - -def test_fixture_local_files(daily_xarray_dataset, netcdf_local_paths): - paths = [str(path) for path in netcdf_local_paths] - ds = xr.open_mfdataset(paths, combine="nested", concat_dim="time") - assert ds.identical(daily_xarray_dataset) - - -def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): - url, paths = netcdf_http_server - urls = ["/".join([url, str(path)]) for path in paths] - open_files = [fsspec.open(url).open() for url in urls] - ds = xr.open_mfdataset(open_files, combine="nested", concat_dim="time") - assert ds.identical(daily_xarray_dataset) - - -def test_target(tmp_target): - mapper = tmp_target.get_mapper() - mybytes = b"bar" - mapper["foo"] = b"bar" - with open(tmp_target.path + "/foo") as f: - res = f.read() - assert res == "bar" - - -def test_cache(tmp_cache): - assert not tmp_cache.exists("foo") - with tmp_cache.open("foo", mode="w") as f: - f.write("bar") - assert tmp_cache.exists("foo") - with tmp_cache.open("foo", mode="r") as f: - assert f.read() == "bar" - tmp_cache.rm("foo") - assert not tmp_cache.exists("foo") diff --git a/tests/test_recipe.py b/tests/test_recipe.py index d55b4710..28e58bc3 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -1,11 +1,8 @@ import pytest import xarray as xr -import zarr from pangeo_forge import recipe -from .fixtures import daily_xarray_dataset, netcdf_local_paths, tmp_cache, tmp_target - dummy_fnames = ["a.nc", "b.nc", "c.nc"] From b993dabd5f3db6fbc86826cb18a6372f120e4a20 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 3 Dec 2020 21:19:47 -0500 Subject: [PATCH 16/34] testing executors --- pangeo_forge/executors/__init__.py | 8 ++++++ pangeo_forge/recipe.py | 3 --- setup.cfg | 3 +-- tests/conftest.py | 19 ++++++++++++++ tests/test_executors.py | 14 +++++++++++ tests/test_fixtures.py | 40 ++++++++++++++++++++++++++++++ tests/test_recipe.py | 2 ++ 7 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 tests/test_executors.py create mode 100644 tests/test_fixtures.py diff --git a/pangeo_forge/executors/__init__.py b/pangeo_forge/executors/__init__.py index e69de29b..e0aa6809 100644 --- a/pangeo_forge/executors/__init__.py +++ b/pangeo_forge/executors/__init__.py @@ -0,0 +1,8 @@ +""" +Executors know how to run recipes. +""" + +from .prefect import PrefectExecutor +from .python import PythonExecutor + +__all__ = [PythonExecutor, PrefectExecutor] diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index e651e834..d0a6a6ab 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -318,6 +318,3 @@ class StandardSequentialRecipe( ZarrConsolidatorMixin, ): pass - - -# helper utilities diff --git a/setup.cfg b/setup.cfg index 7468400a..9c7719f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,5 +11,4 @@ combine_as_imports=True line_length=100 [tool:pytest] -log_cli = True -log_level = INFO +log_cli = False diff --git a/tests/conftest.py b/tests/conftest.py index c2584809..6389e837 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ import pytest import xarray as xr +from pangeo_forge import recipe from pangeo_forge.storage import InputCache, Target # where to run the http server @@ -88,3 +89,21 @@ def tmp_cache(tmpdir_factory): fs = fsspec.get_filesystem_class("file")() cache = InputCache(fs, prefix=path) return cache + + +@pytest.fixture +def sequential_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): + r = recipe.StandardSequentialRecipe( + consolidate_zarr=True, + xarray_open_kwargs={}, + xarray_concat_kwargs={}, + require_cache=False, + input_cache=tmp_cache, + target=tmp_target, + chunk_preprocess_funcs=[], + input_urls=netcdf_local_paths, + sequence_dim="time", + inputs_per_chunk=1, + nitems_per_input=daily_xarray_dataset.attrs["items_per_file"], + ) + return r, daily_xarray_dataset, tmp_target diff --git a/tests/test_executors.py b/tests/test_executors.py new file mode 100644 index 00000000..eb90e2a0 --- /dev/null +++ b/tests/test_executors.py @@ -0,0 +1,14 @@ +import pytest +import xarray as xr + +from pangeo_forge.executors import PrefectExecutor, PythonExecutor + + +@pytest.mark.parametrize("Executor", [PythonExecutor, PrefectExecutor]) +def test_recipe_w_executor(Executor, sequential_recipe): + rec, ds_expected, target = sequential_recipe + ex = Executor() + plan = ex.prepare_plan(rec) + ex.execute_plan(plan) + ds_actual = xr.open_zarr(target.get_mapper()).load() + assert ds_actual.identical(ds_expected) diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py new file mode 100644 index 00000000..c867f17b --- /dev/null +++ b/tests/test_fixtures.py @@ -0,0 +1,40 @@ +import fsspec +import xarray as xr + +from pangeo_forge.utils import fix_scalar_attr_encoding + + +def test_fixture_local_files(daily_xarray_dataset, netcdf_local_paths): + paths = [str(path) for path in netcdf_local_paths] + ds = xr.open_mfdataset(paths, combine="nested", concat_dim="time") + assert ds.identical(daily_xarray_dataset) + + +def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): + url, paths = netcdf_http_server + urls = ["/".join([url, str(path)]) for path in paths] + open_files = [fsspec.open(url).open() for url in urls] + ds = xr.open_mfdataset(open_files, combine="nested", concat_dim="time").load() + ds = fix_scalar_attr_encoding(ds) + print(ds) + print(daily_xarray_dataset) + assert ds.identical(daily_xarray_dataset) + + +def test_target(tmp_target): + mapper = tmp_target.get_mapper() + mapper["foo"] = b"bar" + with open(tmp_target.path + "/foo") as f: + res = f.read() + assert res == "bar" + + +def test_cache(tmp_cache): + assert not tmp_cache.exists("foo") + with tmp_cache.open("foo", mode="w") as f: + f.write("bar") + assert tmp_cache.exists("foo") + with tmp_cache.open("foo", mode="r") as f: + assert f.read() == "bar" + tmp_cache.rm("foo") + assert not tmp_cache.exists("foo") diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 28e58bc3..11675e17 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -46,6 +46,8 @@ def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_fil def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): + # the same recipe is created as a fixture in conftest.py + # I left it here explicitly because it makes the test easier to read. r = recipe.StandardSequentialRecipe( consolidate_zarr=True, xarray_open_kwargs={}, From 0e150cb6a5c1291799c7588f69ca209cb1b7b6e9 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 21 Dec 2020 11:45:45 -0500 Subject: [PATCH 17/34] major simplification of recipe class --- pangeo_forge/executors/prefect.py | 15 +- pangeo_forge/executors/python.py | 6 +- pangeo_forge/recipe.py | 302 ++++++++++++------------------ tests/conftest.py | 13 +- tests/test_executors.py | 4 +- tests/test_recipe.py | 14 +- 6 files changed, 140 insertions(+), 214 deletions(-) diff --git a/pangeo_forge/executors/prefect.py b/pangeo_forge/executors/prefect.py index afe13b8a..ce91017e 100644 --- a/pangeo_forge/executors/prefect.py +++ b/pangeo_forge/executors/prefect.py @@ -2,11 +2,9 @@ import prefect -from ..recipe import DatasetRecipe - class PrefectExecutor: - def prepare_plan(self, r: DatasetRecipe) -> prefect.Flow: + def prepare_plan(self, r) -> prefect.Flow: # wrap our functions as prefect tasks @prefect.task @@ -26,10 +24,13 @@ def finalize() -> None: r.finalize() with prefect.Flow("Pangeo-Forge") as flow: - prepare() - cache_input.map(list(r.iter_inputs())) - store_chunk.map(list(r.iter_chunks())) - finalize() + prep_task = prepare() + cache_task = cache_input.map(list(r.iter_inputs())) + cache_task.set_dependencies(upstream_tasks=[prep_task]) + store_task = store_chunk.map(list(r.iter_chunks())) + store_task.set_dependencies(upstream_tasks=[cache_task]) + finalize_task = finalize() + finalize_task.set_dependencies(upstream_tasks=[store_task]) return flow diff --git a/pangeo_forge/executors/python.py b/pangeo_forge/executors/python.py index 5e1d1c64..0661d153 100644 --- a/pangeo_forge/executors/python.py +++ b/pangeo_forge/executors/python.py @@ -1,14 +1,10 @@ from functools import partial from typing import Callable, Iterable -# from ..types import Pipeline, Stage, Task -from ..recipe import DatasetRecipe - Task = Callable[[], None] - class PythonExecutor: - def prepare_plan(self, r: DatasetRecipe) -> Task: + def prepare_plan(self, r): tasks = [] tasks.append(r.prepare) for input in r.iter_inputs(): diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index d0a6a6ab..af389317 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -4,8 +4,8 @@ import logging from contextlib import contextmanager -from dataclasses import dataclass -from typing import Callable, Iterable +from dataclasses import dataclass, field +from typing import Callable, Iterable, Optional import fsspec import xarray as xr @@ -14,8 +14,7 @@ from .storage import InputCache, Target from .utils import chunked_iterable, fix_scalar_attr_encoding -# logger = logging.getLogger(__name__) -logger = logging.getLogger("recipe") +logger = logging.getLogger(__name__) # How to manually execute a recipe: ### # @@ -41,123 +40,159 @@ # Might be coming from the cache or might be read directly. # 6) +@contextmanager +def input_opener(fname, **kwargs): + logger.info(f"Opening input '{fname}'") + with fsspec.open(fname, **kwargs) as f: + yield f + +# Notes about dataclasses: +# - https://www.python.org/dev/peps/pep-0557/#inheritance +# - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses +# This means that, for now, I can't get default arguments to work. @dataclass -class DatasetRecipe: - target: Target - chunk_preprocess_funcs: Iterable[Callable] +class NetCDFtoZarrSequentialRecipe: + """There are many inputs (a.k.a. files, granules), arranged in a sequence + along the dimension `sequence_dim`. Each file may contain multiple variables. + """ + input_urls: Iterable[str] + """The inputs used to generate the dataset.""" - @property - def prepare(self): - def _prepare(): - pass + sequence_dim: str + """The dimension name along which the inputs will be concatenated.""" - return _prepare + inputs_per_chunk: int = 1 + """The number of inputs to use in each chunk.""" - def iter_inputs(self): - return [] - - # need to figure out what's going on with these methods and inheritance - # @property - # def cache_input(self): - # def _cache_input(input_key): - # raise NotImplementedError - # return _cache_input - - # this only gets run when iterating, not preparing! - def preprocess_chunk(self, ds): - for f in self.chunk_preprocess_funcs: - ds = f(ds) - return ds + nitems_per_input: int = 1 + """The length of each input along the `sequence_dim` dimension.""" - def iter_chunks(self): - raise NotImplementedError + target: Optional[Target] = None + """A location in which to put the dataset.""" - # @property - # def store_chunk(self): - # def _store_chunk(chunk_key): - # raise NotImplementedError - # return _store_chunk + input_cache: Optional[InputCache] = None + """The length of each input along the `sequence_dim` dimension.""" - # @property - # def finalize(self): - # - # def _finalize(): - # pass - # return _finalize + consolidate_zarr: bool = True + """Whether to consolidate the resulting Zarr dataset.""" + xarray_open_kwargs: dict = field(default_factory=dict) + """Extra options for opening the inputs with Xarray.""" -# Notes about dataclasses: -# - https://www.python.org/dev/peps/pep-0557/#inheritance -# - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses -# This means that, for now, I can't get default arguments to work. + xarray_concat_kwargs: dict = field(default_factory=dict) + """Extra options to pass to Xarray when concatenating the inputs to form a chunk.""" + def __post_init__(self): + self._chunks_inputs = { + k: v for k, v in enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk)) + } -@dataclass -class FSSpecFileOpenerMixin: - # input_open_kwargs: dict #= field(default_factory=dict) + @property + def prepare(self) -> Callable: + """Prepare target for storing dataset.""" - @contextmanager - def input_opener(self, fname, **kwargs): - logger.info(f"Opening input '{fname}'") - with fsspec.open(fname, **kwargs) as f: - yield f + def _prepare(): + try: + ds = self.open_target() + logger.info("Found an existing dataset in target") + logger.debug(f"{ds}") + except (IOError, zarr.errors.GroupNotFoundError): + first_chunk_key = next(self.iter_chunks()) + for input_url in self.inputs_for_chunk(first_chunk_key): + self.cache_input(input_url) + ds = self.open_chunk(first_chunk_key).chunk() -@dataclass -class InputCachingMixin(FSSpecFileOpenerMixin): - require_cache: bool # = False - input_cache: InputCache + # make sure the concat dim has a valid fill_value to avoid + # overruns when writing chunk + ds[self.sequence_dim].encoding = {"_FillValue": -1} + # actually not necessary if we use decode_times=False + self.initialize_target(ds) + + self.expand_target_dim(self.sequence_dim, self.sequence_len()) + + return _prepare - # returns a function that takes one input, the input_key - # this allows us to parallelize these operations @property - def cache_input(self): + def cache_input(self) -> Callable: + """Cache the input. - opener = super().input_opener + Properties + ---------- + url : URL pointing to the input file. Must be openable by fsspec. + """ def cache_func(fname: str) -> None: logger.info(f"Caching input '{fname}'") - with opener(fname, mode="rb") as source: + with input_opener(fname, mode="rb") as source: with self.input_cache.open(fname, mode="wb") as target: target.write(source.read()) return cache_func + @property + def store_chunk(self) -> Callable: + """Store a chunk in the target. + + Parameters + ---------- + chunk_key : str + The identifier for the chunk + """ + def _store_chunk(chunk_key): + ds_chunk = self.open_chunk(chunk_key) + + def drop_vars(ds): + # writing a region means that all the variables MUST have sequence_dim + to_drop = [v for v in ds.variables if self.sequence_dim not in ds[v].dims] + return ds.drop_vars(to_drop) + + ds_chunk = drop_vars(ds_chunk) + target_mapper = self.target.get_mapper() + write_region = self.region_for_chunk(chunk_key) + logger.info(f"Storing chunk '{chunk_key}' to Zarr region {write_region}") + ds_chunk.to_zarr(target_mapper, region=write_region) + + return _store_chunk + + @property + def finalize(self) -> Callable: + """Finalize writing of dataset.""" + + def _finalize(): + if self.consolidate_zarr: + logger.info("Consolidating Zarr metadata") + target_mapper = self.target.get_mapper() + zarr.consolidate_metadata(target_mapper) + + return _finalize + @contextmanager - def input_opener(self, fname): - if self.input_cache.exists(fname): + def input_opener(self, fname: str): + if self.input_cache is None: + logger.info(f"No cache. Opening input `{fname}` directly.") + # This will bypass the cache. May be slow. + with input_opener(fname, mode="rb") as f: + yield f + elif self.input_cache.exists(fname): logger.info(f"Input '{fname}' found in cache") with self.input_cache.open(fname, mode="rb") as f: yield f - elif self.require_cache: - # this creates an error on prepare because nothing is cached - raise IOError("Input can only be opened from cache. Call .cache_input first.") else: - logger.info(f"Input '{fname}' not found in cache. Opening directly.") - # This will bypass the cache. May be slow. - with super().input_opener(fname, mode="rb") as f: - yield f - + raise ValueError(f"Input '{fname}' has not been cached yet. " + "Call .cache_input() first.") -@dataclass -class XarrayInputOpener: - xarray_open_kwargs: dict - - def open_input(self, fname): + def open_input(self, fname: str): with self.input_opener(fname) as f: logger.info(f"Opening input with Xarray '{fname}'") - ds = xr.open_dataset(f, **self.xarray_open_kwargs).load() - # do we always want to remove encoding? I think so. + ds = xr.open_dataset(f, **self.xarray_open_kwargs) + # explicitly load into memory + ds = ds.load() ds = fix_scalar_attr_encoding(ds) logger.debug(f"{ds}") return ds - -@dataclass -class XarrayConcatChunkOpener(XarrayInputOpener): - xarray_concat_kwargs: dict - def open_chunk(self, chunk_key): logger.info(f"Concatenating inputs for chunk '{chunk_key}'") inputs = self.inputs_for_chunk(chunk_key) @@ -166,28 +201,9 @@ def open_chunk(self, chunk_key): ds = xr.concat(dsets, self.sequence_dim, **self.xarray_concat_kwargs) logger.debug(f"{ds}") - # do we really want to just delete all encoding? - # for v in ds.variables: - # ds[v].encoding = {} - # TODO: maybe do some chunking here? return ds - -@dataclass -class ZarrXarrayWriterMixin: - @property - def store_chunk(self) -> Callable: - def _store_chunk(chunk_key): - ds_chunk = self.open_chunk(chunk_key) - ds_chunk = self.preprocess_chunk(ds_chunk) - target_mapper = self.target.get_mapper() - write_region = self.region_for_chunk(chunk_key) - logger.info(f"Storing chunk '{chunk_key}' to Zarr region {write_region}") - ds_chunk.to_zarr(target_mapper, region=write_region) - - return _store_chunk - def open_target(self): target_mapper = self.target.get_mapper() return xr.open_zarr(target_mapper) @@ -210,52 +226,6 @@ def expand_target_dim(self, dim, dimsize): shape[axis] = dimsize arr.resize(shape) - -@dataclass -class ZarrConsolidatorMixin: - consolidate_zarr: bool # = True - - @property - def finalize(self): - def _finalize(): - if self.consolidate_zarr: - logger.info("Consolidating Zarr metadata") - target_mapper = self.target.get_mapper() - zarr.consolidate_metadata(target_mapper) - - return _finalize - - -@dataclass -class SequenceRecipe(DatasetRecipe): - """There are many inputs (a.k.a. files, granules), arranged in a sequence - along the dimension `sequence_dim`. Each file may contain multiple variables. - """ - - input_urls: Iterable[str] - """The inputs used to generate the dataset.""" - - sequence_dim: str - """The dimension name along which the inputs will be concatenated.""" - - inputs_per_chunk: int = 1 - """The number of inputs to use in each chunk.""" - - nitems_per_input: int = 1 - """The length of each input along the `sequence_dim` dimension.""" - - def __post_init__(self): - self._chunks_inputs = { - k: v for k, v in enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk)) - } - - def drop_vars(ds): - # writing a region means that all the variables MUST have sequence_dim - to_drop = [v for v in ds.variables if self.sequence_dim not in ds[v].dims] - return ds.drop_vars(to_drop) - - self.chunk_preprocess_funcs.append(drop_vars) - def inputs_for_chunk(self, chunk_key): return self._chunks_inputs[chunk_key] @@ -272,7 +242,8 @@ def region_for_chunk(self, chunk_key): # specifies where in the overall array to put this chunk's data stride = self.nitems_per_input * self.inputs_per_chunk start = chunk_key * stride - return {self.sequence_dim: slice(start, start + self.nitems_for_chunk(chunk_key))} + region_slice = slice(start, start + self.nitems_for_chunk(chunk_key)) + return {self.sequence_dim: region_slice} def sequence_len(self): # tells the total size of dataset along the sequence dimension @@ -285,36 +256,3 @@ def sequence_chunks(self): def iter_chunks(self): for k in self._chunks_inputs: yield k - - @property - def prepare(self): - def _prepare(): - - try: - ds = self.open_target() - logger.info("Found an existing dataset in target") - logger.debug(f"{ds}") - except (IOError, zarr.errors.GroupNotFoundError): - first_chunk_key = next(self.iter_chunks()) - ds = self.open_chunk(first_chunk_key).chunk() - - # make sure the concat dim has a valid fill_value to avoid - # overruns when writing chunk - ds[self.sequence_dim].encoding = {"_FillValue": -1} - # actually not necessary if we use decode_times=False - self.initialize_target(ds) - - self.expand_target_dim(self.sequence_dim, self.sequence_len()) - - return _prepare - - -@dataclass -class StandardSequentialRecipe( - SequenceRecipe, - InputCachingMixin, - XarrayConcatChunkOpener, - ZarrXarrayWriterMixin, - ZarrConsolidatorMixin, -): - pass diff --git a/tests/conftest.py b/tests/conftest.py index 6389e837..6386481a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -92,18 +92,13 @@ def tmp_cache(tmpdir_factory): @pytest.fixture -def sequential_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): - r = recipe.StandardSequentialRecipe( - consolidate_zarr=True, - xarray_open_kwargs={}, - xarray_concat_kwargs={}, - require_cache=False, - input_cache=tmp_cache, - target=tmp_target, - chunk_preprocess_funcs=[], +def netCDFtoZarr_sequential_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): + r = recipe.NetCDFtoZarrSequentialRecipe( input_urls=netcdf_local_paths, sequence_dim="time", inputs_per_chunk=1, nitems_per_input=daily_xarray_dataset.attrs["items_per_file"], + target=tmp_target, + input_cache=tmp_cache, ) return r, daily_xarray_dataset, tmp_target diff --git a/tests/test_executors.py b/tests/test_executors.py index eb90e2a0..551abaa5 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -5,8 +5,8 @@ @pytest.mark.parametrize("Executor", [PythonExecutor, PrefectExecutor]) -def test_recipe_w_executor(Executor, sequential_recipe): - rec, ds_expected, target = sequential_recipe +def test_recipe_w_executor(Executor, netCDFtoZarr_sequential_recipe): + rec, ds_expected, target = netCDFtoZarr_sequential_recipe ex = Executor() plan = ex.prepare_plan(rec) ex.execute_plan(plan) diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 11675e17..c146565e 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -6,6 +6,7 @@ dummy_fnames = ["a.nc", "b.nc", "c.nc"] +@pytest.mark.skip(reason="Removed this class for now") @pytest.mark.parametrize( "file_urls, files_per_chunk, expected_keys, expected_filenames", [ @@ -44,22 +45,17 @@ def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_fil assert fnames == expected -def test_full_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): +def test_NetCDFtoZarrSequentialRecipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): # the same recipe is created as a fixture in conftest.py # I left it here explicitly because it makes the test easier to read. - r = recipe.StandardSequentialRecipe( - consolidate_zarr=True, - xarray_open_kwargs={}, - xarray_concat_kwargs={}, - require_cache=False, - input_cache=tmp_cache, - target=tmp_target, - chunk_preprocess_funcs=[], + r = recipe.NetCDFtoZarrSequentialRecipe( input_urls=netcdf_local_paths, sequence_dim="time", inputs_per_chunk=1, nitems_per_input=daily_xarray_dataset.attrs["items_per_file"], + target=tmp_target, + input_cache=tmp_cache, ) # this is the cannonical way to manually execute a recipe From 1ec63eb1513b46b6cf649711151e919d820409fc Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 21 Dec 2020 11:49:35 -0500 Subject: [PATCH 18/34] fix precommit again --- pangeo_forge/executors/python.py | 1 + pangeo_forge/recipe.py | 10 ++++++++-- tests/test_recipe.py | 17 ++++------------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pangeo_forge/executors/python.py b/pangeo_forge/executors/python.py index 0661d153..8e6625e8 100644 --- a/pangeo_forge/executors/python.py +++ b/pangeo_forge/executors/python.py @@ -3,6 +3,7 @@ Task = Callable[[], None] + class PythonExecutor: def prepare_plan(self, r): tasks = [] diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index af389317..2af0ebfe 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -40,22 +40,26 @@ # Might be coming from the cache or might be read directly. # 6) + @contextmanager def input_opener(fname, **kwargs): logger.info(f"Opening input '{fname}'") with fsspec.open(fname, **kwargs) as f: yield f + # Notes about dataclasses: # - https://www.python.org/dev/peps/pep-0557/#inheritance # - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses # This means that, for now, I can't get default arguments to work. + @dataclass class NetCDFtoZarrSequentialRecipe: """There are many inputs (a.k.a. files, granules), arranged in a sequence along the dimension `sequence_dim`. Each file may contain multiple variables. """ + input_urls: Iterable[str] """The inputs used to generate the dataset.""" @@ -140,6 +144,7 @@ def store_chunk(self) -> Callable: chunk_key : str The identifier for the chunk """ + def _store_chunk(chunk_key): ds_chunk = self.open_chunk(chunk_key) @@ -180,8 +185,9 @@ def input_opener(self, fname: str): with self.input_cache.open(fname, mode="rb") as f: yield f else: - raise ValueError(f"Input '{fname}' has not been cached yet. " - "Call .cache_input() first.") + raise ValueError( + f"Input '{fname}' has not been cached yet. " "Call .cache_input() first." + ) def open_input(self, fname: str): with self.input_opener(fname) as f: diff --git a/tests/test_recipe.py b/tests/test_recipe.py index c146565e..9b4c5461 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -11,18 +11,7 @@ "file_urls, files_per_chunk, expected_keys, expected_filenames", [ (dummy_fnames, 1, [0, 1, 2], [("a.nc",), ("b.nc",), ("c.nc",)]), - ( - dummy_fnames, - 2, - [0, 1], - [ - ( - "a.nc", - "b.nc", - ), - ("c.nc",), - ], - ), + (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]), ], ) def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): @@ -45,7 +34,9 @@ def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_fil assert fnames == expected -def test_NetCDFtoZarrSequentialRecipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): +def test_NetCDFtoZarrSequentialRecipe( + daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache +): # the same recipe is created as a fixture in conftest.py # I left it here explicitly because it makes the test easier to read. From a4bf88a820f1f2444d6aa0bb898b665a0caa4666 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 21 Dec 2020 20:50:05 -0500 Subject: [PATCH 19/34] finally --- tests/test_recipe.py | 2 +- tests/test_utils.py | 24 ++---------------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 9b4c5461..2a6a412d 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -11,7 +11,7 @@ "file_urls, files_per_chunk, expected_keys, expected_filenames", [ (dummy_fnames, 1, [0, 1, 2], [("a.nc",), ("b.nc",), ("c.nc",)]), - (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",)]), + (dummy_fnames, 2, [0, 1], [("a.nc", "b.nc",), ("c.nc",),],), # noqa: E231 ], ) def test_sequence_recipe(file_urls, files_per_chunk, expected_keys, expected_filenames, tmp_target): diff --git a/tests/test_utils.py b/tests/test_utils.py index 0d9b8ee8..d486b7af 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,28 +8,8 @@ [ ([1, 2, 3], 1, [(1,), (2,), (3,)]), ([1, 2, 3], 2, [(1, 2), (3,)]), - ( - [1, 2, 3], - 3, - [ - ( - 1, - 2, - 3, - ) - ], - ), - ( - [1, 2, 3], - 4, - [ - ( - 1, - 2, - 3, - ) - ], - ), + ([1, 2, 3], 3, [(1, 2, 3,)],), + ([1, 2, 3], 4, [(1, 2, 3,)],), ], ) def test_chunked_iterable(iterable, size, expected): From dbf6b132bbd066ff4afacd59943d391fe34aded6 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 18 Jan 2021 12:39:19 -0500 Subject: [PATCH 20/34] cleanup --- pangeo_forge/executors/__init__.py | 8 ------- pangeo_forge/executors/prefect.py | 38 ------------------------------ pangeo_forge/executors/python.py | 25 -------------------- pangeo_forge/recipe.py | 9 +++++++ setup.cfg | 2 +- tests/test_executors.py | 14 +++++++---- 6 files changed, 20 insertions(+), 76 deletions(-) delete mode 100644 pangeo_forge/executors/__init__.py delete mode 100644 pangeo_forge/executors/prefect.py delete mode 100644 pangeo_forge/executors/python.py diff --git a/pangeo_forge/executors/__init__.py b/pangeo_forge/executors/__init__.py deleted file mode 100644 index e0aa6809..00000000 --- a/pangeo_forge/executors/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Executors know how to run recipes. -""" - -from .prefect import PrefectExecutor -from .python import PythonExecutor - -__all__ = [PythonExecutor, PrefectExecutor] diff --git a/pangeo_forge/executors/prefect.py b/pangeo_forge/executors/prefect.py deleted file mode 100644 index ce91017e..00000000 --- a/pangeo_forge/executors/prefect.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import Any - -import prefect - - -class PrefectExecutor: - def prepare_plan(self, r) -> prefect.Flow: - - # wrap our functions as prefect tasks - @prefect.task - def prepare() -> None: - r.prepare() - - @prefect.task - def cache_input(input: Any) -> None: - r.cache_input(input) - - @prefect.task - def store_chunk(input: Any) -> None: - r.store_chunk(input) - - @prefect.task - def finalize() -> None: - r.finalize() - - with prefect.Flow("Pangeo-Forge") as flow: - prep_task = prepare() - cache_task = cache_input.map(list(r.iter_inputs())) - cache_task.set_dependencies(upstream_tasks=[prep_task]) - store_task = store_chunk.map(list(r.iter_chunks())) - store_task.set_dependencies(upstream_tasks=[cache_task]) - finalize_task = finalize() - finalize_task.set_dependencies(upstream_tasks=[store_task]) - - return flow - - def execute_plan(self, plan: prefect.Flow, **kwargs): - return plan.run(**kwargs) diff --git a/pangeo_forge/executors/python.py b/pangeo_forge/executors/python.py deleted file mode 100644 index 8e6625e8..00000000 --- a/pangeo_forge/executors/python.py +++ /dev/null @@ -1,25 +0,0 @@ -from functools import partial -from typing import Callable, Iterable - -Task = Callable[[], None] - - -class PythonExecutor: - def prepare_plan(self, r): - tasks = [] - tasks.append(r.prepare) - for input in r.iter_inputs(): - tasks.append(partial(r.cache_input, input)) - for chunk in r.iter_chunks(): - tasks.append(partial(r.store_chunk, chunk)) - tasks.append(r.finalize) - - return partial(_execute_all, tasks) - - def execute_plan(self, plan: Task, **kwargs): - plan() - - -def _execute_all(tasks: Iterable[Task]) -> None: - for task in tasks: - task() diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index 2af0ebfe..5c9b322a 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -10,6 +10,7 @@ import fsspec import xarray as xr import zarr +from rechunker.types import MultiStagePipeline, ParallelPipelines, Stage from .storage import InputCache, Target from .utils import chunked_iterable, fix_scalar_attr_encoding @@ -92,6 +93,14 @@ def __post_init__(self): k: v for k, v in enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk)) } + def to_pipelines(self) -> ParallelPipelines: + pipeline = [] # type: MultiStagePipeline + pipeline.append(Stage(self.prepare)) + pipeline.append(Stage(self.cache_input, list(self.iter_inputs()))) + pipeline.append(Stage(self.store_chunk, list(self.iter_chunks()))) + pipeline.append(Stage(self.finalize)) + return [pipeline] # type: ParallelPipelines + @property def prepare(self) -> Callable: """Prepare target for storing dataset.""" diff --git a/setup.cfg b/setup.cfg index 9c7719f5..d2063367 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ max-line-length = 100 [isort] known_first_party=pangeo_forge -known_third_party=click,fsspec,numpy,pandas,pkg_resources,prefect,pytest,setuptools,sphinx_pangeo_theme,xarray,zarr +known_third_party=click,fsspec,numpy,pandas,pkg_resources,prefect,pytest,rechunker,setuptools,sphinx_pangeo_theme,xarray,zarr multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 diff --git a/tests/test_executors.py b/tests/test_executors.py index 551abaa5..b21ad56a 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -1,14 +1,20 @@ import pytest import xarray as xr +from rechunker.executors import ( + DaskPipelineExecutor, + PrefectPipelineExecutor, + PythonPipelineExecutor, +) -from pangeo_forge.executors import PrefectExecutor, PythonExecutor - -@pytest.mark.parametrize("Executor", [PythonExecutor, PrefectExecutor]) +@pytest.mark.parametrize( + "Executor", [PythonPipelineExecutor, DaskPipelineExecutor, PrefectPipelineExecutor] +) def test_recipe_w_executor(Executor, netCDFtoZarr_sequential_recipe): rec, ds_expected, target = netCDFtoZarr_sequential_recipe + pipeline = rec.to_pipelines() ex = Executor() - plan = ex.prepare_plan(rec) + plan = ex.pipelines_to_plan(pipeline) ex.execute_plan(plan) ds_actual = xr.open_zarr(target.get_mapper()).load() assert ds_actual.identical(ds_expected) From c2927be41f69f63ec4b52992d66fe2dc02f0f600 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 18 Jan 2021 12:41:20 -0500 Subject: [PATCH 21/34] add rechunker to CI --- ci/py3.7.yml | 2 ++ ci/py3.8.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ci/py3.7.yml b/ci/py3.7.yml index c310ff4c..807f4192 100644 --- a/ci/py3.7.yml +++ b/ci/py3.7.yml @@ -27,3 +27,5 @@ dependencies: - toolz - xarray>=0.16.2 - zarr>=2.6.0 + - pip: + - git+https://github.com/rabernat/rechunker.git@refactor-executors diff --git a/ci/py3.8.yml b/ci/py3.8.yml index 739ce1ef..bbdd3a6e 100644 --- a/ci/py3.8.yml +++ b/ci/py3.8.yml @@ -27,3 +27,5 @@ dependencies: - toolz - xarray>=0.16.2 - zarr>=2.6.0 + - pip: + - git+https://github.com/rabernat/rechunker.git@refactor-executors From 9a1d11f41a7be9b508496ed6d70c185acdfc431d Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 18 Jan 2021 12:42:46 -0500 Subject: [PATCH 22/34] add rechunker to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 11a36cbe..83520897 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ setuptools click dask distributed +rechunker xarray >= 0.16.2 zarr >= 2.6.0 fsspec[http] From 1879acb2a2a6ecf928c6fba9264f736f5c833e1b Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 18 Jan 2021 16:55:03 -0500 Subject: [PATCH 23/34] create ABC for Recipe --- docs/{design.md => concepts.md} | 13 ++++++- docs/index.md | 5 +-- pangeo_forge/recipe.py | 61 +++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 14 deletions(-) rename docs/{design.md => concepts.md} (93%) diff --git a/docs/design.md b/docs/concepts.md similarity index 93% rename from docs/design.md rename to docs/concepts.md index fa81f088..65da4a82 100644 --- a/docs/design.md +++ b/docs/concepts.md @@ -1,8 +1,19 @@ -# Design +# Concepts pangeo-forge is modeled after [conda-forge], a community-led collection of recipes for building conda packages. +## Recipes + +The most important concept in Pangeo Forge is a ``Recipe``. +A recipe defines how to transform data in one format / location into another format / location. + + +## Storage + + + + ## Components The high-level components of pangeo-forge are: diff --git a/docs/index.md b/docs/index.md index 14b7ad31..6b629e03 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,6 +4,7 @@ :maxdepth: 2 :caption: Contents -contribute -design +concepts +contribte + ``` diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index 5c9b322a..adee9124 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -3,9 +3,10 @@ """ import logging +from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Callable, Iterable, Optional +from typing import Callable, Hashable, Iterable, NoReturn, Optional import fsspec import xarray as xr @@ -49,14 +50,60 @@ def input_opener(fname, **kwargs): yield f +class BaseRecipe(ABC): + @property + @abstractmethod + def prepare(self) -> Callable[[], NoReturn]: + pass + + @abstractmethod + def iter_inputs(self) -> Iterable[Hashable]: + pass + + @property + @abstractmethod + def cache_input(self) -> Callable[[Hashable], NoReturn]: + pass + + @abstractmethod + def iter_chunks(self) -> Iterable[Hashable]: + pass + + @property + @abstractmethod + def store_chunk(self) -> Callable[[Hashable], NoReturn]: + pass + + @property + @abstractmethod + def finalize(self) -> Callable[[], NoReturn]: + pass + + def to_pipelines(self) -> ParallelPipelines: + """Translate recipe to pipelines + + Returns + ------- + pipeline : ParallelPipelines + """ + + pipeline = [] # type: MultiStagePipeline + pipeline.append(Stage(self.prepare)) + pipeline.append(Stage(self.cache_input, list(self.iter_inputs()))) + pipeline.append(Stage(self.store_chunk, list(self.iter_chunks()))) + pipeline.append(Stage(self.finalize)) + pipelines = [] # type: ParallelPipelines + pipelines.append(pipeline) + return pipelines + + # Notes about dataclasses: # - https://www.python.org/dev/peps/pep-0557/#inheritance # - https://stackoverflow.com/questions/51575931/class-inheritance-in-python-3-7-dataclasses -# This means that, for now, I can't get default arguments to work. @dataclass -class NetCDFtoZarrSequentialRecipe: +class NetCDFtoZarrSequentialRecipe(BaseRecipe): """There are many inputs (a.k.a. files, granules), arranged in a sequence along the dimension `sequence_dim`. Each file may contain multiple variables. """ @@ -93,14 +140,6 @@ def __post_init__(self): k: v for k, v in enumerate(chunked_iterable(self.input_urls, self.inputs_per_chunk)) } - def to_pipelines(self) -> ParallelPipelines: - pipeline = [] # type: MultiStagePipeline - pipeline.append(Stage(self.prepare)) - pipeline.append(Stage(self.cache_input, list(self.iter_inputs()))) - pipeline.append(Stage(self.store_chunk, list(self.iter_chunks()))) - pipeline.append(Stage(self.finalize)) - return [pipeline] # type: ParallelPipelines - @property def prepare(self) -> Callable: """Prepare target for storing dataset.""" From 0fa41ee1390616f6d2889712b1646b70b62f6084 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 18 Jan 2021 22:49:56 -0500 Subject: [PATCH 24/34] start working on docs --- docs/bakeries.md | 1 + docs/concepts.md | 75 ------------------------------------------ docs/conf.py | 4 +++ docs/execution.md | 1 + docs/index.md | 38 +++++++++++++++++++-- docs/recipes.md | 15 +++++++++ pangeo_forge/recipe.py | 51 +++++++++++++++------------- 7 files changed, 85 insertions(+), 100 deletions(-) create mode 100644 docs/bakeries.md delete mode 100644 docs/concepts.md create mode 100644 docs/execution.md create mode 100644 docs/recipes.md diff --git a/docs/bakeries.md b/docs/bakeries.md new file mode 100644 index 00000000..efdfd4d7 --- /dev/null +++ b/docs/bakeries.md @@ -0,0 +1 @@ +# Bakeries diff --git a/docs/concepts.md b/docs/concepts.md deleted file mode 100644 index 65da4a82..00000000 --- a/docs/concepts.md +++ /dev/null @@ -1,75 +0,0 @@ -# Concepts - -pangeo-forge is modeled after [conda-forge], a community-led collection of recipes -for building conda packages. - -## Recipes - -The most important concept in Pangeo Forge is a ``Recipe``. -A recipe defines how to transform data in one format / location into another format / location. - - -## Storage - - - - -## Components - -The high-level components of pangeo-forge are: - -* https://github.com/pangeo-forge/staged-recipes: The birthplace for new recipes. - Anyone interested in adding a dataset to pangeo-forge can submit a new recipe - through a pull request, using the examples there as a starting point. -* Recipes: Metadata describing a dataset and code for transforming it from - raw input to analysis-ready dataset. -* pangeo-forge: A Python library containing useful pipeline helpers and the - [pangeo-forge command-line-interface][cli] for validating pipelines. -* https://github.com/pangeo-forge/docker-images: A repository for building docker - images for pangeo-forge. - -## Pipeline Structure - -pangeo-forge uses [Prefect] for - -1. Pipeline definitions: the code that expresses the transformation from raw to analysis-ready data. -2. Orchestration: everything involved with taking a pipeline definition and actually running it. - -This imposes a few constraints which we build upon. - -All recipe definitions must be named `recipe/pipeline.py` and must contain an instances -of a `prefect.Flow` at the top-level of the module. - -Additionally, pangeo-forge imposes additional strucutre. All pipelines modules should include a `Pipeline` -class that inherits from `pangeo_forge.AbstractPipeline`. So most pipeline modules will have - - -``` -class Pipeline(pangeo_forge.AbstractPipeline): - @property - def flow(self): - with prefect.Flow(self.name) as flow: - ... - return flow - - -pipeline = Pipeline() -flow = pipeline.flow -``` - -## Lifecycle of a recipe - -A maintainer contributes a recipe to [staged-recipes] through a pull request. We -use GitHub Actions to perform some initial validation. The actions are at https://github.com/pangeo-forge/staged-recipes/blob/master/.github/workflows/main.yaml and the logs are available at https://github.com/pangeo-forge/staged-recipes/actions. - -When a PR is merged into `staged-recipes` the [Create Repository](https://github.com/pangeo-forge/staged-recipes/blob/master/.github/workflows/create-repository.yaml) action is run, which - -1. Creates a new git repository with the contents of `staged-recipes/examples/` - inserted, along with a few other files provided by pangeo-forge, including the `register_pipeline` action definition. -2. Pushes that new repository to `https://pangeo-forge/`, for example https://github.com/pangeo-forge/example-pipeline/. -3. Executes the [Register Pipeline](https://github.com/pangeo-forge/staged-recipes/blob/master/.github/workflows/scripts/register_pipeline.yaml) action, which registers the new pipeline with Prefect and executes it. - -[conda-forge]: https://conda-forge.github.io -[cli]: https://github.com/pangeo-forge/pangeo-forge/blob/master/pangeo_forge/cli.py -[Prefect]: https://docs.prefect.io -[staged-recipes]: https://github.com/pangeo-forge/staged-recipes/ diff --git a/docs/conf.py b/docs/conf.py index c46ec4db..2dfe4699 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,6 +11,9 @@ extensions = [ "myst_parser", + "sphinx.ext.autodoc", + # "numpydoc", + "sphinx_autodoc_typehints", ] templates_path = ["_templates"] @@ -21,5 +24,6 @@ html_theme = "pangeo" html_static_path = ["_static"] +html_sidebars = {"index": [], "**": ["localtoc.html"]} myst_heading_anchors = 2 diff --git a/docs/execution.md b/docs/execution.md new file mode 100644 index 00000000..7bb5b258 --- /dev/null +++ b/docs/execution.md @@ -0,0 +1 @@ +# Recipe Execution diff --git a/docs/index.md b/docs/index.md index 6b629e03..2718b2bd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,10 +1,42 @@ -# pangeo-forge +# Pangeo Forge + +Pangeo Forge is an open source tool for data Extraction, Transformation, and Loading (ETL). +The goal of Pangeo Forge is to make it easy to extract data from traditional data +repositories and deposit in cloud object storage in analysis-ready, cloud-optimize (ARCO) format. + +Pangeo Forge is inspired by [Conda Forge](https://conda-forge.org/), a +community-led collection of recipes for building conda packages. +We hope that Pangeo Forge can play the same role for datasets. + +## Recipes + +The most important concept in Pangeo Forge is a ``recipe``. +A recipe defines how to transform data in one format / location into another format / location. +The primary way people contribute to Pangeo Forge is by writing / maintaining recipes. +Recipes developed by the community are stored in GitHub repositories. +For information about how to write a recipe, see {doc}`recipes`. + +## Recipe Execution + +There are several different ways to execute recipes. +See {doc}`execution` for details. + +## Bakeries + +Bakeries are cloud-based environments for executing recipes. +Each Bakery is coupled to one or more cloud storage buckets where the ARCO data is stored. +Bakeries use [Prefect](https://prefect.io/) to orchestrate the various steps +of the recipe. +For more information, see {doc}`bakeries`. + ```{toctree} :maxdepth: 2 :caption: Contents -concepts -contribte +recipes +execution +bakeries +contribute ``` diff --git a/docs/recipes.md b/docs/recipes.md new file mode 100644 index 00000000..f053c335 --- /dev/null +++ b/docs/recipes.md @@ -0,0 +1,15 @@ +# Recipes + +## The Base Recipe Class + +```{eval-rst} +.. autoclass:: pangeo_forge.recipe.BaseRecipe + :members: +``` + +## Specific Recipe Classes + +```{eval-rst} +.. autoclass:: pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe + :show-inheritance: +``` diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index adee9124..e42fbb7d 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -51,40 +51,53 @@ def input_opener(fname, **kwargs): class BaseRecipe(ABC): + """Base recipe class from which all other Recipes inherit. + """ + @property @abstractmethod def prepare(self) -> Callable[[], NoReturn]: + """Prepare the recipe for execution by initializing the target. + Attribute that returns a callable function. + """ pass @abstractmethod def iter_inputs(self) -> Iterable[Hashable]: + """Iterate over all inputs.""" pass @property @abstractmethod def cache_input(self) -> Callable[[Hashable], NoReturn]: + """Copy an input from its source location to the cache. + Attribute that returns a callable function. + """ pass @abstractmethod def iter_chunks(self) -> Iterable[Hashable]: + """Iterate over all target chunks.""" pass @property @abstractmethod def store_chunk(self) -> Callable[[Hashable], NoReturn]: + """Store a chunk of data in the target. + Attribute that returns a callable function. + """ pass @property @abstractmethod def finalize(self) -> Callable[[], NoReturn]: + """Final step to finish the recipe after data has been written. + Attribute that returns a callable function. + """ pass def to_pipelines(self) -> ParallelPipelines: - """Translate recipe to pipelines - - Returns - ------- - pipeline : ParallelPipelines + """Translate recipe to pipeline for execution. """ pipeline = [] # type: MultiStagePipeline @@ -106,34 +119,28 @@ def to_pipelines(self) -> ParallelPipelines: class NetCDFtoZarrSequentialRecipe(BaseRecipe): """There are many inputs (a.k.a. files, granules), arranged in a sequence along the dimension `sequence_dim`. Each file may contain multiple variables. + + :param input_urls: The inputs used to generate the dataset. + :param sequence_dim: The dimension name along which the inputs will be concatenated. + :param inputs_per_chunk: The number of inputs to use in each chunk. + :param nitems_per_input: The length of each input along the `sequence_dim` dimension. + :param target: A location in which to put the dataset. Can also be assigned at run time. + :param input_cache: A location in which to cache temporary data. + :param consolidate_zarr: Whether to consolidate the resulting Zarr dataset. + :param xarray_open_kwargs: Extra options for opening the inputs with Xarray. + :param xarray_concat_kwargs: Extra options to pass to Xarray when concatenating + the inputs to form a chunk. """ input_urls: Iterable[str] - """The inputs used to generate the dataset.""" - sequence_dim: str - """The dimension name along which the inputs will be concatenated.""" - inputs_per_chunk: int = 1 - """The number of inputs to use in each chunk.""" - nitems_per_input: int = 1 - """The length of each input along the `sequence_dim` dimension.""" - target: Optional[Target] = None - """A location in which to put the dataset.""" - input_cache: Optional[InputCache] = None - """The length of each input along the `sequence_dim` dimension.""" - consolidate_zarr: bool = True - """Whether to consolidate the resulting Zarr dataset.""" - xarray_open_kwargs: dict = field(default_factory=dict) - """Extra options for opening the inputs with Xarray.""" - xarray_concat_kwargs: dict = field(default_factory=dict) - """Extra options to pass to Xarray when concatenating the inputs to form a chunk.""" def __post_init__(self): self._chunks_inputs = { From 4ee634a4806b569310d43f87d04f40a1a4222d27 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 19 Jan 2021 10:39:46 -0500 Subject: [PATCH 25/34] writing more docs --- docs/_static/pangeo-forge-logo-blue.png | Bin 0 -> 17602 bytes docs/api.md | 26 +++++++++++++ docs/conf.py | 13 ++++--- docs/index.md | 1 + docs/recipes.md | 48 ++++++++++++++++++++++-- pangeo_forge/recipe.py | 20 +--------- pangeo_forge/storage.py | 33 +++++++--------- setup.cfg | 2 +- 8 files changed, 96 insertions(+), 47 deletions(-) create mode 100644 docs/_static/pangeo-forge-logo-blue.png create mode 100644 docs/api.md diff --git a/docs/_static/pangeo-forge-logo-blue.png b/docs/_static/pangeo-forge-logo-blue.png new file mode 100644 index 0000000000000000000000000000000000000000..467758a8305bbc71aac9b174d10be530bc8ed8c1 GIT binary patch literal 17602 zcmdSB^;?wR7dA==DAGtt2@EAQLpLhj0!sG~j&wHz(kbhz}_12W{#HJ4$jt@ z2jVZ$&|aY_%DnyHnYFj*;cM`B_VIYw4>UWJLsat%jrZyMSC$|_*eCY&d@TK!q5%i* zlymvr6W&c%y5*|9n)zNk^8IwXuxRA?eO?*1x!9AZSW2&a5+v-9T4POcI9v1+Ld8u5 z`Gk(cY2`}t-Gynt-kNgdP2*RL|NnlqeR(E|P8`oYXz0lqOB+@94RRq|MeKNkX)5yc zfT2tz7!srVGw^0!5GSkJ6_wD`%P-q!Mit|P4A{oRLnFmT6cMk>L59=}FF%@(mW$ti z^Xp{cdaGfUFl_Y!ueV2jRVI>eE-r?a@x|x->k&0=&RDD3O4lqthbIiNNGEJ&H@}5F z%$9V^$y>gx*?82SXVo@)Y=$njV|HaW;dtIT%*hZFmnI8)$RNsd;TM9gQlNGWX^UEw z9c)KA*3KrI72xxMFBmzkgG6YBS&Aq_u}zt$9&l_@h~gv=Ct@%85-MHoHX)*7hkbKPR4Tm-=4B`)jk}1DgAHGI;FemHDL;w z>7%UAz!)Mj*#$)aB6*NVQAT6!or0{`GX_a^C;YC*v+yXSBC}pUvNpB-ZHHfJ-WXam zseYB947;8*J;1;&9C?lFPV6RNa?BjYbdnC|?(-)ZpfRl)9-W7XM-H^r3O&Y?MH%?U z@>Hg4KwgB!q0!@fx6TWbpnM2$f)9<&oBVm>+fV#j%^q*RBmvyhIX~M6U8glQ0%M`D)FXI--^Y#(+yOhoTKjZ91o*H1h+e)&XXD6?+06>AW~vJ2T{7W~ zc7s?^IC+o_go>{X(1xc!kL&H}c;ZXGwJu`pOoyyCv4*J{rQ9TPYMLd)DRDgD8q&GG z%=Ub7#p^aXY}HzvZX%9OYJi{fxFY{7oz%1Db16N{t)#YqYHJtrdwt@g3EE686R6!` zcg)^lPrju5T6J#UWFT-x9o#ygl~p^tueCT6cnn)h3?C>k;kO+&=zaOGoqHC@bedCPuHLwdGaO!Nd*vdUyfN5credV~K5fdLo zZRziySGfcYZE)dPlenjVZ=;ac3@6q%|oG^_^I{g{#7iZ{hl2bR*I{|p zp6nE4Tpf>ns%4eB`Ct@iD_VJ{vcd-pSKyR%Hf=E@UIaW{6r6S|piT^6AQtxpYqpwh zh3SoIi7GZ;N)XgPAyNAMW^yXV;d9X#{%96d5>B>^T12M7ku}UeWi}*7yGqgYLz=<` zrB7Y)Bq4+36N?kWJd#MUrQFV|ho4zWijN_0i;u~jSTCKGQtDhz%^%bRvxE-$Ri@(F zkCf?E?oQvbGL&eUG0uKa;ZJ!`OBC$Bjd^445Y-Cp=xq9{6*RQw_7fG?JL9`_^qGxT zHutG$M46*X>-^^8h~bG(*wJn}A1mSE#hJZV&9kpB@VapVc9Q0q>mgFe$KWM%!OfJo z&5{gNy{!SBUSSmXif^l}&S7ffJ=%l+)@+nOoT=x_Zow)Xw5sn8^Mr8`5@l>FiTF52{VMi0^9 z8_BAd*H!58xy#wQL5Gs23K@`&u5wXsJs^^_BplFekfHf`v!`-!EVd!E>YC{2p7ZQ4 zeYY4J*W(K}0jcMrWOtbc&P2Kj79)q=;^;U39GFZ6a^`StICx$w%}*Mhvf%YAOta8i7nKvasrQ_fD9(@e57KXYc824zmJFPAs9(C#7I?uoW@iYdQ

Kyh&O|n-nl)4? zccr6kiImZ-6fgLpy1PVW8)t(-NH(q`#_t(^;)ISx^9vXavNn$8Y6+_ zDfQS_T^z4u#tVJGt70u29mTEkQLF<8CW-(jHf2gY)jJox29Hq&AIw@s^ODIr;9T_W z?Al!Ef2uWXUO(r4P_P*tF*rU(q&5+U(raX{8MRHH#k9&q4X!c~nEQOPTkMSN9i`(I zpAH*JadAA7FDczqXtJ~$bYLr`4Et+9=i*5{Ub#CE*~>+@u8{H{IDUh($&GdMqc2`6 z^HNay#08(z*n&wG4IrpdB8YH^@Tzf!me~^}Qr4>=y7uE03!!93>3am_J=k{H~E}u>I>HYf6ADJ*6 zjKjq^cSQB7P3tS6e7^c)VgQAB&sWZ|A2f_$}@pXx>w25a$5#@W8N~M{eSFciN zVRgN}X|u`KdX3ZvJI;wKnlxK6raQk?eSHJU6 zI8XkgXW0Lwqd0qBVc)JUhmo_yw%a)lDUPUHmq30k(=cN!slHI8{{rfe@A(?4tmx4G zhRLmg-%Mqv8~qtmINhIOuAOCD5G_h9Taq-!`og{b*o-crPTeb3L#K^I*~3Gk-mD3K zI!n>+ynjL)F1CHm+4gp~ef`K=e(?(M&5qHR!fLqmp&&;91O-Cn4k0Qhr7U9-=hKd! z82_n04)%$@J%Ewm++NliP)quFUQROUWzLGRtoT7|Vo`&V8|Vy|rAOa$b)+$;?l5eY zWf$|-ODc_j)%<|pj_Kq(MYeXtazCha2_Co~zNN8COO*!z>T~pYyJd?&bzud^?HJL@ z*xAjB<6Co`qDeZ$%z%is{d@V=uvcf&g!{CHr!H1HDKy8Ej!xfM(Yz&u%b6n-)BE+j zZ=;0Wac4LnDA%r_P%V>l_8l9#u-|xUwd_yo<2H%>eZ$eKP*X~wYhbg(xEfN}= z?^i^HB25FL6^x5qTU?Lmh*d8_7l~Vv$X5ERk8{MWRR@%K9Dy*HT$)Pl>HtE>_))3Q zfnyX~8RdR@=54R;tgMt+S8Ijim|F?)33}i9bobn)^xQfpmb9j)WjRkU;zOiwziiFB zkO+_istd{>Iq&o8r&QDK*i6fYK&;U-&{CQbhdVmcSY)EG)0NIP^J;uI9w*^fe>03D z=HIBt*ts79Xq+X{8v50lt{wCkd=$~$)k~M36+iy*izxnV-0$7NsQ0}~Cc%2GVdY+1 zg3?1PZaT|Al1}A@nyFXXL0`fmHAwMtFJ<;mhExh`Y5n(Xw+*gSzVv! zj=j4kkTT>Cuwt$Rp$$Co++O;7$gEH_+tK1DSzu>K{sq#|J&)=OXnT)Kwwr`8DK9EEXjT`U5@8A$+}t8k)<(do@MZ&^27qK2rSTy%{5xYKWqPMY)ToIc&P? z*nI8ay`cy8{58hRxbLo%MUL9qk5Y-VswAO9X=3Np74I}nv;Dd!T4};Wl2ljtD}O7V zt!VSLoVccZ=&Q2*HE?QftrHqScSq4zSpD2F!8q3sx+Gm|_7p4y^@~s_u6A9a{)`Fr(Q+>#Kj%U%o!nW))-ShR(8+R7Mkj= z`>nYt-D`}U$}+AA0fCmP&41Ka?;CWI&N`k@9^}Uj5mrW>vaMLUR6`>K3|I2~yrUIq z3I$(1h#i@+st$bVejJ{sVqs*YDga`T?G(B&r)VSt>AGus%x%q%c2%5rq+S1mW2_}V z-|L&B7217-ourH}#m{y}lBxf|#PL~$z+-=xqrJ~{7L6hI-`~#gH{{_U+Rtv}%(G8L zvs|!>gb0o(nLdUpuvLF(H2DOn8g>yEx`R3T2}f(l^uU3YQ;P=dHg4t|TqbkJ8e7(ZRCC=?8J$Z9A~|*R*6;7gsw8P*Ss2%NfK}r^ z|F6`)9$6Y)JSiNebLaZaf7Je{k&!ukl?Letb*ZaQnGfKiW1>^m+TeAhJED}A>G1$& z{>j;(rL%a--UJvNrU5Fy*lyxurNWx1L$^riX}SQsteCRId5}7!AMi+&w*teE@>?pJ zOZuF3i$u4&^%4c1ylfL@EzUFt0Ec37c(AIbSIXb}r_^9sU-;_za!ifa7tbyiEX^?u z*8xciR}on*&XA(ie@$o$A*OSRmrpire_AOUAL#BD>-MZeS{R$ufndA~edXuG$9wi| z@;-0szQBR%gTIQd8`ac^xwwSEt{&n0xFNLlp>EOhLX$sKhMV1pj({kMzaBClNH0C- z{HEcFDIYszV7MUkC);+j{z}uidKa^kg$lw?eo4z>y02c3e>n%^?5IYsOT-R#Gf{d&79rc& z8P#{(A-kCY+BEI~vtQQBHu`0${rW*uo)B;Wa@TqI=^SC}^DAren;mcG&qjkmN!oYh z4mk8T$e9?)g*$Jak!yl>c8j%c#U53<`OQk^WV@u4Lo~kM(n);j|MG|qvOeS0YwO>Z ziO8o3vaeOVnYz(;epC{Q;-i)M#PE=62CW2FFmr@NZ2Pip3J@26`Ok;J(H7;h9q2aY zJ>N}$FdmGKR7&|+3>SP4yY5{%b~Ux}zAU!2nKD1pOcU-|U&gLb6MMIPH&At9ODOVY zZD?3D4g!kePh#p6u|9YMKExM|Ih(!2s*=4o;A*ntN$~(5p>`3UHP#9ZJ*mlPp5wS7hGN zBN7(B?cGmOkowRc?dFl@x8K-X4L!c2b9QjJ~qJ(yF4@8>D**7e3$%erNFM^JmlEH8udt5HfL{8SE=5fnM}eNXyL%KhARZ z?|z(9*X<5o7GG>kt$MR3bl~j+Ha8-1qcjfa!}N4`X}j#(S5Ga;Jc7J{BoA*iWwA_m zp({&-;etjgICiFAcu0klzI%8dv&^N#p&G>TZv%ijfjF3Bi{JN1Z75xgWebF+tN#J-9*2M$=`drV5+Tr|=L~SD%n5 zdKd9#(%+w4yAKHu{g}K8iIylqJWce^L^0|8IJcfG^^o8i$*+hR&1IP0UKYkQ@+5M* zn1tsqN}8D5gJho&mH)Eg!`2hN&a5?v5@}Uu7*adV3rz)Tb-gXwbBs)k z68ut*#~*gI;uCO3gA2J(Q&&ztjZj6&C?{^b&1&kq%PDGTqj4oq5LNY0-g7yIj_vCF zSNf2@$u?dY^?uBOB8N3P!{%3!dWu_6DVv?A)LUB5hC*Js*)PM;Mb`e(n9qCeY}f81 zdK*d2KRUFukK=j_b6ZEBtWpQ5y&L-`#*$$4lcU>Iasv{{Z1I3nmgFFpulCHXbGUly zg0gUL6=zQ7=AOo+4J>-^W5D%QyY|1yV*`OXLJG4+9 zfP9^^m6pJDaXG!A#E<@Xogm@6-Y_<%SwWN$u;AgEok!C@ml%Wj<<~5ExA(lxA$2BJ zv-`8c=C2+(JJvsPPNN;?X(@IMmsH+-98pxb`4^7pA#cI4b^b3%9KuhD{8h*W+VM25 zcM*ugFMjM*4?3K&YtB?h?@UENy;JW^T$F*u*52%dK`ux=muH1sqOVyop+<*;j9za6 z@rL)PZ=idYK!tO67(8T?^08rUI9=!vdVzoM7rvgGoJ`w@(t2{O=%IJ+Z@cvlJp3d@ zdK&PG8ai6Uu8dy!^(Z0TQ}!=3jC*s-PYGMU;2eG;6eT{E1BO zaXu-9r`}UK_qe<5)HD0=cd%-NM+xHTM818n>k&BoUU5^}W-8rkEFGdK zSz~)dY4(%HtMK>CPjR^!Y0jJHCe9*r31WnSO+b?1@|o477SVSmee?c3fC*oA4dbS@ zX77Fh87iI1@rUSpf{6Q1HNV6oRu_S)=l1TC9igatTTDVM!;9KgxpBCJO$R*SW@p!F z5Q}oh?+*u+PXIN{Ml!_m<90R-B%%yRfGZ+}F?Rogws&RN?*0SSx+MwfWtk8Az~QGe zuH1&F!#KC{;;IKfmBsFBRt?^8#sp-B2j8>zrF@hK6?m@b(H`;lnVkLZ-E!)WxkLKA zdX$Zf7>ns3#;ZhXGXV@K-o9p+9i8Xp`xnto928JhU#0z=5zxv{5t zm=Dqis1OvFAu_1LYt;)YZnW)$Owvw1tuu*q?O}CCG*PT&JTt{P^K8qf?G^+p|5vV` z7=pL_Sx%N7kHS1H93m?rRHfRuhpdWiemiWruf6Z}mnkCiXJEc*=nBzxbr5ogm&S!4Fq|3@c4X$@aB-ScIOK2f1#4{LRgprhIA=6;v7xF@ zptp%^AVTzgQ+M;dQUv+JQPjk8GkRrr-2c6SgrzJ3?z36jN=Y$`J);BjQuVRZ9sY>lTgrA2u{KZPQ1eW^*=_ayrbx+s|^t0V3Rr6YF& zHqd-Ibor01UzCi6QN)h($bUr#eUkwO6x$xS#H_~<)#9UQu)_B~fcQOfaMUgk-{YbcwfCAqBDP)KZj-cNS6(HI`0#S_gD zSa{rrv={$fwc7=U#&&JbUq|W?_e+wpe+ftZxMlV?Wwl?=$r~riEdjH?ueP%Zw=+@x zH1jF4FIUMkcnFs?9%H31Vt-Bs*+&sUKhm20uCNRA=+-8ZSl4seuC%+P<8(v#{nH&k zBb3Fg>!yc=@w(SeLdNw}NwmJKi$#cLDrY+j#K&(V^2mio&g@hDOg=n1Tjlr)CVY=$ z4dx;yJ~a1f7_Z)DE)Wwc5_P&VH1)EoNl*crF^`dfQkaK|b!Jy-kQb_Hauz+upP&T9 z{bYi4OgB6FqJm|hrCO;;6!(!7Lg;>2rRMFST*!Jh!f>Un(9<|}+~HO4y88D|3l=Jc z!c6@U;b%Q|_Jw}q1Fp8lAu->5gGU%vrV+G=*UO2CK!r=wQx1I^Jr^gmcx)v2os67n z;YQ*3Q+%3?_zfAESn~OX#0z|m?U{B+wx977rj*2;!u3Vu1|1noNGavjtM}+xIiJ^$ zR#so+I^y<1)zQ|ka;=h-)+f(v%i$58>wzCKTR;RnMtBQ2J@-TEU%ml#;WUCj zn37LqG+|pwXGNY}FO*nvqO}YYfH?~BYs+{OQ^$<8yGfMW=#V&wr+q%1 z4z=+|TZkL_vL)~#?UyS_qGcu4P2t!G`?hrdRpOS**oQ31d)|bs@QqwgEzI&=<9xgI zm$sZKpHHz@;DIkqza91~4gu+1r3m@e6Gu!(UqIUR_x#>8erjndqfcpR*v2^Yly76p zYb(6cV~ys=BdTz(AV++8v(pQpN+9t2783A?(T-;L_4E3#uV!Z=PQ1+P>J#8N1s+o| z(NBxDu_==r5kke{_uW`@uOR8?5Pm}@;IV^jbYJpD?5#{C>Egm*NmBKK!w9XX=exOp zWdjk$G>Z#EhZ?fZ*q++-$8k|YvtN4zOv!)|3Wiw0s%*8|E8zrcK z+D-r}XKG^2|*BwOr=L)F=QTCAd@R`+t5%m!&s-w}CEUTzec zmVk*Xu@h!A>S@w4G?f7`I1s*3Pgm+sgQWjbmC>{?O06$95UJfE6LmjSW9`8u@rk=b z55LLCI>UXf(!)a+=7l|W*h9K41S3(?&|o%7E{Jn~-Z zqI{gkN&y|K2oqoNN5ssB0_Q=7@hlUZ^Pp8sLM6^d+7$6C({*ninXjWgTH{1I<)&rV z9@MiCPmkf_r>^GB{I3Ux89!`=VvFX{j*J~8iapN^GK2;7A zmS(BHiGJui3VRYK6tDi)buliEn)vjm@RDTKw-GA96xBQtXgBJT@ehe=UF=i9sc2r~ zDyPwJOZon>)M;K*a<&1#5l8n)>z0o*V^|3|-ZoRgJbZH;QT5?v#{2Mg(mu%6L}COU zRTgk8a5Zi#7Wvy-Gg1Z(5UPN3@j<}h$KE@Y5Q+Z=EH^I+O$YsXz__NJq4*jlwp!-9 zWhW>k)<0S{79|JeuUwWHA1K`x{(92ie$?kU==bKgO^EDRQO8!o^Y4kegQsSiG^l6i z``rQTth`4Xior^ed2F_m81GS%;+d2aO`nsmx*ABQkyeACd|72(*=9?4ITnISyPu>5vryfps!x5X-q3zh0}38c^7)#gWprcRC2dSlx2 z|D9}(K`epC&Ag<;|>G$#@w#2rnv z8ZUM1Hdf%;q;@JzIgWui3S|{uc=*!_bb!p)dn*5fC(W0o&(7J+Mv|W4NuJAe-QQY) zM2LAr3HY5I|9ZbPQM749y*jeV!<_+;l#vR^zi~TFSD+x4fmr2|BFGuFPy$17w#;%h z6K{N`!bw??*C_!(Yhy}EdCOSD5zP_lO2eCf;%;D&v1lsLkg!t2(LE{HAABk3`GPx( zvk-{>bw~a`OnNUI?21>DV`%P%MGI8fr%7oM%L<(5eiN4)GBqk85ifr9gD#}6UySf1 zhpK4=ztuwWot-6y2Y8kh}n<-NKI7 z9pocS`wH0kmJmueG{>Luo;CXSo6lp^vyIB*q5CwxFwDVi{OPh+P97W6Ac84Ub~d2F z*5DZ6k-+wzo4GglzJLLf(tAz@TRk&?R1r4qON?gDWk=)d#fM|8D}VOh=EtB3bwvsK z4q&mMFyQ^n|GokXyKoHrfyV5p_8FeUDTwjD4wiXV<~EIfQLZaJ-fX8kty*`j zy@Euj2THWFvL*xphuA?dprwG zUIrPcKJpQh=p3*(8lb}fI`(lS22(Vq^;D^AVyjaPY&{>Iu(ERw{-pIh^RYR)UV=T_ zWE|s;;ELhMT#kga9}|e}VqF;{CdG3>5T|WCK+EWh)&k9r*m)J#~ z+|e~+da`pwL~Kw2(P4k4k~sl_%E$=3u>qlZTxzi9Vd{T&T+g=ae+jbl$!u=fi$iRV zIL0V60C8a9cgib#aB=B#xgp7P;!U;R#&RT$_;7u>^-yjhWtYH^H*OcaB>g6H>`EYk zeMDG*AHrI!#dI^Vlv4PC@+OuhYqAGK*^+C#tfg_HLEnXn`biJ_1 zg<|vm)jC87XE2BMMA8YZ~zE_WUhAEV>3mKs@EUDEQWtl zQ#uVj(^FTevEJ?Gao0)ukY6AJvIJB=HjQnF)ry8EcGFK=XCsU{s}5GG4Q?c&oomhw zturE67035wkJ=7@foG?~zNSig^xE<7sQOG=6sbEU*x%E9g@xpCaPg`93yd87*-O#<<6WQ za?!ZlZ0%WtsE7~4tKokyDstLGdJZZW6IlKYc+HUgf<@JUfJkgY&Eky{8hga@%c+Bd z&&+BZM^T)`O#hsC_|+zZDF^r>yi$K4P(V!NVjqciN5)zoEpzR zSebLtW=mp;XZ>}R!A-y=C5B62^$72Hlzyv@iS5&l#7TaVn0!?L+wH8#@h14x|8Fv7 zV6qMZbRG={YwwMG}~WRbc`VYfQ3H_x?S{c<-0FJ`dz9~ zpLEpSn#jt+3xP^X_@n5e;$TeQhqR3*Px-RaVty}$9@c-8@zGqt5u0K@yi3H>39I51 zhg+^lodD}VgTTW?T~SU(7+_gzMi4OJ)k|>RKxx?|ZKCh=`M@;A750(FN))0^@3%tX z$`+?Pu>&_#HRHNfGlx<@;5s%lOUF)pLmcpz+sSV_GmHpYQ+gb=-EDJXFOEfBa>e2P zHv*1kycC3zW?+Zgv+dwW>GVM*-XC5%d^~p>UULL20{Wyl4AnW{w21#O($&jHo z<2A6q3e$aP%UBhvsI_+F*=pgXibW5V#M1kC4Cs1FxpG7GL1#>le#^q{#$W-$2!zVz zx8fIx7ys5Jvr&G7Zh}qzwWm*Sb&ZIW`z5mR6uA=o9G-7b#wAxK5F67XV*s6LWS*i! z#xzqRP2YtoEQ9+X!BNt}vJ73!4DVz^q>dZDl?1t6I#|rt(9UdVhaNs@$GGG9IuT43 z|E?IA10TCFqerH9RKnQSbV3B_H0HyqDgGbQtIDfr{e4oXYE11OtjNuTn(I~{Ha{)5 z`ygm;XJpSiRbKEwe1pf4KU^hwMy3BQ*Npj$el|4_Nyye%P;@*gUDj;Pg)=1tn%|%z z=@uK{O-)wt|Ci`Hk~@llnbMWh_Ld0GHftN-H4GE%b4w`2n>yhp_!Rc7&w?J zJGZ0h4uITKiJQF&V#TwVDeMf<*wULhUZB%{bd#{VolmswJX9>ZnM)a6ka14KCEeZ+ zMUPV0o0(k~R_R|cXEA{UYFNkox?RrszCXC}E0u_W%j#kK_5q^?dhJRMjPB2YEddf9y$Y2ifQ6E{un$ z^_Z<;L4{Y>s=eA2^~<=1rkb;Djhcg`#axOhw<96ojo`P@V(=HpY>0T}p=C5%URcDGR|Ru4hGvX ze2Yt2abmZT{q$MCs``U|rX_Tk?>kN1L!VRAFNRW|?xS^Tn+ioRSK_}B8^NGqbOL{R z1||we3sRtQ_S{>2yeRwhCYb$@dN^%t{p6z7%4T)4IlJs;fJHQ;VA1G?V5Oh4h&h4u zLx7R_gXkM*UiwFkk%IN&cpKvW3t0Z$nf;yf8L5Aj#Y*wh>=Mk zb|dR z@09`v6kl*DLa^0ZT{#nZ#3U=De%%;6cDGMvb93dy0GhJwddvDM!3NJX+M%0H&&_e1 zA*aoTjLRpnUHOFOh;@a|RJxC3cd0b&wj-=(HQXBAC7?85y~XAeS6LgJ1322e_s#t; z2q&6k7_YhE)?XtmA{Bi}h8vLee|`SDUxFvFD;cgxnzDeiduk@~A``ypt-t_ulp*Qu zAD0lAyH&ze_dWH>Ft#JOb5q!jUv>DQAPJCoPRPks{3Qf=7W-YKjDo0l-E*U-1&=cD z3>6O!d0(twF~- z+)wT*r!HLu2b2-lpcIA5fxgR8YxS>bVJB3k4muQW{K)@W+?autU_CS!o{Vsj#tQ)+ zcRLs&$V1nXvpgKtu{g7cWB+^sH|y<#dVy-WeGmLY@3EvS|_9_7IB5&+M>1Y)SdZ9Z|G2{dHz*pX+5q-AR{& zAohbk^ZBox1~pIvtf}gk28jcC(>?n|k(7PDlR+vg+ToKWMf0HAcb7OMV#5#dY7aSf zCG!JY4TNTAdeT30bd^4&hqtPEsK|&yoiCkCp?te^OS2zQTnNjv*n$m#OLUX1oqqV^ zXVU5m&y`Ap20~)5Y#_d(;Yvl`bG07L4=$>@--9AZ6MZsU0<;2$Y42qWq?0Be8?v=3 zlx#L{Ir~R*h2qh+VgY^NiGVlrqL4!r)H5y3C8ZN^X>)~d4f<$hJ9`lVga?J|?>UdA zD@*^W4-hEU>K8zkvN>2Doz(IP^wLvdrl?Dp+1yZ!efcV)p`{FNK`~xIL#ALZq3OhU zBDx467KvoI`ui&@r&9ODYL=m zGflIW+~Fb8#1Fom7VtQqP+KVmW|{Vj>PfkkZ(VNK+g-U{Zc{o&EZqK_Zf#)NdLbhN z!tssiDeLxy`wkpBsG#e?^CRx}dik5UAKvEGWutTOkKwNv_YZ%X7X$_3 zY(Y+Ma%r|5-mnhPLka%SL<0CdPv0@I_jm?42WGXK|CA&x#bXF|oc5PpFJa$EPi85K}o@1w)%Lz70UqUj>SA_!Q- zwt7BQ(fQ&&^Iw}S1MVQ?vRX|fMH^Pq7~N|I)Z#%YUkT9)5DY75Q~ZH;U8|OcxAber z$($YE2EOIU{$mFBWCC{u%<&IQ_>Kz{FsisX& z)Ak^1YMgN^GdM6V{{S)}p{uWd<7@73DCerbqwl##&&m|;GXtzqw)K0Kb%2K~{G#Ov zlpCA-NuPIgBl>`}t_yuHX0EBC<)1K4S#Jq|8=(s8T@Jkb$B5R440%l~)idq`{bN$6 zCiLz)MD;_FdoXzw9D9>{=SR*7*Js%EGQtpL&^pl@_>uBfz05_*4Kp`DNvM?S52q9@ zav|Vv5CVaBUPCxQ@3fFv(&ncedPB5L24smhzhASM>(RTul&InS-hOkFG@s4qffs)? zUF}N~ewCv$P1jRx0tGt0IR$_8g#$f(gVaT?EipsIV0P_Mz$FCUxpA-OYJF}USgmn; z_vO2YncQ{Z<@Np_fBTfE!3(kccM4Ih;ru{;Z4W^qoU})sg*W!2?gm1 z;(gMZccL^Km)`3;NjUPQ-v%!x3a5QGymbTmjNi%f%X|f=?glnn?`kk@>8rtP^ zwDo$v_Ennhrv@@Eud^0obpy>G*RVs6aG+KG)zOm-JT~3v|8qgj&&_KDrtmPj-L=*m zkVmSZdEzw$aPj#zmvI(k9c7=rdAj$GCdV8QbUOsxbaxZoGqVhKc`y;Su7EFZp8l5S zZCq6>yT|S%V@C6mk7{HJ~jJ=n;Tybm806Zt$^uKm%5`5^HR)03qMEwTV2^{;#l}{cnFB)|_DSY?)FK zU%l{@x!Ym6p%GGQv#Ft1B!gqfRN-{%@VdXs6$#Qnegv>xNV%ISgt@jpG()0^2*Ey` zNB_oA>@mLt{M`gA)i5`Gv>W}ahA^S;X7h)ZLs{=fBEmQRbyY?~9n3uCffmUx-q5sr zt`Je!B5z`Xe=>W+uA{$~r<+oRSnfU)g_9kuPyuB0Z3gC^7G@(b(BNDYTlv+e5tzlu zJEcwL4kx4SJ3`b|1V$9DB?1{J3pe@=HyW%5*U1 zZSkvwQb3edK2E=Z>7h&?ywE7V5rY>qXk}J6esP&%6?+%6={Uc6 z`B%@;BBuBBOtC_B0%GybTp)ZMj!M{Ts1!#i|JDo8J%wrs=FzOpU{ba z)x+H#^~);MbmM~RF=A6!bJl$WE194OU#l908{I}Ayp-!dNV8&QV*WQrOT^&-!r2B@ zFCNYq`Dl8-vn#v8ZAfRSci!9}F4n@UvWIZB-TGXWi+(r5|Hm{dDlpYIJUJ5&U&N)R zc7u$Jg@hBScIm5Yq;LD$dee22A^ILvH!0TKUN}}1n~YGh`tX5x?N+yarOjsf0pEyk zXx>LW-Rs89Z+rYu*zlA)I-5}C_?z{@FT9n*Y^i*wmdc2s4witlV4|wZoJeoRw93>0hdfIKV;iuf3 zHf~5M_OzE$$z4-eqvm(C`iV2?3;`GZxxBXTB8I!h8{ylfhH^1DU}cn%;aSKEq=!P5NJXbA-=9gwOA{g>SZF zBhDoIVv*kWJ^K&cE_B4VU=7mlfx~>W)=hEV&XY1i>dnht#tYBk>UES;95LD@Iap#} zw1pT7k9&6ikJz%;rXJq~WP0B8yLhLq1>N{Q@I*aLm#<@yc8JDWs4m|h7rJ!Q1vz?_ zVsa0*HEaQY>pO;fp4`9;#n69 zOvFH-+xE364oP_GM8Ans>+{)8oI)?%x#E)Odvz4+jVG?nv4cZQPh{CU$qyQ!U4WS9 zO^@7tl4zI^^qAs;ib2c}ddEVy?<$hcb+um_s_)B!z5#mwLbi$V8|7nSx!UJpmVMs- zhxY4P@{%xGfyek{&ty{C?>+4sCH-D!6nxYC&fQYumG5{`sjKh=M6tI|J+DA?oRPc<`9iE{W9V@A{Wj)!n z{r67__p3&hao%%=!3?*>@?wTxrv2Pxd;v}2MxeJV==H{xlrIsv@?v#14{K)ZF|@aH zOGZG=7ac-!pZ(-eo>@BLaUAabnt#rZ9Ub_o8`tVX7P4k%O64peRAJ;2<-Abw=EzWWf52;5YFD=qArKnxzBovpFU3{v4uuTGl#eVo>hY)#erZf9t zhVzm&;r1VFIV~(0lrWZj>UQ&|dcV9hbg7t5m9Y3AoSb zr=sZbOKJvV%K1)CiwDEHPBGJ`FqiQDU*EU&MuOdzWr1smg$rYz3v5g7r-6Ucigb6! zIHYRRQF(ks=)j=5b5*EP&eENSV6&iUh9Dwfe*l7TBT^WxH%H%Kd`+6*EoA^{25zBo z+RE`c-`U|w6L9$?P2}-)Rc<$!V9TDu3={7nNx4{Kki(SzT*Cg+udmZrtkf^k6-aHK z3}jF{?&+&T{>SDhEP|18kh31k+3Abt456y*7Ql!&hHp^uIzfvYEqkUTeRM&hr`P1O zb7I1kcf=`Eg~Gn(21o7#rMHVlyQK#}a1Qx>wtJ&FHvD)9H~NVGe%;3~UL3d(_Jzzj z^=B=rsq6x_;HMi06;a!>XeDR)IeOX?-^Hy%LgVJUpXeT7vYs z;C(t_2vq=X-ZHku4PYO>-E;e63>Lq?NG3S4JcL~xJ19!~4GIN5o--r_LAA2xO{aYh zr~H>(!mFAVT=2S$haob!(SUpQOZn)~eOogU*t1{)xo=j`FCQRyAara-(jZRCHYvBA z%}t&VcoSN1e{+3kQZ%ve<`DOQiDZsNM%K;`|0hg5pt^JlUP9?aym&>%GG@SaO4I4g z&V2tc1h9M(?fJeH5L%KoKnHasE(r|?DD*mYy^(a_hAYxP{zBWzq^yLMr}^UsY?*M< zlA%JbY`7$W`kJ3?Y_BMXL?02maK_#1w#3z3T@w~Z<7gXg4gF&q!&96{97!Z_7ng~r z#Ffo1EmAy)q#l_2;8`ORIO!Dg7s>zpoI#jyd44KQm)(@f8vH;2${ar zURZcDdq)r5kI8{jwPT9E<8`nSd|-`U_sdZzSl0qysI4)ul?$Wop^v{Y-`0kk@~5Cb zoSn0zgpJu)vj(NFy9i3wn5HSidOz&^=pc3#On?LI(zfw2v5Ug{zLB=4VB)mczqY2l z7AvZQN3Ugr=q@GfMWYQcYj26R_y2Hq@Ro4whE3e?J-grvMN35FV71bwy<;(!LfakK zST($;&LjO_1nkX&Tq1YHh5fv6Px*lAqoY)|alg7?vH8=G#>X0=y71ZvZ?gT2D!PcX z^g(?@(PCz6!&o_<320p|mDXE5tK#3{AFGPe^>rFiq+y^j>7f?u@rH!hK_xRLlFPX| zp48urSYJ@xp66-(`U3We-6nMRdE$f1WwLDgCz;bC_}uP9 z$fi%otd>X4*%1!p)wzF0WOO^+Ocx0Y>tu8bDF7p@0Vt+r>q;xVd0N|jlD2mpy3Pjn z4rca8yA}P_GEoWh>28XHZ{Hm-doB+OC9Y9QzP;a9$IcG_Uv!OebQX6(?7{;Qib|BA z++^H=A8C$6oc&jSqj3bw^o5c;UTlUjyX_}R=|-&^26WCxbf%96Lg>HO-(0?BWlSIh zl7UvM>;0DS+SCLC7jk2Ss5LDeOF!Y1?{cm%UIAOeotL1pgAWyS7(3^r2oAUHZ2SoX zF05NJGAALpq{m&i@%vLE8`5+Sdd`~+01dszQ6e-HjW5FN50?yfw(`AQ=J<0nKTkjN zk`Br9kC*5J5|{X{lAb?9}oaCw;X@l;Duy*I?^yO|!I%+G~K%tn;$=Tdv`-6k=1$F3!l zT>p4iWBRB80fbuIJ!Z7E`*mKDewi*!QN;~Be11I8r~ksjYyhfjpk#Dj41|+Xm=YT& zh8aR)2*3~LC4Rk2oGZy-l;_}jq)rvm>{vfjQ4H1raK8CRn zG?2|%>Oa@;UlAD`i(CEY`EW_;gX^5%upyXW@6qAV-`F;~OE>^N{D(G14;=CE-k>+NBx~7(qNeS^W^bx%4LlWePF`CnD6dpUL zdbyVp>e)l{BEwpTF6>2gikqoprF?C!>qkK7t4c}Bw*kvy-Z5#f2 zXa2a&RpOQ7f9R&|%yh|3_poeMbtarPl6|}__kSj4C+nA;0BnrCPqPlNQ!7>b9(K=T zS#-VSp?f!NEvKZO=1*DRvt)|%%A3>PeY=n|$$I6^sXJ56KmVwC@9i$T*56LG^4jLF zlJ>jx&OW=hWSzh6>LBAw*NT_!J9FXB);oJI%`UqC?&L(CexMnhS7dxP?L2M1&+Fve zC3pND-#$C<=*_;`VE4&YdMB@`9R8nk@7o^#PYP$Bb=vH@(=QqLYfdv~N!Iz3qM0Vg zf=-{CvdhqPcF-m>(`HS{SecN0YNd83+f_DuR?dCm+Qv5j$-CCLDONA8{Fr(5=e$_W zwX!Qyle5g0TrrJ2z`jtMPpC9w@U1oVl{{E-d ko%yHCfhU5{80hHQGjLSB4s(@A>;hTk>FVdQ&MBb@0MQy3l>h($ literal 0 HcmV?d00001 diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 00000000..f363d279 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,26 @@ +# API Reference + + +## Storage + +```{eval-rst} +.. autoclass:: pangeo_forge.storage.Target + :members: +``` + +```{eval-rst} +.. autoclass:: pangeo_forge.storage.InputCache + :members: +``` + +## Recipes + +```{eval-rst} +.. autoclass:: pangeo_forge.recipe.BaseRecipe + :members: +``` + +```{eval-rst} +.. autoclass:: pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe + :show-inheritance: +``` diff --git a/docs/conf.py b/docs/conf.py index 2dfe4699..a0e1904c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,8 +1,9 @@ -import sphinx_pangeo_theme # noqa +# import sphinx_pangeo_theme # noqa +import sphinx_book_theme # noqa # -- Project information ----------------------------------------------------- -project = "pangeo-forge" +project = "Pangeo Forge" copyright = "2020, Pangeo Community" author = "Pangeo Community" @@ -22,8 +23,10 @@ # -- Options for HTML output ------------------------------------------------- -html_theme = "pangeo" -html_static_path = ["_static"] -html_sidebars = {"index": [], "**": ["localtoc.html"]} +html_theme = "sphinx_book_theme" +html_logo = "_static/pangeo-forge-logo-blue.png" +# html_theme = "pangeo" +# html_static_path = ["_static"] +# html_sidebars = {"index": [], "**": ["localtoc.html"]} myst_heading_anchors = 2 diff --git a/docs/index.md b/docs/index.md index 2718b2bd..f35e3bfb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,5 +38,6 @@ recipes execution bakeries contribute +api ``` diff --git a/docs/recipes.md b/docs/recipes.md index f053c335..a45789fc 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -1,15 +1,57 @@ # Recipes +A recipe defines how to transform data in one format / location into another format / location. +The primary way people contribute to Pangeo Forge is by writing / maintaining recipes. + +```{warning} +The Recipe API is still in flux and may change. Make sure the version of the documentation +you are reading matches your installed version of pangeo_forge. +``` + +## Storage + +Recipes need a place to store data. +The location where the final dataset produced by the recipe is stored is called the +``Target``. Pangeo forge has a special class for this: {class}`pangeo_forge.storage.Target` + +Creating a Target requires two arguments: +- The ``fs`` argument is an [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) + filesystem. Fsspec supports many different types of storage via its + [built in](https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations) + and [third party](https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations) + implementations. +- The `path` argument specifies the specific path where the data should be stored. + +For example, creating a storage target for AWS S3 might look like this: +```{code-block} python +import s3fs +fs = s3fs.S3FileSystem(key="MY_AWS_KEY", secret="MY_AWS_SECRET") +target_path = "pangeo-forge-bucket/my-dataset-v1.zarr" +target = Target(fs=fs, path=target_path) +``` + +Temporary data can be stored in an {class}`pangeo_forge.storage.InputCache` object. +``InputCache`` is similar to ``Target``, but instead of specifying a ``path``, +you specify ``prefix``. + + ## The Base Recipe Class -```{eval-rst} -.. autoclass:: pangeo_forge.recipe.BaseRecipe - :members: +A recipe is initialized from a recipe class. +```{code-block} python +recipe = Recipe(option1='foo', option2=) ``` +All recipes follow the same basic steps. + + + + + ## Specific Recipe Classes ```{eval-rst} .. autoclass:: pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe :show-inheritance: + :noindex: ``` diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index e42fbb7d..9d07c79f 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -119,6 +119,7 @@ def to_pipelines(self) -> ParallelPipelines: class NetCDFtoZarrSequentialRecipe(BaseRecipe): """There are many inputs (a.k.a. files, granules), arranged in a sequence along the dimension `sequence_dim`. Each file may contain multiple variables. + This class uses Xarray to read and write data. :param input_urls: The inputs used to generate the dataset. :param sequence_dim: The dimension name along which the inputs will be concatenated. @@ -149,8 +150,6 @@ def __post_init__(self): @property def prepare(self) -> Callable: - """Prepare target for storing dataset.""" - def _prepare(): try: @@ -175,13 +174,6 @@ def _prepare(): @property def cache_input(self) -> Callable: - """Cache the input. - - Properties - ---------- - url : URL pointing to the input file. Must be openable by fsspec. - """ - def cache_func(fname: str) -> None: logger.info(f"Caching input '{fname}'") with input_opener(fname, mode="rb") as source: @@ -192,14 +184,6 @@ def cache_func(fname: str) -> None: @property def store_chunk(self) -> Callable: - """Store a chunk in the target. - - Parameters - ---------- - chunk_key : str - The identifier for the chunk - """ - def _store_chunk(chunk_key): ds_chunk = self.open_chunk(chunk_key) @@ -218,8 +202,6 @@ def drop_vars(ds): @property def finalize(self) -> Callable: - """Finalize writing of dataset.""" - def _finalize(): if self.consolidate_zarr: logger.info("Consolidating Zarr metadata") diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index 742bb9f2..acc5ad33 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -1,6 +1,7 @@ import os from contextlib import contextmanager from dataclasses import dataclass +from typing import BinaryIO, NoReturn import fsspec @@ -8,20 +9,16 @@ @dataclass class Target: """Representation of a storage target for Pangeo Forge. - Attributes - ---------- - fs : FileSystemSpec.AbtractFileSystem - The filesystem we are writing to. Should be instantiated outside this - class. - path : str - The path where the target data will be saved. + + :param fs: The filesystem object we are writing to. + :param path: The path where the target data will be saved. """ fs: fsspec.AbstractFileSystem path: str - def get_mapper(self): - # don't want to use this because we want to use a fancier Zarr FSStore + def get_mapper(self) -> fsspec.mapping.FSMap: + """Get a mutable mapping object suitable for storing Zarr data.""" return self.fs.get_mapper(self.path) @@ -34,13 +31,8 @@ class InputCache: """Representation of an intermediate storage location where remote files Can be cached locally. - Attributes - ---------- - fs : FileSystemSpec.AbtractFileSystem - The filesystem we are writing to. Should be instantiated outside this - class. - prefix : str - A path prepended to all paths. + :param fs: The filesystem we are writing to. + :param prefix: A path prepended to all paths. """ fs: fsspec.AbstractFileSystem @@ -49,14 +41,17 @@ class InputCache: def _full_path(self, path): return os.path.join(self.prefix, _hash_path(path)) - def exists(self, path): + def exists(self, path) -> bool: + """Check that the file is in the cache.""" return self.fs.exists(self._full_path(path)) - def rm(self, path): + def rm(self, path) -> NoReturn: + """Remove file from the cache.""" self.fs.rm(self._full_path(path)) @contextmanager - def open(self, path, **kwargs): + def open(self, path, **kwargs) -> BinaryIO: + """Open file with a context manager.""" with self.fs.open(self._full_path(path), **kwargs) as f: yield f diff --git a/setup.cfg b/setup.cfg index d2063367..6a8506d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ max-line-length = 100 [isort] known_first_party=pangeo_forge -known_third_party=click,fsspec,numpy,pandas,pkg_resources,prefect,pytest,rechunker,setuptools,sphinx_pangeo_theme,xarray,zarr +known_third_party=click,fsspec,numpy,pandas,pkg_resources,prefect,pytest,rechunker,setuptools,sphinx_book_theme,xarray,zarr multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 From fa20daf64b5ccbccd32371d0dd6755a58a1f7b55 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 00:42:18 -0500 Subject: [PATCH 26/34] add tutorial to docs --- docs/conf.py | 4 +- docs/index.md | 1 + docs/recipe_tutorial.ipynb | 215 +++++++++++++++++++++++++++++++++++++ pangeo_forge/recipe.py | 11 +- 4 files changed, 226 insertions(+), 5 deletions(-) create mode 100644 docs/recipe_tutorial.ipynb diff --git a/docs/conf.py b/docs/conf.py index a0e1904c..35c8d26d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,14 +11,14 @@ # -- General configuration --------------------------------------------------- extensions = [ - "myst_parser", + "myst_nb", "sphinx.ext.autodoc", # "numpydoc", "sphinx_autodoc_typehints", ] templates_path = ["_templates"] -exclude_patterns = [] +exclude_patterns = ["_build", "**.ipynb_checkpoints"] master_doc = "index" # -- Options for HTML output ------------------------------------------------- diff --git a/docs/index.md b/docs/index.md index f35e3bfb..319199e4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,6 +35,7 @@ For more information, see {doc}`bakeries`. :caption: Contents recipes +recipe_tutorial execution bakeries contribute diff --git a/docs/recipe_tutorial.ipynb b/docs/recipe_tutorial.ipynb new file mode 100644 index 00000000..16cc5820 --- /dev/null +++ b/docs/recipe_tutorial.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recipe Tutorial\n", + "\n", + "This tutorial describes how to create a recipe from scratch.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Get to know your source data\n", + "\n", + "If you are developing a new recipe, you are probably starting from an existing\n", + "datset. The first step is to just get to know the dataset. For this tutorial,\n", + "our example will be the _NOAA Optimum Interpolation Sea Surface Temperature\n", + "(OISST) v2.1_. The authoritative website describing the data is\n", + ".\n", + "This website contains links to the actual data files on the\n", + "[data access](https://www.ncdc.noaa.gov/oisst/data-access) page. We will use the\n", + "_AVHRR-Only_ version of the data and follow the corresponding link to the\n", + "[Gridded netCDF Data](https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/).\n", + "Browsing through the directories, we can see that there is one file per day. The\n", + "very first day of the dataset is stored at the following URL:\n", + "\n", + "```text\n", + "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc\n", + "```\n", + "\n", + "From this example, we can work out the pattern of the file naming conventions.\n", + "But first, let's just download one of the files and open it up.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! curl -O https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "\n", + "ds = xr.open_dataset(\"oisst-avhrr-v02r01.19810901.nc\")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see there are four data variables, all with dimension\n", + "`(time, zlev, lat, lon)`. There is a _dimension coordinate_ for each dimension,\n", + "and no _non-dimension coordinates_. Each file in the sequence presumably has the\n", + "same `zlev`, `lat`, and `lon`, but we expect `time` to be different in each one.\n", + "\n", + "Let's also check the total size of the dataset in the file.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"File size is {ds.nbytes/1e6} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The file size is important because it will help us define the _chunk size_\n", + "Pangeo Forge will use to build up the target dataset.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Pick a Recipe class\n", + "\n", + "For our first recipe, we will want to use a pre-defined Recipe class from Pangeo\n", + "Forge.\n", + "\n", + "By examining the {doc}`recipes` documentation page, we see that our scenario is\n", + "a good case for the {class}`pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe`\n", + "class. Let's examine its documentation string in our notebook.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pangeo_forge.recipe import NetCDFtoZarrSequentialRecipe\n", + "NetCDFtoZarrSequentialRecipe?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Define Recipe parameters\n", + "\n", + "Our chosen class has only two required parameters: `input_urls` and\n", + "`sequence_dim`.\n", + "\n", + "`input_urls` is a list of URLs pointing to the data. To populate this, we need\n", + "to explicitly create this list based on what we know about the file naming\n", + "conventions. Let's look again at the first URL\n", + "\n", + "```text\n", + "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc\n", + "```\n", + "\n", + "From this we deduce the following format string.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_url_pattern = (\n", + " \"https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation\"\n", + " \"/v2.1/access/avhrr/{yyyymm}/oisst-avhrr-v02r01.{yyyymmdd}.nc\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need a sequence of datetimes. Pandas is the easiest way to get this. At\n", + "the time of writing, the latest available data is from 2021-01-05.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "dates = pd.date_range(\"1981-09-01\", \"2021-01-05\", freq=\"D\")\n", + "input_urls = [\n", + " input_url_pattern.format(\n", + " yyyymm=day.strftime(\"%Y%m\"), yyyymmdd=day.strftime(\"%Y%m%d\")\n", + " )\n", + " for day in dates\n", + "]\n", + "print(f\"Found {len(input_urls)} files!\")\n", + "input_urls[-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's a lot of files!\n", + "\n", + "The other remaining parameter is `sequence_dim`. It's just `\"time\"`. We can now\n", + "instantiate our recipe.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "recipe = NetCDFtoZarrSequentialRecipe(\n", + " input_urls=input_urls, sequence_dim=\"time\"\n", + ")\n", + "recipe" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index 9d07c79f..b962ce1e 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -117,8 +117,13 @@ def to_pipelines(self) -> ParallelPipelines: @dataclass class NetCDFtoZarrSequentialRecipe(BaseRecipe): - """There are many inputs (a.k.a. files, granules), arranged in a sequence - along the dimension `sequence_dim`. Each file may contain multiple variables. + """This class represents a dataset composed of many individual NetCDF files. + The files are arraged in a sequence along a single dimension, called the + `sequence_dim`. Each file may contain multiple variables. + + The dataset is assembled by concatenating all of these files along `sequence_dim`. + The target is written in Zarr format. + This class uses Xarray to read and write data. :param input_urls: The inputs used to generate the dataset. @@ -133,7 +138,7 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): the inputs to form a chunk. """ - input_urls: Iterable[str] + input_urls: Iterable[str] = field(repr=False) sequence_dim: str inputs_per_chunk: int = 1 nitems_per_input: int = 1 From 8d66eb948e7456333c94d8e70bb76231bfae814a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 11:47:06 -0500 Subject: [PATCH 27/34] refactored storage targets --- pangeo_forge/recipe.py | 31 +++++++++-------- pangeo_forge/storage.py | 75 ++++++++++++++++++++++++++++++----------- tests/conftest.py | 6 ++-- tests/test_fixtures.py | 2 +- 4 files changed, 76 insertions(+), 38 deletions(-) diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index b962ce1e..a2ed5aa3 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -13,7 +13,7 @@ import zarr from rechunker.types import MultiStagePipeline, ParallelPipelines, Stage -from .storage import InputCache, Target +from .storage import AbstractTarget from .utils import chunked_iterable, fix_scalar_attr_encoding logger = logging.getLogger(__name__) @@ -132,6 +132,9 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): :param nitems_per_input: The length of each input along the `sequence_dim` dimension. :param target: A location in which to put the dataset. Can also be assigned at run time. :param input_cache: A location in which to cache temporary data. + :param require_cache: Whether to allow opening inputs directly which have not + yet been cached. This could lead to very slow behavior if the inputs + live on a slow network. :param consolidate_zarr: Whether to consolidate the resulting Zarr dataset. :param xarray_open_kwargs: Extra options for opening the inputs with Xarray. :param xarray_concat_kwargs: Extra options to pass to Xarray when concatenating @@ -142,8 +145,9 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): sequence_dim: str inputs_per_chunk: int = 1 nitems_per_input: int = 1 - target: Optional[Target] = None - input_cache: Optional[InputCache] = None + target: Optional[AbstractTarget] = None + input_cache: Optional[AbstractTarget] = None + require_cache: bool = True consolidate_zarr: bool = True xarray_open_kwargs: dict = field(default_factory=dict) xarray_concat_kwargs: dict = field(default_factory=dict) @@ -217,19 +221,18 @@ def _finalize(): @contextmanager def input_opener(self, fname: str): - if self.input_cache is None: - logger.info(f"No cache. Opening input `{fname}` directly.") - # This will bypass the cache. May be slow. - with input_opener(fname, mode="rb") as f: - yield f - elif self.input_cache.exists(fname): - logger.info(f"Input '{fname}' found in cache") + try: with self.input_cache.open(fname, mode="rb") as f: + logger.info(f"Opening '{fname}' from cache") yield f - else: - raise ValueError( - f"Input '{fname}' has not been cached yet. " "Call .cache_input() first." - ) + except IOError: # TODO figure out the excpetion to catch + if self.require_cache: + raise + else: + logger.info(f"No cache found. Opening input `{fname}` directly.") + # This will bypass the cache. May be slow. + with input_opener(fname, mode="rb") as f: + yield f def open_input(self, fname: str): with self.input_opener(fname) as f: diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index acc5ad33..87fce491 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -1,4 +1,7 @@ import os +import re +import unicodedata +from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass from typing import BinaryIO, NoReturn @@ -6,20 +9,25 @@ import fsspec -@dataclass -class Target: - """Representation of a storage target for Pangeo Forge. +class AbstractTarget(ABC): + @abstractmethod + def get_mapper(self): + pass - :param fs: The filesystem object we are writing to. - :param path: The path where the target data will be saved. - """ + @abstractmethod + def exists(self, path) -> bool: + """Check that the file exists.""" + pass - fs: fsspec.AbstractFileSystem - path: str + @abstractmethod + def rm(self, path) -> NoReturn: + """Remove file.""" + pass - def get_mapper(self) -> fsspec.mapping.FSMap: - """Get a mutable mapping object suitable for storing Zarr data.""" - return self.fs.get_mapper(self.path) + @contextmanager + def open(self, path, **kwargs) -> BinaryIO: + """Open file with a context manager.""" + pass def _hash_path(path: str) -> str: @@ -27,19 +35,22 @@ def _hash_path(path: str) -> str: @dataclass -class InputCache: - """Representation of an intermediate storage location where remote files - Can be cached locally. +class FSSpecTarget(AbstractTarget): + """Representation of a storage target for Pangeo Forge. - :param fs: The filesystem we are writing to. - :param prefix: A path prepended to all paths. + :param fs: The filesystem object we are writing to. + :param root_path: The path under which the target data will be stored. """ fs: fsspec.AbstractFileSystem - prefix: str = "" + root_path: str = "" + + def get_mapper(self) -> fsspec.mapping.FSMap: + """Get a mutable mapping object suitable for storing Zarr data.""" + return self.fs.get_mapper(self.root_path) def _full_path(self, path): - return os.path.join(self.prefix, _hash_path(path)) + return os.path.join(self.root_path, path) def exists(self, path) -> bool: """Check that the file is in the cache.""" @@ -56,5 +67,29 @@ def open(self, path, **kwargs) -> BinaryIO: yield f def __post_init__(self): - if not self.fs.isdir(self.prefix): - self.fs.mkdir(self.prefix) + if not self.fs.isdir(self.root_path): + self.fs.mkdir(self.root_path) + + +class FlatFSSpecTarget(FSSpecTarget): + """A target that sanitizes all the path names so that everthing is stored + in a single directory. + + Designed to be used as a cache for inputs. + """ + + def _full_path(self, path): + slug = _slugify(path) + prefix = prefix = hex(hash(path))[2:10] + new_path = "-".join([prefix, slug]) + return os.path.join(self.root_path, new_path) + + +def _slugify(value): + # Adopted from + # https://github.com/django/django/blob/master/django/utils/text.py + # https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename + value = str(value) + value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii") + value = re.sub(r"[^\w\s-]+", "_", value.lower()) + return re.sub(r"[-\s]+", "-", value).strip("-_") diff --git a/tests/conftest.py b/tests/conftest.py index 6386481a..1525eb57 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ import xarray as xr from pangeo_forge import recipe -from pangeo_forge.storage import InputCache, Target +from pangeo_forge.storage import FlatFSSpecTarget, FSSpecTarget # where to run the http server _PORT = "8080" @@ -80,14 +80,14 @@ def tmp_target(tmpdir_factory): fs = fsspec.get_filesystem_class("file")() path = str(tmpdir_factory.mktemp("target")) - return Target(fs, path) + return FSSpecTarget(fs, path) @pytest.fixture() def tmp_cache(tmpdir_factory): path = str(tmpdir_factory.mktemp("cache")) fs = fsspec.get_filesystem_class("file")() - cache = InputCache(fs, prefix=path) + cache = FlatFSSpecTarget(fs, path) return cache diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py index c867f17b..4e5e34fa 100644 --- a/tests/test_fixtures.py +++ b/tests/test_fixtures.py @@ -24,7 +24,7 @@ def test_fixture_http_files(daily_xarray_dataset, netcdf_http_server): def test_target(tmp_target): mapper = tmp_target.get_mapper() mapper["foo"] = b"bar" - with open(tmp_target.path + "/foo") as f: + with open(tmp_target.root_path + "/foo") as f: res = f.read() assert res == "bar" From 86f6f929d79a659acbee4993b9afe9641660108e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 12:10:08 -0500 Subject: [PATCH 28/34] better target testing --- .pre-commit-config.yaml | 6 +----- pangeo_forge/recipe.py | 4 ++-- pangeo_forge/storage.py | 26 ++++++++++++++++++++++++++ tests/conftest.py | 7 ++++++- tests/test_fixtures.py | 20 ++++++++++++++++++++ 5 files changed, 55 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 067b22f0..65a3907e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,16 +26,12 @@ repos: rev: v2.2.0 hooks: - id: seed-isort-config + - repo: https://github.com/pre-commit/mirrors-isort rev: v5.2.0 hooks: - id: isort -- repo: https://github.com/deathbeds/prenotebook - rev: f5bdb72a400f1a56fe88109936c83aa12cc349fa - hooks: - - id: prenotebook - - repo: https://github.com/myint/rstcheck rev: master hooks: diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index a2ed5aa3..c22c12fa 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -165,7 +165,7 @@ def _prepare(): ds = self.open_target() logger.info("Found an existing dataset in target") logger.debug(f"{ds}") - except (IOError, zarr.errors.GroupNotFoundError): + except (FileNotFoundError, IOError, zarr.errors.GroupNotFoundError): first_chunk_key = next(self.iter_chunks()) for input_url in self.inputs_for_chunk(first_chunk_key): self.cache_input(input_url) @@ -225,7 +225,7 @@ def input_opener(self, fname: str): with self.input_cache.open(fname, mode="rb") as f: logger.info(f"Opening '{fname}' from cache") yield f - except IOError: # TODO figure out the excpetion to catch + except (IOError, FileNotFoundError): if self.require_cache: raise else: diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index 87fce491..44fc8e15 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -93,3 +93,29 @@ def _slugify(value): value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii") value = re.sub(r"[^\w\s-]+", "_", value.lower()) return re.sub(r"[-\s]+", "-", value).strip("-_") + + +class UninitializedTarget(AbstractTarget): + def get_mapper(self): + raise UninitializedTargetError + + def exists(self, path) -> bool: + raise UninitializedTargetError + + def rm(self, path) -> NoReturn: + raise UninitializedTargetError + + def open(self, path, **kwargs) -> BinaryIO: + raise UninitializedTargetError + + +class TargetError(Exception): + """Base class for exceptions in this module.""" + + pass + + +class UninitializedTargetError(TargetError): + """Operation on an uninitialized Target.""" + + pass diff --git a/tests/conftest.py b/tests/conftest.py index 1525eb57..37b4eaa4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ import xarray as xr from pangeo_forge import recipe -from pangeo_forge.storage import FlatFSSpecTarget, FSSpecTarget +from pangeo_forge.storage import FlatFSSpecTarget, FSSpecTarget, UninitializedTarget # where to run the http server _PORT = "8080" @@ -91,6 +91,11 @@ def tmp_cache(tmpdir_factory): return cache +@pytest.fixture() +def uninitialized_target(): + return UninitializedTarget() + + @pytest.fixture def netCDFtoZarr_sequential_recipe(daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache): r = recipe.NetCDFtoZarrSequentialRecipe( diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py index 4e5e34fa..cfc64fd1 100644 --- a/tests/test_fixtures.py +++ b/tests/test_fixtures.py @@ -1,6 +1,8 @@ import fsspec +import pytest import xarray as xr +from pangeo_forge.storage import UninitializedTargetError from pangeo_forge.utils import fix_scalar_attr_encoding @@ -27,6 +29,24 @@ def test_target(tmp_target): with open(tmp_target.root_path + "/foo") as f: res = f.read() assert res == "bar" + with pytest.raises(FileNotFoundError): + tmp_target.rm("baz") + with pytest.raises(FileNotFoundError): + with tmp_target.open("baz"): + pass + + +def test_uninitialized_target(uninitialized_target): + target = uninitialized_target + with pytest.raises(UninitializedTargetError): + target.get_mapper() + with pytest.raises(UninitializedTargetError): + target.exists("foo") + with pytest.raises(UninitializedTargetError): + target.rm("foo") + with pytest.raises(UninitializedTargetError): + with target.open("foo"): + pass def test_cache(tmp_cache): From 049e692cde62ddfb4cde1dcf5261f615806f76f3 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 14:36:15 -0500 Subject: [PATCH 29/34] change cannonical recipe execution order --- pangeo_forge/recipe.py | 43 ++++++++++++++++++++++++----------------- pangeo_forge/storage.py | 6 ++++++ tests/conftest.py | 4 ++-- tests/test_recipe.py | 21 ++++++++++++++++++-- 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index c22c12fa..ebe82dda 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -13,7 +13,7 @@ import zarr from rechunker.types import MultiStagePipeline, ParallelPipelines, Stage -from .storage import AbstractTarget +from .storage import AbstractTarget, UninitializedTarget from .utils import chunked_iterable, fix_scalar_attr_encoding logger = logging.getLogger(__name__) @@ -23,12 +23,12 @@ # t = PangeoForgeTarget() # r = MyRecipe(target=t, **opts) # 1 # # manual execution of recipe -# r.prepare() # 3 # for input_key in r.iter_inputs(): # r.cache_input(input_key) # 4 +# r.prepare_target() # 3 # for chunk_key in r.iter_chunks(): # r.store_chunk(chunk_key) # 5 -# r.finalize() # 6 +# r.finalize_target() # 6 # 1) Initialize the Recipe object @@ -56,7 +56,7 @@ class BaseRecipe(ABC): @property @abstractmethod - def prepare(self) -> Callable[[], NoReturn]: + def prepare_target(self) -> Callable[[], NoReturn]: """Prepare the recipe for execution by initializing the target. Attribute that returns a callable function. """ @@ -90,7 +90,7 @@ def store_chunk(self) -> Callable[[Hashable], NoReturn]: @property @abstractmethod - def finalize(self) -> Callable[[], NoReturn]: + def finalize_target(self) -> Callable[[], NoReturn]: """Final step to finish the recipe after data has been written. Attribute that returns a callable function. """ @@ -101,10 +101,10 @@ def to_pipelines(self) -> ParallelPipelines: """ pipeline = [] # type: MultiStagePipeline - pipeline.append(Stage(self.prepare)) pipeline.append(Stage(self.cache_input, list(self.iter_inputs()))) + pipeline.append(Stage(self.prepare_target)) pipeline.append(Stage(self.store_chunk, list(self.iter_chunks()))) - pipeline.append(Stage(self.finalize)) + pipeline.append(Stage(self.finalize_target)) pipelines = [] # type: ParallelPipelines pipelines.append(pipeline) return pipelines @@ -145,8 +145,8 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): sequence_dim: str inputs_per_chunk: int = 1 nitems_per_input: int = 1 - target: Optional[AbstractTarget] = None - input_cache: Optional[AbstractTarget] = None + target: Optional[AbstractTarget] = field(default_factory=UninitializedTarget) + input_cache: Optional[AbstractTarget] = field(default_factory=UninitializedTarget) require_cache: bool = True consolidate_zarr: bool = True xarray_open_kwargs: dict = field(default_factory=dict) @@ -158,8 +158,8 @@ def __post_init__(self): } @property - def prepare(self) -> Callable: - def _prepare(): + def prepare_target(self) -> Callable: + def _prepare_target(): try: ds = self.open_target() @@ -167,8 +167,8 @@ def _prepare(): logger.debug(f"{ds}") except (FileNotFoundError, IOError, zarr.errors.GroupNotFoundError): first_chunk_key = next(self.iter_chunks()) - for input_url in self.inputs_for_chunk(first_chunk_key): - self.cache_input(input_url) + # for input_url in self.inputs_for_chunk(first_chunk_key): + # self.cache_input(input_url) ds = self.open_chunk(first_chunk_key).chunk() # make sure the concat dim has a valid fill_value to avoid @@ -179,7 +179,7 @@ def _prepare(): self.expand_target_dim(self.sequence_dim, self.sequence_len()) - return _prepare + return _prepare_target @property def cache_input(self) -> Callable: @@ -210,14 +210,14 @@ def drop_vars(ds): return _store_chunk @property - def finalize(self) -> Callable: - def _finalize(): + def finalize_target(self) -> Callable: + def _finalize_target(): if self.consolidate_zarr: logger.info("Consolidating Zarr metadata") target_mapper = self.target.get_mapper() zarr.consolidate_metadata(target_mapper) - return _finalize + return _finalize_target @contextmanager def input_opener(self, fname: str): @@ -227,7 +227,11 @@ def input_opener(self, fname: str): yield f except (IOError, FileNotFoundError): if self.require_cache: - raise + raise FileNotFoundError( + f"You are trying to open input {fname}, but the file is " + "not cached yet. First call `cache_input` or set " + "`require_cache=False`." + ) else: logger.info(f"No cache found. Opening input `{fname}` directly.") # This will bypass the cache. May be slow. @@ -249,7 +253,10 @@ def open_chunk(self, chunk_key): inputs = self.inputs_for_chunk(chunk_key) dsets = [self.open_input(i) for i in inputs] # CONCAT DELETES ENCODING!!! + # OR NO IT DOESN'T! Not in the latest version of xarray? ds = xr.concat(dsets, self.sequence_dim, **self.xarray_concat_kwargs) + for var in ds.variables: + ds[var].encoding = {} logger.debug(f"{ds}") # TODO: maybe do some chunking here? diff --git a/pangeo_forge/storage.py b/pangeo_forge/storage.py index 44fc8e15..43b11a2d 100644 --- a/pangeo_forge/storage.py +++ b/pangeo_forge/storage.py @@ -85,6 +85,12 @@ def _full_path(self, path): return os.path.join(self.root_path, new_path) +class CacheFSSpecTarget(FlatFSSpecTarget): + """Alias for FlatFSSpecTarget""" + + pass + + def _slugify(value): # Adopted from # https://github.com/django/django/blob/master/django/utils/text.py diff --git a/tests/conftest.py b/tests/conftest.py index 37b4eaa4..af505ade 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ import xarray as xr from pangeo_forge import recipe -from pangeo_forge.storage import FlatFSSpecTarget, FSSpecTarget, UninitializedTarget +from pangeo_forge.storage import CacheFSSpecTarget, FSSpecTarget, UninitializedTarget # where to run the http server _PORT = "8080" @@ -87,7 +87,7 @@ def tmp_target(tmpdir_factory): def tmp_cache(tmpdir_factory): path = str(tmpdir_factory.mktemp("cache")) fs = fsspec.get_filesystem_class("file")() - cache = FlatFSSpecTarget(fs, path) + cache = CacheFSSpecTarget(fs, path) return cache diff --git a/tests/test_recipe.py b/tests/test_recipe.py index 2a6a412d..05fc2ae5 100644 --- a/tests/test_recipe.py +++ b/tests/test_recipe.py @@ -2,6 +2,7 @@ import xarray as xr from pangeo_forge import recipe +from pangeo_forge.storage import UninitializedTargetError dummy_fnames = ["a.nc", "b.nc", "c.nc"] @@ -50,13 +51,29 @@ def test_NetCDFtoZarrSequentialRecipe( ) # this is the cannonical way to manually execute a recipe - r.prepare() for input_key in r.iter_inputs(): r.cache_input(input_key) + r.prepare_target() for chunk_key in r.iter_chunks(): r.store_chunk(chunk_key) - r.finalize() + r.finalize_target() ds_target = xr.open_zarr(tmp_target.get_mapper(), consolidated=True).load() ds_expected = daily_xarray_dataset.compute() assert ds_target.identical(ds_expected) + + +def test_NetCDFtoZarrSequentialRecipeNoTarget( + daily_xarray_dataset, netcdf_local_paths, tmp_target, tmp_cache +): + + r = recipe.NetCDFtoZarrSequentialRecipe( + input_urls=netcdf_local_paths, + sequence_dim="time", + inputs_per_chunk=1, + nitems_per_input=daily_xarray_dataset.attrs["items_per_file"], + ) + + # this is the cannonical way to manually execute a recipe + with pytest.raises(UninitializedTargetError): + r.cache_input(next(r.iter_inputs())) From 49519ef2a5fcb7b1dfe08b7ed61ed66a56917ff3 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 15:38:12 -0500 Subject: [PATCH 30/34] big update --- docs/api.md | 4 +- docs/conf.py | 12 +- docs/recipe_tutorial.ipynb | 3458 +++++++++++++++++++++++++++++++++++- docs/recipes.md | 14 +- pangeo_forge/recipe.py | 12 +- 5 files changed, 3467 insertions(+), 33 deletions(-) diff --git a/docs/api.md b/docs/api.md index f363d279..667283b2 100644 --- a/docs/api.md +++ b/docs/api.md @@ -4,12 +4,12 @@ ## Storage ```{eval-rst} -.. autoclass:: pangeo_forge.storage.Target +.. autoclass:: pangeo_forge.storage.FSSpecTarget :members: ``` ```{eval-rst} -.. autoclass:: pangeo_forge.storage.InputCache +.. autoclass:: pangeo_forge.storage.CacheFSSpecTarget :members: ``` diff --git a/docs/conf.py b/docs/conf.py index 35c8d26d..8ff69e2c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,18 +15,22 @@ "sphinx.ext.autodoc", # "numpydoc", "sphinx_autodoc_typehints", + "sphinx_copybutton", ] templates_path = ["_templates"] exclude_patterns = ["_build", "**.ipynb_checkpoints"] master_doc = "index" +# we always have to manually run the notebooks because they are slow / expensive +jupyter_execute_notebooks = "off" + # -- Options for HTML output ------------------------------------------------- html_theme = "sphinx_book_theme" html_logo = "_static/pangeo-forge-logo-blue.png" -# html_theme = "pangeo" -# html_static_path = ["_static"] -# html_sidebars = {"index": [], "**": ["localtoc.html"]} - +html_static_path = ["_static"] myst_heading_anchors = 2 +html_css_files = [ + "custom.css", +] diff --git a/docs/recipe_tutorial.ipynb b/docs/recipe_tutorial.ipynb index 16cc5820..3e7b2cfe 100644 --- a/docs/recipe_tutorial.ipynb +++ b/docs/recipe_tutorial.ipynb @@ -46,9 +46,474 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 720, lon: 1440, time: 1, zlev: 1)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
+       "  * lon      (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n",
+       "  * time     (time) datetime64[ns] 1981-09-01T12:00:00\n",
+       "  * zlev     (zlev) float32 0.0\n",
+       "Data variables:\n",
+       "    anom     (time, zlev, lat, lon) float32 ...\n",
+       "    err      (time, zlev, lat, lon) float32 ...\n",
+       "    ice      (time, zlev, lat, lon) float32 ...\n",
+       "    sst      (time, zlev, lat, lon) float32 ...\n",
+       "Attributes:\n",
+       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n",
+       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
+       "    id:                         oisst-avhrr-v02r01.19810901.nc\n",
+       "    naming_authority:           gov.noaa.ncei\n",
+       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
+       "    cdm_data_type:              Grid\n",
+       "    history:                    Final file created using preliminary as first...\n",
+       "    date_modified:              2020-05-08T19:05:13Z\n",
+       "    date_created:               2020-05-08T19:05:13Z\n",
+       "    product_version:            Version v02r01\n",
+       "    processing_level:           NOAA Level 4\n",
+       "    institution:                NOAA/National Centers for Environmental Infor...\n",
+       "    creator_url:                https://www.ncei.noaa.gov/\n",
+       "    creator_email:              oisst-help@noaa.gov\n",
+       "    keywords:                   Earth Science > Oceans > Ocean Temperature > ...\n",
+       "    keywords_vocabulary:        Global Change Master Directory (GCMD) Earth S...\n",
+       "    platform:                   Ships, buoys, Argo floats, MetOp-A, MetOp-B\n",
+       "    platform_vocabulary:        Global Change Master Directory (GCMD) Platfor...\n",
+       "    instrument:                 Earth Remote Sensing Instruments > Passive Re...\n",
+       "    instrument_vocabulary:      Global Change Master Directory (GCMD) Instrum...\n",
+       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
+       "    geospatial_lat_min:         -90.0\n",
+       "    geospatial_lat_max:         90.0\n",
+       "    geospatial_lon_min:         0.0\n",
+       "    geospatial_lon_max:         360.0\n",
+       "    geospatial_lat_units:       degrees_north\n",
+       "    geospatial_lat_resolution:  0.25\n",
+       "    geospatial_lon_units:       degrees_east\n",
+       "    geospatial_lon_resolution:  0.25\n",
+       "    time_coverage_start:        1981-09-01T00:00:00Z\n",
+       "    time_coverage_end:          1981-09-01T23:59:59Z\n",
+       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
+       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
+       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
+       "    sensor:                     Thermometer, AVHRR\n",
+       "    Conventions:                CF-1.6, ACDD-1.3\n",
+       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 720, lon: 1440, time: 1, zlev: 1)\n", + "Coordinates:\n", + " * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", + " * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n", + " * time (time) datetime64[ns] 1981-09-01T12:00:00\n", + " * zlev (zlev) float32 0.0\n", + "Data variables:\n", + " anom (time, zlev, lat, lon) float32 ...\n", + " err (time, zlev, lat, lon) float32 ...\n", + " ice (time, zlev, lat, lon) float32 ...\n", + " sst (time, zlev, lat, lon) float32 ...\n", + "Attributes:\n", + " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n", + " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", + " id: oisst-avhrr-v02r01.19810901.nc\n", + " naming_authority: gov.noaa.ncei\n", + " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", + " cdm_data_type: Grid\n", + " history: Final file created using preliminary as first...\n", + " date_modified: 2020-05-08T19:05:13Z\n", + " date_created: 2020-05-08T19:05:13Z\n", + " product_version: Version v02r01\n", + " processing_level: NOAA Level 4\n", + " institution: NOAA/National Centers for Environmental Infor...\n", + " creator_url: https://www.ncei.noaa.gov/\n", + " creator_email: oisst-help@noaa.gov\n", + " keywords: Earth Science > Oceans > Ocean Temperature > ...\n", + " keywords_vocabulary: Global Change Master Directory (GCMD) Earth S...\n", + " platform: Ships, buoys, Argo floats, MetOp-A, MetOp-B\n", + " platform_vocabulary: Global Change Master Directory (GCMD) Platfor...\n", + " instrument: Earth Remote Sensing Instruments > Passive Re...\n", + " instrument_vocabulary: Global Change Master Directory (GCMD) Instrum...\n", + " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", + " geospatial_lat_min: -90.0\n", + " geospatial_lat_max: 90.0\n", + " geospatial_lon_min: 0.0\n", + " geospatial_lon_max: 360.0\n", + " geospatial_lat_units: degrees_north\n", + " geospatial_lat_resolution: 0.25\n", + " geospatial_lon_units: degrees_east\n", + " geospatial_lon_resolution: 0.25\n", + " time_coverage_start: 1981-09-01T00:00:00Z\n", + " time_coverage_end: 1981-09-01T23:59:59Z\n", + " metadata_link: https://doi.org/10.25921/RE9P-PT57\n", + " ncei_template_version: NCEI_NetCDF_Grid_Template_v2.0\n", + " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", + " sensor: Thermometer, AVHRR\n", + " Conventions: CF-1.6, ACDD-1.3\n", + " references: Reynolds, et al.(2007) Daily High-Resolution-..." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import xarray as xr\n", "\n", @@ -70,9 +535,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File size is 16.597452 MB\n" + ] + } + ], "source": [ "print(f\"File size is {ds.nbytes/1e6} MB\")" ] @@ -101,9 +574,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mInit signature:\u001b[0m\n", + "\u001b[0mNetCDFtoZarrSequentialRecipe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minput_urls\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIterable\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msequence_dim\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minputs_per_chunk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mnitems_per_input\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpangeo_forge\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAbstractTarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNoneType\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m<\u001b[0m\u001b[0mfactory\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0minput_cache\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpangeo_forge\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstorage\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mAbstractTarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNoneType\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m<\u001b[0m\u001b[0mfactory\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mrequire_cache\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mconsolidate_zarr\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mxarray_open_kwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m<\u001b[0m\u001b[0mfactory\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mxarray_concat_kwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mdict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m<\u001b[0m\u001b[0mfactory\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdelete_input_encoding\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "This class represents a dataset composed of many individual NetCDF files.\n", + "The files are arraged in a sequence along a single dimension, called the\n", + "`sequence_dim`. Each file may contain multiple variables.\n", + "\n", + "The dataset is assembled by concatenating all of these files along `sequence_dim`.\n", + "The target is written in Zarr format.\n", + "\n", + "This class uses Xarray to read and write data.\n", + "\n", + ":param input_urls: The inputs used to generate the dataset.\n", + ":param sequence_dim: The dimension name along which the inputs will be concatenated.\n", + ":param inputs_per_chunk: The number of inputs to use in each chunk.\n", + ":param nitems_per_input: The length of each input along the `sequence_dim` dimension.\n", + ":param target: A location in which to put the dataset. Can also be assigned at run time.\n", + ":param input_cache: A location in which to cache temporary data.\n", + ":param require_cache: Whether to allow opening inputs directly which have not\n", + " yet been cached. This could lead to very slow behavior if the inputs\n", + " live on a slow network.\n", + ":param consolidate_zarr: Whether to consolidate the resulting Zarr dataset.\n", + ":param xarray_open_kwargs: Extra options for opening the inputs with Xarray.\n", + ":param xarray_concat_kwargs: Extra options to pass to Xarray when concatenating\n", + " the inputs to form a chunk.\n", + ":param delete_input_encoding: Whether to remove Xarray encoding from variables\n", + " in the input dataset\n", + "\u001b[0;31mFile:\u001b[0m ~/pangeo-forge/pangeo-forge/pangeo_forge/recipe.py\n", + "\u001b[0;31mType:\u001b[0m ABCMeta\n", + "\u001b[0;31mSubclasses:\u001b[0m \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from pangeo_forge.recipe import NetCDFtoZarrSequentialRecipe\n", "NetCDFtoZarrSequentialRecipe?" @@ -131,7 +655,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -151,9 +675,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 14372 files!\n" + ] + }, + { + "data": { + "text/plain": [ + "'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210105.nc'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -174,21 +716,2909 @@ "source": [ "That's a lot of files!\n", "\n", - "The other remaining parameter is `sequence_dim`. It's just `\"time\"`. We can now\n", - "instantiate our recipe.\n" + "The other remaining parameter is `sequence_dim`. It's just `\"time\"`. This is enough to initialize the recipe." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "NetCDFtoZarrSequentialRecipe(sequence_dim='time', inputs_per_chunk=1, nitems_per_input=1, target=, input_cache=, require_cache=True, consolidate_zarr=True, xarray_open_kwargs={}, xarray_concat_kwargs={}, delete_input_encoding=True)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "recipe = NetCDFtoZarrSequentialRecipe(\n", - " input_urls=input_urls, sequence_dim=\"time\"\n", + " input_urls=input_urls,\n", + " sequence_dim=\"time\"\n", ")\n", "recipe" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, now let's think about the Zarr chunks that this recipe will produce.\n", + "Each target chunk corresponds to one input. So each variable chunk will only be a few MB.\n", + "That is too small. Let's increase `inputs_per_chunk` to 20.\n", + "This means that we will need to be able to hold 20 files like the one we examined above in memory at once.\n", + "That's `16MB * 20 = 320MB`. Not a problem!" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NetCDFtoZarrSequentialRecipe(sequence_dim='time', inputs_per_chunk=20, nitems_per_input=1, target=, input_cache=, require_cache=True, consolidate_zarr=True, xarray_open_kwargs={}, xarray_concat_kwargs={}, delete_input_encoding=True)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recipe = NetCDFtoZarrSequentialRecipe(\n", + " input_urls=input_urls,\n", + " sequence_dim=\"time\",\n", + " inputs_per_chunk=20\n", + ")\n", + "recipe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Play with the recipe\n", + "\n", + "Now we will just explore our recipe a bit to check whether things make sense.\n", + "\n", + "We can see how many inputs the recipe has like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14372" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_inputs = list(recipe.iter_inputs())\n", + "len(all_inputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And how many chunks:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "719" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_chunks = list(recipe.iter_chunks())\n", + "len(all_chunks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we can see their dependencies as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first chunk ID: 0\n" + ] + }, + { + "data": { + "text/plain": [ + "('https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810902.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810903.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810904.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810905.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810906.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810907.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810908.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810909.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810910.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810911.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810912.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810913.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810914.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810915.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810916.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810917.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810918.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810919.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810920.nc')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(f'first chunk ID: {all_chunks[0]}')\n", + "recipe.inputs_for_chunk(all_chunks[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last chunk ID: 718\n" + ] + }, + { + "data": { + "text/plain": [ + "('https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201225.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201226.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201227.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201228.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201229.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201230.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202012/oisst-avhrr-v02r01.20201231.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210101.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210102.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210103.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210104.nc',\n", + " 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/202101/oisst-avhrr-v02r01.20210105.nc')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(f'last chunk ID: {all_chunks[-1]}')\n", + "recipe.inputs_for_chunk(all_chunks[-1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Those are all the files that will go into the first and last chunk. \n", + "\n", + "We can now try to load the first chunk. This will raise an exception because we have not initialized any targets." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "try:\n", + " recipe.open_chunk(all_chunks[0])\n", + "except Exception as e:\n", + " print(type(e))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Create storage targets\n", + "\n", + "We in order to run our recipe, we need to define to places to store data:\n", + "- The Input Cache, where we will temporarily store the files we have downloaded\n", + "- The Target, where the final Zarr dataset will live\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NetCDFtoZarrSequentialRecipe(sequence_dim='time', inputs_per_chunk=20, nitems_per_input=1, target=FSSpecTarget(fs=, root_path='/var/folders/n8/63q49ms55wxcj_gfbtykwp5r0000gn/T/tmppoaidedy'), input_cache=CacheFSSpecTarget(fs=, root_path='/var/folders/n8/63q49ms55wxcj_gfbtykwp5r0000gn/T/tmp61q8ifva'), require_cache=True, consolidate_zarr=True, xarray_open_kwargs={}, xarray_concat_kwargs={}, delete_input_encoding=True)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tempfile\n", + "from fsspec.implementations.local import LocalFileSystem\n", + "from pangeo_forge.storage import FSSpecTarget, CacheFSSpecTarget\n", + "\n", + "fs_local = LocalFileSystem()\n", + "\n", + "cache_dir = tempfile.TemporaryDirectory()\n", + "cache_target = CacheFSSpecTarget(fs_local, cache_dir.name)\n", + "\n", + "target_dir = tempfile.TemporaryDirectory()\n", + "target = FSSpecTarget(fs_local, target_dir.name)\n", + "\n", + "recipe.input_cache = cache_target\n", + "recipe.target = target\n", + "recipe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we try to load the chunk." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are trying to open input https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198109/oisst-avhrr-v02r01.19810901.nc, but the file is not cached yet. First call `cache_input` or set `require_cache=False`.\n" + ] + } + ], + "source": [ + "try:\n", + " recipe.open_chunk(all_chunks[0])\n", + "except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It still didn't work! That's because we have not cached the inputs yet." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "for input_file in recipe.inputs_for_chunk(all_chunks[0]):\n", + " recipe.cache_input(input_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Examine some chunks\n", + "\n", + "Now we can finally open the first chunk!" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 720, lon: 1440, time: 20, zlev: 1)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
+       "  * lon      (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n",
+       "  * time     (time) datetime64[ns] 1981-09-01T12:00:00 ... 1981-09-20T12:00:00\n",
+       "  * zlev     (zlev) float32 0.0\n",
+       "Data variables:\n",
+       "    anom     (time, zlev, lat, lon) float32 nan nan nan nan ... 0.41 0.41 0.41\n",
+       "    err      (time, zlev, lat, lon) float32 nan nan nan nan ... 0.3 0.3 0.3 0.3\n",
+       "    ice      (time, zlev, lat, lon) float32 nan nan nan nan ... 0.9 0.9 0.9 0.9\n",
+       "    sst      (time, zlev, lat, lon) float32 nan nan nan ... -1.39 -1.39 -1.39\n",
+       "Attributes:\n",
+       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n",
+       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
+       "    id:                         oisst-avhrr-v02r01.19810901.nc\n",
+       "    naming_authority:           gov.noaa.ncei\n",
+       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
+       "    cdm_data_type:              Grid\n",
+       "    history:                    Final file created using preliminary as first...\n",
+       "    date_modified:              2020-05-08T19:05:13Z\n",
+       "    date_created:               2020-05-08T19:05:13Z\n",
+       "    product_version:            Version v02r01\n",
+       "    processing_level:           NOAA Level 4\n",
+       "    institution:                NOAA/National Centers for Environmental Infor...\n",
+       "    creator_url:                https://www.ncei.noaa.gov/\n",
+       "    creator_email:              oisst-help@noaa.gov\n",
+       "    keywords:                   Earth Science > Oceans > Ocean Temperature > ...\n",
+       "    keywords_vocabulary:        Global Change Master Directory (GCMD) Earth S...\n",
+       "    platform:                   Ships, buoys, Argo floats, MetOp-A, MetOp-B\n",
+       "    platform_vocabulary:        Global Change Master Directory (GCMD) Platfor...\n",
+       "    instrument:                 Earth Remote Sensing Instruments > Passive Re...\n",
+       "    instrument_vocabulary:      Global Change Master Directory (GCMD) Instrum...\n",
+       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
+       "    geospatial_lat_min:         -90.0\n",
+       "    geospatial_lat_max:         90.0\n",
+       "    geospatial_lon_min:         0.0\n",
+       "    geospatial_lon_max:         360.0\n",
+       "    geospatial_lat_units:       degrees_north\n",
+       "    geospatial_lat_resolution:  0.25\n",
+       "    geospatial_lon_units:       degrees_east\n",
+       "    geospatial_lon_resolution:  0.25\n",
+       "    time_coverage_start:        1981-09-01T00:00:00Z\n",
+       "    time_coverage_end:          1981-09-01T23:59:59Z\n",
+       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
+       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
+       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
+       "    sensor:                     Thermometer, AVHRR\n",
+       "    Conventions:                CF-1.6, ACDD-1.3\n",
+       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 720, lon: 1440, time: 20, zlev: 1)\n", + "Coordinates:\n", + " * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", + " * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n", + " * time (time) datetime64[ns] 1981-09-01T12:00:00 ... 1981-09-20T12:00:00\n", + " * zlev (zlev) float32 0.0\n", + "Data variables:\n", + " anom (time, zlev, lat, lon) float32 nan nan nan nan ... 0.41 0.41 0.41\n", + " err (time, zlev, lat, lon) float32 nan nan nan nan ... 0.3 0.3 0.3 0.3\n", + " ice (time, zlev, lat, lon) float32 nan nan nan nan ... 0.9 0.9 0.9 0.9\n", + " sst (time, zlev, lat, lon) float32 nan nan nan ... -1.39 -1.39 -1.39\n", + "Attributes:\n", + " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n", + " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", + " id: oisst-avhrr-v02r01.19810901.nc\n", + " naming_authority: gov.noaa.ncei\n", + " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", + " cdm_data_type: Grid\n", + " history: Final file created using preliminary as first...\n", + " date_modified: 2020-05-08T19:05:13Z\n", + " date_created: 2020-05-08T19:05:13Z\n", + " product_version: Version v02r01\n", + " processing_level: NOAA Level 4\n", + " institution: NOAA/National Centers for Environmental Infor...\n", + " creator_url: https://www.ncei.noaa.gov/\n", + " creator_email: oisst-help@noaa.gov\n", + " keywords: Earth Science > Oceans > Ocean Temperature > ...\n", + " keywords_vocabulary: Global Change Master Directory (GCMD) Earth S...\n", + " platform: Ships, buoys, Argo floats, MetOp-A, MetOp-B\n", + " platform_vocabulary: Global Change Master Directory (GCMD) Platfor...\n", + " instrument: Earth Remote Sensing Instruments > Passive Re...\n", + " instrument_vocabulary: Global Change Master Directory (GCMD) Instrum...\n", + " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", + " geospatial_lat_min: -90.0\n", + " geospatial_lat_max: 90.0\n", + " geospatial_lon_min: 0.0\n", + " geospatial_lon_max: 360.0\n", + " geospatial_lat_units: degrees_north\n", + " geospatial_lat_resolution: 0.25\n", + " geospatial_lon_units: degrees_east\n", + " geospatial_lon_resolution: 0.25\n", + " time_coverage_start: 1981-09-01T00:00:00Z\n", + " time_coverage_end: 1981-09-01T23:59:59Z\n", + " metadata_link: https://doi.org/10.25921/RE9P-PT57\n", + " ncei_template_version: NCEI_NetCDF_Grid_Template_v2.0\n", + " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", + " sensor: Thermometer, AVHRR\n", + " Conventions: CF-1.6, ACDD-1.3\n", + " references: Reynolds, et al.(2007) Daily High-Resolution-..." + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_chunk = recipe.open_chunk(all_chunks[0])\n", + "ds_chunk" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total chunk size: 331.784804 MB\n" + ] + } + ], + "source": [ + "print(f'Total chunk size: {ds_chunk.nbytes / 1e6} MB')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "👀 Inspect the Xarray HTML repr carefully by clicking on the buttons to expand the different sections.\n", + "- ✅ Is the shape of the variable what we expect\n", + "- ✅ Is `time` in going the right order\n", + "- ✅ Do the variable attributes make sense?\n", + "\n", + "\n", + "Now let's visualize some data make sure things look" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ds_chunk.sst[0].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ds_chunk.ice[-1].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data look good!\n", + "Now let's try a random chunk from the middle." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_number = 500\n", + "for input_file in recipe.inputs_for_chunk(chunk_number):\n", + " recipe.cache_input(input_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 720, lon: 1440, time: 20, zlev: 1)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
+       "  * lon      (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n",
+       "  * time     (time) datetime64[ns] 2009-01-17T12:00:00 ... 2009-02-05T12:00:00\n",
+       "  * zlev     (zlev) float32 0.0\n",
+       "Data variables:\n",
+       "    anom     (time, zlev, lat, lon) float32 nan nan nan nan ... 0.08 0.08 0.08\n",
+       "    err      (time, zlev, lat, lon) float32 nan nan nan nan ... 0.3 0.3 0.3 0.3\n",
+       "    ice      (time, zlev, lat, lon) float32 nan nan nan nan ... 1.0 1.0 1.0 1.0\n",
+       "    sst      (time, zlev, lat, lon) float32 nan nan nan ... -1.72 -1.72 -1.72\n",
+       "Attributes:\n",
+       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n",
+       "    Description:                Reynolds, et al.(2007) Daily High-resolution ...\n",
+       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
+       "    id:                         oisst-avhrr-v02r01.20090117.nc\n",
+       "    naming_authority:           gov.noaa.ncei\n",
+       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
+       "    cdm_data_type:              Grid\n",
+       "    history:                    Final file created using preliminary as first...\n",
+       "    date_modified:              2020-05-08T19:05:13Z\n",
+       "    date_created:               2020-05-08T19:05:13Z\n",
+       "    product_version:            Version v02r01\n",
+       "    processing_level:           NOAA Level 4\n",
+       "    institution:                NOAA/National Centers for Environmental Infor...\n",
+       "    creator_url:                https://www.ncei.noaa.gov/\n",
+       "    creator_email:              oisst-help@noaa.gov\n",
+       "    keywords:                   Earth Science > Oceans > Ocean Temperature > ...\n",
+       "    keywords_vocabulary:        Global Change Master Directory (GCMD) Earth S...\n",
+       "    platform:                   Ships, buoys, Argo floats, MetOp-A, MetOp-B\n",
+       "    platform_vocabulary:        Global Change Master Directory (GCMD) Platfor...\n",
+       "    instrument:                 Earth Remote Sensing Instruments > Passive Re...\n",
+       "    instrument_vocabulary:      Global Change Master Directory (GCMD) Instrum...\n",
+       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
+       "    geospatial_lat_min:         -90.0\n",
+       "    geospatial_lat_max:         90.0\n",
+       "    geospatial_lon_min:         0.0\n",
+       "    geospatial_lon_max:         360.0\n",
+       "    geospatial_lat_units:       degrees_north\n",
+       "    geospatial_lat_resolution:  0.25\n",
+       "    geospatial_lon_units:       degrees_east\n",
+       "    geospatial_lon_resolution:  0.25\n",
+       "    time_coverage_start:        2009-01-17T00:00:00Z\n",
+       "    time_coverage_end:          2009-01-17T23:59:59Z\n",
+       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
+       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
+       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
+       "    sensor:                     Thermometer, AVHRR\n",
+       "    Conventions:                CF-1.6, ACDD-1.3\n",
+       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 720, lon: 1440, time: 20, zlev: 1)\n", + "Coordinates:\n", + " * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", + " * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n", + " * time (time) datetime64[ns] 2009-01-17T12:00:00 ... 2009-02-05T12:00:00\n", + " * zlev (zlev) float32 0.0\n", + "Data variables:\n", + " anom (time, zlev, lat, lon) float32 nan nan nan nan ... 0.08 0.08 0.08\n", + " err (time, zlev, lat, lon) float32 nan nan nan nan ... 0.3 0.3 0.3 0.3\n", + " ice (time, zlev, lat, lon) float32 nan nan nan nan ... 1.0 1.0 1.0 1.0\n", + " sst (time, zlev, lat, lon) float32 nan nan nan ... -1.72 -1.72 -1.72\n", + "Attributes:\n", + " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...\n", + " Description: Reynolds, et al.(2007) Daily High-resolution ...\n", + " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", + " id: oisst-avhrr-v02r01.20090117.nc\n", + " naming_authority: gov.noaa.ncei\n", + " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", + " cdm_data_type: Grid\n", + " history: Final file created using preliminary as first...\n", + " date_modified: 2020-05-08T19:05:13Z\n", + " date_created: 2020-05-08T19:05:13Z\n", + " product_version: Version v02r01\n", + " processing_level: NOAA Level 4\n", + " institution: NOAA/National Centers for Environmental Infor...\n", + " creator_url: https://www.ncei.noaa.gov/\n", + " creator_email: oisst-help@noaa.gov\n", + " keywords: Earth Science > Oceans > Ocean Temperature > ...\n", + " keywords_vocabulary: Global Change Master Directory (GCMD) Earth S...\n", + " platform: Ships, buoys, Argo floats, MetOp-A, MetOp-B\n", + " platform_vocabulary: Global Change Master Directory (GCMD) Platfor...\n", + " instrument: Earth Remote Sensing Instruments > Passive Re...\n", + " instrument_vocabulary: Global Change Master Directory (GCMD) Instrum...\n", + " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", + " geospatial_lat_min: -90.0\n", + " geospatial_lat_max: 90.0\n", + " geospatial_lon_min: 0.0\n", + " geospatial_lon_max: 360.0\n", + " geospatial_lat_units: degrees_north\n", + " geospatial_lat_resolution: 0.25\n", + " geospatial_lon_units: degrees_east\n", + " geospatial_lon_resolution: 0.25\n", + " time_coverage_start: 2009-01-17T00:00:00Z\n", + " time_coverage_end: 2009-01-17T23:59:59Z\n", + " metadata_link: https://doi.org/10.25921/RE9P-PT57\n", + " ncei_template_version: NCEI_NetCDF_Grid_Template_v2.0\n", + " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", + " sensor: Thermometer, AVHRR\n", + " Conventions: CF-1.6, ACDD-1.3\n", + " references: Reynolds, et al.(2007) Daily High-Resolution-..." + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_chunk = recipe.open_chunk(chunk_number)\n", + "ds_chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Try writing data\n", + "\n", + "Now that we can see our chunks opening correctly, we are ready to try writing data to our target.\n", + "\n", + "First we need to prepare the target." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "recipe.prepare_target()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We should now see a Zarr group at the target location.\n", + "Only the coordinates have been written, not the data variables." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/\n", + " ├── anom (14372, 1, 720, 1440) float32\n", + " ├── err (14372, 1, 720, 1440) float32\n", + " ├── ice (14372, 1, 720, 1440) float32\n", + " ├── lat (720,) float32\n", + " ├── lon (1440,) float32\n", + " ├── sst (14372, 1, 720, 1440) float32\n", + " ├── time (14372,) int64\n", + " └── zlev (1,) float32\n" + ] + } + ], + "source": [ + "import zarr\n", + "zgroup = zarr.open(target_dir.name)\n", + "print(zgroup.tree())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's examine one of the data variables." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Name/sst
Typezarr.core.Array
Data typefloat32
Shape(14372, 1, 720, 1440)
Chunk shape(20, 1, 720, 1440)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typezarr.storage.DirectoryStore
No. bytes59603558400 (55.5G)
No. bytes stored611
Storage ratio97550832.1
Chunks initialized0/719
" + ], + "text/plain": [ + "Name : /sst\n", + "Type : zarr.core.Array\n", + "Data type : float32\n", + "Shape : (14372, 1, 720, 1440)\n", + "Chunk shape : (20, 1, 720, 1440)\n", + "Order : C\n", + "Read-only : False\n", + "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", + "Store type : zarr.storage.DirectoryStore\n", + "No. bytes : 59603558400 (55.5G)\n", + "No. bytes stored : 611\n", + "Storage ratio : 97550832.1\n", + "Chunks initialized : 0/719" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zgroup['sst'].info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's write the first chunk." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Name/sst
Typezarr.core.Array
Data typefloat32
Shape(14372, 1, 720, 1440)
Chunk shape(20, 1, 720, 1440)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typezarr.storage.DirectoryStore
No. bytes59603558400 (55.5G)
No. bytes stored39402334 (37.6M)
Storage ratio1512.7
Chunks initialized1/719
" + ], + "text/plain": [ + "Name : /sst\n", + "Type : zarr.core.Array\n", + "Data type : float32\n", + "Shape : (14372, 1, 720, 1440)\n", + "Chunk shape : (20, 1, 720, 1440)\n", + "Order : C\n", + "Read-only : False\n", + "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", + "Store type : zarr.storage.DirectoryStore\n", + "No. bytes : 59603558400 (55.5G)\n", + "No. bytes stored : 39402334 (37.6M)\n", + "Storage ratio : 1512.7\n", + "Chunks initialized : 1/719" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recipe.store_chunk(all_chunks[0])\n", + "zgroup['sst'].info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that one of the chunks has been written! 🎉" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also open the dataset with xarray" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 720, lon: 1440, time: 14372, zlev: 1)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n",
+       "  * lon      (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n",
+       "  * time     (time) datetime64[ns] 1981-09-01T12:00:00 ... NaT\n",
+       "  * zlev     (zlev) float32 0.0\n",
+       "Data variables:\n",
+       "    anom     (time, zlev, lat, lon) float32 dask.array<chunksize=(20, 1, 720, 1440), meta=np.ndarray>\n",
+       "    err      (time, zlev, lat, lon) float32 dask.array<chunksize=(20, 1, 720, 1440), meta=np.ndarray>\n",
+       "    ice      (time, zlev, lat, lon) float32 dask.array<chunksize=(20, 1, 720, 1440), meta=np.ndarray>\n",
+       "    sst      (time, zlev, lat, lon) float32 dask.array<chunksize=(20, 1, 720, 1440), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:                CF-1.6, ACDD-1.3\n",
+       "    cdm_data_type:              Grid\n",
+       "    comment:                    Data was converted from NetCDF-3 to NetCDF-4 ...\n",
+       "    creator_email:              oisst-help@noaa.gov\n",
+       "    creator_url:                https://www.ncei.noaa.gov/\n",
+       "    date_created:               2020-05-08T19:05:13Z\n",
+       "    date_modified:              2020-05-08T19:05:13Z\n",
+       "    geospatial_lat_max:         90.0\n",
+       "    geospatial_lat_min:         -90.0\n",
+       "    geospatial_lat_resolution:  0.25\n",
+       "    geospatial_lat_units:       degrees_north\n",
+       "    geospatial_lon_max:         360.0\n",
+       "    geospatial_lon_min:         0.0\n",
+       "    geospatial_lon_resolution:  0.25\n",
+       "    geospatial_lon_units:       degrees_east\n",
+       "    history:                    Final file created using preliminary as first...\n",
+       "    id:                         oisst-avhrr-v02r01.19810901.nc\n",
+       "    institution:                NOAA/National Centers for Environmental Infor...\n",
+       "    instrument:                 Earth Remote Sensing Instruments > Passive Re...\n",
+       "    instrument_vocabulary:      Global Change Master Directory (GCMD) Instrum...\n",
+       "    keywords:                   Earth Science > Oceans > Ocean Temperature > ...\n",
+       "    keywords_vocabulary:        Global Change Master Directory (GCMD) Earth S...\n",
+       "    metadata_link:              https://doi.org/10.25921/RE9P-PT57\n",
+       "    naming_authority:           gov.noaa.ncei\n",
+       "    ncei_template_version:      NCEI_NetCDF_Grid_Template_v2.0\n",
+       "    platform:                   Ships, buoys, Argo floats, MetOp-A, MetOp-B\n",
+       "    platform_vocabulary:        Global Change Master Directory (GCMD) Platfor...\n",
+       "    processing_level:           NOAA Level 4\n",
+       "    product_version:            Version v02r01\n",
+       "    references:                 Reynolds, et al.(2007) Daily High-Resolution-...\n",
+       "    sensor:                     Thermometer, AVHRR\n",
+       "    source:                     ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n",
+       "    standard_name_vocabulary:   CF Standard Name Table (v40, 25 January 2017)\n",
+       "    summary:                    NOAAs 1/4-degree Daily Optimum Interpolation ...\n",
+       "    time_coverage_end:          1981-09-01T23:59:59Z\n",
+       "    time_coverage_start:        1981-09-01T00:00:00Z\n",
+       "    title:                      NOAA/NCEI 1/4 Degree Daily Optimum Interpolat...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 720, lon: 1440, time: 14372, zlev: 1)\n", + "Coordinates:\n", + " * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88\n", + " * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9\n", + " * time (time) datetime64[ns] 1981-09-01T12:00:00 ... NaT\n", + " * zlev (zlev) float32 0.0\n", + "Data variables:\n", + " anom (time, zlev, lat, lon) float32 dask.array\n", + " err (time, zlev, lat, lon) float32 dask.array\n", + " ice (time, zlev, lat, lon) float32 dask.array\n", + " sst (time, zlev, lat, lon) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6, ACDD-1.3\n", + " cdm_data_type: Grid\n", + " comment: Data was converted from NetCDF-3 to NetCDF-4 ...\n", + " creator_email: oisst-help@noaa.gov\n", + " creator_url: https://www.ncei.noaa.gov/\n", + " date_created: 2020-05-08T19:05:13Z\n", + " date_modified: 2020-05-08T19:05:13Z\n", + " geospatial_lat_max: 90.0\n", + " geospatial_lat_min: -90.0\n", + " geospatial_lat_resolution: 0.25\n", + " geospatial_lat_units: degrees_north\n", + " geospatial_lon_max: 360.0\n", + " geospatial_lon_min: 0.0\n", + " geospatial_lon_resolution: 0.25\n", + " geospatial_lon_units: degrees_east\n", + " history: Final file created using preliminary as first...\n", + " id: oisst-avhrr-v02r01.19810901.nc\n", + " institution: NOAA/National Centers for Environmental Infor...\n", + " instrument: Earth Remote Sensing Instruments > Passive Re...\n", + " instrument_vocabulary: Global Change Master Directory (GCMD) Instrum...\n", + " keywords: Earth Science > Oceans > Ocean Temperature > ...\n", + " keywords_vocabulary: Global Change Master Directory (GCMD) Earth S...\n", + " metadata_link: https://doi.org/10.25921/RE9P-PT57\n", + " naming_authority: gov.noaa.ncei\n", + " ncei_template_version: NCEI_NetCDF_Grid_Template_v2.0\n", + " platform: Ships, buoys, Argo floats, MetOp-A, MetOp-B\n", + " platform_vocabulary: Global Change Master Directory (GCMD) Platfor...\n", + " processing_level: NOAA Level 4\n", + " product_version: Version v02r01\n", + " references: Reynolds, et al.(2007) Daily High-Resolution-...\n", + " sensor: Thermometer, AVHRR\n", + " source: ICOADS, NCEP_GTS, GSFC_ICE, NCEP_ICE, Pathfin...\n", + " standard_name_vocabulary: CF Standard Name Table (v40, 25 January 2017)\n", + " summary: NOAAs 1/4-degree Daily Optimum Interpolation ...\n", + " time_coverage_end: 1981-09-01T23:59:59Z\n", + " time_coverage_start: 1981-09-01T00:00:00Z\n", + " title: NOAA/NCEI 1/4 Degree Daily Optimum Interpolat..." + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.open_zarr(target_dir.name)\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There should be data at the beginning..." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ds.sst[0].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But not the end..." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ds.sst[-1].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Postscript: Execute the full recipe\n", + "\n", + "We are now confident that our recipe works as we expect.\n", + "At this point we could either:\n", + "- Execute it all ourselves (see {doc}`execution`)\n", + "- Create a new recipe feedstock on Pangeo Forge\n", + "\n", + "If we wanted to execute it ourselves, one way would be to simply run the following code\n", + "```python\n", + "for input_name in recipe.iter_inputs():\n", + " recipe.cache_input(input_name)\n", + "recipe.prepare_target()\n", + "for chunk in recipe.iter_chunks():\n", + " recipe.store_chunk(chunk)\n", + "recipe.finalize_target()\n", + "```\n", + "\n", + "We aren't going to do this in this notebook because it would take too long.\n", + "\n", + "But hopefully now you have a better understanding of how Pangeo Forge recipes work." + ] } ], "metadata": { diff --git a/docs/recipes.md b/docs/recipes.md index a45789fc..4328f13b 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -12,7 +12,7 @@ you are reading matches your installed version of pangeo_forge. Recipes need a place to store data. The location where the final dataset produced by the recipe is stored is called the -``Target``. Pangeo forge has a special class for this: {class}`pangeo_forge.storage.Target` +``Target``. Pangeo forge has a special class for this: {class}`pangeo_forge.storage.FSSpecTarget` Creating a Target requires two arguments: - The ``fs`` argument is an [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) @@ -20,20 +20,17 @@ Creating a Target requires two arguments: [built in](https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations) and [third party](https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations) implementations. -- The `path` argument specifies the specific path where the data should be stored. +- The `root_path` argument specifies the specific path where the data should be stored. For example, creating a storage target for AWS S3 might look like this: ```{code-block} python import s3fs fs = s3fs.S3FileSystem(key="MY_AWS_KEY", secret="MY_AWS_SECRET") target_path = "pangeo-forge-bucket/my-dataset-v1.zarr" -target = Target(fs=fs, path=target_path) +target = FSSpecTarget(fs=fs, root_path=target_path) ``` -Temporary data can be stored in an {class}`pangeo_forge.storage.InputCache` object. -``InputCache`` is similar to ``Target``, but instead of specifying a ``path``, -you specify ``prefix``. - +Temporary data is recommended to use a {class}`pangeo_forge.storage.CacheFSSpecTarget` object. ## The Base Recipe Class @@ -45,9 +42,6 @@ recipe = Recipe(option1='foo', option2=) All recipes follow the same basic steps. - - - ## Specific Recipe Classes ```{eval-rst} diff --git a/pangeo_forge/recipe.py b/pangeo_forge/recipe.py index ebe82dda..55cb88cf 100644 --- a/pangeo_forge/recipe.py +++ b/pangeo_forge/recipe.py @@ -138,7 +138,9 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): :param consolidate_zarr: Whether to consolidate the resulting Zarr dataset. :param xarray_open_kwargs: Extra options for opening the inputs with Xarray. :param xarray_concat_kwargs: Extra options to pass to Xarray when concatenating - the inputs to form a chunk. + the inputs to form a chunk. + :param delete_input_encoding: Whether to remove Xarray encoding from variables + in the input dataset """ input_urls: Iterable[str] = field(repr=False) @@ -151,6 +153,7 @@ class NetCDFtoZarrSequentialRecipe(BaseRecipe): consolidate_zarr: bool = True xarray_open_kwargs: dict = field(default_factory=dict) xarray_concat_kwargs: dict = field(default_factory=dict) + delete_input_encoding: bool = True def __post_init__(self): self._chunks_inputs = { @@ -245,6 +248,11 @@ def open_input(self, fname: str): # explicitly load into memory ds = ds.load() ds = fix_scalar_attr_encoding(ds) + + if self.delete_input_encoding: + for var in ds.variables: + ds[var].encoding = {} + logger.debug(f"{ds}") return ds @@ -255,8 +263,6 @@ def open_chunk(self, chunk_key): # CONCAT DELETES ENCODING!!! # OR NO IT DOESN'T! Not in the latest version of xarray? ds = xr.concat(dsets, self.sequence_dim, **self.xarray_concat_kwargs) - for var in ds.variables: - ds[var].encoding = {} logger.debug(f"{ds}") # TODO: maybe do some chunking here? From 63e2297366584702853368fbe8918a2e3c7890ee Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 23:46:39 -0500 Subject: [PATCH 31/34] last commit of the night --- docs/_static/custom.css | 27 +++++++ docs/api.md | 24 ++++++ docs/execution.md | 81 +++++++++++++++++++ docs/index.md | 5 +- docs/recipes.md | 41 +++++++--- docs/tutorials/index.md | 9 +++ .../netcdf_zarr_sequential.ipynb} | 17 +++- pangeo_forge/executors.py | 3 + tests/test_executors.py | 3 +- 9 files changed, 194 insertions(+), 16 deletions(-) create mode 100644 docs/_static/custom.css create mode 100644 docs/tutorials/index.md rename docs/{recipe_tutorial.ipynb => tutorials/netcdf_zarr_sequential.ipynb} (99%) create mode 100644 pangeo_forge/executors.py diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 00000000..9b2ac1b6 --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1,27 @@ +/* Put your custom CSS here */ +@import url('http://fonts.cdnfonts.com/css/panton-black-caps'); + +h1 { + font-family: "Panton Black Caps", sans-serif; + color: #003B71 !important; +} + +h2 { + font-family: "Panton Light Caps", sans-serif; + color: #003B71 !important; +} + +a { + color: #5eb130 !important; +} + + + +/* Fixing up some pygments and code-styling CSS for accessibility */ +code { font-size: 100%; color: #e50051; } +pre { font-family: monospace; } + +/* .highlight { font-size: 125%; } */ +.highlight .c1 { color: #e50051; } +.highlight .si { color: #e50051; } +.highlight .nn { color: #e50051; } diff --git a/docs/api.md b/docs/api.md index 667283b2..1ba199ae 100644 --- a/docs/api.md +++ b/docs/api.md @@ -8,9 +8,16 @@ :members: ``` +```{eval-rst} +.. autoclass:: pangeo_forge.storage.FlatFSSpecTarget + :members: + :show-inheritance: +``` + ```{eval-rst} .. autoclass:: pangeo_forge.storage.CacheFSSpecTarget :members: + :show-inheritance: ``` ## Recipes @@ -24,3 +31,20 @@ .. autoclass:: pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe :show-inheritance: ``` + +## Excutors + +```{eval-rst} +.. autoclass:: pangeo_forge.executors.PythonPipelineExecutor + :members: +``` + +```{eval-rst} +.. autoclass:: pangeo_forge.executors.DaskPipelineExecutor + :members: +``` + +```{eval-rst} +.. autoclass:: pangeo_forge.executors.PrefectPipelineExecutor + :members: +``` diff --git a/docs/execution.md b/docs/execution.md index 7bb5b258..0ec71286 100644 --- a/docs/execution.md +++ b/docs/execution.md @@ -1 +1,82 @@ # Recipe Execution + +There are many different types of Pangeo Forge recipes. +However, **all recipes are executed the same way**! +This is a key part of the Pangeo Forge design. + +Once you have created a recipe object (see {doc}`recipes`) you have two +options for executing it. In the subsequent code, we will assume that a +recipe has already been initialized in the variable `recipe`. + +## Manual Execution + +A recipe can be executed manually, step by step, in serial, from a notebook +or interactive interpreter. The ability to manually step through a recipe +is very important for developing and debugging complex recipes. +There are four stages of recipe execution. + +### Stage 1: Cache Inputs + +Recipes may define files that have to be cached locally before the subsequent +steps can proceed. The common use case here is for files that have to be +extracted from a slow FTP server. Here is how to cache the inputs. + +```{code-block} python +for input_name in recipe.iter_inputs(): + recipe.cache_input(input_name) +``` + +If the recipe doesn't do input caching, nothing will happen here. + +### Stage 2: Prepare Target + +Once the inputs have been cached, we can get the target ready. +Preparing the target for writing is done as follows: + +```{code-block} python +recipe.prepare_target() +``` + +For example, for Zarr targets, this sets up the Zarr group with the necessary +arrays and metadata. + +### Stage 3: Store Chunks + +This is the step where the bulk of the work happens. + +```{code-block} python +for chunk in recipe.iter_chunks(): + recipe.store_chunk(chunk) +``` + +### Stage 4: Finalize Target + +If there is any cleanup or consolidation to be done, it happens here. + +```{code-block} python +recipe.finalize_target() +``` + +For example, consolidating Zarr metadta happens in the finalize step. + +## Execution by Executors + +Very large recipes cannot feasibly be executed this way. +To support distributed parallel execution, Pangeo Forge borrows the +[Executors framework from Rechunker](https://rechunker.readthedocs.io/en/latest/executors.html). + +There are currently three executors implemented. +- {class}`pangeo_forge.executors.PythonPipelineExecutor`: a reference executor + using simple python +- {class}`pangeo_forge.executors.DaskPipelineExecutor`: distributed executor using Dask +- {class}`pangeo_forge.executors.PrefectPipelineExecutor`: distributed executor using Prefect + +To use an executor, the recipe must first be transformed into a `Pipeline` object. +The full process looks like this: + +```{code-block} python +pipeline = recipe.to_pipelines() +executor = PrefectPipelineExecutor() +plan = executor.pipelines_to_plan(pipeline) +executor.execute_plan(plan) # actually runs the recipe +``` diff --git a/docs/index.md b/docs/index.md index 319199e4..7e475018 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,7 +14,8 @@ The most important concept in Pangeo Forge is a ``recipe``. A recipe defines how to transform data in one format / location into another format / location. The primary way people contribute to Pangeo Forge is by writing / maintaining recipes. Recipes developed by the community are stored in GitHub repositories. -For information about how to write a recipe, see {doc}`recipes`. +For information about how recipes work see {doc}`recipes`. +The {doc}`tutorials/index` provide deep dives into how to develop and debug Pangeo Forge recipes. ## Recipe Execution @@ -35,7 +36,7 @@ For more information, see {doc}`bakeries`. :caption: Contents recipes -recipe_tutorial +tutorials/index execution bakeries contribute diff --git a/docs/recipes.md b/docs/recipes.md index 4328f13b..527df8b8 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -32,20 +32,43 @@ target = FSSpecTarget(fs=fs, root_path=target_path) Temporary data is recommended to use a {class}`pangeo_forge.storage.CacheFSSpecTarget` object. -## The Base Recipe Class +## The Recipe Object + +You define a recipe by instantiating a class that inherits from {class}`pangeo_forge.recipe.BaseRecipe`. +The `pangeo_forge` package includes several pre-defined Recipe classes which +cover common scenarios. You can also define your own Recipe class. + +For a the common scenario of assembling many NetCDF files into a single Zarr +group, we use {class}`pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe`. +Initializing a recipe looks something like this. -A recipe is initialized from a recipe class. ```{code-block} python -recipe = Recipe(option1='foo', option2=) +from pangeo_forge.recipes import NetCDFtoZarrSequentialRecipe +input_urls = [...] # build a list of inputs +recipe = NetCDFtoZarrSequentialRecipe( + input_urls=input_urls, + sequence_dim="time" +) ``` -All recipes follow the same basic steps. +There are many other options we can pass, all covered in the [API documentation](api). +For a deeper dive on how to pick these options and what they mean, check out the +tutorial: {doc}`tutorials/netcdf_zarr_sequential`. +Your recipe will also need storage. +If you have already defined a `Target` object (as in the the [Storage section](#storage)), +then you can either assign it when you initialize the recipe or later, e.g. -## Specific Recipe Classes +```{code-block} python +recipe.target = FSSpecTarget(fs=fs, root_path=target_path) +``` -```{eval-rst} -.. autoclass:: pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe - :show-inheritance: - :noindex: +This particular class of recipe also requires a cache, a place to store temporary +files. We can create one as follows. + +```{code-block} python +recipe.input_cache = CacheFSSpecTarget(fs=fs, root_path=cache_path) ``` + +Once your recipe is defined and has its targets assigned, you're ready to +move on to {doc}`execution`. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md new file mode 100644 index 00000000..ef34442c --- /dev/null +++ b/docs/tutorials/index.md @@ -0,0 +1,9 @@ +# Recipe Tutorials + +These tutorials are deep dives into how to develop and debug Pangeo Forge recipes. + +```{toctree} +:maxdepth: 1 + +netcdf_zarr_sequential +``` diff --git a/docs/recipe_tutorial.ipynb b/docs/tutorials/netcdf_zarr_sequential.ipynb similarity index 99% rename from docs/recipe_tutorial.ipynb rename to docs/tutorials/netcdf_zarr_sequential.ipynb index 3e7b2cfe..4d501d87 100644 --- a/docs/recipe_tutorial.ipynb +++ b/docs/tutorials/netcdf_zarr_sequential.ipynb @@ -4,9 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recipe Tutorial\n", + "# NetCDF Zarr Sequential Recipe\n", "\n", - "This tutorial describes how to create a recipe from scratch.\n" + "This tutorial describes how to create a recipe from scratch.\n", + "The source data is a sequence of NetCDF files accessed via HTTP.\n", + "The target is a Zarr store." ] }, { @@ -567,7 +569,7 @@ "For our first recipe, we will want to use a pre-defined Recipe class from Pangeo\n", "Forge.\n", "\n", - "By examining the {doc}`recipes` documentation page, we see that our scenario is\n", + "By examining the {doc}`../recipes` documentation page, we see that our scenario is\n", "a good case for the {class}`pangeo_forge.recipe.NetCDFtoZarrSequentialRecipe`\n", "class. Let's examine its documentation string in our notebook.\n" ] @@ -3602,7 +3604,7 @@ "\n", "We are now confident that our recipe works as we expect.\n", "At this point we could either:\n", - "- Execute it all ourselves (see {doc}`execution`)\n", + "- Execute it all ourselves (see {doc}`../execution`)\n", "- Create a new recipe feedstock on Pangeo Forge\n", "\n", "If we wanted to execute it ourselves, one way would be to simply run the following code\n", @@ -3619,6 +3621,13 @@ "\n", "But hopefully now you have a better understanding of how Pangeo Forge recipes work." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pangeo_forge/executors.py b/pangeo_forge/executors.py new file mode 100644 index 00000000..bca10b1b --- /dev/null +++ b/pangeo_forge/executors.py @@ -0,0 +1,3 @@ +from rechunker.executors import DaskPipelineExecutor # noqa: F401 +from rechunker.executors import PrefectPipelineExecutor # noqa: F401 +from rechunker.executors import PythonPipelineExecutor # noqa: F401 diff --git a/tests/test_executors.py b/tests/test_executors.py index b21ad56a..6294809a 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -1,6 +1,7 @@ import pytest import xarray as xr -from rechunker.executors import ( + +from pangeo_forge.executors import ( DaskPipelineExecutor, PrefectPipelineExecutor, PythonPipelineExecutor, From e0c97b009555d9d8fcdeeef5264acee78b993256 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 23:48:27 -0500 Subject: [PATCH 32/34] update doc requirements --- docs/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 8a7fbe2f..88595554 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,5 @@ sphinx -sphinx-pangeo-theme +sphinx_book_theme myst-parser +myst-nb +sphinx-copybutton From 382663d4f6b963fd02553f00c94956f87aa35be8 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 21 Jan 2021 23:57:40 -0500 Subject: [PATCH 33/34] use rechunker from github --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 83520897..b8770e9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ setuptools click dask distributed -rechunker +-e git://github.com/rabernat/rechunker.git@refactor-executors#egg=rechunker xarray >= 0.16.2 zarr >= 2.6.0 fsspec[http] From 57304e86ff31e5e4c56cc8dd5f27f79a8e9b47f4 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 22 Jan 2021 07:34:20 -0500 Subject: [PATCH 34/34] fix requirements --- docs/requirements.txt | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 88595554..51875dbb 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ sphinx +pydata-sphinx-theme>=0.4.2 sphinx_book_theme myst-parser myst-nb diff --git a/requirements.txt b/requirements.txt index b8770e9b..83520897 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ setuptools click dask distributed --e git://github.com/rabernat/rechunker.git@refactor-executors#egg=rechunker +rechunker xarray >= 0.16.2 zarr >= 2.6.0 fsspec[http]