From 2ce0b2d77e7480afed6fb47809f5431d4e1cdc8f Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 23 May 2022 09:34:41 -0700 Subject: [PATCH] [Data] Add partitioning classes to Data API reference (#24203) --- doc/source/data/package-ref.rst | 19 +++++++++++++- python/ray/data/datasource/__init__.py | 2 ++ python/ray/data/datasource/partitioning.py | 30 +++++++++++++++++----- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst index ddf0487e088e..6c8e83125d43 100644 --- a/doc/source/data/package-ref.rst +++ b/doc/source/data/package-ref.rst @@ -123,6 +123,23 @@ Custom Datasource API .. autoclass:: ray.data.ReadTask :members: +Datasource Partitioning API +--------------------------- + +.. autoclass:: ray.data.datasource.PartitionStyle + :members: + +.. autoclass:: ray.data.datasource.PathPartitionScheme + :members: + +.. autoclass:: ray.data.datasource.PathPartitionEncoder + :members: + +.. autoclass:: ray.data.datasource.PathPartitionParser + :members: + +.. autoclass:: ray.data.datasource.PathPartitionFilter + Built-in Datasources -------------------- @@ -146,7 +163,7 @@ Built-in Datasources .. autoclass:: ray.data.datasource.RangeDatasource :members: - + .. autoclass:: ray.data.datasource.SimpleTensorFlowDatasource :members: diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py index e51390b1b953..aab29c028d10 100644 --- a/python/ray/data/datasource/__init__.py +++ b/python/ray/data/datasource/__init__.py @@ -31,6 +31,7 @@ PathPartitionEncoder, PathPartitionFilter, PathPartitionParser, + PathPartitionScheme, ) from ray.data.datasource.tensorflow_datasource import SimpleTensorFlowDatasource from ray.data.datasource.torch_datasource import SimpleTorchDatasource @@ -57,6 +58,7 @@ "PathPartitionEncoder", "PathPartitionFilter", "PathPartitionParser", + "PathPartitionScheme", "RandomIntRowDatasource", "RangeDatasource", "ReadTask", diff --git a/python/ray/data/datasource/partitioning.py b/python/ray/data/datasource/partitioning.py index 238f37522e44..26d0635079ce 100644 --- a/python/ray/data/datasource/partitioning.py +++ b/python/ray/data/datasource/partitioning.py @@ -22,10 +22,12 @@ class PartitionStyle(str, Enum): Examples: >>> # Serialize to JSON text. - >>> json.dumps(PartitionStyle.HIVE) # "hive" + >>> json.dumps(PartitionStyle.HIVE) # doctest: +SKIP + '"hive"' >>> # Deserialize from JSON text. - >>> PartitionStyle(json.loads('"hive"')) # PartitionStyle.HIVE + >>> PartitionStyle(json.loads('"hive"')) # doctest: +SKIP + """ HIVE = "hive" @@ -151,6 +153,7 @@ def of( filesystem: Optional["pyarrow.fs.FileSystem"] = None, ) -> "PathPartitionEncoder": """Creates a new partition path encoder. + Args: style: The partition style - may be either HIVE or DIRECTORY. base_dir: "/"-delimited base directory that all partition paths will be @@ -426,13 +429,26 @@ def of( partition or `False` to skip it. Partition keys and values are always strings read from the filesystem path. For example, this removes all unpartitioned files: - ``lambda d: True if d else False`` + + .. code:: python + + lambda d: True if d else False + This raises an assertion error for any unpartitioned file found: - ``def do_assert(val, msg): - assert val, msg - lambda d: do_assert(d, "Expected all files to be partitioned!")`` + + .. code:: python + + def do_assert(val, msg): + assert val, msg + + lambda d: do_assert(d, "Expected all files to be partitioned!") + And this only reads files from January, 2022 partitions: - ``lambda d: d["month"] == "January" and d["year"] == "2022"`` + + .. code:: python + + lambda d: d["month"] == "January" and d["year"] == "2022" + style: The partition style - may be either HIVE or DIRECTORY. base_dir: "/"-delimited base directory to start searching for partitions (exclusive). File paths outside of this directory will be considered