[Datasets] Allow specify batch_size when reading Parquet file (#31165)

This PR is to allow users to specify batch_size when reading Parquet file.
ray-project · Dec 21, 2022 · c8443c0 · c8443c0
1 parent 9b51b01
commit c8443c0
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 3 deletions.
diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py
@@ -374,13 +374,14 @@ def _read_pieces(
 
     logger.debug(f"Reading {len(pieces)} parquet pieces")
     use_threads = reader_args.pop("use_threads", False)
+    batch_size = reader_args.pop("batch_size", PARQUET_READER_ROW_BATCH_SIZE)
     for piece in pieces:
         part = _get_partition_keys(piece.partition_expression)
         batches = piece.to_batches(
             use_threads=use_threads,
             columns=columns,
             schema=schema,
-            batch_size=PARQUET_READER_ROW_BATCH_SIZE,
+            batch_size=batch_size,
             **reader_args,
         )
         for batch in batches:
@@ -461,6 +462,9 @@ def _sample_piece(
     batch_size = max(
         min(piece.metadata.num_rows, PARQUET_ENCODING_RATIO_ESTIMATE_NUM_ROWS), 1
     )
+    # Use the batch_size calculated above, and ignore the one specified by user if set.
+    # This is to avoid sampling too few or too many rows.
+    reader_args.pop("batch_size", None)
     batches = piece.to_batches(
         columns=columns,
         schema=schema,

diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
@@ -459,7 +459,7 @@ def read_parquet(
         Dataset(num_blocks=..., num_rows=150, schema={sepal.length: double, ...})
 
         For further arguments you can pass to pyarrow as a keyword argument, see
-        https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
+        https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment
 
     Args:
         paths: A single file path or directory, or a list of file paths. Multiple
@@ -479,7 +479,7 @@ def read_parquet(
         meta_provider: File metadata provider. Custom metadata providers may
             be able to resolve file metadata more quickly and/or accurately.
         arrow_parquet_args: Other parquet read options to pass to pyarrow, see
-            https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
+            https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment
 
     Returns:
         Dataset holding Arrow records read from the specified paths.

diff --git a/python/ray/data/tests/test_dataset_parquet.py b/python/ray/data/tests/test_dataset_parquet.py
@@ -906,6 +906,13 @@ def test_parquet_read_empty_file(ray_start_regular_shared, tmp_path):
     pd.testing.assert_frame_equal(ds.to_pandas(), table.to_pandas())
 
 
+def test_parquet_reader_batch_size(ray_start_regular_shared, tmp_path):
+    path = os.path.join(tmp_path, "data.parquet")
+    ray.data.range_tensor(1000, shape=(1000,)).write_parquet(path)
+    ds = ray.data.read_parquet(path, batch_size=10)
+    assert ds.count() == 1000
+
+
 if __name__ == "__main__":
     import sys