Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data] Rename ParquetBaseDatasource to ParquetBulkDatasource #45728

Merged
merged 1 commit into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/ray/data/datasource/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from ray.data.datasource.mongo_datasource import MongoDatasource
from ray.data.datasource.numpy_datasink import _NumpyDatasink
from ray.data.datasource.numpy_datasource import NumpyDatasource
from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource
from ray.data.datasource.parquet_bulk_datasource import ParquetBulkDatasource
from ray.data.datasource.parquet_datasink import _ParquetDatasink
from ray.data.datasource.parquet_datasource import ParquetDatasource
from ray.data.datasource.partitioning import (
Expand Down Expand Up @@ -95,7 +95,7 @@
"LanceDatasource",
"_NumpyDatasink",
"NumpyDatasource",
"ParquetBaseDatasource",
"ParquetBulkDatasource",
"_ParquetDatasink",
"ParquetDatasource",
"ParquetMetadataProvider",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


@PublicAPI
class ParquetBaseDatasource(FileBasedDatasource):
class ParquetBulkDatasource(FileBasedDatasource):
"""Minimal Parquet datasource, for reading and writing Parquet files."""

_FILE_EXTENSIONS = ["parquet"]
Expand Down
4 changes: 2 additions & 2 deletions python/ray/data/datasource/parquet_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def _check_for_legacy_tensor_type(schema):
class ParquetDatasource(Datasource):
"""Parquet datasource, for reading and writing Parquet files.

The primary difference from ParquetBaseDatasource is that this uses
The primary difference from ParquetBulkDatasource is that this uses
PyArrow's `ParquetDataset` abstraction for dataset reads, and thus offers
automatic Arrow dataset schema inference and row count collection at the
cost of some potential performance and/or compatibility penalties.
Expand Down Expand Up @@ -527,8 +527,8 @@ def _estimate_files_encoding_ratio(self) -> float:

def get_name(self):
"""Return a human-readable name for this datasource.

This will be used as the names of the read tasks.
Note: overrides the base `ParquetBaseDatasource` method.
"""
return "Parquet"

Expand Down
6 changes: 3 additions & 3 deletions python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
LanceDatasource,
MongoDatasource,
NumpyDatasource,
ParquetBaseDatasource,
ParquetBulkDatasource,
ParquetDatasource,
ParquetMetadataProvider,
PathPartitionFilter,
Expand Down Expand Up @@ -967,7 +967,7 @@ def read_parquet_bulk(
partition_filter: Optional[PathPartitionFilter] = None,
shuffle: Union[Literal["files"], None] = None,
include_paths: bool = False,
file_extensions: Optional[List[str]] = ParquetBaseDatasource._FILE_EXTENSIONS,
file_extensions: Optional[List[str]] = ParquetBulkDatasource._FILE_EXTENSIONS,
concurrency: Optional[int] = None,
override_num_blocks: Optional[int] = None,
**arrow_parquet_args,
Expand Down Expand Up @@ -1062,7 +1062,7 @@ def read_parquet_bulk(
if columns is not None:
read_table_args["columns"] = columns

datasource = ParquetBaseDatasource(
datasource = ParquetBulkDatasource(
paths,
read_table_args=read_table_args,
filesystem=filesystem,
Expand Down
4 changes: 2 additions & 2 deletions python/ray/data/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
DefaultFileMetadataProvider,
DefaultParquetMetadataProvider,
)
from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource
from ray.data.datasource.parquet_bulk_datasource import ParquetBulkDatasource
from ray.data.datasource.parquet_datasource import (
NUM_CPUS_FOR_META_FETCH_TASK,
PARALLELIZE_META_FETCH_THRESHOLD,
Expand Down Expand Up @@ -1092,7 +1092,7 @@ def test_parquet_datasource_names(ray_start_regular_shared, tmp_path):
path = os.path.join(tmp_path, "data.parquet")
df.to_parquet(path)

assert ParquetBaseDatasource(path).get_name() == "ParquetBulk"
assert ParquetBulkDatasource(path).get_name() == "ParquetBulk"
assert ParquetDatasource(path).get_name() == "Parquet"


Expand Down
Loading