Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datasets] Re-enable Parquet sampling and add progress bar #28021

Merged
merged 2 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/ray/data/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
DEFAULT_USE_POLARS = False

# Whether to estimate in-memory decoding data size for data source.
DEFAULT_DECODING_SIZE_ESTIMATION_ENABLED = False
DEFAULT_DECODING_SIZE_ESTIMATION_ENABLED = True

# Whether to automatically cast NumPy ndarray columns in Pandas DataFrames to tensor
# extension columns.
Expand Down
14 changes: 3 additions & 11 deletions python/ray/data/datasource/parquet_datasource.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import itertools
import logging
import time
from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union

import numpy as np

import ray
from ray.data._internal.output_buffer import BlockOutputBuffer
from ray.data._internal.progress_bar import ProgressBar
from ray.data._internal.remote_fn import cached_remote_fn
Expand Down Expand Up @@ -292,7 +290,6 @@ def _estimate_files_encoding_ratio(self) -> float:
# Launch tasks to sample multiple files remotely in parallel.
# Evenly distributed to sample N rows in i-th row group in i-th file.
# TODO(ekl/cheng) take into account column pruning.
start_time = time.perf_counter()
num_files = len(self._pq_ds.pieces)
num_samples = int(num_files * PARQUET_ENCODING_RATIO_ESTIMATE_SAMPLING_RATIO)
min_num_samples = min(
Expand Down Expand Up @@ -321,15 +318,10 @@ def _estimate_files_encoding_ratio(self) -> float:
_SerializedPiece(sample), idx
)
)
sample_ratios = ray.get(futures)
sample_bar = ProgressBar("Parquet Files Sample", len(futures))
sample_ratios = sample_bar.fetch_until_complete(futures)
sample_bar.close()
ratio = np.mean(sample_ratios)

sampling_duration = time.perf_counter() - start_time
if sampling_duration > 5:
logger.info(
"Parquet input size estimation took "
f"{round(sampling_duration, 2)} seconds."
)
logger.debug(f"Estimated Parquet encoding ratio from sampling is {ratio}.")
return max(ratio, PARQUET_ENCODING_RATIO_ESTIMATE_LOWER_BOUND)

Expand Down