From fa5c167717ab9a5c2d6fa0b468e38cea3922d3e8 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Tue, 18 Jan 2022 06:11:20 -0800 Subject: [PATCH] Revert "[Dataset] [DataFrame 2/n] Add pandas block format implementation (partial) (#20988) (#21661) This reverts commit 4a55d10bb1b70971f50a3872421f2c1eebd84e64. --- doc/examples/datasets_train/datasets_train.py | 11 +- python/ray/data/block.py | 7 +- python/ray/data/context.py | 18 +- python/ray/data/dataset.py | 85 ++++---- python/ray/data/impl/arrow_block.py | 2 +- .../ray/data/impl/delegating_block_builder.py | 4 - python/ray/data/impl/pandas_block.py | 188 ------------------ python/ray/data/impl/simple_block.py | 2 +- python/ray/data/read_api.py | 36 +--- python/ray/data/tests/test_dataset.py | 126 ++++-------- 10 files changed, 96 insertions(+), 383 deletions(-) delete mode 100644 python/ray/data/impl/pandas_block.py diff --git a/doc/examples/datasets_train/datasets_train.py b/doc/examples/datasets_train/datasets_train.py index bfe00f03cd6f..bc4625432208 100644 --- a/doc/examples/datasets_train/datasets_train.py +++ b/doc/examples/datasets_train/datasets_train.py @@ -273,7 +273,6 @@ def inference(dataset, model_cls: type, batch_size: int, result_path: str, model_cls, compute="actors", batch_size=batch_size, - batch_format="pandas", num_gpus=num_gpus, num_cpus=0) \ .write_parquet(result_path) @@ -579,8 +578,8 @@ def train_func(config): read_dataset(data_path)) num_columns = len(train_dataset.schema().names) - # remove label column. - num_features = num_columns - 1 + # remove label column and internal Arrow column. + num_features = num_columns - 2 NUM_EPOCHS = 2 BATCH_SIZE = 512 @@ -682,9 +681,9 @@ def __init__(self, load_model_func): self.model = load_model_func().to(self.device) def __call__(self, batch) -> "pd.DataFrame": - tensor = torch.FloatTensor(batch.values).to(self.device) - return pd.DataFrame( - self.model(tensor).cpu().detach().numpy(), columns=["value"]) + tensor = torch.FloatTensor(batch.to_pandas().values).to( + self.device) + return pd.DataFrame(self.model(tensor).cpu().detach().numpy()) inference_dataset = preprocessor.preprocess_inference_data( read_dataset(inference_path)) diff --git a/python/ray/data/block.py b/python/ray/data/block.py index f5b40622ca7d..6e7f58ae2e99 100644 --- a/python/ray/data/block.py +++ b/python/ray/data/block.py @@ -25,7 +25,7 @@ # # Block data can be accessed in a uniform way via ``BlockAccessors`` such as # ``SimpleBlockAccessor`` and ``ArrowBlockAccessor``. -Block = Union[List[T], "pyarrow.Table", "pandas.DataFrame", bytes] +Block = Union[List[T], "pyarrow.Table", bytes] # A list of block references pending computation by a single task. For example, # this may be the output of a task reading a file. @@ -196,16 +196,11 @@ def for_block(block: Block) -> "BlockAccessor[T]": """Create a block accessor for the given block.""" _check_pyarrow_version() import pyarrow - import pandas if isinstance(block, pyarrow.Table): from ray.data.impl.arrow_block import \ ArrowBlockAccessor return ArrowBlockAccessor(block) - elif isinstance(block, pandas.DataFrame): - from ray.data.impl.pandas_block import \ - PandasBlockAccessor - return PandasBlockAccessor(block) elif isinstance(block, bytes): from ray.data.impl.arrow_block import \ ArrowBlockAccessor diff --git a/python/ray/data/context.py b/python/ray/data/context.py index 3ac4cb068ece..7f59038115cd 100644 --- a/python/ray/data/context.py +++ b/python/ray/data/context.py @@ -14,10 +14,6 @@ # Whether block splitting is on by default DEFAULT_BLOCK_SPLITTING_ENABLED = False -# Whether pandas block format is enabled. -# TODO (kfstorm): Remove this once stable. -DEFAULT_ENABLE_PANDAS_BLOCK = True - @DeveloperAPI class DatasetContext: @@ -27,18 +23,12 @@ class DatasetContext: from the driver and remote workers via DatasetContext.get_current(). """ - def __init__( - self, - block_owner: ray.actor.ActorHandle, - block_splitting_enabled: bool, - target_max_block_size: int, - enable_pandas_block: bool, - ): + def __init__(self, block_owner: ray.actor.ActorHandle, + block_splitting_enabled: bool, target_max_block_size: int): """Private constructor (use get_current() instead).""" self.block_owner = block_owner self.block_splitting_enabled = block_splitting_enabled self.target_max_block_size = target_max_block_size - self.enable_pandas_block = enable_pandas_block @staticmethod def get_current() -> "DatasetContext": @@ -55,9 +45,7 @@ def get_current() -> "DatasetContext": _default_context = DatasetContext( block_owner=None, block_splitting_enabled=DEFAULT_BLOCK_SPLITTING_ENABLED, - target_max_block_size=DEFAULT_TARGET_MAX_BLOCK_SIZE, - enable_pandas_block=DEFAULT_ENABLE_PANDAS_BLOCK, - ) + target_max_block_size=DEFAULT_TARGET_MAX_BLOCK_SIZE) if _default_context.block_owner is None: owner = _DesignatedBlockOwner.options( diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 02da0214b7dd..fc2b804c1f82 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -44,7 +44,6 @@ from ray.data.impl.sort import sort_impl from ray.data.impl.block_list import BlockList from ray.data.impl.lazy_block_list import LazyBlockList -from ray.data.impl.table_block import TableRow from ray.data.impl.delegating_block_builder import DelegatingBlockBuilder # An output type of iter_batches() determined by the batch_format parameter. @@ -231,9 +230,11 @@ def transform(block: Block) -> Iterable[Block]: "or 'pyarrow', got: {}".format(batch_format)) applied = fn(view) - if not (isinstance(applied, list) - or isinstance(applied, pa.Table) - or isinstance(applied, pd.core.frame.DataFrame)): + if isinstance(applied, list) or isinstance(applied, pa.Table): + applied = applied + elif isinstance(applied, pd.core.frame.DataFrame): + applied = pa.Table.from_pandas(applied) + else: raise ValueError("The map batches UDF returned the value " f"{applied}, which is not allowed. " "The return type must be either list, " @@ -402,15 +403,12 @@ def repartition(self, num_blocks: int, *, # Handle empty blocks. if len(new_blocks) < num_blocks: from ray.data.impl.arrow_block import ArrowBlockBuilder - from ray.data.impl.pandas_block import PandasBlockBuilder from ray.data.impl.simple_block import SimpleBlockBuilder num_empties = num_blocks - len(new_blocks) dataset_format = self._dataset_format() if dataset_format == "arrow": builder = ArrowBlockBuilder() - elif dataset_format == "pandas": - builder = PandasBlockBuilder() else: builder = SimpleBlockBuilder() empty_block = builder.build() @@ -941,7 +939,7 @@ def _check_and_normalize_agg_on(self, # Dataset is empty/cleared, let downstream ops handle this. return on - if dataset_format == "arrow" or dataset_format == "pandas": + if dataset_format == "arrow": # This should be cached from the ._dataset_format() check, so we # don't fetch and we assert that the schema is not None. schema = self.schema(fetch_if_missing=False) @@ -974,35 +972,32 @@ def _check_and_normalize_agg_on(self, and isinstance(on[0], str)): raise ValueError( "Can't aggregate on a column when using a simple Dataset; " - "use a callable `on` argument or use an Arrow or Pandas" - " Dataset instead of a simple Dataset.") + "use a callable `on` argument or use an Arrow Dataset " + "instead of a simple Dataset.") return on def _dataset_format(self) -> str: """Determine the format of the dataset. Possible values are: "arrow", - "pandas", "simple". + "simple". This may block; if the schema is unknown, this will synchronously fetch the schema for the first block. """ - # We need schema to properly validate, so synchronously - # fetch it if necessary. - schema = self.schema(fetch_if_missing=True) - if schema is None: - raise ValueError( - "Dataset is empty or cleared, can't determine the format of " - "the dataset.") - try: import pyarrow as pa + except ModuleNotFoundError: + return "simple" + else: + # We need schema to properly validate, so synchronously + # fetch it if necessary. + schema = self.schema(fetch_if_missing=True) + if schema is None: + raise ValueError( + "Dataset is empty or cleared, can't determine the format" + " of the dataset") if isinstance(schema, pa.Schema): return "arrow" - except ModuleNotFoundError: - pass - from ray.data.impl.pandas_block import PandasBlockSchema - if isinstance(schema, PandasBlockSchema): - return "pandas" - return "simple" + return "simple" def _aggregate_on(self, agg_cls: type, on: Optional["AggregateOnTs"], *args, **kwargs): @@ -1031,18 +1026,6 @@ def _build_multicolumn_aggs(self, on = [on] return [agg_cls(on_, *args, **kwargs) for on_ in on] - def _aggregate_result(self, result: Union[Tuple, TableRow]) -> U: - if len(result) == 1: - if isinstance(result, tuple): - return result[0] - else: - # NOTE (kfstorm): We cannot call `result[0]` directly on - # `PandasRow` because indexing a column with position is not - # supported by pandas. - return list(result.values())[0] - else: - return result - def sum(self, on: Optional["AggregateOnTs"] = None) -> U: """Compute sum over entire dataset. @@ -1093,8 +1076,10 @@ def sum(self, on: Optional["AggregateOnTs"] = None) -> U: ret = self._aggregate_on(Sum, on) if ret is None: return 0 + elif len(ret) == 1: + return ret[0] else: - return self._aggregate_result(ret) + return ret def min(self, on: Optional["AggregateOnTs"] = None) -> U: """Compute minimum over entire dataset. @@ -1146,8 +1131,10 @@ def min(self, on: Optional["AggregateOnTs"] = None) -> U: ret = self._aggregate_on(Min, on) if ret is None: raise ValueError("Cannot compute min on an empty dataset") + elif len(ret) == 1: + return ret[0] else: - return self._aggregate_result(ret) + return ret def max(self, on: Optional["AggregateOnTs"] = None) -> U: """Compute maximum over entire dataset. @@ -1199,8 +1186,10 @@ def max(self, on: Optional["AggregateOnTs"] = None) -> U: ret = self._aggregate_on(Max, on) if ret is None: raise ValueError("Cannot compute max on an empty dataset") + elif len(ret) == 1: + return ret[0] else: - return self._aggregate_result(ret) + return ret def mean(self, on: Optional["AggregateOnTs"] = None) -> U: """Compute mean over entire dataset. @@ -1252,8 +1241,10 @@ def mean(self, on: Optional["AggregateOnTs"] = None) -> U: ret = self._aggregate_on(Mean, on) if ret is None: raise ValueError("Cannot compute mean on an empty dataset") + elif len(ret) == 1: + return ret[0] else: - return self._aggregate_result(ret) + return ret def std(self, on: Optional["AggregateOnTs"] = None, ddof: int = 1) -> U: """Compute standard deviation over entire dataset. @@ -1315,8 +1306,10 @@ def std(self, on: Optional["AggregateOnTs"] = None, ddof: int = 1) -> U: ret = self._aggregate_on(Std, on, ddof=ddof) if ret is None: raise ValueError("Cannot compute std on an empty dataset") + elif len(ret) == 1: + return ret[0] else: - return self._aggregate_result(ret) + return ret def sort(self, key: Union[None, str, List[str], Callable[[T], Any]] = None, @@ -2271,10 +2264,10 @@ def to_spark(self, def to_pandas(self, limit: int = 100000) -> "pandas.DataFrame": """Convert this dataset into a single Pandas DataFrame. - This is only supported for datasets convertible to Arrow or Pandas - records. An error is raised if the number of records exceeds the - provided limit. Note that you can use ``.limit()`` on the dataset - beforehand to truncate the dataset manually. + This is only supported for datasets convertible to Arrow records. An + error is raised if the number of records exceeds the provided limit. + Note that you can use ``.limit()`` on the dataset beforehand to + truncate the dataset manually. Time complexity: O(dataset size) diff --git a/python/ray/data/impl/arrow_block.py b/python/ray/data/impl/arrow_block.py index 3a5c16108a7e..86500e2ec73e 100644 --- a/python/ray/data/impl/arrow_block.py +++ b/python/ray/data/impl/arrow_block.py @@ -79,7 +79,7 @@ def slice(self, start: int, end: int, copy: bool) -> "pyarrow.Table": view = _copy_table(view) return view - def random_shuffle(self, random_seed: Optional[int]) -> "pyarrow.Table": + def random_shuffle(self, random_seed: Optional[int]) -> List[T]: random = np.random.RandomState(random_seed) return self._table.take(random.permutation(self.num_rows())) diff --git a/python/ray/data/impl/delegating_block_builder.py b/python/ray/data/impl/delegating_block_builder.py index c173f3042712..ec5b3973ee46 100644 --- a/python/ray/data/impl/delegating_block_builder.py +++ b/python/ray/data/impl/delegating_block_builder.py @@ -4,7 +4,6 @@ from ray.data.impl.block_builder import BlockBuilder from ray.data.impl.simple_block import SimpleBlockBuilder from ray.data.impl.arrow_block import ArrowRow, ArrowBlockBuilder -from ray.data.impl.pandas_block import PandasRow, PandasBlockBuilder class DelegatingBlockBuilder(BlockBuilder[T]): @@ -14,7 +13,6 @@ def __init__(self): def add(self, item: Any) -> None: if self._builder is None: - # TODO (kfstorm): Maybe we can use Pandas block format for dict. if isinstance(item, dict) or isinstance(item, ArrowRow): import pyarrow try: @@ -24,8 +22,6 @@ def add(self, item: Any) -> None: self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): self._builder = SimpleBlockBuilder() - elif isinstance(item, PandasRow): - self._builder = PandasBlockBuilder() else: self._builder = SimpleBlockBuilder() self._builder.add(item) diff --git a/python/ray/data/impl/pandas_block.py b/python/ray/data/impl/pandas_block.py deleted file mode 100644 index ffbbcfbaa83a..000000000000 --- a/python/ray/data/impl/pandas_block.py +++ /dev/null @@ -1,188 +0,0 @@ -from typing import Dict, List, Tuple, Any, TypeVar, Optional, TYPE_CHECKING - -import collections -import numpy as np - -try: - import pandas -except ImportError: - pandas = None - -from ray.data.block import BlockAccessor, BlockMetadata -from ray.data.impl.table_block import TableBlockAccessor, TableRow, \ - TableBlockBuilder, SortKeyT, GroupKeyT -from ray.data.impl.arrow_block import ArrowBlockAccessor -from ray.data.aggregate import AggregateFn - -if TYPE_CHECKING: - import pyarrow - import pandas - -T = TypeVar("T") - - -class PandasRow(TableRow): - def as_pydict(self) -> dict: - return {k: v[0] for k, v in self._row.to_dict("list").items()} - - def __getitem__(self, key: str) -> Any: - assert isinstance(key, str) - col = self._row[key] - if len(col) == 0: - return None - item = col.iloc[0] - try: - # Try to interpret this as a numpy-type value. - # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types. # noqa: E501 - return item.item() - except AttributeError: - # Fallback to the original form. - return item - - def __len__(self): - return self._row.shape[1] - - -class PandasBlockBuilder(TableBlockBuilder[T]): - def __init__(self): - if pandas is None: - raise ImportError("Run `pip install pandas` for Pandas support.") - super().__init__(pandas.DataFrame) - - def _table_from_pydict( - self, columns: Dict[str, List[Any]]) -> "pandas.DataFrame": - return pandas.DataFrame(columns) - - def _concat_tables(self, - tables: List["pandas.DataFrame"]) -> "pandas.DataFrame": - return pandas.concat(tables, ignore_index=True) - - @staticmethod - def _empty_table() -> "pandas.DataFrame": - return pandas.DataFrame() - - -# This is to be compatible with pyarrow.lib.schema -# TODO (kfstorm): We need a format-independent way to represent schema. -PandasBlockSchema = collections.namedtuple("PandasBlockSchema", - ["names", "types"]) - - -class PandasBlockAccessor(TableBlockAccessor): - def __init__(self, table: "pandas.DataFrame"): - if pandas is None: - raise ImportError("Run `pip install pandas` for Pandas support.") - super().__init__(table) - - def _create_table_row(self, row: "pandas.DataFrame") -> PandasRow: - return PandasRow(row) - - def slice(self, start: int, end: int, copy: bool) -> "pandas.DataFrame": - view = self._table[start:end] - if copy: - view = view.copy(deep=True) - return view - - def random_shuffle(self, random_seed: Optional[int]) -> "pandas.DataFrame": - return self._table.sample(frac=1, random_state=random_seed) - - def schema(self) -> PandasBlockSchema: - dtypes = self._table.dtypes - schema = PandasBlockSchema( - names=dtypes.index.tolist(), types=dtypes.values.tolist()) - # Column names with non-str types of a pandas DataFrame is not - # supported by Ray Dataset. - if any(not isinstance(name, str) for name in schema.names): - raise ValueError( - "A Pandas DataFrame with column names of non-str types" - " is not supported by Ray Dataset. Column names of this" - f" DataFrame: {schema.names!r}.") - return schema - - def to_pandas(self) -> "pandas.DataFrame": - return self._table - - def to_numpy(self, column: str = None) -> np.ndarray: - if not column: - raise ValueError( - "`column` must be specified when calling .to_numpy() " - "on Pandas blocks.") - if column not in self._table.columns: - raise ValueError( - "Cannot find column {}, available columns: {}".format( - column, self._table.columns.tolist())) - return self._table[column].to_numpy() - - def to_arrow(self) -> "pyarrow.Table": - import pyarrow - return pyarrow.table(self._table) - - def num_rows(self) -> int: - return self._table.shape[0] - - def size_bytes(self) -> int: - return self._table.memory_usage(index=True, deep=True).sum() - - def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame": - r = self.to_pandas().copy(deep=False) - s = acc.to_pandas() - for col_name in s.columns: - col = s[col_name] - # Ensure the column names are unique after zip. - if col_name in r.column_names: - i = 1 - new_name = col_name - while new_name in r.column_names: - new_name = "{}_{}".format(col_name, i) - i += 1 - col_name = new_name - r[col_name] = col - return r - - @staticmethod - def builder() -> PandasBlockBuilder[T]: - return PandasBlockBuilder() - - @staticmethod - def _empty_table() -> "pandas.DataFrame": - return PandasBlockBuilder._empty_table() - - def _sample(self, n_samples: int, key: SortKeyT) -> "pandas.DataFrame": - return self._table[[k[0] for k in key]].sample( - n_samples, ignore_index=True) - - def sort_and_partition(self, boundaries: List[T], key: SortKeyT, - descending: bool) -> List["pandas.DataFrame"]: - # TODO (kfstorm): A workaround to pass tests. Not efficient. - delegated_result = BlockAccessor.for_block( - self.to_arrow()).sort_and_partition(boundaries, key, descending) - return [ - BlockAccessor.for_block(_).to_pandas() for _ in delegated_result - ] - - def combine(self, key: GroupKeyT, - aggs: Tuple[AggregateFn]) -> "pandas.DataFrame": - # TODO (kfstorm): A workaround to pass tests. Not efficient. - return BlockAccessor.for_block(self.to_arrow()).combine( - key, aggs).to_pandas() - - @staticmethod - def merge_sorted_blocks( - blocks: List["pandas.DataFrame"], key: SortKeyT, - _descending: bool) -> Tuple["pandas.DataFrame", BlockMetadata]: - # TODO (kfstorm): A workaround to pass tests. Not efficient. - block, metadata = ArrowBlockAccessor.merge_sorted_blocks( - [BlockAccessor.for_block(block).to_arrow() for block in blocks], - key, _descending) - return BlockAccessor.for_block(block).to_pandas(), metadata - - @staticmethod - def aggregate_combined_blocks( - blocks: List["pandas.DataFrame"], key: GroupKeyT, - aggs: Tuple[AggregateFn] - ) -> Tuple["pandas.DataFrame", BlockMetadata]: - # TODO (kfstorm): A workaround to pass tests. Not efficient. - block, metadata = ArrowBlockAccessor.aggregate_combined_blocks( - [BlockAccessor.for_block(block).to_arrow() for block in blocks], - key, aggs) - return BlockAccessor.for_block(block).to_pandas(), metadata diff --git a/python/ray/data/impl/simple_block.py b/python/ray/data/impl/simple_block.py index 7b90bfd78d14..7006eb7cf01d 100644 --- a/python/ray/data/impl/simple_block.py +++ b/python/ray/data/impl/simple_block.py @@ -71,7 +71,7 @@ def random_shuffle(self, random_seed: Optional[int]) -> List[T]: def to_pandas(self) -> "pandas.DataFrame": import pandas - return pandas.DataFrame({"value": self._items}) + return pandas.DataFrame(self._items) def to_numpy(self, column: str = None) -> np.ndarray: if column: diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 9ac814dccfaf..87f96c30486f 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -534,22 +534,8 @@ def from_dask(df: "dask.DataFrame") -> Dataset[ArrowRow]: partitions = df.to_delayed() persisted_partitions = dask.persist(*partitions, scheduler=ray_dask_get) - - import pandas - - def to_ref(df): - if isinstance(df, pandas.DataFrame): - return ray.put(df) - elif isinstance(df, ray.ObjectRef): - return df - else: - raise ValueError( - "Expected a Ray object ref or a Pandas DataFrame, " - f"got {type(df)}") - - return from_pandas_refs([ - to_ref(next(iter(part.dask.values()))) for part in persisted_partitions - ]) + return from_pandas_refs( + [next(iter(part.dask.values())) for part in persisted_partitions]) @PublicAPI(stability="beta") @@ -614,21 +600,6 @@ def from_pandas_refs(dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef[ """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] - elif isinstance(dfs, list): - for df in dfs: - if not isinstance(df, ray.ObjectRef): - raise ValueError("Expected list of Ray object refs, " - f"got list containing {type(df)}") - else: - raise ValueError("Expected Ray object ref or list of Ray object refs, " - f"got {type(df)}") - - context = DatasetContext.get_current() - if context.enable_pandas_block: - get_metadata = cached_remote_fn(_get_metadata) - metadata = [get_metadata.remote(df) for df in dfs] - return Dataset( - BlockList(dfs, ray.get(metadata)), 0, DatasetStats.TODO()) df_to_block = cached_remote_fn(_df_to_block, num_returns=2) @@ -736,8 +707,7 @@ def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: input_files=None, exec_stats=stats.build())) -def _get_metadata( - table: Union["pyarrow.Table", "pandas.DataFrame"]) -> BlockMetadata: +def _get_metadata(table: "pyarrow.Table") -> BlockMetadata: stats = BlockExecStats.builder() return BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build()) diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index b003e7eca2e6..f3416a7e9566 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -329,7 +329,7 @@ def test_batch_tensors(ray_start_regular_shared): with pytest.raises(pa.lib.ArrowInvalid): next(ds.iter_batches(batch_format="pyarrow")) df = next(ds.iter_batches(batch_format="pandas")) - assert df.to_dict().keys() == {"value"} + assert df.to_dict().keys() == {0, 1} def test_arrow_block_slice_copy(): @@ -1156,56 +1156,34 @@ def test_repartition_shuffle_arrow(ray_start_regular_shared): assert large._block_num_rows() == [500] * 20 -@pytest.mark.parametrize("enable_pandas_block", [False, True]) -def test_from_pandas(ray_start_regular_shared, enable_pandas_block): - ctx = ray.data.context.DatasetContext.get_current() - old_enable_pandas_block = ctx.enable_pandas_block - ctx.enable_pandas_block = enable_pandas_block - try: - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - ds = ray.data.from_pandas([df1, df2]) - assert ds._dataset_format( - ) == "pandas" if enable_pandas_block else "arrow" - values = [(r["one"], r["two"]) for r in ds.take(6)] - rows = [(r.one, r.two) for _, r in pd.concat([df1, df2]).iterrows()] - assert values == rows - - # test from single pandas dataframe - ds = ray.data.from_pandas(df1) - assert ds._dataset_format( - ) == "pandas" if enable_pandas_block else "arrow" - values = [(r["one"], r["two"]) for r in ds.take(3)] - rows = [(r.one, r.two) for _, r in df1.iterrows()] - assert values == rows - finally: - ctx.enable_pandas_block = old_enable_pandas_block +def test_from_pandas(ray_start_regular_shared): + df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) + df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) + ds = ray.data.from_pandas([df1, df2]) + values = [(r["one"], r["two"]) for r in ds.take(6)] + rows = [(r.one, r.two) for _, r in pd.concat([df1, df2]).iterrows()] + assert values == rows + # test from single pandas dataframe + ds = ray.data.from_pandas(df1) + values = [(r["one"], r["two"]) for r in ds.take(3)] + rows = [(r.one, r.two) for _, r in df1.iterrows()] + assert values == rows -@pytest.mark.parametrize("enable_pandas_block", [False, True]) -def test_from_pandas_refs(ray_start_regular_shared, enable_pandas_block): - ctx = ray.data.context.DatasetContext.get_current() - old_enable_pandas_block = ctx.enable_pandas_block - ctx.enable_pandas_block = enable_pandas_block - try: - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - ds = ray.data.from_pandas_refs([ray.put(df1), ray.put(df2)]) - assert ds._dataset_format( - ) == "pandas" if enable_pandas_block else "arrow" - values = [(r["one"], r["two"]) for r in ds.take(6)] - rows = [(r.one, r.two) for _, r in pd.concat([df1, df2]).iterrows()] - assert values == rows - - # test from single pandas dataframe ref - ds = ray.data.from_pandas_refs(ray.put(df1)) - assert ds._dataset_format( - ) == "pandas" if enable_pandas_block else "arrow" - values = [(r["one"], r["two"]) for r in ds.take(3)] - rows = [(r.one, r.two) for _, r in df1.iterrows()] - assert values == rows - finally: - ctx.enable_pandas_block = old_enable_pandas_block + +def test_from_pandas_refs(ray_start_regular_shared): + df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) + df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) + ds = ray.data.from_pandas_refs([ray.put(df1), ray.put(df2)]) + values = [(r["one"], r["two"]) for r in ds.take(6)] + rows = [(r.one, r.two) for _, r in pd.concat([df1, df2]).iterrows()] + assert values == rows + + # test from single pandas dataframe ref + ds = ray.data.from_pandas_refs(ray.put(df1)) + values = [(r["one"], r["two"]) for r in ds.take(3)] + rows = [(r.one, r.two) for _, r in df1.iterrows()] + assert values == rows def test_from_numpy(ray_start_regular_shared): @@ -1316,7 +1294,7 @@ def test_to_arrow_refs(ray_start_regular_shared): assert df.equals(dfds) # Conversion. - df = pd.DataFrame({"value": list(range(n))}) + df = pd.DataFrame({0: list(range(n))}) ds = ray.data.range(n) dfds = pd.concat( [t.to_pandas() for t in ray.get(ds.to_arrow_refs())], @@ -1699,8 +1677,8 @@ def test_parquet_write_with_udf(ray_start_regular_shared, tmp_path): df = pd.concat([df1, df2]) ds = ray.data.from_pandas([df1, df2]) - def _block_udf(block): - df = BlockAccessor.for_block(block).to_pandas().copy() + def _block_udf(block: pa.Table): + df = block.to_pandas() df["one"] += 1 return pa.Table.from_pandas(df) @@ -1887,7 +1865,7 @@ def test_iter_batches_basic(ray_start_regular_shared): # blocks format. for batch, df in zip(ds.iter_batches(batch_format="native"), dfs): - assert BlockAccessor.for_block(batch).to_pandas().equals(df) + assert batch.to_pandas().equals(df) # Batch size. batch_size = 2 @@ -2049,10 +2027,8 @@ def test_map_batch(ray_start_regular_shared, tmp_path): table = pa.Table.from_pandas(df) pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - lambda df: df + 1, batch_size=1, batch_format="pandas") - assert ds2._dataset_format() == "pandas" - ds_list = ds2.take() + ds_list = ds.map_batches( + lambda df: df + 1, batch_size=1, batch_format="pandas").take() values = [s["one"] for s in ds_list] assert values == [2, 3, 4] values = [s["two"] for s in ds_list] @@ -2060,9 +2036,8 @@ def test_map_batch(ray_start_regular_shared, tmp_path): # Test Pyarrow ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda pa: pa, batch_size=1, batch_format="pyarrow") - assert ds2._dataset_format() == "arrow" - ds_list = ds2.take() + ds_list = ds.map_batches( + lambda pa: pa, batch_size=1, batch_format="pyarrow").take() values = [s["one"] for s in ds_list] assert values == [1, 2, 3] values = [s["two"] for s in ds_list] @@ -2071,31 +2046,27 @@ def test_map_batch(ray_start_regular_shared, tmp_path): # Test batch size = 300 ds = ray.data.range(size) - ds2 = ds.map_batches( - lambda df: df + 1, batch_size=17, batch_format="pandas") - assert ds2._dataset_format() == "pandas" - ds_list = ds2.take(limit=size) + ds_list = ds.map_batches( + lambda df: df + 1, batch_size=17, + batch_format="pandas").take(limit=size) for i in range(size): - # The pandas column is "value", and it originally has rows from 0~299. + # The pandas column is "0", and it originally has rows from 0~299. # After the map batch, it should have 1~300. row = ds_list[i] - assert row["value"] == i + 1 + assert row["0"] == i + 1 assert ds.count() == 300 # Test the lambda returns different types than the batch_format # pandas => list block ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1) - assert ds2._dataset_format() == "simple" - ds_list = ds2.take() + ds_list = ds.map_batches(lambda df: [1], batch_size=1).take() assert ds_list == [1, 1, 1] assert ds.count() == 3 # pyarrow => list block ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1, batch_format="pyarrow") - assert ds2._dataset_format() == "simple" - ds_list = ds2.take() + ds_list = ds.map_batches( + lambda df: [1], batch_size=1, batch_format="pyarrow").take() assert ds_list == [1, 1, 1] assert ds.count() == 3 @@ -3676,17 +3647,6 @@ def test_sort_simple(ray_start_regular_shared): assert ds.count() == 0 -def test_column_name_type_check(ray_start_regular_shared): - df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) - ds = ray.data.from_pandas(df) - expected_str = ("Dataset(num_blocks=1, num_rows=10, " - "schema={1: float64, a: float64})") - assert str(ds) == expected_str, str(ds) - df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) - with pytest.raises(ValueError): - ray.data.from_pandas(df) - - @pytest.mark.parametrize("pipelined", [False, True]) def test_random_shuffle(shutdown_only, pipelined): def range(n, parallelism=200):