From b51aa7445c2fd3fd85e2d0cca495aaec16fd292b Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 00:41:53 -0700 Subject: [PATCH 01/30] Add `__repr__` for checkpoints --- python/ray/air/checkpoint.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/ray/air/checkpoint.py b/python/ray/air/checkpoint.py index 00f3a5716f91..f849aa52eba1 100644 --- a/python/ray/air/checkpoint.py +++ b/python/ray/air/checkpoint.py @@ -209,6 +209,10 @@ def __init__( self._uri: Optional[str] = uri self._obj_ref: Optional[ray.ObjectRef] = obj_ref + def __repr__(self): + parameter, argument = self.get_internal_representation() + return f"{self.__class__.__name__}({parameter}={argument})" + @classmethod def from_bytes(cls, data: bytes) -> "Checkpoint": """Create a checkpoint from the given byte string. From ca2e9a748d5d9bcb62c07f167036e8b7b1f9e991 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 01:36:48 -0700 Subject: [PATCH 02/30] Add config `__repr__`s --- python/ray/air/config.py | 86 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index d7970e9e2657..a1dcedc7fb40 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -1,5 +1,15 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Dict, List, Mapping, Optional, Union, Tuple +from dataclasses import _MISSING_TYPE, dataclass, fields +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Union, + Tuple, +) from ray.air.constants import WILDCARD_KEY from ray.util.annotations import PublicAPI @@ -27,6 +37,54 @@ MIN = "min" +def _repr_dataclass(obj, *, default_values: Optional[Dict[str, Any]] = None) -> str: + """A utility function to elegantly represent dataclasses. + + In contrast to the default dataclass `__repr__`, which shows all parameters, this + function only shows parameters with non-default values. + + Args: + obj: The dataclass to represent. + default_values: An optional dictionary that maps field names to default values. + Use this parameter to specify default values that are generated dynamically + (e.g., in `__post_init__` or by a `default_factory`). If a default value + isn't specified in `default_values`, then the default value is inferred from + the `dataclass`. + + Returns: + A representation of the dataclass. + + Examples: + >>> @dataclass + >>> class A: + >>> x: List = None + >>> def __post_init__(self): + >>> self.x = [] + >>> a = A() + >>> repr(a) + A() + """ + if default_values is None: + default_values = {} + + non_default_values = {} # Maps field name to value. + + for field in fields(obj): + value = getattr(obj, field.name) + default_value = default_values.get(field.name, field.default) + is_required = isinstance(field.default, _MISSING_TYPE) + if is_required or value != default_value: + non_default_values[field.name] = value + + string = f"{obj.__class__.__name__}(" + string += ", ".join( + f"{name}={repr(value)}" for name, value in non_default_values.items() + ) + string += ")" + + return string + + @dataclass @PublicAPI(stability="alpha") class ScalingConfig: @@ -86,6 +144,9 @@ def __post_init__(self): "`resources_per_worker." ) + def __repr__(self): + return _repr_dataclass(self) + def __eq__(self, o: "ScalingConfig") -> bool: if not isinstance(o, type(self)): return False @@ -262,6 +323,9 @@ class DatasetConfig: # True by default. randomize_block_order: Optional[bool] = None + def __repr__(self): + return _repr_dataclass(self) + def fill_defaults(self) -> "DatasetConfig": """Return a copy of this config with all default values filled in.""" return DatasetConfig( @@ -394,6 +458,9 @@ def __post_init__(self): "fail_fast must be one of {bool, 'raise'}. " f"Got {self.fail_fast}." ) + def __repr__(self): + return _repr_dataclass(self) + @dataclass @PublicAPI(stability="alpha") @@ -458,6 +525,9 @@ def __post_init__(self): f"checkpoint_frequency must be >=0, got {self.checkpoint_frequency}" ) + def __repr__(self): + return _repr_dataclass(self) + @property def _tune_legacy_checkpoint_score_attr(self) -> Optional[str]: """Same as ``checkpoint_score_attr`` in ``tune.run``. @@ -540,3 +610,15 @@ def __post_init__(self): if not self.checkpoint_config: self.checkpoint_config = CheckpointConfig() + + def __repr__(self): + from ray.tune.syncer import SyncConfig + + return _repr_dataclass( + self, + default_values={ + "failure_config": FailureConfig(), + "sync_config": SyncConfig(), + "checkpoint_config": CheckpointConfig(), + }, + ) From 50b1b047c38819377b9d57f4116dec0e7ce71d50 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 02:10:38 -0700 Subject: [PATCH 03/30] Add preprocessor `__repr__`s --- python/ray/data/preprocessors/chain.py | 2 +- python/ray/data/preprocessors/concatenator.py | 19 ++++++++----- python/ray/data/preprocessors/encoder.py | 27 ++++++++++--------- python/ray/data/preprocessors/imputer.py | 14 +++++----- python/ray/data/preprocessors/normalizer.py | 5 +++- python/ray/data/preprocessors/scaler.py | 26 ++++++++---------- python/ray/data/preprocessors/transformer.py | 10 +++---- python/ray/data/preprocessors/vectorizer.py | 26 +++++++++--------- 8 files changed, 65 insertions(+), 64 deletions(-) diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py index d11852815356..97ed79c90e63 100644 --- a/python/ray/data/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -71,4 +71,4 @@ def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": return df def __repr__(self): - return f"Chain(preprocessors={self.preprocessors})" + return f"Chain({', '.join(repr(preprocessor) for preprocessor in self.preprocessors)})" diff --git a/python/ray/data/preprocessors/concatenator.py b/python/ray/data/preprocessors/concatenator.py index f0f54d6e1bb7..7ed74a76e6d0 100644 --- a/python/ray/data/preprocessors/concatenator.py +++ b/python/ray/data/preprocessors/concatenator.py @@ -1,4 +1,5 @@ from typing import List, Optional +from xml.etree.ElementInclude import include import numpy as np import pandas as pd @@ -98,9 +99,15 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return ( - f"Concatenator(output_column_name={self.output_column_name}, " - f"include={self.included_columns}, " - f"exclude={self.excluded_columns}, " - f"dtype={self.dtype})" - ) + non_default_arguments = [] + if self.output_column_name != "concat_out": + non_default_arguments.append(f"output_column_name='{self.output_column_name}'") + if self.included_columns is not None: + non_default_arguments.append(f"include={self.included_columns}") + if self.excluded_columns != []: + non_default_arguments.append(f"exclude={self.excluded_columns}") + if self.dtype is not None: + non_default_arguments.append(f"dtype={self.dtype}") + if self.raise_if_missing: + non_default_arguments.append(f"raise_if_missing=True") + return f"Concatenator({', '.join(non_default_arguments)})" diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 05955cdcaeb7..11e20ca2dd3b 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -101,11 +101,10 @@ def list_as_category(element): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return ( - f"OrdinalEncoder(columns={self.columns}, stats={stats}, " - f"encode_lists={self.encode_lists})" - ) + if self.encode_lists: + return f"OrdinalEncoder(columns={self.columns})" + else: + return f"OrdinalEncoder(columns={self.columns}, encode_lists=False)" class OneHotEncoder(Preprocessor): @@ -186,8 +185,10 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"OneHotEncoder(columns={self.columns}, stats={stats})" + if self.limit is not None: + return f"OneHotEncoder(columns={self.columns}, limit={self.limit})" + else: + return f"OneHotEncoder(columns={self.columns})" class MultiHotEncoder(Preprocessor): @@ -270,8 +271,10 @@ def encode_list(element: list, *, name: str): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"MultiHotEncoder(columns={self.columns}, stats={stats})" + if self.limit is not None: + return f"MultiHotEncoder(columns={self.columns}, limit={self.limit})" + else: + return f"MultiHotEncoder(columns={self.columns})" class LabelEncoder(Preprocessor): @@ -306,8 +309,7 @@ def column_label_encoder(s: pd.Series): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"LabelEncoder(label_column={self.label_column}, stats={stats})" + return f"LabelEncoder(label_column='{self.label_column}')" class Categorizer(Preprocessor): @@ -361,8 +363,7 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"" + return f"Categorizer(columns={self.columns})>" def _get_unique_value_indices( diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py index e3f8d1e43e9a..0165a8b5771e 100644 --- a/python/ray/data/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -73,14 +73,12 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return ( - f"SimpleImputer(" - f"columns={self.columns}, " - f"strategy={self.strategy}, " - f"fill_value={self.fill_value}, " - f"stats={stats})" - ) + non_default_arguments = [f"columns={self.columns}"] + if self.strategy != "mean": + non_default_arguments.append(f"strategy='{self.strategy}'") + if self.fill_value is not None: + non_default_arguments.append(f"fill_value={repr(self.fill_value)}") + return f"SimpleImputer({', '.join(non_default_arguments)})" def _get_most_frequent_values( diff --git a/python/ray/data/preprocessors/normalizer.py b/python/ray/data/preprocessors/normalizer.py index ac765a8b132e..cea35b9a9518 100644 --- a/python/ray/data/preprocessors/normalizer.py +++ b/python/ray/data/preprocessors/normalizer.py @@ -45,4 +45,7 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return f"Normalizer(columns={self.columns}, norm={self.norm})>" + if self.norm != "l2": + return f"Normalizer(columns={self.columns}, norm='{self.norm}')" + else: + return f"Normalizer(columns={self.columns})" diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index b5b527147f07..af27e8e4c214 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -6,6 +6,7 @@ from ray.data import Dataset from ray.data.aggregate import Mean, Std, Min, Max, AbsMax from ray.data.preprocessor import Preprocessor +from torch import quantile class StandardScaler(Preprocessor): @@ -47,10 +48,10 @@ def column_standard_scaler(s: pd.Series): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return ( - f"StandardScaler(columns={self.columns}, ddof={self.ddof}, stats={stats})" - ) + if self.ddof != 0: + return f"StandardScaler(columns={self.columns}, ddof={self.ddof})" + else: + return f"StandardScaler(columns={self.columns})" class MinMaxScaler(Preprocessor): @@ -94,8 +95,7 @@ def column_min_max_scaler(s: pd.Series): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"MixMaxScaler(columns={self.columns}, stats={stats})" + return f"MixMaxScaler(columns={self.columns})" class MaxAbsScaler(Preprocessor): @@ -136,8 +136,7 @@ def column_abs_max_scaler(s: pd.Series): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return f"MaxAbsScaler(columns={self.columns}, stats={stats})" + return f"MaxAbsScaler(columns={self.columns})" class RobustScaler(Preprocessor): @@ -214,10 +213,7 @@ def column_robust_scaler(s: pd.Series): return df def __repr__(self): - stats = getattr(self, "stats_", None) - return ( - f"RobustScaler(" - f"columns={self.columns}, " - f"quantile_range={self.quantile_range}, " - f"stats={stats})>" - ) + if self.quantile_range != (0.25, 0.75): + return f"RobustScaler(columns={self.columns}, quantile_range={self.quantile_range})" + else: + return f"RobustScaler(columns={self.columns})" diff --git a/python/ray/data/preprocessors/transformer.py b/python/ray/data/preprocessors/transformer.py index 20373a9e6d6e..7cc5ba4d7191 100644 --- a/python/ray/data/preprocessors/transformer.py +++ b/python/ray/data/preprocessors/transformer.py @@ -69,9 +69,7 @@ def column_power_transformer(s: pd.Series): return df def __repr__(self): - return ( - f"PowerTransformer(" - f"columns={self.columns}, " - f"method={self.method}, " - f"power={self.power})" - ) + if self.method != "yeo-johnson": + return f"PowerTransformer(columns={self.columns}, power={self.power}, method={self.method})" + else: + return f"PowerTransformer(columns={self.columns}, power={self.power})" diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 0a22f72942c0..7d6c3a1829fd 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -53,13 +53,11 @@ def hash_count(tokens: List[str]) -> Counter: return df def __repr__(self): - fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - return ( - f"HashingVectorizer(" - f"Columns={self.columns}, " - f"num_features={self.num_features}, " - f"tokenization_fn={fn_name})" - ) + if self.tokenization_fn != simple_split_tokenizer: + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features}, tokenization_fn={fn_name})" + else: + return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features})" class CountVectorizer(Preprocessor): @@ -132,10 +130,10 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - return ( - f"CountVectorizer(" - f"columns={self.columns}, " - f"tokenization_fn={fn_name}, " - f"max_features={self.max_features})" - ) + non_default_arguments = [f"columns={self.columns}"] + if self.tokenization_fn != simple_split_tokenizer: + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + non_default_arguments.append(f"tokenization_fn={fn_name}") + if self.max_features is not None: + non_default_arguments.append(f"max_features={self.max_features}") + return f"CountVectorizer({', '.join(non_default_arguments)})" From bb360889b2c60da6f54bbcff73da320d1008d0c6 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 02:43:17 -0700 Subject: [PATCH 04/30] Add trainer `__repr__` --- python/ray/data/dataset.py | 2 +- python/ray/train/base_trainer.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index fcae3991bacc..bf085a2cd33d 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -3586,7 +3586,7 @@ def __repr__(self) -> str: schema_str = ", ".join(schema_str) schema_str = "{" + schema_str + "}" count = self._meta_count() - return "Dataset(num_blocks={}, num_rows={}, schema={})".format( + return "".format( self._plan.initial_num_blocks(), count, schema_str ) diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py index 8a10e5f7fb0d..e0907cd28476 100644 --- a/python/ray/train/base_trainer.py +++ b/python/ray/train/base_trainer.py @@ -170,6 +170,20 @@ def __init__( "#example-datasets-in-tune for more info." ) + def __repr__(self): + non_default_arguments = [] + if self.scaling_config != ScalingConfig(): + non_default_arguments.append(f"scaling_config={self.scaling_config}") + if self.run_config != RunConfig(): + non_default_arguments.append(f"run_config={self.run_config}") + if self.datasets != {}: + non_default_arguments.append(f"datasets={self.datasets}") + if self.preprocessor is not None: + non_default_arguments.append(f"preprocessor={self.preprocessor}") + if self.resume_from_checkpoint is not None: + non_default_arguments.append(f"resume_from_checkpoint={self.resume_from_checkpoint}") + return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>" + def __new__(cls, *args, **kwargs): """Store the init args as attributes so this can be merged with Tune hparams.""" trainer = super(BaseTrainer, cls).__new__(cls) From 40468f6789058e97f16e1aab508199b922cf3da0 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 03:03:35 -0700 Subject: [PATCH 05/30] Update `Preprocessor`s --- python/ray/data/preprocessors/concatenator.py | 20 +++++++------------ python/ray/data/preprocessors/encoder.py | 17 +++++----------- python/ray/data/preprocessors/imputer.py | 6 +++++- python/ray/data/preprocessors/normalizer.py | 5 +---- python/ray/data/preprocessors/scaler.py | 5 +---- python/ray/data/preprocessors/transformer.py | 5 +---- python/ray/data/preprocessors/vectorizer.py | 7 ++----- python/ray/train/base_trainer.py | 4 +++- 8 files changed, 25 insertions(+), 44 deletions(-) diff --git a/python/ray/data/preprocessors/concatenator.py b/python/ray/data/preprocessors/concatenator.py index 7ed74a76e6d0..521c7c5bc639 100644 --- a/python/ray/data/preprocessors/concatenator.py +++ b/python/ray/data/preprocessors/concatenator.py @@ -1,5 +1,4 @@ from typing import List, Optional -from xml.etree.ElementInclude import include import numpy as np import pandas as pd @@ -99,15 +98,10 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - non_default_arguments = [] - if self.output_column_name != "concat_out": - non_default_arguments.append(f"output_column_name='{self.output_column_name}'") - if self.included_columns is not None: - non_default_arguments.append(f"include={self.included_columns}") - if self.excluded_columns != []: - non_default_arguments.append(f"exclude={self.excluded_columns}") - if self.dtype is not None: - non_default_arguments.append(f"dtype={self.dtype}") - if self.raise_if_missing: - non_default_arguments.append(f"raise_if_missing=True") - return f"Concatenator({', '.join(non_default_arguments)})" + return ( + f"Concatenator(output_column_name='{self.output_column_name}', " + f"include={self.included_columns}, " + f"exclude={self.excluded_columns}, " + f"dtype={self.dtype}, " + f"raise_if_missing={self.raise_if_missing})" + ) diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 11e20ca2dd3b..1624c6a3b6f6 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -101,10 +101,9 @@ def list_as_category(element): return df def __repr__(self): - if self.encode_lists: - return f"OrdinalEncoder(columns={self.columns})" - else: - return f"OrdinalEncoder(columns={self.columns}, encode_lists=False)" + return ( + f"OrdinalEncoder(columns={self.columns}, encode_lists={self.encode_lists})" + ) class OneHotEncoder(Preprocessor): @@ -185,10 +184,7 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - if self.limit is not None: - return f"OneHotEncoder(columns={self.columns}, limit={self.limit})" - else: - return f"OneHotEncoder(columns={self.columns})" + return f"OneHotEncoder(columns={self.columns}, limit={self.limit})" class MultiHotEncoder(Preprocessor): @@ -271,10 +267,7 @@ def encode_list(element: list, *, name: str): return df def __repr__(self): - if self.limit is not None: - return f"MultiHotEncoder(columns={self.columns}, limit={self.limit})" - else: - return f"MultiHotEncoder(columns={self.columns})" + return f"MultiHotEncoder(columns={self.columns}, limit={self.limit})" class LabelEncoder(Preprocessor): diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py index 0165a8b5771e..f22df86318dc 100644 --- a/python/ray/data/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -78,7 +78,11 @@ def __repr__(self): non_default_arguments.append(f"strategy='{self.strategy}'") if self.fill_value is not None: non_default_arguments.append(f"fill_value={repr(self.fill_value)}") - return f"SimpleImputer({', '.join(non_default_arguments)})" + return ( + f"SimpleImputer(columns={self.columns}, " + f"strategy={self.strategy}, " + f"fill_value={self.fill_value})" + ) def _get_most_frequent_values( diff --git a/python/ray/data/preprocessors/normalizer.py b/python/ray/data/preprocessors/normalizer.py index cea35b9a9518..4287e3be4df5 100644 --- a/python/ray/data/preprocessors/normalizer.py +++ b/python/ray/data/preprocessors/normalizer.py @@ -45,7 +45,4 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - if self.norm != "l2": - return f"Normalizer(columns={self.columns}, norm='{self.norm}')" - else: - return f"Normalizer(columns={self.columns})" + return f"Normalizer(columns={self.columns}, norm='{self.norm}')" diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index af27e8e4c214..62e9c038e471 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -213,7 +213,4 @@ def column_robust_scaler(s: pd.Series): return df def __repr__(self): - if self.quantile_range != (0.25, 0.75): - return f"RobustScaler(columns={self.columns}, quantile_range={self.quantile_range})" - else: - return f"RobustScaler(columns={self.columns})" + return f"RobustScaler(columns={self.columns}, quantile_range={self.quantile_range})" diff --git a/python/ray/data/preprocessors/transformer.py b/python/ray/data/preprocessors/transformer.py index 7cc5ba4d7191..619bf81b5248 100644 --- a/python/ray/data/preprocessors/transformer.py +++ b/python/ray/data/preprocessors/transformer.py @@ -69,7 +69,4 @@ def column_power_transformer(s: pd.Series): return df def __repr__(self): - if self.method != "yeo-johnson": - return f"PowerTransformer(columns={self.columns}, power={self.power}, method={self.method})" - else: - return f"PowerTransformer(columns={self.columns}, power={self.power})" + return f"PowerTransformer(columns={self.columns}, power={self.power}, method={self.method})" diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 7d6c3a1829fd..1e31a33314e5 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -53,11 +53,8 @@ def hash_count(tokens: List[str]) -> Counter: return df def __repr__(self): - if self.tokenization_fn != simple_split_tokenizer: - fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features}, tokenization_fn={fn_name})" - else: - return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features})" + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features}, tokenization_fn={fn_name})" class CountVectorizer(Preprocessor): diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py index e0907cd28476..1ee80dde4221 100644 --- a/python/ray/train/base_trainer.py +++ b/python/ray/train/base_trainer.py @@ -181,7 +181,9 @@ def __repr__(self): if self.preprocessor is not None: non_default_arguments.append(f"preprocessor={self.preprocessor}") if self.resume_from_checkpoint is not None: - non_default_arguments.append(f"resume_from_checkpoint={self.resume_from_checkpoint}") + non_default_arguments.append( + f"resume_from_checkpoint={self.resume_from_checkpoint}" + ) return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>" def __new__(cls, *args, **kwargs): From ad08a5125ed4dffc6277c4458e35ead2d9c31d21 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 03:05:55 -0700 Subject: [PATCH 06/30] Fix stuff --- python/ray/data/preprocessors/imputer.py | 5 ----- python/ray/data/preprocessors/scaler.py | 5 +---- python/ray/data/preprocessors/vectorizer.py | 14 +++++++------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py index f22df86318dc..9aa9257017db 100644 --- a/python/ray/data/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -73,11 +73,6 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - non_default_arguments = [f"columns={self.columns}"] - if self.strategy != "mean": - non_default_arguments.append(f"strategy='{self.strategy}'") - if self.fill_value is not None: - non_default_arguments.append(f"fill_value={repr(self.fill_value)}") return ( f"SimpleImputer(columns={self.columns}, " f"strategy={self.strategy}, " diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index 62e9c038e471..603495692fce 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -48,10 +48,7 @@ def column_standard_scaler(s: pd.Series): return df def __repr__(self): - if self.ddof != 0: - return f"StandardScaler(columns={self.columns}, ddof={self.ddof})" - else: - return f"StandardScaler(columns={self.columns})" + return f"StandardScaler(columns={self.columns}, ddof={self.ddof})" class MinMaxScaler(Preprocessor): diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 1e31a33314e5..91800dc17239 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -127,10 +127,10 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - non_default_arguments = [f"columns={self.columns}"] - if self.tokenization_fn != simple_split_tokenizer: - fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - non_default_arguments.append(f"tokenization_fn={fn_name}") - if self.max_features is not None: - non_default_arguments.append(f"max_features={self.max_features}") - return f"CountVectorizer({', '.join(non_default_arguments)})" + fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) + return ( + f"CountVectorizer(" + f"columns={self.columns}, " + f"tokenization_fn={fn_name}, " + f"max_features={self.max_features})" + ) From 448a67c890651a511daa2539cb96d2e04bd9e292 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 03:13:37 -0700 Subject: [PATCH 07/30] Add predictor `__repr__`s --- python/ray/train/huggingface/huggingface_predictor.py | 3 +++ python/ray/train/lightgbm/lightgbm_predictor.py | 5 +++++ python/ray/train/rl/rl_predictor.py | 3 +++ python/ray/train/sklearn/sklearn_predictor.py | 3 +++ python/ray/train/tensorflow/tensorflow_predictor.py | 9 +++++++++ python/ray/train/torch/torch_predictor.py | 3 +++ python/ray/train/xgboost/xgboost_predictor.py | 5 +++++ 7 files changed, 31 insertions(+) diff --git a/python/ray/train/huggingface/huggingface_predictor.py b/python/ray/train/huggingface/huggingface_predictor.py index b4ee23adb66c..fdd545871ebc 100644 --- a/python/ray/train/huggingface/huggingface_predictor.py +++ b/python/ray/train/huggingface/huggingface_predictor.py @@ -36,6 +36,9 @@ def __init__( self.pipeline = pipeline super().__init__(preprocessor) + def __repr__(self): + return f"HuggingFacePredictor(pipeline={self.pipeline}, preprocessor={self._preprocessor})" + @classmethod def from_checkpoint( cls, diff --git a/python/ray/train/lightgbm/lightgbm_predictor.py b/python/ray/train/lightgbm/lightgbm_predictor.py index ff4e2311c854..5c9958183a09 100644 --- a/python/ray/train/lightgbm/lightgbm_predictor.py +++ b/python/ray/train/lightgbm/lightgbm_predictor.py @@ -30,6 +30,11 @@ def __init__( self.model = model super().__init__(preprocessor) + def __repr__(self): + return ( + f"LightGBMPredictor(model={self.model}, preprocessor={self._preprocessor})" + ) + @classmethod def from_checkpoint(cls, checkpoint: Checkpoint) -> "LightGBMPredictor": """Instantiate the predictor from a Checkpoint. diff --git a/python/ray/train/rl/rl_predictor.py b/python/ray/train/rl/rl_predictor.py index 726aa945c97b..e71aaab3f1da 100644 --- a/python/ray/train/rl/rl_predictor.py +++ b/python/ray/train/rl/rl_predictor.py @@ -33,6 +33,9 @@ def __init__( self.policy = policy super().__init__(preprocessor) + def __repr__(self): + return f"RLPredictor(policy={self.policy}, preprocessor={self._preprocessor})" + @classmethod def from_checkpoint( cls, diff --git a/python/ray/train/sklearn/sklearn_predictor.py b/python/ray/train/sklearn/sklearn_predictor.py index 62b94810ae84..b8bb91a90dd9 100644 --- a/python/ray/train/sklearn/sklearn_predictor.py +++ b/python/ray/train/sklearn/sklearn_predictor.py @@ -35,6 +35,9 @@ def __init__( self.estimator = estimator super().__init__(preprocessor) + def __repr__(self): + return f"SklearnPredictor(estimator={self.estimator}, preprocessor={self._preprocessor})" + @classmethod def from_checkpoint(cls, checkpoint: Checkpoint) -> "SklearnPredictor": """Instantiate the predictor from a Checkpoint. diff --git a/python/ray/train/tensorflow/tensorflow_predictor.py b/python/ray/train/tensorflow/tensorflow_predictor.py index eca7af5f12d3..15c7a64f6c8c 100644 --- a/python/ray/train/tensorflow/tensorflow_predictor.py +++ b/python/ray/train/tensorflow/tensorflow_predictor.py @@ -75,6 +75,15 @@ def __init__( self._model.set_weights(model_weights) super().__init__(preprocessor) + def __repr__(self): + return ( + "TensorflowPredictor(" + f"model_definition={self.model_definition}, " + f"preprocessor={self._preprocessor}, " + f"model_weights={self.model_weights}, " + f"use_gpu={self.use_gpu})" + ) + @classmethod def from_checkpoint( cls, diff --git a/python/ray/train/torch/torch_predictor.py b/python/ray/train/torch/torch_predictor.py index aead252df10c..44ae3a202647 100644 --- a/python/ray/train/torch/torch_predictor.py +++ b/python/ray/train/torch/torch_predictor.py @@ -61,6 +61,9 @@ def __init__( super().__init__(preprocessor) + def __repr__(self): + return f"TorchPredictor(model={self.model}, preprocessor={self._preprocessor}, use_gpu={self.use_gpu})" + @classmethod def from_checkpoint( cls, diff --git a/python/ray/train/xgboost/xgboost_predictor.py b/python/ray/train/xgboost/xgboost_predictor.py index 4613f1e33535..1ea9cad1aba6 100644 --- a/python/ray/train/xgboost/xgboost_predictor.py +++ b/python/ray/train/xgboost/xgboost_predictor.py @@ -29,6 +29,11 @@ def __init__( self.model = model super().__init__(preprocessor) + def __repr__(self): + return ( + f"XGBoostPredictor(model={self.model}, preprocessor={self._preprocessor})" + ) + @classmethod def from_checkpoint(cls, checkpoint: Checkpoint) -> "XGBoostPredictor": """Instantiate the predictor from a Checkpoint. From 829cf734cb991ec6ff9fdece75c50e5e586d0d5d Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 03:19:25 -0700 Subject: [PATCH 08/30] Appease lint --- python/ray/data/preprocessors/chain.py | 3 ++- python/ray/data/preprocessors/scaler.py | 6 ++++-- python/ray/data/preprocessors/transformer.py | 5 ++++- python/ray/data/preprocessors/vectorizer.py | 6 +++++- python/ray/train/huggingface/huggingface_predictor.py | 5 ++++- python/ray/train/sklearn/sklearn_predictor.py | 5 ++++- python/ray/train/torch/torch_predictor.py | 6 +++++- 7 files changed, 28 insertions(+), 8 deletions(-) diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py index 97ed79c90e63..53b715354d96 100644 --- a/python/ray/data/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -71,4 +71,5 @@ def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": return df def __repr__(self): - return f"Chain({', '.join(repr(preprocessor) for preprocessor in self.preprocessors)})" + arguments = ", ".join(repr(preprocessor) for preprocessor in self.preprocessors) + return f"Chain({arguments})" diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index 603495692fce..c4efdd51925a 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -6,7 +6,6 @@ from ray.data import Dataset from ray.data.aggregate import Mean, Std, Min, Max, AbsMax from ray.data.preprocessor import Preprocessor -from torch import quantile class StandardScaler(Preprocessor): @@ -210,4 +209,7 @@ def column_robust_scaler(s: pd.Series): return df def __repr__(self): - return f"RobustScaler(columns={self.columns}, quantile_range={self.quantile_range})" + return ( + f"RobustScaler(columns={self.columns}, " + f"quantile_range={self.quantile_range})" + ) diff --git a/python/ray/data/preprocessors/transformer.py b/python/ray/data/preprocessors/transformer.py index 619bf81b5248..e35e97e5ca05 100644 --- a/python/ray/data/preprocessors/transformer.py +++ b/python/ray/data/preprocessors/transformer.py @@ -69,4 +69,7 @@ def column_power_transformer(s: pd.Series): return df def __repr__(self): - return f"PowerTransformer(columns={self.columns}, power={self.power}, method={self.method})" + return ( + f"PowerTransformer(columns={self.columns}, " + f"power={self.power}, method={self.method})" + ) diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 91800dc17239..2502b45bc8f3 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -54,7 +54,11 @@ def hash_count(tokens: List[str]) -> Counter: def __repr__(self): fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - return f"HasingVectorizer(columns={self.columns}, num_features={self.num_features}, tokenization_fn={fn_name})" + return ( + f"HasingVectorizer(columns={self.columns}, " + f"num_features={self.num_features}, " + f"tokenization_fn={fn_name})" + ) class CountVectorizer(Preprocessor): diff --git a/python/ray/train/huggingface/huggingface_predictor.py b/python/ray/train/huggingface/huggingface_predictor.py index fdd545871ebc..dc8c2fac7346 100644 --- a/python/ray/train/huggingface/huggingface_predictor.py +++ b/python/ray/train/huggingface/huggingface_predictor.py @@ -37,7 +37,10 @@ def __init__( super().__init__(preprocessor) def __repr__(self): - return f"HuggingFacePredictor(pipeline={self.pipeline}, preprocessor={self._preprocessor})" + return ( + f"HuggingFacePredictor(pipeline={self.pipeline}, " + f"preprocessor={self._preprocessor})" + ) @classmethod def from_checkpoint( diff --git a/python/ray/train/sklearn/sklearn_predictor.py b/python/ray/train/sklearn/sklearn_predictor.py index b8bb91a90dd9..d6796e7e77eb 100644 --- a/python/ray/train/sklearn/sklearn_predictor.py +++ b/python/ray/train/sklearn/sklearn_predictor.py @@ -36,7 +36,10 @@ def __init__( super().__init__(preprocessor) def __repr__(self): - return f"SklearnPredictor(estimator={self.estimator}, preprocessor={self._preprocessor})" + return ( + f"SklearnPredictor(estimator={self.estimator}, " + f"preprocessor={self._preprocessor})" + ) @classmethod def from_checkpoint(cls, checkpoint: Checkpoint) -> "SklearnPredictor": diff --git a/python/ray/train/torch/torch_predictor.py b/python/ray/train/torch/torch_predictor.py index 44ae3a202647..012c6296bb8f 100644 --- a/python/ray/train/torch/torch_predictor.py +++ b/python/ray/train/torch/torch_predictor.py @@ -62,7 +62,11 @@ def __init__( super().__init__(preprocessor) def __repr__(self): - return f"TorchPredictor(model={self.model}, preprocessor={self._preprocessor}, use_gpu={self.use_gpu})" + return ( + f"TorchPredictor(model={self.model}, " + f"preprocessor={self._preprocessor}, " + f"use_gpu={self.use_gpu})" + ) @classmethod def from_checkpoint( From 6e881c3d1f88ea0c26c4ad2bc1b26b4f31645f9d Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 03:23:19 -0700 Subject: [PATCH 09/30] Update tensorflow_predictor.py --- python/ray/train/tensorflow/tensorflow_predictor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/train/tensorflow/tensorflow_predictor.py b/python/ray/train/tensorflow/tensorflow_predictor.py index 15c7a64f6c8c..df4d47319628 100644 --- a/python/ray/train/tensorflow/tensorflow_predictor.py +++ b/python/ray/train/tensorflow/tensorflow_predictor.py @@ -77,8 +77,7 @@ def __init__( def __repr__(self): return ( - "TensorflowPredictor(" - f"model_definition={self.model_definition}, " + f"TensorflowPredictor(model_definition={self.model_definition}, " f"preprocessor={self._preprocessor}, " f"model_weights={self.model_weights}, " f"use_gpu={self.use_gpu})" From 32737dace840b78d79cb9c0427d1e73ff12ba608 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 10:23:01 -0700 Subject: [PATCH 10/30] Fix docstring example --- python/ray/air/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index a1dcedc7fb40..ac1f33cff6d5 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -61,8 +61,8 @@ def _repr_dataclass(obj, *, default_values: Optional[Dict[str, Any]] = None) -> >>> def __post_init__(self): >>> self.x = [] >>> a = A() - >>> repr(a) - A() + >>> _repr_dataclass(a, default_values={"x": []}) + 'A()' """ if default_values is None: default_values = {} From a7abc5a4b998be5026d552e108fad0bab033ef37 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:14:00 -0700 Subject: [PATCH 11/30] Update preprocessor reprs --- python/ray/data/preprocessors/batch_mapper.py | 2 +- python/ray/data/preprocessors/chain.py | 2 +- python/ray/data/preprocessors/concatenator.py | 11 ++++++----- python/ray/data/preprocessors/encoder.py | 18 +++++++++++++----- python/ray/data/preprocessors/hasher.py | 3 ++- python/ray/data/preprocessors/imputer.py | 5 ++--- python/ray/data/preprocessors/normalizer.py | 4 +++- python/ray/data/preprocessors/scaler.py | 4 ++-- python/ray/data/preprocessors/tokenizer.py | 5 ++++- python/ray/data/preprocessors/transformer.py | 4 ++-- python/ray/data/preprocessors/vectorizer.py | 11 ++++------- 11 files changed, 40 insertions(+), 29 deletions(-) diff --git a/python/ray/data/preprocessors/batch_mapper.py b/python/ray/data/preprocessors/batch_mapper.py index eaca7726ed53..f42ce81c7364 100644 --- a/python/ray/data/preprocessors/batch_mapper.py +++ b/python/ray/data/preprocessors/batch_mapper.py @@ -27,4 +27,4 @@ def _transform_pandas(self, df: "pandas.DataFrame") -> "pandas.DataFrame": def __repr__(self): fn_name = getattr(self.fn, "__name__", self.fn) - return f"BatchMapper(fn={fn_name})" + return f"{self.__class__.__name__}(fn={fn_name})" diff --git a/python/ray/data/preprocessors/chain.py b/python/ray/data/preprocessors/chain.py index 53b715354d96..1a812718dce6 100644 --- a/python/ray/data/preprocessors/chain.py +++ b/python/ray/data/preprocessors/chain.py @@ -72,4 +72,4 @@ def _transform_batch(self, df: "DataBatchType") -> "DataBatchType": def __repr__(self): arguments = ", ".join(repr(preprocessor) for preprocessor in self.preprocessors) - return f"Chain({arguments})" + return f"{self.__class__.__name__}({arguments})" diff --git a/python/ray/data/preprocessors/concatenator.py b/python/ray/data/preprocessors/concatenator.py index 521c7c5bc639..9a64c828967b 100644 --- a/python/ray/data/preprocessors/concatenator.py +++ b/python/ray/data/preprocessors/concatenator.py @@ -99,9 +99,10 @@ def _transform_pandas(self, df: pd.DataFrame): def __repr__(self): return ( - f"Concatenator(output_column_name='{self.output_column_name}', " - f"include={self.included_columns}, " - f"exclude={self.excluded_columns}, " - f"dtype={self.dtype}, " - f"raise_if_missing={self.raise_if_missing})" + f"{self.__class__.__name__}(" + f"output_column_name={self.output_column_name!r}, " + f"include={self.included_columns!r}, " + f"exclude={self.excluded_columns!r}, " + f"dtype={self.dtype!r}, " + f"raise_if_missing={self.raise_if_missing!r})" ) diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 25c387d183f7..012ce5c2dd6c 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -102,7 +102,8 @@ def list_as_category(element): def __repr__(self): return ( - f"OrdinalEncoder(columns={self.columns}, encode_lists={self.encode_lists})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"encode_lists={self.encode_lists!r})" ) @@ -184,7 +185,9 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return f"OneHotEncoder(columns={self.columns}, limit={self.limit})" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, limit={self.limit!r})" + ) class MultiHotEncoder(Preprocessor): @@ -267,7 +270,9 @@ def encode_list(element: list, *, name: str): return df def __repr__(self): - return f"MultiHotEncoder(columns={self.columns}, limit={self.limit})" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, limit={self.limit!r})" + ) class LabelEncoder(Preprocessor): @@ -302,7 +307,7 @@ def column_label_encoder(s: pd.Series): return df def __repr__(self): - return f"LabelEncoder(label_column='{self.label_column}')" + return f"{self.__class__.__name__}(label_column={self.label_column!r})" class Categorizer(Preprocessor): @@ -355,7 +360,10 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return f"Categorizer(columns={self.columns}, dtypes={self.dtypes})" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"dtypes={self.dtypes!r})" + ) def _get_unique_value_indices( diff --git a/python/ray/data/preprocessors/hasher.py b/python/ray/data/preprocessors/hasher.py index 1687c68c2d18..effe06967dfe 100644 --- a/python/ray/data/preprocessors/hasher.py +++ b/python/ray/data/preprocessors/hasher.py @@ -56,5 +56,6 @@ def row_feature_hasher(row): def __repr__(self): return ( - f"FeatureHasher(columns={self.columns}, num_features={self.num_features})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"num_features={self.num_features!r})" ) diff --git a/python/ray/data/preprocessors/imputer.py b/python/ray/data/preprocessors/imputer.py index 9aa9257017db..6a09680ebaeb 100644 --- a/python/ray/data/preprocessors/imputer.py +++ b/python/ray/data/preprocessors/imputer.py @@ -74,9 +74,8 @@ def _transform_pandas(self, df: pd.DataFrame): def __repr__(self): return ( - f"SimpleImputer(columns={self.columns}, " - f"strategy={self.strategy}, " - f"fill_value={self.fill_value})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"strategy={self.strategy!r}, fill_value={self.fill_value!r})" ) diff --git a/python/ray/data/preprocessors/normalizer.py b/python/ray/data/preprocessors/normalizer.py index 4287e3be4df5..7e3e2dbffbeb 100644 --- a/python/ray/data/preprocessors/normalizer.py +++ b/python/ray/data/preprocessors/normalizer.py @@ -45,4 +45,6 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return f"Normalizer(columns={self.columns}, norm='{self.norm}')" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, norm={self.norm!r})" + ) diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index c4efdd51925a..37747d26f1f1 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -210,6 +210,6 @@ def column_robust_scaler(s: pd.Series): def __repr__(self): return ( - f"RobustScaler(columns={self.columns}, " - f"quantile_range={self.quantile_range})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"quantile_range={self.quantile_range!r})" ) diff --git a/python/ray/data/preprocessors/tokenizer.py b/python/ray/data/preprocessors/tokenizer.py index 12f7d8ac889e..68985b0cb61d 100644 --- a/python/ray/data/preprocessors/tokenizer.py +++ b/python/ray/data/preprocessors/tokenizer.py @@ -37,4 +37,7 @@ def column_tokenizer(s: pd.Series): def __repr__(self): name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) - return f"Tokenizer(columns={self.columns}, tokenization_fn={name})" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"tokenization_fn={name})" + ) diff --git a/python/ray/data/preprocessors/transformer.py b/python/ray/data/preprocessors/transformer.py index e35e97e5ca05..1814967d3e1d 100644 --- a/python/ray/data/preprocessors/transformer.py +++ b/python/ray/data/preprocessors/transformer.py @@ -70,6 +70,6 @@ def column_power_transformer(s: pd.Series): def __repr__(self): return ( - f"PowerTransformer(columns={self.columns}, " - f"power={self.power}, method={self.method})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"power={self.power!r}, method={self.method!r})" ) diff --git a/python/ray/data/preprocessors/vectorizer.py b/python/ray/data/preprocessors/vectorizer.py index 2502b45bc8f3..d051a9881717 100644 --- a/python/ray/data/preprocessors/vectorizer.py +++ b/python/ray/data/preprocessors/vectorizer.py @@ -55,9 +55,8 @@ def hash_count(tokens: List[str]) -> Counter: def __repr__(self): fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) return ( - f"HasingVectorizer(columns={self.columns}, " - f"num_features={self.num_features}, " - f"tokenization_fn={fn_name})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"num_features={self.num_features!r}, tokenization_fn={fn_name})" ) @@ -133,8 +132,6 @@ def _transform_pandas(self, df: pd.DataFrame): def __repr__(self): fn_name = getattr(self.tokenization_fn, "__name__", self.tokenization_fn) return ( - f"CountVectorizer(" - f"columns={self.columns}, " - f"tokenization_fn={fn_name}, " - f"max_features={self.max_features})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"tokenization_fn={fn_name}, max_features={self.max_features!r})" ) From 1632231ca762338f7b6b5d2f34760cd7dccff895 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:16:30 -0700 Subject: [PATCH 12/30] More preprocessor fix --- python/ray/data/preprocessors/scaler.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/data/preprocessors/scaler.py b/python/ray/data/preprocessors/scaler.py index 37747d26f1f1..2015dedc2935 100644 --- a/python/ray/data/preprocessors/scaler.py +++ b/python/ray/data/preprocessors/scaler.py @@ -47,7 +47,9 @@ def column_standard_scaler(s: pd.Series): return df def __repr__(self): - return f"StandardScaler(columns={self.columns}, ddof={self.ddof})" + return ( + f"{self.__class__.__name__}(columns={self.columns!r}, ddof={self.ddof!r})" + ) class MinMaxScaler(Preprocessor): @@ -91,7 +93,7 @@ def column_min_max_scaler(s: pd.Series): return df def __repr__(self): - return f"MixMaxScaler(columns={self.columns})" + return f"{self.__class__.__name__}(columns={self.columns!r})" class MaxAbsScaler(Preprocessor): @@ -132,7 +134,7 @@ def column_abs_max_scaler(s: pd.Series): return df def __repr__(self): - return f"MaxAbsScaler(columns={self.columns})" + return f"{self.__class__.__name__}(columns={self.columns!r})" class RobustScaler(Preprocessor): From c509445f469620fd0d051739375ef08ccaae1a89 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:21:58 -0700 Subject: [PATCH 13/30] Update base_trainer.py --- python/ray/train/base_trainer.py | 33 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/python/ray/train/base_trainer.py b/python/ray/train/base_trainer.py index 1ee80dde4221..60b730af047f 100644 --- a/python/ray/train/base_trainer.py +++ b/python/ray/train/base_trainer.py @@ -1,7 +1,7 @@ import abc import inspect import logging -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type, Union import ray from ray.air._internal.config import ensure_only_allowed_dataclass_keys_updated @@ -171,20 +171,25 @@ def __init__( ) def __repr__(self): + # A dictionary that maps parameters to their default values. + default_values: Dict[str, Any] = { + "scaling_config": ScalingConfig(), + "run_config": RunConfig(), + "datasets": {}, + "preprocessor": None, + "resume_from_checkpoint": None, + } + non_default_arguments = [] - if self.scaling_config != ScalingConfig(): - non_default_arguments.append(f"scaling_config={self.scaling_config}") - if self.run_config != RunConfig(): - non_default_arguments.append(f"run_config={self.run_config}") - if self.datasets != {}: - non_default_arguments.append(f"datasets={self.datasets}") - if self.preprocessor is not None: - non_default_arguments.append(f"preprocessor={self.preprocessor}") - if self.resume_from_checkpoint is not None: - non_default_arguments.append( - f"resume_from_checkpoint={self.resume_from_checkpoint}" - ) - return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>" + for parameter, default_value in default_values.items(): + value = getattr(self, parameter) + if value != default_value: + non_default_arguments.append(f"{parameter}={value!r}") + + if non_default_arguments: + return f"<{self.__class__.__name__} {' '.join(non_default_arguments)}>" + + return f"<{self.__class__.__name__}>" def __new__(cls, *args, **kwargs): """Store the init args as attributes so this can be merged with Tune hparams.""" From 9c62b920a03b6c51040686d386c782270725393e Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:22:40 -0700 Subject: [PATCH 14/30] Update config.py --- python/ray/air/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index ac1f33cff6d5..f91778d4a4e5 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -78,7 +78,7 @@ def _repr_dataclass(obj, *, default_values: Optional[Dict[str, Any]] = None) -> string = f"{obj.__class__.__name__}(" string += ", ".join( - f"{name}={repr(value)}" for name, value in non_default_values.items() + f"{name}={value!r}" for name, value in non_default_values.items() ) string += ")" From 83b3e89b0ec6fe2f1c1028fad6003f40c6ad51c5 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:26:21 -0700 Subject: [PATCH 15/30] Update predictors --- python/ray/train/huggingface/huggingface_predictor.py | 4 ++-- python/ray/train/lightgbm/lightgbm_predictor.py | 3 ++- python/ray/train/rl/rl_predictor.py | 5 ++++- python/ray/train/sklearn/sklearn_predictor.py | 4 ++-- python/ray/train/tensorflow/tensorflow_predictor.py | 9 +++++---- python/ray/train/torch/torch_predictor.py | 5 ++--- python/ray/train/xgboost/xgboost_predictor.py | 3 ++- 7 files changed, 19 insertions(+), 14 deletions(-) diff --git a/python/ray/train/huggingface/huggingface_predictor.py b/python/ray/train/huggingface/huggingface_predictor.py index dc8c2fac7346..f70692c05e84 100644 --- a/python/ray/train/huggingface/huggingface_predictor.py +++ b/python/ray/train/huggingface/huggingface_predictor.py @@ -38,8 +38,8 @@ def __init__( def __repr__(self): return ( - f"HuggingFacePredictor(pipeline={self.pipeline}, " - f"preprocessor={self._preprocessor})" + f"{self.__class__.__name__}(pipeline={self.pipeline!r}, " + f"preprocessor={self._preprocessor!r})" ) @classmethod diff --git a/python/ray/train/lightgbm/lightgbm_predictor.py b/python/ray/train/lightgbm/lightgbm_predictor.py index 5c9958183a09..5fc9dd46f0d8 100644 --- a/python/ray/train/lightgbm/lightgbm_predictor.py +++ b/python/ray/train/lightgbm/lightgbm_predictor.py @@ -32,7 +32,8 @@ def __init__( def __repr__(self): return ( - f"LightGBMPredictor(model={self.model}, preprocessor={self._preprocessor})" + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r})" ) @classmethod diff --git a/python/ray/train/rl/rl_predictor.py b/python/ray/train/rl/rl_predictor.py index e71aaab3f1da..0edc7f12f480 100644 --- a/python/ray/train/rl/rl_predictor.py +++ b/python/ray/train/rl/rl_predictor.py @@ -34,7 +34,10 @@ def __init__( super().__init__(preprocessor) def __repr__(self): - return f"RLPredictor(policy={self.policy}, preprocessor={self._preprocessor})" + return ( + f"{self.__class__.__name__}(policy={self.policy!r}, " + f"preprocessor={self._preprocessor!r})" + ) @classmethod def from_checkpoint( diff --git a/python/ray/train/sklearn/sklearn_predictor.py b/python/ray/train/sklearn/sklearn_predictor.py index d6796e7e77eb..42482e1110b3 100644 --- a/python/ray/train/sklearn/sklearn_predictor.py +++ b/python/ray/train/sklearn/sklearn_predictor.py @@ -37,8 +37,8 @@ def __init__( def __repr__(self): return ( - f"SklearnPredictor(estimator={self.estimator}, " - f"preprocessor={self._preprocessor})" + f"{self.__class__.__name__}(estimator={self.estimator!r}, " + f"preprocessor={self._preprocessor!r})" ) @classmethod diff --git a/python/ray/train/tensorflow/tensorflow_predictor.py b/python/ray/train/tensorflow/tensorflow_predictor.py index df4d47319628..8331f1222a92 100644 --- a/python/ray/train/tensorflow/tensorflow_predictor.py +++ b/python/ray/train/tensorflow/tensorflow_predictor.py @@ -77,10 +77,11 @@ def __init__( def __repr__(self): return ( - f"TensorflowPredictor(model_definition={self.model_definition}, " - f"preprocessor={self._preprocessor}, " - f"model_weights={self.model_weights}, " - f"use_gpu={self.use_gpu})" + f"{self.__class__.__name__}(" + f"model_definition={self.model_definition!r}, " + f"preprocessor={self._preprocessor!r}, " + f"model_weights={self.model_weights!r}, " + f"use_gpu={self.use_gpu!r})" ) @classmethod diff --git a/python/ray/train/torch/torch_predictor.py b/python/ray/train/torch/torch_predictor.py index 012c6296bb8f..add8d41c9808 100644 --- a/python/ray/train/torch/torch_predictor.py +++ b/python/ray/train/torch/torch_predictor.py @@ -63,9 +63,8 @@ def __init__( def __repr__(self): return ( - f"TorchPredictor(model={self.model}, " - f"preprocessor={self._preprocessor}, " - f"use_gpu={self.use_gpu})" + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r}, use_gpu={self.use_gpu!r})" ) @classmethod diff --git a/python/ray/train/xgboost/xgboost_predictor.py b/python/ray/train/xgboost/xgboost_predictor.py index 1ea9cad1aba6..7cf79d508070 100644 --- a/python/ray/train/xgboost/xgboost_predictor.py +++ b/python/ray/train/xgboost/xgboost_predictor.py @@ -31,7 +31,8 @@ def __init__( def __repr__(self): return ( - f"XGBoostPredictor(model={self.model}, preprocessor={self._preprocessor})" + f"{self.__class__.__name__}(model={self.model!r}, " + f"preprocessor={self._preprocessor!r})" ) @classmethod From 3cc78368f6474418367e3660037d527d94d13a13 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 12:29:37 -0700 Subject: [PATCH 16/30] Remove example --- python/ray/air/config.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/ray/air/config.py b/python/ray/air/config.py index f91778d4a4e5..e84b23fb7b2a 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -53,16 +53,6 @@ def _repr_dataclass(obj, *, default_values: Optional[Dict[str, Any]] = None) -> Returns: A representation of the dataclass. - - Examples: - >>> @dataclass - >>> class A: - >>> x: List = None - >>> def __post_init__(self): - >>> self.x = [] - >>> a = A() - >>> _repr_dataclass(a, default_values={"x": []}) - 'A()' """ if default_values is None: default_values = {} From a32763865282662ced7e026baa1709f0eb755fa1 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 14:01:22 -0700 Subject: [PATCH 17/30] Resolve merge issues --- python/ray/data/preprocessors/encoder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 9867630ae7c6..980e2ba0fa7f 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -191,7 +191,8 @@ def _transform_pandas(self, df: pd.DataFrame): def __repr__(self): return ( - f"{self.__class__.__name__}(columns={self.columns!r}, limit={self.limit!r})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"max_categories={self.limit!r})" ) @@ -278,7 +279,8 @@ def encode_list(element: list, *, name: str): def __repr__(self): return ( - f"{self.__class__.__name__}(columns={self.columns!r}, limit={self.limit!r})" + f"{self.__class__.__name__}(columns={self.columns!r}, " + f"max_categories={self.max_categories!r})" ) From 6ee74e31a6426063c65939bc7a6261f3a851e942 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 14:17:52 -0700 Subject: [PATCH 18/30] Add preprocessor repr tests --- python/ray/data/preprocessors/concatenator.py | 45 +++++++++++-------- python/ray/data/preprocessors/encoder.py | 2 +- python/ray/data/tests/test_preprocessors.py | 35 +++++++++++++++ 3 files changed, 62 insertions(+), 20 deletions(-) diff --git a/python/ray/data/preprocessors/concatenator.py b/python/ray/data/preprocessors/concatenator.py index 9a64c828967b..edca724b5825 100644 --- a/python/ray/data/preprocessors/concatenator.py +++ b/python/ray/data/preprocessors/concatenator.py @@ -56,24 +56,24 @@ def __init__( raise_if_missing: bool = False, ): self.output_column_name = output_column_name - self.included_columns = include - self.excluded_columns = exclude or [] + self.include = include + self.exclude = exclude or [] self.dtype = dtype self.raise_if_missing = raise_if_missing def _validate(self, df: pd.DataFrame): total_columns = set(df) - if self.excluded_columns and self.raise_if_missing: - missing_columns = set(self.excluded_columns) - total_columns.intersection( - set(self.excluded_columns) + if self.exclude and self.raise_if_missing: + missing_columns = set(self.exclude) - total_columns.intersection( + set(self.exclude) ) if missing_columns: raise ValueError( f"Missing columns specified in 'exclude': {missing_columns}" ) - if self.included_columns and self.raise_if_missing: - missing_columns = set(self.included_columns) - total_columns.intersection( - set(self.included_columns) + if self.include and self.raise_if_missing: + missing_columns = set(self.include) - total_columns.intersection( + set(self.include) ) if missing_columns: raise ValueError( @@ -84,10 +84,10 @@ def _transform_pandas(self, df: pd.DataFrame): self._validate(df) included_columns = set(df) - if self.included_columns: # subset of included columns - included_columns = set(self.included_columns) + if self.include: # subset of included columns + included_columns = set(self.include) - columns_to_concat = list(included_columns - set(self.excluded_columns)) + columns_to_concat = list(included_columns - set(self.exclude)) concatenated = df[columns_to_concat].to_numpy(dtype=self.dtype) df = df.drop(columns=columns_to_concat) try: @@ -98,11 +98,18 @@ def _transform_pandas(self, df: pd.DataFrame): return df def __repr__(self): - return ( - f"{self.__class__.__name__}(" - f"output_column_name={self.output_column_name!r}, " - f"include={self.included_columns!r}, " - f"exclude={self.excluded_columns!r}, " - f"dtype={self.dtype!r}, " - f"raise_if_missing={self.raise_if_missing!r})" - ) + default_values = { + "output_column_name": "concat_out", + "include": None, + "exclude": [], + "dtype": None, + "raise_if_missing": False, + } + + non_default_arguments = [] + for parameter, default_value in default_values.items(): + value = getattr(self, parameter) + if value != default_value: + non_default_arguments.append(f"{parameter}={value}") + + return f"{self.__class__.__name__}({', '.join(non_default_arguments)})" diff --git a/python/ray/data/preprocessors/encoder.py b/python/ray/data/preprocessors/encoder.py index 980e2ba0fa7f..58703f3cc72b 100644 --- a/python/ray/data/preprocessors/encoder.py +++ b/python/ray/data/preprocessors/encoder.py @@ -192,7 +192,7 @@ def _transform_pandas(self, df: pd.DataFrame): def __repr__(self): return ( f"{self.__class__.__name__}(columns={self.columns!r}, " - f"max_categories={self.limit!r})" + f"max_categories={self.max_categories!r})" ) diff --git a/python/ray/data/tests/test_preprocessors.py b/python/ray/data/tests/test_preprocessors.py index 09e108e3bf8e..f2498d8f0fd6 100644 --- a/python/ray/data/tests/test_preprocessors.py +++ b/python/ray/data/tests/test_preprocessors.py @@ -1,5 +1,6 @@ import warnings from collections import Counter +import re from unittest.mock import patch import numpy as np @@ -29,6 +30,8 @@ from ray.data.preprocessors.utils import simple_hash, simple_split_tokenizer from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer +MAX_REPR_LENGTH = 100 + @pytest.fixture def create_dummy_preprocessors(): @@ -118,6 +121,38 @@ def test_standard_scaler(): assert pred_out_df.equals(pred_expected_df) +@pytest.mark.parametrize( + "preprocessor", + [ + BatchMapper(fn=lambda x: x), + Categorizer(columns=["X"]), + CountVectorizer(columns=["X"]), + Chain(StandardScaler(columns=["X"]), MinMaxScaler(columns=["X"])), + FeatureHasher(columns=["X"], num_features=1), + HashingVectorizer(columns=["X"], num_features=1), + LabelEncoder(label_column="X"), + MaxAbsScaler(columns=["X"]), + MinMaxScaler(columns=["X"]), + MultiHotEncoder(columns=["X"]), + Normalizer(columns=["X"]), + OneHotEncoder(columns=["X"]), + OrdinalEncoder(columns=["X"]), + PowerTransformer(columns=["X"], power=1), + RobustScaler(columns=["X"]), + SimpleImputer(columns=["X"]), + StandardScaler(columns=["X"]), + Concatenator(), + Tokenizer(columns=["X"]), + ], +) +def test_repr(preprocessor): + representation = repr(preprocessor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile(f"^{preprocessor.__class__.__name__}\\((.*)\\)$") + assert pattern.match(representation) + + @patch.object(warnings, "warn") def test_fit_twice(mocked_warn): """Tests that a warning msg should be printed.""" From 13efed346788ebd6bc1f808b1c88a3a24cdfa94a Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 14:30:52 -0700 Subject: [PATCH 19/30] Add checkpoint tests --- python/ray/air/tests/test_checkpoints.py | 15 ++++++++++ python/ray/train/tests/test_checkpoints.py | 32 ++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 python/ray/train/tests/test_checkpoints.py diff --git a/python/ray/air/tests/test_checkpoints.py b/python/ray/air/tests/test_checkpoints.py index c00b3cc8af17..479a0d425b92 100644 --- a/python/ray/air/tests/test_checkpoints.py +++ b/python/ray/air/tests/test_checkpoints.py @@ -1,5 +1,7 @@ import os import pickle +import pytest +import re import shutil import tempfile import unittest @@ -20,6 +22,19 @@ def transform_batch(self, df): return df * self.multiplier +MAX_REPR_LENGTH = 100 + + +def test_repr(): + checkpoint = Checkpoint(data_dict={"foo": "bar"}) + + representation = repr(checkpoint) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile(f"^Checkpoint\\((.*)\\)$") + assert pattern.match(representation) + + class CheckpointsConversionTest(unittest.TestCase): def setUp(self): self.tmpdir = os.path.realpath(tempfile.mkdtemp()) diff --git a/python/ray/train/tests/test_checkpoints.py b/python/ray/train/tests/test_checkpoints.py new file mode 100644 index 000000000000..d64b125f392c --- /dev/null +++ b/python/ray/train/tests/test_checkpoints.py @@ -0,0 +1,32 @@ +import re + +import pytest + +from ray.train.huggingface import HuggingFaceCheckpoint +from ray.train.lightgbm import LightGBMCheckpoint +from ray.train.rl import RLCheckpoint +from ray.train.sklearn import SklearnCheckpoint +from ray.train.tensorflow import TensorflowCheckpoint +from ray.train.xgboost import XGBoostCheckpoint +from ray.train.torch import TorchCheckpoint + +MAX_REPR_LENGTH = 100 + + +@pytest.mark.parametrize("checkpoint", + [ + HuggingFaceCheckpoint(data_dict={"foo": "bar"}), + LightGBMCheckpoint(data_dict={"foo": "bar"}), + RLCheckpoint(data_dict={"foo": "bar"}), + SklearnCheckpoint(data_dict={"foo": "bar"}), + TensorflowCheckpoint(data_dict={"foo": "bar"}), + XGBoostCheckpoint(data_dict={"foo": "bar"}), + TorchCheckpoint(data_dict={"foo": "bar"}), + + ]) +def test_repr(checkpoint): + representation = repr(checkpoint) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile(f"^{checkpoint.__class__.__name__}\\((.*)\\)$") + assert pattern.match(representation) From c7bdad0177865eaad30be1a49699d5bfcae026be Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 14:31:22 -0700 Subject: [PATCH 20/30] Appease lint --- python/ray/air/tests/test_checkpoints.py | 3 +-- python/ray/train/tests/test_checkpoints.py | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/ray/air/tests/test_checkpoints.py b/python/ray/air/tests/test_checkpoints.py index 479a0d425b92..45d0afa5596b 100644 --- a/python/ray/air/tests/test_checkpoints.py +++ b/python/ray/air/tests/test_checkpoints.py @@ -1,6 +1,5 @@ import os import pickle -import pytest import re import shutil import tempfile @@ -31,7 +30,7 @@ def test_repr(): representation = repr(checkpoint) assert len(representation) < MAX_REPR_LENGTH - pattern = re.compile(f"^Checkpoint\\((.*)\\)$") + pattern = re.compile("^Checkpoint\\((.*)\\)$") assert pattern.match(representation) diff --git a/python/ray/train/tests/test_checkpoints.py b/python/ray/train/tests/test_checkpoints.py index d64b125f392c..e97f584beef9 100644 --- a/python/ray/train/tests/test_checkpoints.py +++ b/python/ray/train/tests/test_checkpoints.py @@ -13,7 +13,8 @@ MAX_REPR_LENGTH = 100 -@pytest.mark.parametrize("checkpoint", +@pytest.mark.parametrize( + "checkpoint", [ HuggingFaceCheckpoint(data_dict={"foo": "bar"}), LightGBMCheckpoint(data_dict={"foo": "bar"}), @@ -22,8 +23,8 @@ TensorflowCheckpoint(data_dict={"foo": "bar"}), XGBoostCheckpoint(data_dict={"foo": "bar"}), TorchCheckpoint(data_dict={"foo": "bar"}), - - ]) + ], +) def test_repr(checkpoint): representation = repr(checkpoint) From a54fdb67b3f99ebb8cccc374c3589784500aa510 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 15:01:22 -0700 Subject: [PATCH 21/30] Add some tests --- python/ray/air/constants.py | 4 ++++ python/ray/air/tests/test_checkpoints.py | 5 +---- python/ray/air/tests/test_configs.py | 24 +++++++++++++++++++++ python/ray/data/tests/test_preprocessors.py | 8 +------ python/ray/train/tests/test_checkpoints.py | 4 +--- 5 files changed, 31 insertions(+), 14 deletions(-) create mode 100644 python/ray/air/tests/test_configs.py diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index 65772677502e..2b5a5e7defdb 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -18,3 +18,7 @@ # Name to use for the column when representing tensors in table format. TENSOR_COLUMN_NAME = "__value__" + +# The maximum length of strings returned by `__repr__` for AIR objects constructed with +# default values. +MAX_REPR_LENGTH = 100 diff --git a/python/ray/air/tests/test_checkpoints.py b/python/ray/air/tests/test_checkpoints.py index 45d0afa5596b..199d7c3da9ae 100644 --- a/python/ray/air/tests/test_checkpoints.py +++ b/python/ray/air/tests/test_checkpoints.py @@ -9,7 +9,7 @@ import ray from ray.air._internal.remote_storage import delete_at_uri, _ensure_directory from ray.air.checkpoint import Checkpoint, _DICT_CHECKPOINT_ADDITIONAL_FILE_KEY -from ray.air.constants import PREPROCESSOR_KEY +from ray.air.constants import MAX_REPR_LENGTH, PREPROCESSOR_KEY from ray.data import Preprocessor @@ -21,9 +21,6 @@ def transform_batch(self, df): return df * self.multiplier -MAX_REPR_LENGTH = 100 - - def test_repr(): checkpoint = Checkpoint(data_dict={"foo": "bar"}) diff --git a/python/ray/air/tests/test_configs.py b/python/ray/air/tests/test_configs.py new file mode 100644 index 000000000000..712d2d9e1bc2 --- /dev/null +++ b/python/ray/air/tests/test_configs.py @@ -0,0 +1,24 @@ +import pytest + +from ray.air.config import ScalingConfig, DatasetConfig, FailureConfig, CheckpointConfig, RunConfig +from ray.air.constants import MAX_REPR_LENGTH + +@pytest.mark.parametrize("config", + [ + ScalingConfig(), + ScalingConfig(use_gpu=True), + DatasetConfig(), + DatasetConfig(fit=True), + FailureConfig(), + FailureConfig(max_failures=2), + CheckpointConfig(), + CheckpointConfig(num_to_keep=1), + RunConfig(), + RunConfig(name="experiment"), + RunConfig(failure_config=FailureConfig()) + ]) +def test_repr(config): + representation = repr(config) + + assert eval(representation) == config + assert len(representation) < MAX_REPR_LENGTH diff --git a/python/ray/data/tests/test_preprocessors.py b/python/ray/data/tests/test_preprocessors.py index f2498d8f0fd6..771a8bc08423 100644 --- a/python/ray/data/tests/test_preprocessors.py +++ b/python/ray/data/tests/test_preprocessors.py @@ -29,8 +29,7 @@ from ray.data.preprocessors.transformer import PowerTransformer from ray.data.preprocessors.utils import simple_hash, simple_split_tokenizer from ray.data.preprocessors.vectorizer import CountVectorizer, HashingVectorizer - -MAX_REPR_LENGTH = 100 +from ray.air.constants import MAX_REPR_LENGTH @pytest.fixture @@ -1232,11 +1231,6 @@ def test_concatenator(): for i, row in enumerate(new_ds.take()): assert np.array_equal(row["c"].to_numpy(), np.array([i + 1, i + 1])) - # Test repr - assert "c" in prep.__repr__() - assert "include" in prep.__repr__() - assert "exclude" in prep.__repr__() - df = pd.DataFrame({"a": [1, 2, 3, 4]}) ds = ray.data.from_pandas(df) prep = Concatenator(output_column_name="c", exclude=["b"], raise_if_missing=True) diff --git a/python/ray/train/tests/test_checkpoints.py b/python/ray/train/tests/test_checkpoints.py index e97f584beef9..fab276742280 100644 --- a/python/ray/train/tests/test_checkpoints.py +++ b/python/ray/train/tests/test_checkpoints.py @@ -2,6 +2,7 @@ import pytest +from ray.air.constants import MAX_REPR_LENGTH from ray.train.huggingface import HuggingFaceCheckpoint from ray.train.lightgbm import LightGBMCheckpoint from ray.train.rl import RLCheckpoint @@ -10,9 +11,6 @@ from ray.train.xgboost import XGBoostCheckpoint from ray.train.torch import TorchCheckpoint -MAX_REPR_LENGTH = 100 - - @pytest.mark.parametrize( "checkpoint", [ From e3924b9211cb19d63acf45192241ea87f8e87d1b Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 15:49:59 -0700 Subject: [PATCH 22/30] Add tests --- python/ray/air/constants.py | 2 +- python/ray/air/tests/test_configs.py | 25 ++++++++++++++++--- python/ray/train/batch_predictor.py | 6 +++++ .../train/tensorflow/tensorflow_predictor.py | 3 ++- python/ray/train/tests/test_base_trainer.py | 18 +++++++++++++ .../ray/train/tests/test_batch_predictor.py | 16 +++++++++++- python/ray/train/tests/test_checkpoints.py | 9 +++++++ .../train/tests/test_huggingface_predictor.py | 12 +++++++++ .../train/tests/test_lightgbm_predictor.py | 13 +++++++++- python/ray/train/tests/test_rl_predictor.py | 13 ++++++++++ .../ray/train/tests/test_sklearn_predictor.py | 13 +++++++++- .../train/tests/test_tensorflow_predictor.py | 13 +++++++++- .../ray/train/tests/test_torch_predictor.py | 13 +++++++++- 13 files changed, 145 insertions(+), 11 deletions(-) diff --git a/python/ray/air/constants.py b/python/ray/air/constants.py index 2b5a5e7defdb..3706319fae73 100644 --- a/python/ray/air/constants.py +++ b/python/ray/air/constants.py @@ -21,4 +21,4 @@ # The maximum length of strings returned by `__repr__` for AIR objects constructed with # default values. -MAX_REPR_LENGTH = 100 +MAX_REPR_LENGTH = int(80 * 1.5) diff --git a/python/ray/air/tests/test_configs.py b/python/ray/air/tests/test_configs.py index 712d2d9e1bc2..71a3d4542d88 100644 --- a/python/ray/air/tests/test_configs.py +++ b/python/ray/air/tests/test_configs.py @@ -1,9 +1,17 @@ import pytest -from ray.air.config import ScalingConfig, DatasetConfig, FailureConfig, CheckpointConfig, RunConfig +from ray.air.config import ( + ScalingConfig, + DatasetConfig, + FailureConfig, + CheckpointConfig, + RunConfig, +) from ray.air.constants import MAX_REPR_LENGTH -@pytest.mark.parametrize("config", + +@pytest.mark.parametrize( + "config", [ ScalingConfig(), ScalingConfig(use_gpu=True), @@ -15,10 +23,19 @@ CheckpointConfig(num_to_keep=1), RunConfig(), RunConfig(name="experiment"), - RunConfig(failure_config=FailureConfig()) - ]) + RunConfig(failure_config=FailureConfig()), + ], +) def test_repr(config): representation = repr(config) assert eval(representation) == config assert len(representation) < MAX_REPR_LENGTH + + +if __name__ == "__main__": + import sys + + import pytest + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/batch_predictor.py b/python/ray/train/batch_predictor.py index 78b3e9d45085..5d5ef8a74ced 100644 --- a/python/ray/train/batch_predictor.py +++ b/python/ray/train/batch_predictor.py @@ -32,6 +32,12 @@ def __init__( self._predictor_kwargs = predictor_kwargs self._override_preprocessor: Optional[Preprocessor] = None + def __repr__(self): + return ( + f"{self.__class__.__name__}(checkpoint={self._checkpoint}, " + f"predictor_cls={self._predictor_cls.__name__})" + ) + @classmethod def from_checkpoint( cls, checkpoint: Checkpoint, predictor_cls: Type[Predictor], **kwargs diff --git a/python/ray/train/tensorflow/tensorflow_predictor.py b/python/ray/train/tensorflow/tensorflow_predictor.py index 8331f1222a92..5a3c579908b8 100644 --- a/python/ray/train/tensorflow/tensorflow_predictor.py +++ b/python/ray/train/tensorflow/tensorflow_predictor.py @@ -76,9 +76,10 @@ def __init__( super().__init__(preprocessor) def __repr__(self): + fn_name = getattr(self.model_definition, "__name__", self.model_definition) return ( f"{self.__class__.__name__}(" - f"model_definition={self.model_definition!r}, " + f"model_definition={fn_name}, " f"preprocessor={self._preprocessor!r}, " f"model_weights={self.model_weights!r}, " f"use_gpu={self.use_gpu!r})" diff --git a/python/ray/train/tests/test_base_trainer.py b/python/ray/train/tests/test_base_trainer.py index 337f342df212..394d07729ce4 100644 --- a/python/ray/train/tests/test_base_trainer.py +++ b/python/ray/train/tests/test_base_trainer.py @@ -9,6 +9,7 @@ import ray from ray import tune +from ray.air.constants import MAX_REPR_LENGTH from ray.data.preprocessor import Preprocessor from ray.train import base_trainer from ray.train.data_parallel_trainer import DataParallelTrainer @@ -297,6 +298,23 @@ def test_trainable_name_is_overriden_gbdt_trainer(ray_start_4_cpus): _is_trainable_name_overriden(trainer) +def test_repr(): + def training_loop(self): + pass + + trainer = DummyTrainer( + training_loop, + datasets={ + "train": ray.data.from_items([1, 2, 3]), + }, + ) + + representation = repr(trainer) + + assert "DummyTrainer" in representation + assert len(representation) < MAX_REPR_LENGTH + + if __name__ == "__main__": import sys diff --git a/python/ray/train/tests/test_batch_predictor.py b/python/ray/train/tests/test_batch_predictor.py index 3457a98aeedc..b6978644094a 100644 --- a/python/ray/train/tests/test_batch_predictor.py +++ b/python/ray/train/tests/test_batch_predictor.py @@ -1,9 +1,10 @@ +import re import time from typing import Optional import pandas as pd import pytest -from ray.air.constants import PREPROCESSOR_KEY +from ray.air.constants import MAX_REPR_LENGTH, PREPROCESSOR_KEY import ray from ray.air.checkpoint import Checkpoint @@ -23,6 +24,19 @@ def _transform_pandas(self, df): return df * self.multiplier +def test_repr(): + predictor = BatchPredictor.from_checkpoint( + Checkpoint.from_dict({"factor": 2.0}), + DummyPredictorFS, + ) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^BatchPredictor\\((.*)\\)$") + assert pattern.match(representation) + + class DummyPredictor(Predictor): def __init__( self, diff --git a/python/ray/train/tests/test_checkpoints.py b/python/ray/train/tests/test_checkpoints.py index fab276742280..b99f42501e96 100644 --- a/python/ray/train/tests/test_checkpoints.py +++ b/python/ray/train/tests/test_checkpoints.py @@ -11,6 +11,7 @@ from ray.train.xgboost import XGBoostCheckpoint from ray.train.torch import TorchCheckpoint + @pytest.mark.parametrize( "checkpoint", [ @@ -29,3 +30,11 @@ def test_repr(checkpoint): assert len(representation) < MAX_REPR_LENGTH pattern = re.compile(f"^{checkpoint.__class__.__name__}\\((.*)\\)$") assert pattern.match(representation) + + +if __name__ == "__main__": + import sys + + import pytest + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/train/tests/test_huggingface_predictor.py b/python/ray/train/tests/test_huggingface_predictor.py index 447cf3dcec70..cdb36985e546 100644 --- a/python/ray/train/tests/test_huggingface_predictor.py +++ b/python/ray/train/tests/test_huggingface_predictor.py @@ -1,10 +1,12 @@ import os +import re import tempfile import numpy as np import pandas as pd import pyarrow as pa import pytest +from ray.air.constants import MAX_REPR_LENGTH from ray.air.util.data_batch_conversion import convert_pandas_to_batch_type from ray.train.predictor import TYPE_TO_ENUM from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @@ -41,6 +43,16 @@ def transform_batch(self, df): return df +def test_repr(tmpdir): + predictor = HuggingFacePredictor() + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^HuggingFacePredictor\\((.*)\\)$") + assert pattern.match(representation) + + @pytest.mark.parametrize("batch_type", [np.ndarray, pd.DataFrame, pa.Table, dict]) def test_predict(tmpdir, ray_start_runtime_env, batch_type): dtype_prompts = convert_pandas_to_batch_type(prompts, type=TYPE_TO_ENUM[batch_type]) diff --git a/python/ray/train/tests/test_lightgbm_predictor.py b/python/ray/train/tests/test_lightgbm_predictor.py index deda0ae21193..0c7574685502 100644 --- a/python/ray/train/tests/test_lightgbm_predictor.py +++ b/python/ray/train/tests/test_lightgbm_predictor.py @@ -1,4 +1,5 @@ import os +import re import tempfile import lightgbm as lgbm @@ -9,7 +10,7 @@ from ray.air._internal.checkpointing import save_preprocessor_to_dir from ray.air.checkpoint import Checkpoint -from ray.air.constants import MODEL_KEY +from ray.air.constants import MAX_REPR_LENGTH, MODEL_KEY from ray.air.util.data_batch_conversion import convert_pandas_to_batch_type from ray.data.preprocessor import Preprocessor from ray.train.lightgbm import LightGBMCheckpoint, LightGBMPredictor @@ -31,6 +32,16 @@ def get_num_trees(booster: lgbm.Booster) -> int: return booster.current_iteration() +def test_repr(): + predictor = LightGBMPredictor(model=model) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^LightGBMPredictor\\((.*)\\)$") + assert pattern.match(representation) + + def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 diff --git a/python/ray/train/tests/test_rl_predictor.py b/python/ray/train/tests/test_rl_predictor.py index 09122beb66a7..e1617d88b80b 100644 --- a/python/ray/train/tests/test_rl_predictor.py +++ b/python/ray/train/tests/test_rl_predictor.py @@ -1,3 +1,4 @@ +import re import tempfile from typing import Optional @@ -8,6 +9,7 @@ import pytest from ray.air.checkpoint import Checkpoint +from ray.air.constants import MAX_REPR_LENGTH from ray.air.util.data_batch_conversion import ( convert_pandas_to_batch_type, convert_batch_type_to_pandas, @@ -79,6 +81,17 @@ def create_checkpoint( return Checkpoint.from_dict(checkpoint_data) +def test_repr(): + checkpoint = create_checkpoint() + predictor = RLPredictor.from_checkpoint(checkpoint) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^RLPredictor\\((.*)\\)$") + assert pattern.match(representation) + + @pytest.mark.parametrize("batch_type", [np.ndarray, pd.DataFrame, pa.Table, dict]) @pytest.mark.parametrize("batch_size", [1, 20]) def test_predict_no_preprocessor(batch_type, batch_size): diff --git a/python/ray/train/tests/test_sklearn_predictor.py b/python/ray/train/tests/test_sklearn_predictor.py index 708699d7c9e9..8e42ada4debe 100644 --- a/python/ray/train/tests/test_sklearn_predictor.py +++ b/python/ray/train/tests/test_sklearn_predictor.py @@ -1,4 +1,5 @@ import os +import re import tempfile import numpy as np @@ -13,7 +14,7 @@ import ray.cloudpickle as cpickle from ray.air._internal.checkpointing import save_preprocessor_to_dir from ray.air.checkpoint import Checkpoint -from ray.air.constants import MODEL_KEY +from ray.air.constants import MAX_REPR_LENGTH, MODEL_KEY from ray.data.preprocessor import Preprocessor from ray.train.batch_predictor import BatchPredictor from ray.train.sklearn import SklearnCheckpoint, SklearnPredictor @@ -40,6 +41,16 @@ def transform_batch(self, df): ) +def test_repr(): + predictor = SklearnPredictor(estimator=model) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^SklearnPredictor\\((.*)\\)$") + assert pattern.match(representation) + + def test_init(): preprocessor = DummyPreprocessor() preprocessor.attr = 1 diff --git a/python/ray/train/tests/test_tensorflow_predictor.py b/python/ray/train/tests/test_tensorflow_predictor.py index 22caec6e720f..86bf49569402 100644 --- a/python/ray/train/tests/test_tensorflow_predictor.py +++ b/python/ray/train/tests/test_tensorflow_predictor.py @@ -1,3 +1,4 @@ +import re import numpy as np import pandas as pd import pyarrow as pa @@ -6,7 +7,7 @@ import ray from ray.air.checkpoint import Checkpoint -from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY +from ray.air.constants import MAX_REPR_LENGTH, MODEL_KEY, PREPROCESSOR_KEY from ray.air.util.data_batch_conversion import ( convert_pandas_to_batch_type, convert_batch_type_to_pandas, @@ -52,6 +53,16 @@ def build_model_multi_output() -> tf.keras.Model: weights = [np.array([[2.0]]), np.array([0.0])] +def test_repr(): + predictor = TensorflowPredictor(model_definition=build_model) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^TensorflowPredictor\\((.*)\\)$") + assert pattern.match(representation) + + def test_init(): preprocessor = DummyPreprocessor() predictor = TensorflowPredictor( diff --git a/python/ray/train/tests/test_torch_predictor.py b/python/ray/train/tests/test_torch_predictor.py index d71ffc9d1ee4..990a57810ba8 100644 --- a/python/ray/train/tests/test_torch_predictor.py +++ b/python/ray/train/tests/test_torch_predictor.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd import pyarrow as pa @@ -5,7 +7,7 @@ import torch from ray.air.checkpoint import Checkpoint -from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY +from ray.air.constants import MAX_REPR_LENGTH, MODEL_KEY, PREPROCESSOR_KEY from ray.air.util.data_batch_conversion import ( convert_pandas_to_batch_type, convert_batch_type_to_pandas, @@ -45,6 +47,15 @@ def preprocessor(): return DummyPreprocessor() +def test_repr(model): + predictor = TorchPredictor(model=model) + + representation = repr(predictor) + + assert len(representation) < MAX_REPR_LENGTH + pattern = re.compile("^TorchPredictor\\((.*)\\)$") + assert pattern.match(representation) + def test_init(model, preprocessor): predictor = TorchPredictor(model=model, preprocessor=preprocessor) From e0682ffaacca9022dc91ddbb436cfdb7950466eb Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 15:50:25 -0700 Subject: [PATCH 23/30] Remove whitespace --- python/ray/train/tests/test_torch_predictor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/train/tests/test_torch_predictor.py b/python/ray/train/tests/test_torch_predictor.py index 990a57810ba8..a97fdbc9795a 100644 --- a/python/ray/train/tests/test_torch_predictor.py +++ b/python/ray/train/tests/test_torch_predictor.py @@ -56,6 +56,7 @@ def test_repr(model): pattern = re.compile("^TorchPredictor\\((.*)\\)$") assert pattern.match(representation) + def test_init(model, preprocessor): predictor = TorchPredictor(model=model, preprocessor=preprocessor) From 74e297d3cf716e0bce8d3189c9c834ea2767d298 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 15:54:05 -0700 Subject: [PATCH 24/30] Add tests to `BUILD` --- python/ray/air/BUILD | 8 ++++++++ python/ray/train/BUILD | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/python/ray/air/BUILD b/python/ray/air/BUILD index 29b7cb099fe0..ed9f0fa19758 100644 --- a/python/ray/air/BUILD +++ b/python/ray/air/BUILD @@ -179,6 +179,14 @@ py_test( deps = [":ml_lib"] ) +py_test( + name = "test_configs", + size = "small", + srcs = ["tests/test_configs.py"], + tags = ["team:ml", "exclusive"], + deps = [":ml_lib"] +) + py_test( name = "test_data_batch_conversion", size = "small", diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index dce27aacb631..307257061726 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -137,6 +137,14 @@ py_test( deps = [":train_lib"] ) +py_test( + name = "test_checkpoints", + size = "small", + srcs = ["tests/test_checkpoints.py"], + tags = ["team:ml", "exclusive"], + deps = [":train_lib"] +) + py_test( name = "test_data_parallel_trainer", size = "medium", From 125e1165531329f50149a8790fa7061fee1951b3 Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Tue, 26 Jul 2022 19:36:32 -0700 Subject: [PATCH 25/30] Update python/ray/train/tests/test_batch_predictor.py --- python/ray/train/tests/test_batch_predictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/tests/test_batch_predictor.py b/python/ray/train/tests/test_batch_predictor.py index b6978644094a..fee887f352fe 100644 --- a/python/ray/train/tests/test_batch_predictor.py +++ b/python/ray/train/tests/test_batch_predictor.py @@ -24,7 +24,7 @@ def _transform_pandas(self, df): return df * self.multiplier -def test_repr(): +def test_repr(shutdown_only): predictor = BatchPredictor.from_checkpoint( Checkpoint.from_dict({"factor": 2.0}), DummyPredictorFS, From 85b69669d5dd4bd449546d0fe1cc758564e6a61b Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 22:03:33 -0700 Subject: [PATCH 26/30] Fix dataset tests --- python/ray/data/tests/conftest.py | 8 +-- python/ray/data/tests/test_dataset.py | 36 +++++------ python/ray/data/tests/test_dataset_formats.py | 64 +++++++++---------- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index 1b6f3f0e429d..5ec54a620b06 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -205,12 +205,12 @@ def _assert_base_partitioned_ds( actual_input_files = ds.input_files() assert len(actual_input_files) == num_input_files, actual_input_files assert ( - str(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " - f"schema={schema})" + str(ds) == f"" ), ds assert ( - repr(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " - f"schema={schema})" + repr(ds) == f"" ), ds if num_computed is not None: assert ( diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index 10bd3b12125e..8afc8891e377 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -378,7 +378,7 @@ def test_batch_tensors(ray_start_regular_shared): import torch ds = ray.data.from_items([torch.tensor([0, 0]) for _ in range(40)], parallelism=40) - res = "Dataset(num_blocks=40, num_rows=40, schema=)" + res = ">" assert str(ds) == res, str(ds) with pytest.raises(pa.lib.ArrowInvalid): next(ds.iter_batches(batch_format="pyarrow")) @@ -626,8 +626,8 @@ def test_tensors_basic(ray_start_regular_shared): tensor_shape = (3, 5) ds = ray.data.range_tensor(6, shape=tensor_shape, parallelism=6) assert str(ds) == ( - "Dataset(num_blocks=6, num_rows=6, " - "schema={__value__: ArrowTensorType(shape=(3, 5), dtype=int64)})" + "" ) assert ds.size_bytes() == 5 * 3 * 6 * 8 @@ -817,8 +817,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): # Test map. ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4))) assert str(ds) == ( - "Dataset(num_blocks=10, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "" ) # Test map_batches. @@ -826,8 +826,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: np.ones((3, 4, 4)), batch_size=2 ) assert str(ds) == ( - "Dataset(num_blocks=4, num_rows=24, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "" ) # Test flat_map. @@ -835,8 +835,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: [np.ones((4, 4)), np.ones((4, 4))] ) assert str(ds) == ( - "Dataset(num_blocks=10, num_rows=20, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "" ) # Test map_batches ndarray column. @@ -844,15 +844,15 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: pd.DataFrame({"a": [np.ones((4, 4))] * 3}), batch_size=2 ) assert str(ds) == ( - "Dataset(num_blocks=4, num_rows=24, " - "schema={a: TensorDtype(shape=(4, 4), dtype=float64)})" + "" ) # Test map_batches ragged ndarray column falls back to opaque object-typed column. ds = ray.data.range(16, parallelism=4).map_batches( lambda _: pd.DataFrame({"a": [np.ones((2, 2)), np.ones((3, 3))]}), batch_size=2 ) - assert str(ds) == ("Dataset(num_blocks=4, num_rows=16, schema={a: object})") + assert str(ds) == ("") def test_tensors_in_tables_from_pandas(ray_start_regular_shared): @@ -1314,7 +1314,7 @@ def test_empty_dataset(ray_start_regular_shared): ds = ray.data.range(1) ds = ds.filter(lambda x: x > 1) - assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)" + assert str(ds) == "" # Test map on empty dataset. ds = ray.data.from_items([]) @@ -1332,11 +1332,11 @@ def test_schema(ray_start_regular_shared): ds2 = ray.data.range_table(10, parallelism=10) ds3 = ds2.repartition(5) ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1) - assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" - assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})" - assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})" + assert str(ds) == ">" + assert str(ds2) == "" + assert str(ds3) == "" assert ( - str(ds4) == "Dataset(num_blocks=1, num_rows=5, schema={a: string, b: double})" + str(ds4) == "" ) @@ -4277,7 +4277,7 @@ def test_groupby_simple_multi_agg(ray_start_regular_shared, num_parts): def test_column_name_type_check(ray_start_regular_shared): df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) ds = ray.data.from_pandas(df) - expected_str = "Dataset(num_blocks=1, num_rows=10, schema={1: float64, a: float64})" + expected_str = "" assert str(ds) == expected_str, str(ds) df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) with pytest.raises(ValueError): diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index cad0525eba97..05d90d6d6875 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -462,12 +462,12 @@ def test_parquet_read_basic(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + str(ds) == "" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + repr(ds) == "" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -533,12 +533,12 @@ def prefetch_file_metadata(self, pieces): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + str(ds) == "" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + repr(ds) == "" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -605,12 +605,12 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + str(ds) == "" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + repr(ds) == "" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -691,12 +691,12 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + str(ds) == "" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={one: int64, two: string})" + repr(ds) == "" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -744,14 +744,14 @@ def test_parquet_read_partitioned(ray_start_regular_shared, fs, data_path): input_files = ds.input_files() assert len(input_files) == 2, input_files assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " + str(ds) == "})" + "one: dictionary}>" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + repr(ds) == "})" + "one: dictionary}>" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -830,12 +830,12 @@ def test_parquet_read_partitioned_explicit(ray_start_regular_shared, tmp_path): input_files = ds.input_files() assert len(input_files) == 2, input_files assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={two: string, one: int32})" + str(ds) == "" ), ds assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={two: string, one: int32})" + repr(ds) == "" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -1210,8 +1210,8 @@ def test_numpy_roundtrip(ray_start_regular_shared, fs, data_path): ds.write_numpy(data_path, filesystem=fs) ds = ray.data.read_numpy(data_path, filesystem=fs) assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=None, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -1222,8 +1222,8 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): np.save(os.path.join(path, "test.npy"), np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path) assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -1235,8 +1235,8 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): assert ds.num_blocks() == 1 assert ds.count() == 10 assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "" ) assert [v.item() for v in ds.take(2)] == [0, 1] @@ -1248,8 +1248,8 @@ def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): np.save(path, np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path, meta_provider=FastFileMetadataProvider()) assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) From 9f33bc7034c0bc7ef2810ee0b322403545041857 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 22:09:31 -0700 Subject: [PATCH 27/30] Fix `DatasetPipeline` --- python/ray/air/tests/test_dataset_config.py | 8 ++-- python/ray/data/dataset_pipeline.py | 2 +- .../ray/data/tests/test_dataset_pipeline.py | 38 +++++++++---------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py index 13b165c8929a..772f558d900f 100644 --- a/python/ray/air/tests/test_dataset_config.py +++ b/python/ray/air/tests/test_dataset_config.py @@ -239,7 +239,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] == results[1], results stats = shard.stats() - assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard + assert str(shard) == "", shard assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats def rand(x): @@ -270,7 +270,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard + assert str(shard) == "", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 5/5 blocks executed " in stats @@ -291,7 +291,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard + assert str(shard) == "", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 1/1 blocks executed " in stats @@ -311,7 +311,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard + assert str(shard) == "", shard assert "Stage 1 read->randomize_block_order->random_shuffle" in stats, stats ds = ray.data.range_table(5) diff --git a/python/ray/data/dataset_pipeline.py b/python/ray/data/dataset_pipeline.py index e7320a3a3cb6..e3588f4724ff 100644 --- a/python/ray/data/dataset_pipeline.py +++ b/python/ray/data/dataset_pipeline.py @@ -1123,7 +1123,7 @@ def from_iterable( return DatasetPipeline(iterable, False, length=length) def __repr__(self) -> str: - return "DatasetPipeline(num_windows={}, num_stages={})".format( + return "".format( self._length, 1 + len(self._stages) ) diff --git a/python/ray/data/tests/test_dataset_pipeline.py b/python/ray/data/tests/test_dataset_pipeline.py index 10c54932281a..c50f444a5a21 100644 --- a/python/ray/data/tests/test_dataset_pipeline.py +++ b/python/ray/data/tests/test_dataset_pipeline.py @@ -190,24 +190,24 @@ def test_window_by_bytes(ray_start_regular_shared): ray.data.range_table(10).window(blocks_per_window=2, bytes_per_window=2) pipe = ray.data.range_table(10000000, parallelism=100).window(blocks_per_window=2) - assert str(pipe) == "DatasetPipeline(num_windows=50, num_stages=2)" + assert str(pipe) == "" pipe = ray.data.range_table(10000000, parallelism=100).window( bytes_per_window=10 * 1024 * 1024 ) - assert str(pipe) == "DatasetPipeline(num_windows=8, num_stages=2)" + assert str(pipe) == "" dss = list(pipe.iter_datasets()) assert len(dss) == 8, dss for ds in dss[:-1]: assert ds.num_blocks() in [12, 13] pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1) - assert str(pipe) == "DatasetPipeline(num_windows=100, num_stages=2)" + assert str(pipe) == "" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 1 pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1e9) - assert str(pipe) == "DatasetPipeline(num_windows=1, num_stages=2)" + assert str(pipe) == "" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 100 @@ -217,7 +217,7 @@ def test_window_by_bytes(ray_start_regular_shared): .map_batches(lambda x: x) .window(bytes_per_window=10 * 1024 * 1024) ) - assert str(pipe) == "DatasetPipeline(num_windows=8, num_stages=1)" + assert str(pipe) == "" context = DatasetContext.get_current() old = context.optimize_fuse_read_stages @@ -291,31 +291,31 @@ def test_basic_pipeline(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" + assert str(pipe) == "" assert pipe.count() == 10 pipe = ds.window(blocks_per_window=1) pipe.show() pipe = ds.window(blocks_per_window=1).map(lambda x: x).map(lambda x: x) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" + assert str(pipe) == "" assert pipe.take() == list(range(10)) pipe = ( ds.window(blocks_per_window=1).map(lambda x: x).flat_map(lambda x: [x, x + 1]) ) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" + assert str(pipe) == "" assert pipe.count() == 20 pipe = ds.window(blocks_per_window=1).filter(lambda x: x % 2 == 0) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=3)" + assert str(pipe) == "" assert pipe.count() == 5 pipe = ds.window(blocks_per_window=999) - assert str(pipe) == "DatasetPipeline(num_windows=1, num_stages=2)" + assert str(pipe) == "" assert pipe.count() == 10 pipe = ds.repeat(10) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" + assert str(pipe) == "" assert pipe.count() == 100 pipe = ds.repeat(10) assert pipe.sum() == 450 @@ -328,9 +328,9 @@ def test_window(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" + assert str(pipe) == "" pipe = pipe.rewindow(blocks_per_window=3) - assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" + assert str(pipe) == "" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 assert datasets[0].take() == [0, 1, 2] @@ -340,9 +340,9 @@ def test_window(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=5) - assert str(pipe) == "DatasetPipeline(num_windows=2, num_stages=2)" + assert str(pipe) == "" pipe = pipe.rewindow(blocks_per_window=3) - assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" + assert str(pipe) == "" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 assert datasets[0].take() == [0, 1, 2] @@ -356,15 +356,15 @@ def test_repeat(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(5, parallelism=5) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "DatasetPipeline(num_windows=5, num_stages=2)" + assert str(pipe) == "" pipe = pipe.repeat(2) - assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" + assert str(pipe) == "" assert pipe.take() == (list(range(5)) + list(range(5))) ds = ray.data.range(5) pipe = ds.window(blocks_per_window=1) pipe = pipe.repeat() - assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" + assert str(pipe) == "" assert len(pipe.take(99)) == 99 pipe = ray.data.range(5).repeat() @@ -384,7 +384,7 @@ def test_repeat_forever(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(10) pipe = ds.repeat() - assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" + assert str(pipe) == "" for i, v in enumerate(pipe.iter_rows()): assert v == i % 10, (v, i, i % 10) if i > 1000: From a304fc43bcedef1e9963803aac672e173d581a9e Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 26 Jul 2022 22:11:35 -0700 Subject: [PATCH 28/30] Format files --- python/ray/data/tests/test_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index 8afc8891e377..03b20120fca4 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -1335,9 +1335,7 @@ def test_schema(ray_start_regular_shared): assert str(ds) == ">" assert str(ds2) == "" assert str(ds3) == "" - assert ( - str(ds4) == "" - ) + assert str(ds4) == "" def test_schema_lazy(ray_start_regular_shared): From 1cfb3b6483e24476b477d9ab05a1e2b4bc6d029d Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 27 Jul 2022 09:15:35 -0700 Subject: [PATCH 29/30] Revert dataset changes --- python/ray/data/dataset.py | 2 +- python/ray/data/dataset_pipeline.py | 2 +- python/ray/data/tests/conftest.py | 8 +-- python/ray/data/tests/test_dataset.py | 38 +++++------ python/ray/data/tests/test_dataset_formats.py | 64 +++++++++---------- .../ray/data/tests/test_dataset_pipeline.py | 38 +++++------ 6 files changed, 77 insertions(+), 75 deletions(-) diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index bf085a2cd33d..fcae3991bacc 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -3586,7 +3586,7 @@ def __repr__(self) -> str: schema_str = ", ".join(schema_str) schema_str = "{" + schema_str + "}" count = self._meta_count() - return "".format( + return "Dataset(num_blocks={}, num_rows={}, schema={})".format( self._plan.initial_num_blocks(), count, schema_str ) diff --git a/python/ray/data/dataset_pipeline.py b/python/ray/data/dataset_pipeline.py index e3588f4724ff..e7320a3a3cb6 100644 --- a/python/ray/data/dataset_pipeline.py +++ b/python/ray/data/dataset_pipeline.py @@ -1123,7 +1123,7 @@ def from_iterable( return DatasetPipeline(iterable, False, length=length) def __repr__(self) -> str: - return "".format( + return "DatasetPipeline(num_windows={}, num_stages={})".format( self._length, 1 + len(self._stages) ) diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index 5ec54a620b06..1b6f3f0e429d 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -205,12 +205,12 @@ def _assert_base_partitioned_ds( actual_input_files = ds.input_files() assert len(actual_input_files) == num_input_files, actual_input_files assert ( - str(ds) == f"" + str(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " + f"schema={schema})" ), ds assert ( - repr(ds) == f"" + repr(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " + f"schema={schema})" ), ds if num_computed is not None: assert ( diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py index 03b20120fca4..10bd3b12125e 100644 --- a/python/ray/data/tests/test_dataset.py +++ b/python/ray/data/tests/test_dataset.py @@ -378,7 +378,7 @@ def test_batch_tensors(ray_start_regular_shared): import torch ds = ray.data.from_items([torch.tensor([0, 0]) for _ in range(40)], parallelism=40) - res = ">" + res = "Dataset(num_blocks=40, num_rows=40, schema=)" assert str(ds) == res, str(ds) with pytest.raises(pa.lib.ArrowInvalid): next(ds.iter_batches(batch_format="pyarrow")) @@ -626,8 +626,8 @@ def test_tensors_basic(ray_start_regular_shared): tensor_shape = (3, 5) ds = ray.data.range_tensor(6, shape=tensor_shape, parallelism=6) assert str(ds) == ( - "" + "Dataset(num_blocks=6, num_rows=6, " + "schema={__value__: ArrowTensorType(shape=(3, 5), dtype=int64)})" ) assert ds.size_bytes() == 5 * 3 * 6 * 8 @@ -817,8 +817,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): # Test map. ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4))) assert str(ds) == ( - "" + "Dataset(num_blocks=10, num_rows=10, " + "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" ) # Test map_batches. @@ -826,8 +826,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: np.ones((3, 4, 4)), batch_size=2 ) assert str(ds) == ( - "" + "Dataset(num_blocks=4, num_rows=24, " + "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" ) # Test flat_map. @@ -835,8 +835,8 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: [np.ones((4, 4)), np.ones((4, 4))] ) assert str(ds) == ( - "" + "Dataset(num_blocks=10, num_rows=20, " + "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" ) # Test map_batches ndarray column. @@ -844,15 +844,15 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): lambda _: pd.DataFrame({"a": [np.ones((4, 4))] * 3}), batch_size=2 ) assert str(ds) == ( - "" + "Dataset(num_blocks=4, num_rows=24, " + "schema={a: TensorDtype(shape=(4, 4), dtype=float64)})" ) # Test map_batches ragged ndarray column falls back to opaque object-typed column. ds = ray.data.range(16, parallelism=4).map_batches( lambda _: pd.DataFrame({"a": [np.ones((2, 2)), np.ones((3, 3))]}), batch_size=2 ) - assert str(ds) == ("") + assert str(ds) == ("Dataset(num_blocks=4, num_rows=16, schema={a: object})") def test_tensors_in_tables_from_pandas(ray_start_regular_shared): @@ -1314,7 +1314,7 @@ def test_empty_dataset(ray_start_regular_shared): ds = ray.data.range(1) ds = ds.filter(lambda x: x > 1) - assert str(ds) == "" + assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)" # Test map on empty dataset. ds = ray.data.from_items([]) @@ -1332,10 +1332,12 @@ def test_schema(ray_start_regular_shared): ds2 = ray.data.range_table(10, parallelism=10) ds3 = ds2.repartition(5) ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1) - assert str(ds) == ">" - assert str(ds2) == "" - assert str(ds3) == "" - assert str(ds4) == "" + assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" + assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})" + assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})" + assert ( + str(ds4) == "Dataset(num_blocks=1, num_rows=5, schema={a: string, b: double})" + ) def test_schema_lazy(ray_start_regular_shared): @@ -4275,7 +4277,7 @@ def test_groupby_simple_multi_agg(ray_start_regular_shared, num_parts): def test_column_name_type_check(ray_start_regular_shared): df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) ds = ray.data.from_pandas(df) - expected_str = "" + expected_str = "Dataset(num_blocks=1, num_rows=10, schema={1: float64, a: float64})" assert str(ds) == expected_str, str(ds) df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) with pytest.raises(ValueError): diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index ceca2053942e..72579ab144d9 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -462,12 +462,12 @@ def test_parquet_read_basic(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "" + str(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "" + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -533,12 +533,12 @@ def prefetch_file_metadata(self, pieces): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "" + str(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "" + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -605,12 +605,12 @@ def test_parquet_read_bulk(ray_start_regular_shared, fs, data_path): assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "" + str(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "" + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -691,12 +691,12 @@ def test_parquet_read_bulk_meta_provider(ray_start_regular_shared, fs, data_path assert "test1.parquet" in str(input_files) assert "test2.parquet" in str(input_files) assert ( - str(ds) == "" + str(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ( - repr(ds) == "" + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={one: int64, two: string})" ), ds assert ds._plan.execute()._num_computed() == 2 @@ -744,14 +744,14 @@ def test_parquet_read_partitioned(ray_start_regular_shared, fs, data_path): input_files = ds.input_files() assert len(input_files) == 2, input_files assert ( - str(ds) == "}>" + "one: dictionary})" ), ds assert ( - repr(ds) == "}>" + "one: dictionary})" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -830,12 +830,12 @@ def test_parquet_read_partitioned_explicit(ray_start_regular_shared, tmp_path): input_files = ds.input_files() assert len(input_files) == 2, input_files assert ( - str(ds) == "" + str(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={two: string, one: int32})" ), ds assert ( - repr(ds) == "" + repr(ds) == "Dataset(num_blocks=2, num_rows=6, " + "schema={two: string, one: int32})" ), ds assert ds._plan.execute()._num_computed() == 1 @@ -1218,8 +1218,8 @@ def test_numpy_roundtrip(ray_start_regular_shared, fs, data_path): ds.write_numpy(data_path, filesystem=fs) ds = ray.data.read_numpy(data_path, filesystem=fs) assert str(ds) == ( - "" + "Dataset(num_blocks=2, num_rows=None, " + "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -1230,8 +1230,8 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): np.save(os.path.join(path, "test.npy"), np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path) assert str(ds) == ( - "" + "Dataset(num_blocks=1, num_rows=10, " + "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -1243,8 +1243,8 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): assert ds.num_blocks() == 1 assert ds.count() == 10 assert str(ds) == ( - "" + "Dataset(num_blocks=1, num_rows=10, " + "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" ) assert [v.item() for v in ds.take(2)] == [0, 1] @@ -1256,8 +1256,8 @@ def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): np.save(path, np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path, meta_provider=FastFileMetadataProvider()) assert str(ds) == ( - "" + "Dataset(num_blocks=1, num_rows=10, " + "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) diff --git a/python/ray/data/tests/test_dataset_pipeline.py b/python/ray/data/tests/test_dataset_pipeline.py index c50f444a5a21..10c54932281a 100644 --- a/python/ray/data/tests/test_dataset_pipeline.py +++ b/python/ray/data/tests/test_dataset_pipeline.py @@ -190,24 +190,24 @@ def test_window_by_bytes(ray_start_regular_shared): ray.data.range_table(10).window(blocks_per_window=2, bytes_per_window=2) pipe = ray.data.range_table(10000000, parallelism=100).window(blocks_per_window=2) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=50, num_stages=2)" pipe = ray.data.range_table(10000000, parallelism=100).window( bytes_per_window=10 * 1024 * 1024 ) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=8, num_stages=2)" dss = list(pipe.iter_datasets()) assert len(dss) == 8, dss for ds in dss[:-1]: assert ds.num_blocks() in [12, 13] pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=100, num_stages=2)" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 1 pipe = ray.data.range_table(10000000, parallelism=100).window(bytes_per_window=1e9) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=1, num_stages=2)" for ds in pipe.iter_datasets(): assert ds.num_blocks() == 100 @@ -217,7 +217,7 @@ def test_window_by_bytes(ray_start_regular_shared): .map_batches(lambda x: x) .window(bytes_per_window=10 * 1024 * 1024) ) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=8, num_stages=1)" context = DatasetContext.get_current() old = context.optimize_fuse_read_stages @@ -291,31 +291,31 @@ def test_basic_pipeline(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" assert pipe.count() == 10 pipe = ds.window(blocks_per_window=1) pipe.show() pipe = ds.window(blocks_per_window=1).map(lambda x: x).map(lambda x: x) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" assert pipe.take() == list(range(10)) pipe = ( ds.window(blocks_per_window=1).map(lambda x: x).flat_map(lambda x: [x, x + 1]) ) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=4)" assert pipe.count() == 20 pipe = ds.window(blocks_per_window=1).filter(lambda x: x % 2 == 0) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=3)" assert pipe.count() == 5 pipe = ds.window(blocks_per_window=999) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=1, num_stages=2)" assert pipe.count() == 10 pipe = ds.repeat(10) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" assert pipe.count() == 100 pipe = ds.repeat(10) assert pipe.sum() == 450 @@ -328,9 +328,9 @@ def test_window(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" pipe = pipe.rewindow(blocks_per_window=3) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 assert datasets[0].take() == [0, 1, 2] @@ -340,9 +340,9 @@ def test_window(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10) pipe = ds.window(blocks_per_window=5) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=2, num_stages=2)" pipe = pipe.rewindow(blocks_per_window=3) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=None, num_stages=1)" datasets = list(pipe.iter_datasets()) assert len(datasets) == 4 assert datasets[0].take() == [0, 1, 2] @@ -356,15 +356,15 @@ def test_repeat(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(5, parallelism=5) pipe = ds.window(blocks_per_window=1) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=5, num_stages=2)" pipe = pipe.repeat(2) - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=10, num_stages=2)" assert pipe.take() == (list(range(5)) + list(range(5))) ds = ray.data.range(5) pipe = ds.window(blocks_per_window=1) pipe = pipe.repeat() - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" assert len(pipe.take(99)) == 99 pipe = ray.data.range(5).repeat() @@ -384,7 +384,7 @@ def test_repeat_forever(ray_start_regular_shared): context.optimize_fuse_stages = True ds = ray.data.range(10) pipe = ds.repeat() - assert str(pipe) == "" + assert str(pipe) == "DatasetPipeline(num_windows=inf, num_stages=2)" for i, v in enumerate(pipe.iter_rows()): assert v == i % 10, (v, i, i % 10) if i > 1000: From f6ed176b6e08194137722a8a7282a584ccffe4f5 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 27 Jul 2022 09:24:27 -0700 Subject: [PATCH 30/30] Update test_dataset_config.py --- python/ray/air/tests/test_dataset_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/air/tests/test_dataset_config.py b/python/ray/air/tests/test_dataset_config.py index 772f558d900f..13b165c8929a 100644 --- a/python/ray/air/tests/test_dataset_config.py +++ b/python/ray/air/tests/test_dataset_config.py @@ -239,7 +239,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] == results[1], results stats = shard.stats() - assert str(shard) == "", shard + assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert "Stage 1 read->map_batches: 5/5 blocks executed " in stats, stats def rand(x): @@ -270,7 +270,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "", shard + assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 5/5 blocks executed " in stats @@ -291,7 +291,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "", shard + assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert ( "Stage 1 read->randomize_block_order->map_batches: 1/1 blocks executed " in stats @@ -311,7 +311,7 @@ def checker(shard, results): assert len(results[0]) == 5, results assert results[0] != results[1], results stats = shard.stats() - assert str(shard) == "", shard + assert str(shard) == "DatasetPipeline(num_windows=inf, num_stages=1)", shard assert "Stage 1 read->randomize_block_order->random_shuffle" in stats, stats ds = ray.data.range_table(5)