From 69a780cab1a5dcab33c8c7a36f03bd2eb3367683 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 21 May 2024 12:09:42 +0200 Subject: [PATCH] feat: specify column names in constructor of table transformers (#795) ### Summary of Changes Specify the names of the columns that a table transformer should be applied to in its constructor instead of its `fit` method. This allows easier composition. --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- docs/tutorials/classification.ipynb | 4 +-- docs/tutorials/data_processing.ipynb | 10 +++--- .../data/labeled/containers/_image_dataset.py | 2 +- src/safeds/data/tabular/containers/_table.py | 4 +-- .../tabular/transformation/_discretizer.py | 33 ++++++++++------- .../tabular/transformation/_label_encoder.py | 25 ++++++++----- .../transformation/_one_hot_encoder.py | 21 ++++++----- .../tabular/transformation/_range_scaler.py | 26 +++++++++----- .../tabular/transformation/_simple_imputer.py | 29 ++++++++++----- .../transformation/_standard_scaler.py | 31 +++++++++++----- .../transformation/_table_transformer.py | 33 +++++++++-------- src/safeds/ml/nn/_model.py | 2 +- .../labeled/containers/test_image_dataset.py | 2 +- .../_table/test_inverse_transform_table.py | 4 +-- .../containers/_table/test_transform_table.py | 4 +-- .../transformation/test_discretizer.py | 24 ++++++++----- .../transformation/test_label_encoder.py | 24 ++++++------- .../transformation/test_one_hot_encoder.py | 22 ++++++------ .../transformation/test_range_scaler.py | 35 +++++++++++-------- .../transformation/test_simple_imputer.py | 17 ++++----- .../transformation/test_standard_scaler.py | 29 ++++++++------- .../transformation/test_table_transformer.py | 20 +++++------ tests/safeds/ml/nn/test_cnn_workflow.py | 2 +- tests/safeds/ml/nn/test_forward_workflow.py | 6 ++-- tests/safeds/ml/nn/test_lstm_workflow.py | 4 +-- 25 files changed, 242 insertions(+), 171 deletions(-) diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb index 778880c99..21229648e 100644 --- a/docs/tutorials/classification.ipynb +++ b/docs/tutorials/classification.ipynb @@ -75,7 +75,7 @@ "source": [ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", - "encoder = OneHotEncoder().fit(train_table, [\"sex\"])" + "encoder = OneHotEncoder(column_names=\"sex\").fit(train_table)" ], "metadata": { "collapsed": false @@ -155,7 +155,6 @@ { "cell_type": "code", "source": [ - "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n", "transformed_test_table = encoder.transform(test_table)\n", "\n", "prediction = fitted_model.predict(\n", @@ -182,7 +181,6 @@ { "cell_type": "code", "source": [ - "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n", "testing_table = encoder.transform(testing_table)\n", "\n", "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names=extra_names)\n", diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb index d43319b20..f6e9d5b90 100644 --- a/docs/tutorials/data_processing.ipynb +++ b/docs/tutorials/data_processing.ipynb @@ -183,7 +183,7 @@ "source": [ "from safeds.data.tabular.transformation import SimpleImputer\n", "\n", - "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)).fit(titanic, [\"age\", \"fare\", \"cabin\", \"port_embarked\"])\n", + "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), column_names=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(titanic)\n", "imputer.transform(titanic_slice)" ], "metadata": { @@ -206,7 +206,7 @@ "source": [ "from safeds.data.tabular.transformation import LabelEncoder\n", "\n", - "encoder = LabelEncoder().fit(titanic, [\"sex\", \"port_embarked\"])\n", + "encoder = LabelEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n", "encoder.transform(titanic_slice)" ], "metadata": { @@ -229,7 +229,7 @@ "source": [ "from safeds.data.tabular.transformation import OneHotEncoder\n", "\n", - "encoder = OneHotEncoder().fit(titanic, [\"sex\", \"port_embarked\"])\n", + "encoder = OneHotEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n", "encoder.transform(titanic_slice)" ], "metadata": { @@ -252,7 +252,7 @@ "source": [ "from safeds.data.tabular.transformation import RangeScaler\n", "\n", - "scaler = RangeScaler(0.0, 1.0).fit(titanic, [\"age\"])\n", + "scaler = RangeScaler(0.0, 1.0, column_names=\"age\").fit(titanic)\n", "scaler.transform(titanic_slice)" ], "metadata": { @@ -275,7 +275,7 @@ "source": [ "from safeds.data.tabular.transformation import StandardScaler\n", "\n", - "scaler = StandardScaler().fit(titanic, [\"age\", \"travel_class\"])\n", + "scaler = StandardScaler(column_names=[\"age\", \"travel_class\"]).fit(titanic)\n", "scaler.transform(titanic_slice)" ], "metadata": { diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py index 61eebbf4e..ec50d9835 100644 --- a/src/safeds/data/labeled/containers/_image_dataset.py +++ b/src/safeds/data/labeled/containers/_image_dataset.py @@ -374,7 +374,7 @@ def __init__(self, column: Column) -> None: ) # TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not # be done automatically? - self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name]) + self._one_hot_encoder = OneHotEncoder(column_names=self._column_name).fit(column_as_table) self._tensor = torch.Tensor( self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch(dtype=pl.Float32), ).to(_get_device()) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 68a541974..be09659f3 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -1688,7 +1688,7 @@ def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer >>> from safeds.data.tabular.containers import Table >>> from safeds.data.tabular.transformation import RangeScaler >>> table = Table({"a": [1, 2, 3]}) - >>> transformer, transformed_table = RangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"]) + >>> transformer, transformed_table = RangeScaler(min_=0, max_=1, column_names="a").fit_and_transform(table) >>> transformed_table.inverse_transform_table(transformer) +---------+ | a | @@ -1726,7 +1726,7 @@ def transform_table(self, fitted_transformer: TableTransformer) -> Table: >>> from safeds.data.tabular.containers import Table >>> from safeds.data.tabular.transformation import RangeScaler >>> table = Table({"a": [1, 2, 3]}) - >>> transformer = RangeScaler(min_=0, max_=1).fit(table, ["a"]) + >>> transformer = RangeScaler(min_=0, max_=1, column_names="a").fit(table) >>> table.transform_table(transformer) +---------+ | a | diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py index 5d487c5f1..b1fe56275 100644 --- a/src/safeds/data/tabular/transformation/_discretizer.py +++ b/src/safeds/data/tabular/transformation/_discretizer.py @@ -4,6 +4,7 @@ from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric from safeds.data.tabular.containers import Table from safeds.exceptions import ( NonNumericColumnError, @@ -24,6 +25,8 @@ class Discretizer(TableTransformer): ---------- bin_count: The number of bins to be created. + column_names: + The list of columns used to fit the transformer. If `None`, all numeric columns are used. Raises ------ @@ -35,8 +38,13 @@ class Discretizer(TableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, bin_count: int = 5) -> None: - TableTransformer.__init__(self) + def __init__( + self, + bin_count: int = 5, + *, + column_names: str | list[str] | None = None, + ) -> None: + TableTransformer.__init__(self, column_names) _check_bounds("bin_count", bin_count, lower_bound=_ClosedBound(2)) @@ -53,6 +61,10 @@ def __hash__(self) -> int: # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def is_fitted(self) -> bool: + return self._wrapped_transformer is not None + @property def bin_count(self) -> int: return self._bin_count @@ -61,7 +73,7 @@ def bin_count(self) -> int: # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: + def fit(self, table: Table) -> Discretizer: """ Learn a transformation for a set of columns in a table. @@ -71,8 +83,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -93,14 +103,12 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: if table.row_count == 0: raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows") - if column_names is None: - column_names = table.column_names + if self._column_names is None: + column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) - - for column in column_names: - if not table.get_column(column).type.is_numeric: - raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") + _check_columns_are_numeric(table, column_names, operation="fit a Discretizer") wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._bin_count, encode="ordinal") wrapped_transformer.set_output(transform="polars") @@ -108,9 +116,8 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer: table.remove_columns_except(column_names)._data_frame, ) - result = Discretizer(self._bin_count) + result = Discretizer(self._bin_count, column_names=column_names) result._wrapped_transformer = wrapped_transformer - result._column_names = column_names return result diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py index c95d17b02..1cdf980c2 100644 --- a/src/safeds/data/tabular/transformation/_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_label_encoder.py @@ -18,6 +18,8 @@ class LabelEncoder(InvertibleTableTransformer): Parameters ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all non-numeric columns are used. partial_order: The partial order of the labels. The labels are encoded in the order of the given list. Additional values are assigned labels in the order they are encountered during fitting. @@ -27,8 +29,13 @@ class LabelEncoder(InvertibleTableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, *, partial_order: list[Any] | None = None) -> None: - super().__init__() + def __init__( + self, + *, + column_names: str | list[str] | None = None, + partial_order: list[Any] | None = None, + ) -> None: + super().__init__(column_names) if partial_order is None: partial_order = [] @@ -51,6 +58,10 @@ def __hash__(self) -> int: # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def is_fitted(self) -> bool: + return self._mapping is not None and self._inverse_mapping is not None + @property def partial_order(self) -> list[Any]: """The partial order of the labels.""" @@ -60,7 +71,7 @@ def partial_order(self) -> list[Any]: # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: + def fit(self, table: Table) -> LabelEncoder: """ Learn a transformation for a set of columns in a table. @@ -70,8 +81,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all non-numeric columns are used. Returns ------- @@ -85,9 +94,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: ValueError If the table contains 0 rows. """ - if column_names is None: + if self._column_names is None: column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) _warn_if_columns_are_numeric(table, column_names) @@ -111,8 +121,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder: reverse_mapping[name][label] = value # Create a copy with the learned transformation - result = LabelEncoder(partial_order=self._partial_order) - result._column_names = column_names + result = LabelEncoder(column_names=column_names, partial_order=self._partial_order) result._mapping = mapping result._inverse_mapping = reverse_mapping diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py index ca488d8f8..c3035b956 100644 --- a/src/safeds/data/tabular/transformation/_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_one_hot_encoder.py @@ -43,6 +43,8 @@ class OneHotEncoder(InvertibleTableTransformer): Parameters ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all non-numeric columns are used. separator: The separator used to separate the original column name from the value in the new column names. @@ -52,7 +54,7 @@ class OneHotEncoder(InvertibleTableTransformer): >>> from safeds.data.tabular.transformation import OneHotEncoder >>> table = Table({"col1": ["a", "b", "c", "a"]}) >>> transformer = OneHotEncoder() - >>> transformer.fit_and_transform(table, ["col1"])[1] + >>> transformer.fit_and_transform(table)[1] +---------+---------+---------+ | col1__a | col1__b | col1__c | | --- | --- | --- | @@ -72,9 +74,10 @@ class OneHotEncoder(InvertibleTableTransformer): def __init__( self, *, + column_names: str | list[str] | None = None, separator: str = "__", ) -> None: - super().__init__() + super().__init__(column_names) # Parameters self._separator = separator @@ -103,6 +106,10 @@ def __hash__(self) -> int: # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def is_fitted(self) -> bool: + return self._mapping is not None + @property def separator(self) -> str: """The separator used to separate the original column name from the value in the new column names.""" @@ -112,7 +119,7 @@ def separator(self) -> str: # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: + def fit(self, table: Table) -> OneHotEncoder: """ Learn a transformation for a set of columns in a table. @@ -122,8 +129,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -137,9 +142,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: ValueError If the table contains 0 rows. """ - if column_names is None: + if self._column_names is None: column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) _warn_if_columns_are_numeric(table, column_names) @@ -169,8 +175,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder: mapping[name].append((new_name, value)) # Create a copy with the learned transformation - result = OneHotEncoder() - result._column_names = column_names + result = OneHotEncoder(column_names=column_names, separator=self._separator) result._new_column_names = new_column_names result._mapping = mapping diff --git a/src/safeds/data/tabular/transformation/_range_scaler.py b/src/safeds/data/tabular/transformation/_range_scaler.py index a07ad8cb6..c58cb64b0 100644 --- a/src/safeds/data/tabular/transformation/_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_range_scaler.py @@ -24,6 +24,8 @@ class RangeScaler(InvertibleTableTransformer): The minimum of the new range after the transformation max_: The maximum of the new range after the transformation + column_names: + The list of columns used to fit the transformer. If `None`, all numeric columns are used. Raises ------ @@ -35,8 +37,14 @@ class RangeScaler(InvertibleTableTransformer): # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, min_: float = 0.0, max_: float = 1.0) -> None: - super().__init__() + def __init__( + self, + min_: float = 0.0, + max_: float = 1.0, + *, + column_names: str | list[str] | None = None, + ) -> None: + super().__init__(column_names) if min_ >= max_: raise ValueError('Parameter "max_" must be greater than parameter "min_".') @@ -61,6 +69,10 @@ def __hash__(self) -> int: # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def is_fitted(self) -> bool: + return self._data_min is not None and self._data_max is not None + @property def min(self) -> float: """The minimum of the new range after the transformation.""" @@ -75,7 +87,7 @@ def max(self) -> float: # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: + def fit(self, table: Table) -> RangeScaler: """ Learn a transformation for a set of columns in a table. @@ -85,8 +97,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If None, all numeric columns are used. Returns ------- @@ -102,9 +112,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: ValueError If the table contains 0 rows. """ - if column_names is None: + if self._column_names is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a RangeScaler") @@ -116,8 +127,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler: _data_max = table._lazy_frame.select(column_names).max().collect() # Create a copy with the learned transformation - result = RangeScaler(min_=self._min, max_=self._max) - result._column_names = column_names + result = RangeScaler(min_=self._min, max_=self._max, column_names=column_names) result._data_min = _data_min result._data_max = _data_max diff --git a/src/safeds/data/tabular/transformation/_simple_imputer.py b/src/safeds/data/tabular/transformation/_simple_imputer.py index da6070aa3..63e2e1381 100644 --- a/src/safeds/data/tabular/transformation/_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_simple_imputer.py @@ -23,6 +23,8 @@ class SimpleImputer(TableTransformer): How to replace missing values. value_to_replace: The value that should be replaced. + column_names: + The list of columns used to fit the transformer. If `None`, all columns are used. Examples -------- @@ -90,8 +92,14 @@ def mode() -> SimpleImputer.Strategy: # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, strategy: SimpleImputer.Strategy, *, value_to_replace: float | str | None = None) -> None: - super().__init__() + def __init__( + self, + strategy: SimpleImputer.Strategy, + *, + column_names: str | list[str] | None = None, + value_to_replace: float | str | None = None, + ) -> None: + super().__init__(column_names) # Parameters self._strategy = strategy @@ -112,6 +120,10 @@ def __hash__(self) -> int: # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def is_fitted(self) -> bool: + return self._replacement is not None + @property def strategy(self) -> SimpleImputer.Strategy: """The strategy used to replace missing values.""" @@ -126,7 +138,7 @@ def value_to_replace(self) -> Any: # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: + def fit(self, table: Table) -> SimpleImputer: """ Learn a transformation for a set of columns in a table. @@ -136,8 +148,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -155,15 +165,17 @@ def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: data. """ if isinstance(self._strategy, _Mean | _Median): - if column_names is None: + if self._column_names is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a SimpleImputer") else: # noqa: PLR5501 - if column_names is None: + if self._column_names is None: column_names = table.column_names else: + column_names = self._column_names _check_columns_exist(table, column_names) if table.row_count == 0: @@ -173,8 +185,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> SimpleImputer: replacement = self._strategy._get_replacement(table) # Create a copy with the learned transformation - result = SimpleImputer(self._strategy, value_to_replace=self._value_to_replace) - result._column_names = column_names + result = SimpleImputer(self._strategy, column_names=column_names, value_to_replace=self._value_to_replace) result._replacement = replacement return result diff --git a/src/safeds/data/tabular/transformation/_standard_scaler.py b/src/safeds/data/tabular/transformation/_standard_scaler.py index bdae499ba..969c04e0a 100644 --- a/src/safeds/data/tabular/transformation/_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_standard_scaler.py @@ -14,14 +14,21 @@ class StandardScaler(InvertibleTableTransformer): - """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" + """ + The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance. + + Parameters + ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all numeric columns are used. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self) -> None: - super().__init__() + def __init__(self, *, column_names: str | list[str] | None = None) -> None: + super().__init__(column_names) # Internal state self._data_mean: pl.DataFrame | None = None @@ -31,11 +38,19 @@ def __hash__(self) -> int: # Leave out the internal state for faster hashing return super().__hash__() + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def is_fitted(self) -> bool: + return self._data_mean is not None and self._data_standard_deviation is not None + # ------------------------------------------------------------------------------------------------------------------ # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ - def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: + def fit(self, table: Table) -> StandardScaler: """ Learn a transformation for a set of columns in a table. @@ -45,8 +60,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -62,9 +75,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: ValueError If the table contains 0 rows. """ - if column_names is None: + if self._column_names is None: column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric] else: + column_names = self._column_names _check_columns_exist(table, column_names) _check_columns_are_numeric(table, column_names, operation="fit a StandardScaler") @@ -76,8 +90,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> StandardScaler: _data_standard_deviation = table._lazy_frame.select(column_names).std(ddof=0).collect() # Create a copy with the learned transformation - result = StandardScaler() - result._column_names = column_names + result = StandardScaler(column_names=column_names) result._data_mean = _data_mean result._data_standard_deviation = _data_standard_deviation diff --git a/src/safeds/data/tabular/transformation/_table_transformer.py b/src/safeds/data/tabular/transformation/_table_transformer.py index 06e1c44ab..5e61616c2 100644 --- a/src/safeds/data/tabular/transformation/_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_table_transformer.py @@ -10,7 +10,14 @@ class TableTransformer(ABC): - """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" + """ + Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns. + + Parameters + ---------- + column_names: + The list of columns used to fit the transformer. If `None`, all suitable columns are used. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods @@ -18,8 +25,11 @@ class TableTransformer(ABC): # The decorator is needed so the class really cannot be instantiated @abstractmethod - def __init__(self) -> None: - self._column_names: list[str] | None = None + def __init__(self, column_names: str | list[str] | None) -> None: + if isinstance(column_names, str): + column_names = [column_names] + + self._column_names: list[str] | None = column_names # The decorator ensures that the method is overridden in all subclasses @abstractmethod @@ -27,6 +37,7 @@ def __hash__(self) -> int: return _structural_hash( self.__class__.__qualname__, self._column_names, + self.is_fitted, ) # ------------------------------------------------------------------------------------------------------------------ @@ -34,16 +45,16 @@ def __hash__(self) -> int: # ------------------------------------------------------------------------------------------------------------------ @property + @abstractmethod def is_fitted(self) -> bool: """Whether the transformer is fitted.""" - return self._column_names is not None # ------------------------------------------------------------------------------------------------------------------ # Learning and transformation # ------------------------------------------------------------------------------------------------------------------ @abstractmethod - def fit(self, table: Table, column_names: list[str] | None) -> Self: + def fit(self, table: Table) -> Self: """ Learn a transformation for a set of columns in a table. @@ -53,8 +64,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> Self: ---------- table: The table used to fit the transformer. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -85,11 +94,7 @@ def transform(self, table: Table) -> Table: If the transformer has not been fitted yet. """ - def fit_and_transform( - self, - table: Table, - column_names: list[str] | None = None, - ) -> tuple[Self, Table]: + def fit_and_transform(self, table: Table) -> tuple[Self, Table]: """ Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. @@ -99,8 +104,6 @@ def fit_and_transform( ---------- table: The table used to fit the transformer. The transformer is then applied to this table. - column_names: - The list of columns from the table used to fit the transformer. If `None`, all columns are used. Returns ------- @@ -109,6 +112,6 @@ def fit_and_transform( transformed_table: The transformed table. """ - fitted_transformer = self.fit(table, column_names) + fitted_transformer = self.fit(table) transformed_table = fitted_transformer.transform(table) return fitted_transformer, transformed_table diff --git a/src/safeds/ml/nn/_model.py b/src/safeds/ml/nn/_model.py index 210f73e6a..754b29a83 100644 --- a/src/safeds/ml/nn/_model.py +++ b/src/safeds/ml/nn/_model.py @@ -433,7 +433,7 @@ def load_pretrained_model(huggingface_repo: str) -> NeuralNetworkClassifier: # label_dict: dict[str, str] = config.id2label column_name = "label" labels_table = Table({column_name: [label for _, label in label_dict.items()]}) - one_hot_encoder = OneHotEncoder().fit(labels_table, [column_name]) + one_hot_encoder = OneHotEncoder(column_names=[column_name]).fit(labels_table) in_conversion = InputConversionImageToColumn(input_size) diff --git a/tests/safeds/data/labeled/containers/test_image_dataset.py b/tests/safeds/data/labeled/containers/test_image_dataset.py index ab0cbe197..0c487d3b3 100644 --- a/tests/safeds/data/labeled/containers/test_image_dataset.py +++ b/tests/safeds/data/labeled/containers/test_image_dataset.py @@ -430,7 +430,7 @@ class TestColumnAsTensor: (torch.randn(10, 10), OneHotEncoder(), TransformerNotFittedError, r""), ( torch.randn(10, 10), - OneHotEncoder().fit(Table({"b": ["a", "b", "c"]}), None), + OneHotEncoder().fit(Table({"b": ["a", "b", "c"]})), ValueError, r"Tensor and one_hot_encoder have different amounts of classes \(10!=3\).", ), diff --git a/tests/safeds/data/tabular/containers/_table/test_inverse_transform_table.py b/tests/safeds/data/tabular/containers/_table/test_inverse_transform_table.py index ada2df6d4..98486dfa1 100644 --- a/tests/safeds/data/tabular/containers/_table/test_inverse_transform_table.py +++ b/tests/safeds/data/tabular/containers/_table/test_inverse_transform_table.py @@ -71,7 +71,7 @@ def test_should_return_original_table( column_names: list[str], table_to_transform: Table, ) -> None: - transformer = OneHotEncoder().fit(table_to_fit, column_names) + transformer = OneHotEncoder(column_names=column_names).fit(table_to_fit) transformed_table = transformer.transform(table_to_transform) result = transformed_table.inverse_transform_table(transformer) @@ -91,7 +91,7 @@ def test_should_not_change_transformed_table() -> None: }, ) - transformer = OneHotEncoder().fit(table, None) + transformer = OneHotEncoder().fit(table) transformed_table = transformer.transform(table) transformed_table.inverse_transform_table(transformer) diff --git a/tests/safeds/data/tabular/containers/_table/test_transform_table.py b/tests/safeds/data/tabular/containers/_table/test_transform_table.py index 04bda5b94..ab980ccdb 100644 --- a/tests/safeds/data/tabular/containers/_table/test_transform_table.py +++ b/tests/safeds/data/tabular/containers/_table/test_transform_table.py @@ -79,7 +79,7 @@ def test_should_return_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - transformer = OneHotEncoder().fit(table, column_names) + transformer = OneHotEncoder(column_names=column_names).fit(table) assert table.transform_table(transformer) == expected @@ -102,7 +102,7 @@ def test_should_raise_if_column_not_found(table_to_fit: Table) -> None: }, ) - transformer = OneHotEncoder().fit(table_to_fit, None) + transformer = OneHotEncoder().fit(table_to_fit) table_to_transform = Table( { diff --git a/tests/safeds/data/tabular/transformation/test_discretizer.py b/tests/safeds/data/tabular/transformation/test_discretizer.py index 98e3d4847..091100910 100644 --- a/tests/safeds/data/tabular/transformation/test_discretizer.py +++ b/tests/safeds/data/tabular/transformation/test_discretizer.py @@ -1,7 +1,13 @@ import pytest from safeds.data.tabular.containers import Table from safeds.data.tabular.transformation import Discretizer -from safeds.exceptions import ColumnNotFoundError, NonNumericColumnError, OutOfBoundsError, TransformerNotFittedError +from safeds.exceptions import ( + ColumnNotFoundError, + ColumnTypeError, + NonNumericColumnError, + OutOfBoundsError, + TransformerNotFittedError, +) class TestInit: @@ -45,8 +51,8 @@ class TestFit: }, ), ["col2"], - NonNumericColumnError, - "Tried to do a numerical operation on one or multiple non-numerical columns: \ncol2 is of type String.", + ColumnTypeError, + None, ), ], ids=["ColumnNotFoundError", "multiple missing columns", "ValueError", "NonNumericColumnError"], @@ -59,7 +65,7 @@ def test_should_raise_errors( error_message: str | None, ) -> None: with pytest.raises(error, match=error_message): - Discretizer().fit(table, columns) + Discretizer(column_names=columns).fit(table) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -69,7 +75,7 @@ def test_should_not_change_original_transformer(self) -> None: ) transformer = Discretizer() - transformer.fit(table, None) + transformer.fit(table) assert transformer._wrapped_transformer is None assert transformer._column_names is None @@ -127,7 +133,7 @@ def test_should_raise_errors( }, ) - transformer = Discretizer().fit(table_to_fit, columns) + transformer = Discretizer(column_names=columns).fit(table_to_fit) with pytest.raises(error, match=error_message): transformer.transform(table_to_transform) @@ -158,7 +164,7 @@ def test_should_return_true_after_fitting(self) -> None: ) transformer = Discretizer() - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -203,7 +209,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = Discretizer().fit_and_transform(table, column_names) + fitted_transformer, transformed_table = Discretizer(column_names=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -245,7 +251,7 @@ def test_should_return_transformed_table_with_correct_number_of_bins( bin_count: int, expected: Table, ) -> None: - fitted_transformer, transformed_table = Discretizer(bin_count).fit_and_transform(table, ["col1"]) + fitted_transformer, transformed_table = Discretizer(bin_count, column_names="col1").fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected diff --git a/tests/safeds/data/tabular/transformation/test_label_encoder.py b/tests/safeds/data/tabular/transformation/test_label_encoder.py index 7004a0756..2d2fb05f9 100644 --- a/tests/safeds/data/tabular/transformation/test_label_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_label_encoder.py @@ -13,11 +13,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - LabelEncoder().fit(table, ["col2", "col3"]) + LabelEncoder(column_names=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The LabelEncoder cannot be fitted because the table contains 0 rows"): - LabelEncoder().fit(Table({"col1": []}), ["col1"]) + LabelEncoder(column_names="col1").fit(Table({"col1": []})) def test_should_warn_if_table_contains_numerical_data(self) -> None: with pytest.warns( @@ -27,7 +27,7 @@ def test_should_warn_if_table_contains_numerical_data(self) -> None: r" values into numerical values" ), ): - LabelEncoder().fit(Table({"col1": [1, 2]}), ["col1"]) + LabelEncoder(column_names="col1").fit(Table({"col1": [1, 2]})) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -37,7 +37,7 @@ def test_should_not_change_original_transformer(self) -> None: ) transformer = LabelEncoder() - transformer.fit(table, None) + transformer.fit(table) assert transformer._column_names is None assert transformer._mapping is None @@ -53,7 +53,7 @@ def test_should_raise_if_column_not_found(self) -> None: }, ) - transformer = LabelEncoder().fit(table_to_fit, None) + transformer = LabelEncoder().fit(table_to_fit) table_to_transform = Table( { @@ -90,7 +90,7 @@ def test_should_return_true_after_fitting(self) -> None: ) transformer = LabelEncoder() - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -135,7 +135,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = LabelEncoder().fit_and_transform(table, column_names) + fitted_transformer, transformed_table = LabelEncoder(column_names=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -170,7 +170,7 @@ class TestInverseTransform: ids=["no_column_names"], ) def test_should_return_original_table(self, table: Table) -> None: - transformer = LabelEncoder().fit(table, None) + transformer = LabelEncoder().fit(table) assert transformer.inverse_transform(transformer.transform(table)) == table @@ -181,7 +181,7 @@ def test_should_not_change_transformed_table(self) -> None: }, ) - transformer = LabelEncoder().fit(table, None) + transformer = LabelEncoder().fit(table) transformed_table = transformer.transform(table) transformer.inverse_transform(transformed_table) @@ -207,14 +207,12 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - LabelEncoder().fit( + LabelEncoder(column_names=["col1", "col2"]).fit( Table({"col1": ["one", "two"], "col2": ["three", "four"]}), - ["col1", "col2"], ).inverse_transform(Table({"col3": [1.0, 0.0]})) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - LabelEncoder().fit( + LabelEncoder(column_names=["col1", "col2"]).fit( Table({"col1": ["one", "two"], "col2": ["three", "four"]}), - ["col1", "col2"], ).inverse_transform(Table({"col1": ["1", "null"], "col2": ["2", "apple"]})) diff --git a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py index ce1fbe009..9bd62a685 100644 --- a/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py +++ b/tests/safeds/data/tabular/transformation/test_one_hot_encoder.py @@ -21,11 +21,11 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - OneHotEncoder().fit(table, ["col2", "col3"]) + OneHotEncoder(column_names=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The OneHotEncoder cannot be fitted because the table contains 0 rows"): - OneHotEncoder().fit(Table({"col1": []}), ["col1"]) + OneHotEncoder(column_names="col1").fit(Table({"col1": []})) def test_should_warn_if_table_contains_numerical_data(self) -> None: with pytest.warns( @@ -35,7 +35,7 @@ def test_should_warn_if_table_contains_numerical_data(self) -> None: r" values into numerical values" ), ): - OneHotEncoder().fit(Table({"col1": [1, 2, 3]}), ["col1"]) + OneHotEncoder(column_names="col1").fit(Table({"col1": [1, 2, 3]})) @pytest.mark.parametrize( "table", @@ -55,7 +55,7 @@ def test_should_warn_if_table_contains_numerical_data(self) -> None: ) def test_should_not_change_original_transformer(self, table: Table) -> None: transformer = OneHotEncoder() - transformer.fit(table, None) + transformer.fit(table) assert transformer._column_names is None assert transformer._new_column_names is None @@ -71,7 +71,7 @@ def test_should_raise_if_column_not_found(self) -> None: }, ) - transformer = OneHotEncoder().fit(table_to_fit, None) + transformer = OneHotEncoder().fit(table_to_fit) table_to_transform = Table( { @@ -127,7 +127,7 @@ def test_should_return_true_after_fitting(self, table: Table) -> None: ), category=UserWarning, ) - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -247,7 +247,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = OneHotEncoder().fit_and_transform(table, column_names) + fitted_transformer, transformed_table = OneHotEncoder(column_names=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -339,7 +339,7 @@ def test_should_return_original_table( column_names: list[str], table_to_transform: Table, ) -> None: - transformer = OneHotEncoder().fit(table_to_fit, column_names) + transformer = OneHotEncoder(column_names=column_names).fit(table_to_fit) result = transformer.inverse_transform(transformer.transform(table_to_transform)) @@ -357,7 +357,7 @@ def test_should_not_change_transformed_table(self) -> None: }, ) - transformer = OneHotEncoder().fit(table, None) + transformer = OneHotEncoder().fit(table) transformed_table = transformer.transform(table) transformer.inverse_transform(transformed_table) @@ -387,12 +387,12 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - OneHotEncoder().fit(Table({"col1": ["one", "two"]}), ["col1"]).inverse_transform( + OneHotEncoder(column_names="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( Table({"col1": [1.0, 0.0]}), ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - OneHotEncoder().fit(Table({"col1": ["one", "two"]}), ["col1"]).inverse_transform( + OneHotEncoder(column_names="col1").fit(Table({"col1": ["one", "two"]})).inverse_transform( Table({"col1__one": ["1", "null"], "col1__two": ["2", "ok"]}), ) diff --git a/tests/safeds/data/tabular/transformation/test_range_scaler.py b/tests/safeds/data/tabular/transformation/test_range_scaler.py index 2204e93fa..3aaa4c889 100644 --- a/tests/safeds/data/tabular/transformation/test_range_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_range_scaler.py @@ -19,15 +19,15 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - RangeScaler().fit(table, ["col2", "col3"]) + RangeScaler(column_names=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler().fit(Table({"col1": ["a", "b"], "col2": [1, "c"]}), ["col1", "col2"]) + RangeScaler(column_names=["col1", "col2"]).fit(Table({"col1": ["a", "b"], "col2": [1, "c"]})) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The RangeScaler cannot be fitted because the table contains 0 rows"): - RangeScaler().fit(Table({"col1": []}), None) + RangeScaler().fit(Table({"col1": []})) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -37,7 +37,7 @@ def test_should_not_change_original_transformer(self) -> None: ) transformer = RangeScaler() - transformer.fit(table, None) + transformer.fit(table) assert transformer._column_names is None assert transformer._data_min is None @@ -53,7 +53,7 @@ def test_should_raise_if_column_not_found(self) -> None: }, ) - transformer = RangeScaler().fit(table_to_fit, None) + transformer = RangeScaler().fit(table_to_fit) table_to_transform = Table( { @@ -78,7 +78,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).transform( + RangeScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( Table({"col1": ["a", "b", "c"], "col2": ["c", "d", "e"]}), ) @@ -96,7 +96,7 @@ def test_should_return_true_after_fitting(self) -> None: ) transformer = RangeScaler() - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -141,7 +141,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = RangeScaler().fit_and_transform(table, column_names) + fitted_transformer, transformed_table = RangeScaler(column_names=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -185,9 +185,12 @@ def test_should_return_fitted_transformer_and_transformed_table_with_correct_ran column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = RangeScaler(min_=-10.0, max_=10.0).fit_and_transform( + fitted_transformer, transformed_table = RangeScaler( + min_=-10.0, + max_=10.0, + column_names=column_names, + ).fit_and_transform( table, - column_names, ) assert fitted_transformer.is_fitted assert transformed_table == expected @@ -222,7 +225,7 @@ class TestInverseTransform: ], ) def test_should_return_original_table(self, table: Table) -> None: - transformer = RangeScaler().fit(table, None) + transformer = RangeScaler().fit(table) assert transformer.inverse_transform(transformer.transform(table)) == table @@ -233,7 +236,7 @@ def test_should_not_change_transformed_table(self) -> None: }, ) - transformer = RangeScaler().fit(table, None) + transformer = RangeScaler().fit(table) transformed_table = transformer.transform(table) transformer.inverse_transform(transformed_table) @@ -259,12 +262,16 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - RangeScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( + RangeScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), + ).inverse_transform( Table({"col3": [1, 2, 3]}), ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - RangeScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( + RangeScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), + ).inverse_transform( Table({"col1": ["1", "2", "three"], "col2": [1, 2, "four"]}), ) diff --git a/tests/safeds/data/tabular/transformation/test_simple_imputer.py b/tests/safeds/data/tabular/transformation/test_simple_imputer.py index 5ff579ad2..b0cfb4fc1 100644 --- a/tests/safeds/data/tabular/transformation/test_simple_imputer.py +++ b/tests/safeds/data/tabular/transformation/test_simple_imputer.py @@ -174,12 +174,12 @@ def test_should_raise_if_column_not_found(self, strategy: SimpleImputer.Strategy ) with pytest.raises(ColumnNotFoundError): - SimpleImputer(strategy).fit(table, ["b", "c"]) + SimpleImputer(strategy, column_names=["b", "c"]).fit(table) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) def test_should_raise_if_table_contains_no_rows(self, strategy: SimpleImputer.Strategy) -> None: with pytest.raises(ValueError, match=r"The SimpleImputer cannot be fitted because the table contains 0 rows"): - SimpleImputer(strategy).fit(Table({"col1": []}), None) + SimpleImputer(strategy).fit(Table({"col1": []})) @pytest.mark.parametrize( ("table", "col_names", "strategy"), @@ -196,7 +196,7 @@ def test_should_raise_if_table_contains_non_numerical_data( strategy: SimpleImputer.Strategy, ) -> None: with pytest.raises(ColumnTypeError): - SimpleImputer(strategy).fit(table, col_names) + SimpleImputer(strategy, column_names=col_names).fit(table) @pytest.mark.parametrize("strategy", strategies(), ids=lambda x: x.__class__.__name__) def test_should_not_change_original_transformer(self, strategy: SimpleImputer.Strategy) -> None: @@ -207,7 +207,7 @@ def test_should_not_change_original_transformer(self, strategy: SimpleImputer.St ) transformer = SimpleImputer(strategy) - transformer.fit(table, None) + transformer.fit(table) assert transformer._column_names is None assert transformer._replacement is None @@ -230,9 +230,9 @@ def test_should_raise_if_column_not_found(self, strategy: SimpleImputer.Strategy message=r"There are multiple most frequent values in a column given to the Imputer\..*", category=UserWarning, ) - transformer = SimpleImputer(strategy).fit(table_to_fit, None) + transformer = SimpleImputer(strategy).fit(table_to_fit) else: - transformer = SimpleImputer(strategy).fit(table_to_fit, None) + transformer = SimpleImputer(strategy).fit(table_to_fit) table_to_transform = Table( { @@ -272,7 +272,7 @@ def test_should_return_true_after_fitting(self, strategy: SimpleImputer.Strategy ) transformer = SimpleImputer(strategy) - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -410,8 +410,9 @@ def test_should_return_fitted_transformer_and_transformed_table( ) fitted_transformer, transformed_table = SimpleImputer( strategy, + column_names=column_names, value_to_replace=value_to_replace, - ).fit_and_transform(table, column_names) + ).fit_and_transform(table) assert fitted_transformer.is_fitted assert transformed_table == expected diff --git a/tests/safeds/data/tabular/transformation/test_standard_scaler.py b/tests/safeds/data/tabular/transformation/test_standard_scaler.py index 911e82bca..0e731fa15 100644 --- a/tests/safeds/data/tabular/transformation/test_standard_scaler.py +++ b/tests/safeds/data/tabular/transformation/test_standard_scaler.py @@ -15,18 +15,17 @@ def test_should_raise_if_column_not_found(self) -> None: ) with pytest.raises(ColumnNotFoundError): - StandardScaler().fit(table, ["col2", "col3"]) + StandardScaler(column_names=["col2", "col3"]).fit(table) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler().fit( + StandardScaler(column_names=["col1", "col2"]).fit( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), - ["col1", "col2"], ) def test_should_raise_if_table_contains_no_rows(self) -> None: with pytest.raises(ValueError, match=r"The StandardScaler cannot be fitted because the table contains 0 rows"): - StandardScaler().fit(Table({"col1": []}), None) + StandardScaler().fit(Table({"col1": []})) def test_should_not_change_original_transformer(self) -> None: table = Table( @@ -36,7 +35,7 @@ def test_should_not_change_original_transformer(self) -> None: ) transformer = StandardScaler() - transformer.fit(table, None) + transformer.fit(table) assert transformer._column_names is None assert transformer._data_mean is None @@ -52,7 +51,7 @@ def test_should_raise_if_column_not_found(self) -> None: }, ) - transformer = StandardScaler().fit(table_to_fit, None) + transformer = StandardScaler().fit(table_to_fit) table_to_transform = Table( { @@ -77,7 +76,7 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler().fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]}), ["col1", "col2"]).transform( + StandardScaler(column_names=["col1", "col2"]).fit(Table({"col1": [1, 2, 3], "col2": [2, 3, 4]})).transform( Table({"col1": ["a", "b", "c"], "col2": ["b", "c", "e"]}), ) @@ -95,7 +94,7 @@ def test_should_return_true_after_fitting(self) -> None: ) transformer = StandardScaler() - fitted_transformer = transformer.fit(table, None) + fitted_transformer = transformer.fit(table) assert fitted_transformer.is_fitted @@ -127,7 +126,7 @@ def test_should_return_fitted_transformer_and_transformed_table( column_names: list[str] | None, expected: Table, ) -> None: - fitted_transformer, transformed_table = StandardScaler().fit_and_transform(table, column_names) + fitted_transformer, transformed_table = StandardScaler(column_names=column_names).fit_and_transform(table) assert fitted_transformer.is_fitted assert_tables_equal(transformed_table, expected) @@ -162,7 +161,7 @@ class TestInverseTransform: ids=["one_column"], ) def test_should_return_original_table(self, table: Table) -> None: - transformer = StandardScaler().fit(table, None) + transformer = StandardScaler().fit(table) assert transformer.inverse_transform(transformer.transform(table)) == table @@ -173,7 +172,7 @@ def test_should_not_change_transformed_table(self) -> None: }, ) - transformer = StandardScaler().fit(table, None) + transformer = StandardScaler().fit(table) transformed_table = transformer.transform(table) transformed_table = transformer.inverse_transform(transformed_table) @@ -199,12 +198,16 @@ def test_should_raise_if_not_fitted(self) -> None: def test_should_raise_if_column_not_found(self) -> None: with pytest.raises(ColumnNotFoundError): - StandardScaler().fit(Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( + StandardScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), + ).inverse_transform( Table({"col3": [0, 1, 2]}), ) def test_should_raise_if_table_contains_non_numerical_data(self) -> None: with pytest.raises(ColumnTypeError): - StandardScaler().fit(Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), ["col1", "col2"]).inverse_transform( + StandardScaler(column_names=["col1", "col2"]).fit( + Table({"col1": [1, 2, 4], "col2": [2, 3, 4]}), + ).inverse_transform( Table({"col1": ["one", "two", "apple"], "col2": ["three", "four", "banana"]}), ) diff --git a/tests/safeds/data/tabular/transformation/test_table_transformer.py b/tests/safeds/data/tabular/transformation/test_table_transformer.py index ff80701a2..83c374bd3 100644 --- a/tests/safeds/data/tabular/transformation/test_table_transformer.py +++ b/tests/safeds/data/tabular/transformation/test_table_transformer.py @@ -26,9 +26,9 @@ def transformers_numeric() -> list[TableTransformer]: The list of numeric transformers to test. """ return [ - StandardScaler(), - RangeScaler(), - Discretizer(), + StandardScaler(column_names="col1"), + RangeScaler(column_names="col1"), + Discretizer(column_names="col1"), ] @@ -45,8 +45,8 @@ def transformers_non_numeric() -> list[TableTransformer]: The list of non-numeric transformers to test. """ return [ - OneHotEncoder(), - LabelEncoder(), + OneHotEncoder(column_names="col1"), + LabelEncoder(column_names="col1"), ] @@ -129,7 +129,7 @@ def test_should_return_different_hash_for_same_numeric_transformer_fit( transformer1: TableTransformer, valid_data_numeric: Table, ) -> None: - transformer1_fit = transformer1.fit(valid_data_numeric, ["col1"]) + transformer1_fit = transformer1.fit(valid_data_numeric) assert hash(transformer1) != hash(transformer1_fit) @pytest.mark.parametrize("transformer1", transformers_non_numeric(), ids=lambda x: x.__class__.__name__) @@ -138,7 +138,7 @@ def test_should_return_different_hash_for_same_non_numeric_transformer_fit( transformer1: TableTransformer, valid_data_non_numeric: Table, ) -> None: - transformer1_fit = transformer1.fit(valid_data_non_numeric, ["col1"]) + transformer1_fit = transformer1.fit(valid_data_non_numeric) assert hash(transformer1) != hash(transformer1_fit) @pytest.mark.parametrize( @@ -152,7 +152,7 @@ def test_should_return_different_hash_for_numeric_transformer_fit( transformer2: TableTransformer, valid_data_numeric: Table, ) -> None: - transformer1_fit = transformer1.fit(valid_data_numeric, ["col1"]) + transformer1_fit = transformer1.fit(valid_data_numeric) assert hash(transformer2) != hash(transformer1_fit) @pytest.mark.parametrize( @@ -166,7 +166,7 @@ def test_should_return_different_hash_for_non_numeric_transformer_fit( transformer2: TableTransformer, valid_data_non_numeric: Table, ) -> None: - transformer1_fit = transformer1.fit(valid_data_non_numeric, ["col1"]) + transformer1_fit = transformer1.fit(valid_data_non_numeric) assert hash(transformer2) != hash(transformer1_fit) @pytest.mark.parametrize("transformer2", transformers(), ids=lambda x: x.__class__.__name__) @@ -176,5 +176,5 @@ def test_should_return_different_hash_for_imputer_fit( valid_data_imputer: Table, ) -> None: transformer1 = SimpleImputer(strategy=SimpleImputer.Strategy.mode()) - transformer1_fit = transformer1.fit(valid_data_imputer, ["col1"]) + transformer1_fit = transformer1.fit(valid_data_imputer) assert hash(transformer2) != hash(transformer1_fit) diff --git a/tests/safeds/ml/nn/test_cnn_workflow.py b/tests/safeds/ml/nn/test_cnn_workflow.py index b77c4c7b9..16a36757e 100644 --- a/tests/safeds/ml/nn/test_cnn_workflow.py +++ b/tests/safeds/ml/nn/test_cnn_workflow.py @@ -79,7 +79,7 @@ def test_should_train_and_predict_model( if groups is not None: classes.append(groups.group(2)) image_classes = Table({"class": classes}) - one_hot_encoder = OneHotEncoder().fit(image_classes, ["class"]) + one_hot_encoder = OneHotEncoder(column_names="class").fit(image_classes) image_classes_one_hot_encoded = one_hot_encoder.transform(image_classes) image_dataset = ImageDataset(image_list, image_classes_one_hot_encoded) num_of_classes: int = image_dataset.output_size if isinstance(image_dataset.output_size, int) else 0 diff --git a/tests/safeds/ml/nn/test_forward_workflow.py b/tests/safeds/ml/nn/test_forward_workflow.py index e622bd073..66aaa1a50 100644 --- a/tests/safeds/ml/nn/test_forward_workflow.py +++ b/tests/safeds/ml/nn/test_forward_workflow.py @@ -30,9 +30,9 @@ def test_forward_model(device: Device) -> None: table_2 = table_2.add_columns([(table_1.slice_rows(start=14)).get_column("value").rename("target")]) train_table, test_table = table_2.split_rows(0.8) - ss = StandardScaler() - _, train_table = ss.fit_and_transform(train_table, ["value"]) - _, test_table = ss.fit_and_transform(test_table, ["value"]) + ss = StandardScaler(column_names="value") + _, train_table = ss.fit_and_transform(train_table) + _, test_table = ss.fit_and_transform(test_table) model = NeuralNetworkRegressor( InputConversionTable(prediction_name="predicted"), [ForwardLayer(input_size=1, output_size=1)], diff --git a/tests/safeds/ml/nn/test_lstm_workflow.py b/tests/safeds/ml/nn/test_lstm_workflow.py index 85e396222..6a13a86bc 100644 --- a/tests/safeds/ml/nn/test_lstm_workflow.py +++ b/tests/safeds/ml/nn/test_lstm_workflow.py @@ -24,8 +24,8 @@ def test_lstm_model(device: Device) -> None: # Create a DataFrame _inflation_path = "_datas/US_Inflation_rates.csv" table = Table.from_csv_file(path=resolve_resource_path(_inflation_path)) - rs = RangeScaler() - _, table = rs.fit_and_transform(table, ["value"]) + rs = RangeScaler(column_names="value") + _, table = rs.fit_and_transform(table) train_table, test_table = table.split_rows(0.8) model = NeuralNetworkRegressor(