feat: specify column names in constructor of table transformers (#795)

### Summary of Changes Specify the names of the columns that a table transformer should be applied to in its constructor instead of its `fit` method. This allows easier composition. --------- Co-authored-by: megalinter-bot <[email protected]>
Safe-DS · May 21, 2024 · 69a780c · 69a780c
1 parent f07bc5a
commit 69a780c
Show file tree

Hide file tree

Showing 25 changed files with 242 additions and 171 deletions.
diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "from safeds.data.tabular.transformation import OneHotEncoder\n",
     "\n",
-    "encoder = OneHotEncoder().fit(train_table, [\"sex\"])"
+    "encoder = OneHotEncoder(column_names=\"sex\").fit(train_table)"
    ],
    "metadata": {
     "collapsed": false
@@ -155,7 +155,6 @@
   {
    "cell_type": "code",
    "source": [
-    "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
     "transformed_test_table = encoder.transform(test_table)\n",
     "\n",
     "prediction = fitted_model.predict(\n",
@@ -182,7 +181,6 @@
   {
    "cell_type": "code",
    "source": [
-    "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
     "testing_table = encoder.transform(testing_table)\n",
     "\n",
     "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names=extra_names)\n",

diff --git a/docs/tutorials/data_processing.ipynb b/docs/tutorials/data_processing.ipynb
@@ -183,7 +183,7 @@
    "source": [
     "from safeds.data.tabular.transformation import SimpleImputer\n",
     "\n",
-    "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)).fit(titanic, [\"age\", \"fare\", \"cabin\", \"port_embarked\"])\n",
+    "imputer = SimpleImputer(SimpleImputer.Strategy.constant(0), column_names=[\"age\", \"fare\", \"cabin\", \"port_embarked\"]).fit(titanic)\n",
     "imputer.transform(titanic_slice)"
    ],
    "metadata": {
@@ -206,7 +206,7 @@
    "source": [
     "from safeds.data.tabular.transformation import LabelEncoder\n",
     "\n",
-    "encoder = LabelEncoder().fit(titanic, [\"sex\", \"port_embarked\"])\n",
+    "encoder = LabelEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
     "encoder.transform(titanic_slice)"
    ],
    "metadata": {
@@ -229,7 +229,7 @@
    "source": [
     "from safeds.data.tabular.transformation import OneHotEncoder\n",
     "\n",
-    "encoder = OneHotEncoder().fit(titanic, [\"sex\", \"port_embarked\"])\n",
+    "encoder = OneHotEncoder(column_names=[\"sex\", \"port_embarked\"]).fit(titanic)\n",
     "encoder.transform(titanic_slice)"
    ],
    "metadata": {
@@ -252,7 +252,7 @@
    "source": [
     "from safeds.data.tabular.transformation import RangeScaler\n",
     "\n",
-    "scaler = RangeScaler(0.0, 1.0).fit(titanic, [\"age\"])\n",
+    "scaler = RangeScaler(0.0, 1.0, column_names=\"age\").fit(titanic)\n",
     "scaler.transform(titanic_slice)"
    ],
    "metadata": {
@@ -275,7 +275,7 @@
    "source": [
     "from safeds.data.tabular.transformation import StandardScaler\n",
     "\n",
-    "scaler = StandardScaler().fit(titanic, [\"age\", \"travel_class\"])\n",
+    "scaler = StandardScaler(column_names=[\"age\", \"travel_class\"]).fit(titanic)\n",
     "scaler.transform(titanic_slice)"
    ],
    "metadata": {

diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py
@@ -374,7 +374,7 @@ def __init__(self, column: Column) -> None:
             )
             # TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not
             #  be done automatically?
-            self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name])
+            self._one_hot_encoder = OneHotEncoder(column_names=self._column_name).fit(column_as_table)
         self._tensor = torch.Tensor(
             self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch(dtype=pl.Float32),
         ).to(_get_device())

diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py
@@ -1688,7 +1688,7 @@ def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer
         >>> from safeds.data.tabular.containers import Table
         >>> from safeds.data.tabular.transformation import RangeScaler
         >>> table = Table({"a": [1, 2, 3]})
-        >>> transformer, transformed_table = RangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"])
+        >>> transformer, transformed_table = RangeScaler(min_=0, max_=1, column_names="a").fit_and_transform(table)
         >>> transformed_table.inverse_transform_table(transformer)
         +---------+
         |       a |
@@ -1726,7 +1726,7 @@ def transform_table(self, fitted_transformer: TableTransformer) -> Table:
         >>> from safeds.data.tabular.containers import Table
         >>> from safeds.data.tabular.transformation import RangeScaler
         >>> table = Table({"a": [1, 2, 3]})
-        >>> transformer = RangeScaler(min_=0, max_=1).fit(table, ["a"])
+        >>> transformer = RangeScaler(min_=0, max_=1, column_names="a").fit(table)
         >>> table.transform_table(transformer)
         +---------+
         |       a |

diff --git a/src/safeds/data/tabular/transformation/_discretizer.py b/src/safeds/data/tabular/transformation/_discretizer.py
@@ -4,6 +4,7 @@
 
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound
+from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric
 from safeds.data.tabular.containers import Table
 from safeds.exceptions import (
     NonNumericColumnError,
@@ -24,6 +25,8 @@ class Discretizer(TableTransformer):
     ----------
     bin_count:
         The number of bins to be created.
+    column_names:
+        The list of columns used to fit the transformer. If `None`, all numeric columns are used.
 
     Raises
     ------
@@ -35,8 +38,13 @@ class Discretizer(TableTransformer):
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def __init__(self, bin_count: int = 5) -> None:
-        TableTransformer.__init__(self)
+    def __init__(
+        self,
+        bin_count: int = 5,
+        *,
+        column_names: str | list[str] | None = None,
+    ) -> None:
+        TableTransformer.__init__(self, column_names)
 
         _check_bounds("bin_count", bin_count, lower_bound=_ClosedBound(2))
 
@@ -53,6 +61,10 @@ def __hash__(self) -> int:
     # Properties
     # ------------------------------------------------------------------------------------------------------------------
 
+    @property
+    def is_fitted(self) -> bool:
+        return self._wrapped_transformer is not None
+
     @property
     def bin_count(self) -> int:
         return self._bin_count
@@ -61,7 +73,7 @@ def bin_count(self) -> int:
     # Learning and transformation
     # ------------------------------------------------------------------------------------------------------------------
 
-    def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
+    def fit(self, table: Table) -> Discretizer:
         """
         Learn a transformation for a set of columns in a table.
 
@@ -71,8 +83,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
         ----------
         table:
             The table used to fit the transformer.
-        column_names:
-            The list of columns from the table used to fit the transformer. If `None`, all columns are used.
 
         Returns
         -------
@@ -93,24 +103,21 @@ def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
         if table.row_count == 0:
             raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows")
 
-        if column_names is None:
-            column_names = table.column_names
+        if self._column_names is None:
+            column_names = [name for name in table.column_names if table.get_column_type(name).is_numeric]
         else:
+            column_names = self._column_names
             _check_columns_exist(table, column_names)
-
-            for column in column_names:
-                if not table.get_column(column).type.is_numeric:
-                    raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.")
+            _check_columns_are_numeric(table, column_names, operation="fit a Discretizer")
 
         wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._bin_count, encode="ordinal")
         wrapped_transformer.set_output(transform="polars")
         wrapped_transformer.fit(
             table.remove_columns_except(column_names)._data_frame,
         )
 
-        result = Discretizer(self._bin_count)
+        result = Discretizer(self._bin_count, column_names=column_names)
         result._wrapped_transformer = wrapped_transformer
-        result._column_names = column_names
 
         return result
 

diff --git a/src/safeds/data/tabular/transformation/_label_encoder.py b/src/safeds/data/tabular/transformation/_label_encoder.py
@@ -18,6 +18,8 @@ class LabelEncoder(InvertibleTableTransformer):
 
     Parameters
     ----------
+    column_names:
+        The list of columns used to fit the transformer. If `None`, all non-numeric columns are used.
     partial_order:
         The partial order of the labels. The labels are encoded in the order of the given list. Additional values are
         assigned labels in the order they are encountered during fitting.
@@ -27,8 +29,13 @@ class LabelEncoder(InvertibleTableTransformer):
     # Dunder methods
     # ------------------------------------------------------------------------------------------------------------------
 
-    def __init__(self, *, partial_order: list[Any] | None = None) -> None:
-        super().__init__()
+    def __init__(
+        self,
+        *,
+        column_names: str | list[str] | None = None,
+        partial_order: list[Any] | None = None,
+    ) -> None:
+        super().__init__(column_names)
 
         if partial_order is None:
             partial_order = []
@@ -51,6 +58,10 @@ def __hash__(self) -> int:
     # Properties
     # ------------------------------------------------------------------------------------------------------------------
 
+    @property
+    def is_fitted(self) -> bool:
+        return self._mapping is not None and self._inverse_mapping is not None
+
     @property
     def partial_order(self) -> list[Any]:
         """The partial order of the labels."""
@@ -60,7 +71,7 @@ def partial_order(self) -> list[Any]:
     # Learning and transformation
     # ------------------------------------------------------------------------------------------------------------------
 
-    def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
+    def fit(self, table: Table) -> LabelEncoder:
         """
         Learn a transformation for a set of columns in a table.
 
@@ -70,8 +81,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
         ----------
         table:
             The table used to fit the transformer.
-        column_names:
-            The list of columns from the table used to fit the transformer. If `None`, all non-numeric columns are used.
 
         Returns
         -------
@@ -85,9 +94,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
         ValueError
             If the table contains 0 rows.
         """
-        if column_names is None:
+        if self._column_names is None:
             column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric]
         else:
+            column_names = self._column_names
             _check_columns_exist(table, column_names)
             _warn_if_columns_are_numeric(table, column_names)
 
@@ -111,8 +121,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
                     reverse_mapping[name][label] = value
 
         # Create a copy with the learned transformation
-        result = LabelEncoder(partial_order=self._partial_order)
-        result._column_names = column_names
+        result = LabelEncoder(column_names=column_names, partial_order=self._partial_order)
         result._mapping = mapping
         result._inverse_mapping = reverse_mapping
 

diff --git a/src/safeds/data/tabular/transformation/_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_one_hot_encoder.py
@@ -43,6 +43,8 @@ class OneHotEncoder(InvertibleTableTransformer):
 
     Parameters
     ----------
+    column_names:
+        The list of columns used to fit the transformer. If `None`, all non-numeric columns are used.
     separator:
         The separator used to separate the original column name from the value in the new column names.
 
@@ -52,7 +54,7 @@ class OneHotEncoder(InvertibleTableTransformer):
     >>> from safeds.data.tabular.transformation import OneHotEncoder
     >>> table = Table({"col1": ["a", "b", "c", "a"]})
     >>> transformer = OneHotEncoder()
-    >>> transformer.fit_and_transform(table, ["col1"])[1]
+    >>> transformer.fit_and_transform(table)[1]
     +---------+---------+---------+
     | col1__a | col1__b | col1__c |
     |     --- |     --- |     --- |
@@ -72,9 +74,10 @@ class OneHotEncoder(InvertibleTableTransformer):
     def __init__(
         self,
         *,
+        column_names: str | list[str] | None = None,
         separator: str = "__",
     ) -> None:
-        super().__init__()
+        super().__init__(column_names)
 
         # Parameters
         self._separator = separator
@@ -103,6 +106,10 @@ def __hash__(self) -> int:
     # Properties
     # ------------------------------------------------------------------------------------------------------------------
 
+    @property
+    def is_fitted(self) -> bool:
+        return self._mapping is not None
+
     @property
     def separator(self) -> str:
         """The separator used to separate the original column name from the value in the new column names."""
@@ -112,7 +119,7 @@ def separator(self) -> str:
     # Learning and transformation
     # ------------------------------------------------------------------------------------------------------------------
 
-    def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
+    def fit(self, table: Table) -> OneHotEncoder:
         """
         Learn a transformation for a set of columns in a table.
 
@@ -122,8 +129,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
         ----------
         table:
             The table used to fit the transformer.
-        column_names:
-            The list of columns from the table used to fit the transformer. If `None`, all columns are used.
 
         Returns
         -------
@@ -137,9 +142,10 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
         ValueError
             If the table contains 0 rows.
         """
-        if column_names is None:
+        if self._column_names is None:
             column_names = [name for name in table.column_names if not table.get_column_type(name).is_numeric]
         else:
+            column_names = self._column_names
             _check_columns_exist(table, column_names)
             _warn_if_columns_are_numeric(table, column_names)
 
@@ -169,8 +175,7 @@ def fit(self, table: Table, column_names: list[str] | None) -> OneHotEncoder:
                 mapping[name].append((new_name, value))
 
         # Create a copy with the learned transformation
-        result = OneHotEncoder()
-        result._column_names = column_names
+        result = OneHotEncoder(column_names=column_names, separator=self._separator)
         result._new_column_names = new_column_names
         result._mapping = mapping