diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index bf705e10d..536927ee4 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -13,6 +13,7 @@ from ._one_hot_encoder import OneHotEncoder from ._range_scaler import RangeScaler from ._robust_scaler import RobustScaler + from ._sequential_table_transformer import SequentialTableTransformer from ._simple_imputer import SimpleImputer from ._standard_scaler import StandardScaler from ._table_transformer import TableTransformer @@ -27,6 +28,7 @@ "LabelEncoder": "._label_encoder:LabelEncoder", "OneHotEncoder": "._one_hot_encoder:OneHotEncoder", "RangeScaler": "._range_scaler:RangeScaler", + "SequentialTableTransformer": "._sequential_table_transformer:SequentialTableTransformer", "RobustScaler": "._robust_scaler:RobustScaler", "SimpleImputer": "._simple_imputer:SimpleImputer", "StandardScaler": "._standard_scaler:StandardScaler", @@ -42,6 +44,7 @@ "LabelEncoder", "OneHotEncoder", "RangeScaler", + "SequentialTableTransformer", "RobustScaler", "SimpleImputer", "StandardScaler", diff --git a/src/safeds/data/tabular/transformation/_sequential_table_transformer.py b/src/safeds/data/tabular/transformation/_sequential_table_transformer.py new file mode 100644 index 000000000..5d26c2101 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_sequential_table_transformer.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from warnings import warn + +from safeds._utils import _structural_hash +from safeds.exceptions import TransformerNotFittedError, TransformerNotInvertibleError + +from ._invertible_table_transformer import InvertibleTableTransformer + +if TYPE_CHECKING: + from safeds.data.tabular.containers import Table + + from ._table_transformer import TableTransformer + + +class SequentialTableTransformer(InvertibleTableTransformer): + """ + The SequentialTableTransformer transforms a table using multiple transformers in sequence. + + Parameters + ---------- + transformers: + The list of transformers used to transform the table. Used in the order as they are supplied in the list. + """ + + def __init__( + self, + transformers: list[TableTransformer], + ) -> None: + super().__init__(None) + + # Check if transformers actually contains any transformers. + if transformers is None or len(transformers) == 0: + warn( + "transformers should contain at least 1 transformer", + UserWarning, + stacklevel=2, + ) + + # Parameters + self._transformers: list[TableTransformer] = transformers + + # Internal State + self._is_fitted: bool = False + + def __hash__(self) -> int: + return _structural_hash( + super().__hash__(), + self._transformers, + self._is_fitted, + ) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._is_fitted + + def fit(self, table: Table) -> SequentialTableTransformer: + """ + Fits all the transformers in order. + + Parameters + ---------- + table: + The table used to fit the transformers. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + ValueError: + Raises a ValueError if the table has no rows. + """ + if table.row_count == 0: + raise ValueError("The SequentialTableTransformer cannot be fitted because the table contains 0 rows.") + + current_table: Table = table + fitted_transformers: list[TableTransformer] = [] + + for transformer in self._transformers: + fitted_transformer = transformer.fit(current_table) + fitted_transformers.append(fitted_transformer) + current_table = fitted_transformer.transform(current_table) + + result: SequentialTableTransformer = SequentialTableTransformer( + transformers=fitted_transformers, + ) + + result._is_fitted = True + return result + + def transform(self, table: Table) -> Table: + """ + Transform the table using all the transformers sequentially. + + Might change the order and type of columns base on the transformers used. + + Parameters + ---------- + table: + The table to be transformed. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError: + Raises a TransformerNotFittedError if the transformer isn't fitted. + """ + if not self._is_fitted: + raise TransformerNotFittedError + + current_table: Table = table + for transformer in self._transformers: + current_table = transformer.transform(current_table) + + return current_table + + def inverse_transform(self, transformed_table: Table) -> Table: + """ + Inversely transforms the table using all the transformers sequentially in inverse order. + + Might change the order and type of columns base on the transformers used. + + Parameters + ---------- + transformed_table: + The table to be transformed back. + + Returns + ------- + original_table: + The original table. + + Raises + ------ + TransformerNotFittedError: + Raises a TransformerNotFittedError if the transformer isn't fitted. + TransformerNotInvertibleError: + Raises a TransformerNotInvertibleError if one of the transformers isn't invertible. + """ + if not self._is_fitted: + raise TransformerNotFittedError + + # sequentially inverse transform the table with all transformers, working from the back of the list forwards. + current_table: Table = transformed_table + for transformer in reversed(self._transformers): + # check if transformer is invertible + if not (isinstance(transformer, InvertibleTableTransformer)): + raise TransformerNotInvertibleError(str(type(transformer))) + current_table = transformer.inverse_transform(current_table) + + return current_table diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index dabbc3afa..b885a746e 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -11,6 +11,7 @@ NonNumericColumnError, OutputLengthMismatchError, TransformerNotFittedError, + TransformerNotInvertibleError, ValueNotPresentWhenFittedError, ) from ._ml import ( @@ -66,6 +67,7 @@ class OutOfBoundsError(SafeDsError): "NonNumericColumnError", "OutputLengthMismatchError", "TransformerNotFittedError", + "TransformerNotInvertibleError", "ValueNotPresentWhenFittedError", # ML exceptions "DatasetMissesDataError", diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py index dc729dae0..775cc1847 100644 --- a/src/safeds/exceptions/_data.py +++ b/src/safeds/exceptions/_data.py @@ -111,6 +111,13 @@ def __init__(self) -> None: super().__init__("The transformer has not been fitted yet.") +class TransformerNotInvertibleError(Exception): + """Raised when a function tries to invert a non-invertible transformer.""" + + def __init__(self, transformer_type: str) -> None: + super().__init__(f"{transformer_type} is not invertible.") + + class ValueNotPresentWhenFittedError(Exception): """Exception raised when attempting to one-hot-encode a table containing values not present in the fitting phase.""" diff --git a/tests/helpers/_assertions.py b/tests/helpers/_assertions.py index 4c4847c6e..16cb4f3cf 100644 --- a/tests/helpers/_assertions.py +++ b/tests/helpers/_assertions.py @@ -6,7 +6,15 @@ from safeds.data.tabular.containers import Cell, Column, Table -def assert_tables_equal(table1: Table, table2: Table) -> None: +def assert_tables_equal( + table1: Table, + table2: Table, + *, + ignore_column_order: bool = False, + ignore_row_order: bool = False, + ignore_types: bool = False, + ignore_float_imprecision: bool = True, +) -> None: """ Assert that two tables are almost equal. @@ -16,8 +24,23 @@ def assert_tables_equal(table1: Table, table2: Table) -> None: The first table. table2: The table to compare the first table to. + ignore_column_order: + Ignore the column order when True. Will return true, even when the column order is different. + ignore_row_order: + Ignore the column order when True. Will return true, even when the row order is different. + ignore_types: + Ignore differing data Types. Will return true, even when columns have differing data types. + ignore_float_imprecision: + If False, check if floating point values match EXACTLY. """ - assert_frame_equal(table1._data_frame, table2._data_frame) + assert_frame_equal( + table1._data_frame, + table2._data_frame, + check_row_order=not ignore_row_order, + check_column_order=not ignore_column_order, + check_dtypes=not ignore_types, + check_exact=not ignore_float_imprecision, + ) def assert_that_tabular_datasets_are_equal(table1: TabularDataset, table2: TabularDataset) -> None: diff --git a/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py b/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py new file mode 100644 index 000000000..634c59aeb --- /dev/null +++ b/tests/safeds/data/tabular/transformation/test_sequential_table_transformer.py @@ -0,0 +1,212 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.data.tabular.transformation import ( + Discretizer, + LabelEncoder, + OneHotEncoder, + SequentialTableTransformer, + SimpleImputer, + StandardScaler, + TableTransformer, +) +from safeds.exceptions import TransformerNotFittedError, TransformerNotInvertibleError + +from tests.helpers import assert_tables_equal + + +class TestInit: + + def test_should_warn_on_empty_list(self) -> None: + with pytest.warns(UserWarning, match=("transformers should contain at least 1 transformer")): + SequentialTableTransformer(transformers=[]) # type: ignore[attr-defined] + + +class TestFit: + def test_should_raise_value_error_on_empty_table(self) -> None: + test_table = Table( + { + "col1": [], + "col2": [], + }, + ) + sequential_table_transformer = SequentialTableTransformer([SimpleImputer(SimpleImputer.Strategy.constant(0))]) + with pytest.raises( + ValueError, + match=("The SequentialTableTransformer cannot be fitted because the table contains 0 rows."), + ): + sequential_table_transformer.fit(test_table) + + def test_should_not_change_original_transformer(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformer_list = [one_hot, imputer] + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers=transformer_list) + old_hash = hash(sequential_table_transformer) + sequential_table_transformer.fit(test_table) + assert old_hash == hash(sequential_table_transformer) + + +class TestTransform: + def test_should_raise_if_not_fitted(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformers = [one_hot, imputer] + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers) + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + sequential_table_transformer.transform(test_table) + + @pytest.mark.parametrize( + "transformer", + [ + OneHotEncoder(), + SimpleImputer(SimpleImputer.Strategy.constant(0)), + LabelEncoder(), + SimpleImputer(SimpleImputer.Strategy.mean()), + ], + ids=["OneHotEncoder", "Imputer with Constant", "LabelEncoder", "Mean Imputer"], + ) + def test_should_do_same_as_transformer_with_single_transformer(self, transformer: TableTransformer) -> None: + sequential_transformer = SequentialTableTransformer([transformer]) + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + sequential_transformer = sequential_transformer.fit(test_table) + transformer = transformer.fit(test_table) + test_table_normal = transformer.transform(test_table) + test_table_sequential = sequential_transformer.transform(test_table) + assert_tables_equal(test_table_normal, test_table_sequential) + + def test_should_transform_with_multiple_transformers(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformers = [one_hot, imputer] + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers) + fitted_sequential_table_transformer = sequential_table_transformer.fit(test_table) + transformed_table_sequential = fitted_sequential_table_transformer.transform(test_table) + + one_hot = one_hot.fit(test_table) + transformed_table_individual = one_hot.transform(test_table) + imputer = imputer.fit(transformed_table_individual) + transformed_table_individual = imputer.transform(transformed_table_individual) + + assert_tables_equal(transformed_table_sequential, transformed_table_individual) + + +class TestIsFitted: + def test_should_return_false_before_fitting(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformers = [one_hot, imputer] + sequential_table_transformer = SequentialTableTransformer(transformers) + assert sequential_table_transformer.is_fitted is False + + def test_should_return_true_after_fitting(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformers = [one_hot, imputer] + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers) + sequential_table_transformer = sequential_table_transformer.fit(test_table) + assert sequential_table_transformer.is_fitted is True + + +class TestInverseTransform: + + @pytest.mark.parametrize( + "transformers", + [ + [Discretizer(bin_count=3, column_names="col1")], + [SimpleImputer(SimpleImputer.Strategy.constant(0))], + [SimpleImputer(SimpleImputer.Strategy.constant(0)), Discretizer(bin_count=3)], + [ + LabelEncoder(column_names="col2", partial_order=["a", "b", "c"]), + SimpleImputer(SimpleImputer.Strategy.mean()), + ], + ], + ids=["Discretizer", "SimpleImputer", "Multiple non-invertible", "invertible and non-invertible"], + ) + def test_should_raise_transformer_not_invertible_error_on_non_invertible_transformers( + self, + transformers: list[TableTransformer], + ) -> None: + test_table = Table( + { + "col1": [0.1, 0.113, 0.232, 1.199, 2.33, 2.01, 2.99], + "col2": ["a", "a", "c", "b", "a", "a", "c"], + "col3": [1, 1, None, 3, 14, None, 7], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers) + sequential_table_transformer = sequential_table_transformer.fit(test_table) + transformed_table = sequential_table_transformer.transform(test_table) + with pytest.raises(TransformerNotInvertibleError, match=r".*is not invertible."): + sequential_table_transformer.inverse_transform(transformed_table) + + @pytest.mark.parametrize( + "transformers", + [ + [OneHotEncoder()], + [OneHotEncoder(), StandardScaler(column_names=["col1", "col3"])], + [ + LabelEncoder(column_names="col2", partial_order=["a", "b", "c"]), + OneHotEncoder(), + StandardScaler(column_names=["col1", "col3"]), + ], + [LabelEncoder(), LabelEncoder()], + ], + ids=["1 Transformer", "2 Transformers", "3 Transformers", "Duplicate Transformers"], + ) + def test_should_return_original_table(self, transformers: list[TableTransformer]) -> None: + test_table = Table( + { + "col1": [0.1, 0.113, 0.232, 1.199, 2.33, 2.01, 2.99], + "col2": ["a", "a", "c", "b", "a", "a", "c"], + "col3": [1.0, 1.0, 0.0, 3.0, 14.0, 0.0, 7.0], + "col4": ["one", "two", "one", "two", "one", "two", "one"], + }, + ) + sequential_table_transformer = SequentialTableTransformer(transformers) + sequential_table_transformer = sequential_table_transformer.fit(test_table) + transformed_table = sequential_table_transformer.transform(test_table) + inverse_transformed_table = sequential_table_transformer.inverse_transform(transformed_table) + assert_tables_equal(test_table, inverse_transformed_table, ignore_column_order=True, ignore_types=True) + + def test_should_raise_transformer_not_fitted_error_if_not_fited(self) -> None: + one_hot = OneHotEncoder() + imputer = SimpleImputer(SimpleImputer.Strategy.constant(0)) + transformers = [one_hot, imputer] + sequential_table_transformer = SequentialTableTransformer(transformers) + test_table = Table( + { + "col1": [1, 2, None], + "col2": ["a", "b", "a"], + }, + ) + with pytest.raises(TransformerNotFittedError, match=r"The transformer has not been fitted yet."): + sequential_table_transformer.inverse_transform(test_table)