From 04629ab4b7f88b93e5f68f2f812ba35ffb1d0678 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 7 May 2024 23:20:36 +0200 Subject: [PATCH 01/40] refactor: make fields internal --- .../data/tabular/plotting/_experimental_column_plotter.py | 2 +- src/safeds/data/tabular/plotting/_experimental_table_plotter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 866229992..970103426 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -9,7 +9,7 @@ class ExperimentalColumnPlotter: def __init__(self, column: ExperimentalColumn): - self.column: ExperimentalColumn = column + self._column: ExperimentalColumn = column def box_plot(self) -> Image: raise NotImplementedError diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 7c8ca79be..253853c9d 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -9,7 +9,7 @@ class ExperimentalTablePlotter: def __init__(self, table: ExperimentalTable): - self.table: ExperimentalTable = table + self._table: ExperimentalTable = table def box_plots(self) -> Image: raise NotImplementedError From 2835bd44620de4bf57986d6a389838593bbe5860 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 7 May 2024 23:29:00 +0200 Subject: [PATCH 02/40] test: store polars benchmark in polars table --- benchmarks/table/row_operations_polars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py index 9fc014cb8..9419f7810 100644 --- a/benchmarks/table/row_operations_polars.py +++ b/benchmarks/table/row_operations_polars.py @@ -1,6 +1,6 @@ from timeit import timeit -from safeds.data.tabular.containers import Table +from safeds.data.tabular.containers import ExperimentalTable from benchmarks.table.utils import create_synthetic_table_polars @@ -99,7 +99,7 @@ def _run_transform_column() -> None: # Print the timings print( - Table( + ExperimentalTable( { "method": list(timings.keys()), "timing": list(timings.values()), From 5cd12295d8717bcbad512dd02f4c71a9469beab3 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 7 May 2024 23:44:31 +0200 Subject: [PATCH 03/40] style: add comma --- src/safeds/data/tabular/containers/_experimental_column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 3f2533880..5ee4e58fd 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -458,7 +458,7 @@ def summarize_statistics(self) -> ExperimentalTable: self.idness(), self.stability(), ], - } + }, ) def correlation_with(self, other: ExperimentalColumn) -> float: From 9844d652513327c8026495273bbf4d71940e77ba Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 11:20:53 +0200 Subject: [PATCH 04/40] feat: implement some table operations --- benchmarks/table/column_operations_polars.py | 42 ++++ benchmarks/table/row_operations_polars.py | 10 +- .../containers/_experimental_column.py | 4 +- .../tabular/containers/_experimental_table.py | 193 +++++++++++++++--- 4 files changed, 219 insertions(+), 30 deletions(-) create mode 100644 benchmarks/table/column_operations_polars.py diff --git a/benchmarks/table/column_operations_polars.py b/benchmarks/table/column_operations_polars.py new file mode 100644 index 000000000..00ca129c3 --- /dev/null +++ b/benchmarks/table/column_operations_polars.py @@ -0,0 +1,42 @@ +from timeit import timeit + +from safeds.data.tabular.containers import ExperimentalTable + +from benchmarks.table.utils import create_synthetic_table_polars + +REPETITIONS = 10 + + +def _run_remove_columns_with_missing_values() -> None: + table.remove_columns_with_missing_values()._lazy_frame.collect() + + +def _run_remove_non_numeric_columns() -> None: + table.remove_non_numeric_columns()._lazy_frame.collect() + + +if __name__ == "__main__": + # Create a synthetic Table + table = create_synthetic_table_polars(100, 50000) + + # Run the benchmarks + timings: dict[str, float] = { + "remove_columns_with_missing_values": timeit( + _run_remove_columns_with_missing_values, + number=REPETITIONS, + ), + "remove_non_numeric_columns": timeit( + _run_remove_non_numeric_columns, + number=REPETITIONS, + ), + } + + # Print the timings + print( + ExperimentalTable( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) + ) diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py index 9419f7810..e6fc14aeb 100644 --- a/benchmarks/table/row_operations_polars.py +++ b/benchmarks/table/row_operations_polars.py @@ -23,6 +23,10 @@ def _run_remove_rows() -> None: table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect() +def _run_remove_rows_by_column() -> None: + table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect() + + def _run_shuffle_rows() -> None: table.shuffle_rows()._lazy_frame.collect() @@ -51,7 +55,7 @@ def _run_transform_column() -> None: if __name__ == "__main__": # Create a synthetic Table - table = create_synthetic_table_polars(1000, 50) + table = create_synthetic_table_polars(100000, 50) # Run the benchmarks timings: dict[str, float] = { @@ -71,6 +75,10 @@ def _run_transform_column() -> None: _run_remove_rows, number=REPETITIONS, ), + "remove_rows_by_column": timeit( + _run_remove_rows_by_column, + number=REPETITIONS, + ), "shuffle_rows": timeit( _run_shuffle_rows, number=REPETITIONS, diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 5ee4e58fd..7ef543712 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -494,7 +494,7 @@ def correlation_with(self, other: ExperimentalColumn) -> float: """ import polars as pl - return pl.DataFrame({"a": self._series, "b": other._series}).corr()["a"][1] + return pl.DataFrame({"a": self._series, "b": other._series}).corr().item(row=1, column="a") def distinct_value_count(self) -> int: """ @@ -805,7 +805,7 @@ def to_table(self) -> ExperimentalTable: """ from ._experimental_table import ExperimentalTable - return ExperimentalTable._from_polars_dataframe(self._series.to_frame()) + return ExperimentalTable._from_polars_data_frame(self._series.to_frame()) def temporary_to_old_column(self) -> Column: """ diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 4c515a2d9..58cd891c0 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -25,7 +25,7 @@ from collections.abc import Callable, Mapping, Sequence from pathlib import Path - from polars import DataFrame, LazyFrame + import polars as pl from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.transformation import InvertibleTableTransformer, TableTransformer @@ -72,7 +72,14 @@ class ExperimentalTable: @staticmethod def from_columns(columns: ExperimentalColumn | list[ExperimentalColumn]) -> ExperimentalTable: - raise NotImplementedError + import polars as pl + + if isinstance(columns, ExperimentalColumn): + columns = [columns] + + return ExperimentalTable._from_polars_lazy_frame( + pl.LazyFrame([column._series for column in columns]), + ) @staticmethod def from_csv_file(path: str | Path) -> ExperimentalTable: @@ -155,21 +162,95 @@ def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: @staticmethod def from_json_file(path: str | Path) -> ExperimentalTable: - raise NotImplementedError + """ + Create a table from a JSON file. + + Parameters + ---------- + path: + The path to the JSON file. If the file extension is omitted, it is assumed to be ".json". + + Returns + ------- + table: + The created table. + + Raises + ------ + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".json". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> ExperimentalTable.from_json_file("./src/resources/from_json_file.json") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 1 │ + │ 0 ┆ 0 ┆ 7 │ + └─────┴─────┴─────┘ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".json", [".json"], check_if_file_exists=True) + return ExperimentalTable._from_polars_data_frame(pl.read_json(path)) @staticmethod def from_parquet_file(path: str | Path) -> ExperimentalTable: - raise NotImplementedError + """ + Create a table from a Parquet file. + + Parameters + ---------- + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". + + Returns + ------- + table: + The created table. + + Raises + ------ + FileNotFoundError + If no file exists at the given path. + ValueError + If the path has an extension that is not ".parquet". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> ExperimentalTable.from_parquet_file("./src/resources/from_parquet_file.parquet") + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 2 ┆ 1 │ + │ 0 ┆ 0 ┆ 7 │ + └─────┴─────┴─────┘ + """ + import polars as pl + + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"], check_if_file_exists=True) + return ExperimentalTable._from_polars_lazy_frame(pl.scan_parquet(path)) @staticmethod - def _from_polars_dataframe(data: DataFrame) -> ExperimentalTable: + def _from_polars_data_frame(data: pl.DataFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data.lazy() result._data_frame = data return result @staticmethod - def _from_polars_lazy_frame(data: LazyFrame) -> ExperimentalTable: + def _from_polars_lazy_frame(data: pl.LazyFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data result._data_frame = None @@ -300,7 +381,18 @@ def add_columns( self, columns: ExperimentalColumn | list[ExperimentalColumn], ) -> ExperimentalTable: - raise NotImplementedError + if isinstance(columns, ExperimentalColumn): + columns = [columns] + + if len(columns) == 0: + return self + + if self._data_frame is None: + self._data_frame = self._lazy_frame.collect() + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.hstack([column._series for column in columns]), + ) def compute_column( self, @@ -330,26 +422,41 @@ def get_column_type(self, name: str) -> ExperimentalDataType: def has_column(self, name: str) -> bool: return name in self.column_names - def remove_columns(self, names: str | list[str]) -> ExperimentalTable: + def remove_columns_by_name( + self, + names: str | list[str], + *, + keep_only_listed: bool = False, + ) -> ExperimentalTable: if isinstance(names, str): names = [names] + if keep_only_listed: + names_set = set(names) + names = [name for name in self.column_names if name not in names_set] + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop(names), ) - def remove_columns_except(self, names: str | list[str]) -> ExperimentalTable: - if isinstance(names, str): - names = [names] + def remove_columns_with_missing_values(self) -> ExperimentalTable: + import polars as pl - names_set = set(names) - return self.remove_columns([name for name in self.column_names if name not in names_set]) + if self._data_frame is None: + self._data_frame = self._lazy_frame.collect() - def remove_columns_with_missing_values(self) -> ExperimentalTable: - raise NotImplementedError + return ExperimentalTable._from_polars_lazy_frame( + pl.LazyFrame( + [series for series in self._data_frame.get_columns() if series.null_count() == 0], + ), + ) - def remove_columns_with_non_numeric_values(self) -> ExperimentalTable: - raise NotImplementedError + def remove_non_numeric_columns(self) -> ExperimentalTable: + import polars.selectors as cs + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select(cs.numeric()), + ) def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: """ @@ -399,7 +506,7 @@ def replace_column( new_columns = [new_columns] if len(new_columns) == 0: - return self.remove_columns(old_name) + return self.remove_columns_by_name(old_name) if self._data_frame is None: self._data_frame = self._lazy_frame.collect() @@ -408,14 +515,14 @@ def replace_column( index = new_frame.get_column_index(old_name) if len(new_columns) == 1: - return ExperimentalTable._from_polars_dataframe( + return ExperimentalTable._from_polars_data_frame( new_frame.replace_column(index, new_columns[0]._series), ) prefix = new_frame.select(self.column_names[:index]) - suffix = new_frame.select(self.column_names[index + 1 :]) + suffix = new_frame.select(self.column_names[index + 1:]) - return ExperimentalTable._from_polars_dataframe( + return ExperimentalTable._from_polars_data_frame( prefix.hstack([column._series for column in new_columns]).hstack(suffix), ) @@ -487,7 +594,18 @@ def remove_rows_by_column( name: str, query: Callable[[ExperimentalCell], ExperimentalCell[bool]], ) -> ExperimentalTable: - raise NotImplementedError + import polars as pl + + if not self.has_column(name): + raise UnknownColumnNameError([name]) + + mask = query(_LazyCell(pl.col(name))) + if not isinstance(mask, _LazyCell): + raise TypeError("The query must return a cell.") + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.filter(mask._expression), + ) def remove_rows_with_missing_values( self, @@ -536,7 +654,7 @@ def shuffle_rows(self) -> ExperimentalTable: if self._data_frame is None: self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_dataframe( + return ExperimentalTable._from_polars_data_frame( self._data_frame.sample( fraction=1, shuffle=True, @@ -608,15 +726,27 @@ def split_rows( # ------------------------------------------------------------------------------------------------------------------ def add_table_as_columns(self, other: ExperimentalTable) -> ExperimentalTable: - raise NotImplementedError + if self._data_frame is None: + self._data_frame = self._lazy_frame.collect() + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.hstack(other._data_frame), + ) def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: - raise NotImplementedError + if self._data_frame is None: + self._data_frame = self._lazy_frame.collect() + + return ExperimentalTable._from_polars_data_frame( + self._data_frame.vstack(other._data_frame), + ) def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer) -> ExperimentalTable: + """Not implemented yet. Convert to the old table implementation first using `temporary_to_old_table`.""" raise NotImplementedError def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalTable: + """Not implemented yet. Convert to the old table implementation first using `temporary_to_old_table`.""" raise NotImplementedError # ------------------------------------------------------------------------------------------------------------------ @@ -631,7 +761,10 @@ def summarize_statistics(self) -> ExperimentalTable: # ------------------------------------------------------------------------------------------------------------------ def to_columns(self) -> list[ExperimentalColumn]: - raise NotImplementedError + if self._data_frame is None: + self._data_frame = self._lazy_frame.collect() + + return [ExperimentalColumn._from_polars_series(column) for column in self._data_frame.get_columns()] def to_csv_file(self, path: str | Path) -> None: """ @@ -763,7 +896,13 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable - >>> table = ExperimentalTable({"item": ["apple", "milk", "beer"], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]}) + >>> table = ExperimentalTable( + ... { + ... "item": ["apple", "milk", "beer"], + ... "price": [1.10, 1.19, 1.79], + ... "amount_bought": [74, 72, 51], + ... } + ... ) >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ from safeds.data.labeled.containers import TabularDataset From d4023ff0564499bfaac9f13f9a1f55a61b7681b8 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 11:45:15 +0200 Subject: [PATCH 05/40] docs: minor changes --- .../tabular/containers/_experimental_cell.py | 6 +++- .../containers/_experimental_column.py | 2 +- .../containers/_experimental_lazy_cell.py | 2 +- .../_experimental_lazy_vectorized_row.py | 2 +- .../tabular/containers/_experimental_row.py | 6 +++- .../tabular/containers/_experimental_table.py | 36 +++++++++++++++++-- .../_experimental_vectorized_cell.py | 2 +- 7 files changed, 47 insertions(+), 9 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_experimental_cell.py index c5fe6ce57..ce213947f 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_cell.py @@ -9,7 +9,11 @@ class ExperimentalCell(ABC, Generic[T]): - """A cell is a single value in a table.""" + """ + A single value in a table. + + This class cannot be instantiated directly. It is only used for arguments of callbacks. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 7ef543712..8dba021e2 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -26,7 +26,7 @@ class ExperimentalColumn(Sequence[T]): """ - A column is a named, one-dimensional collection of homogeneous values. + A named, one-dimensional collection of homogeneous values. Parameters ---------- diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py index 88b2e7276..acd7472be 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py @@ -16,7 +16,7 @@ class _LazyCell(ExperimentalCell[T]): """ - A cell is a single value in a table. + A single value in a table. This implementation only builds an expression that will be evaluated when needed. """ diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py b/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py index 3b328dd07..7f29db99c 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_vectorized_row.py @@ -16,7 +16,7 @@ class _LazyVectorizedRow(ExperimentalRow): """ - A row is a one-dimensional collection of named, heterogeneous values. + A one-dimensional collection of named, heterogeneous values. This implementation treats an entire table as a row, where each column is a "cell" in the row. This greatly speeds up operations on the row. diff --git a/src/safeds/data/tabular/containers/_experimental_row.py b/src/safeds/data/tabular/containers/_experimental_row.py index 92c4aaf02..49a1deb8b 100644 --- a/src/safeds/data/tabular/containers/_experimental_row.py +++ b/src/safeds/data/tabular/containers/_experimental_row.py @@ -12,7 +12,11 @@ class ExperimentalRow(ABC, Mapping[str, Any]): - """A row is a one-dimensional collection of named, heterogeneous values.""" + """ + A one-dimensional collection of named, heterogeneous values. + + This class cannot be instantiated directly. It is only used for arguments of callbacks. + """ # ------------------------------------------------------------------------------------------------------------------ # Dunder methods diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 58cd891c0..4b8c6d5cf 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -38,7 +38,7 @@ class ExperimentalTable: """ - A table is a two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. + A two-dimensional collection of data. It can either be seen as a list of rows or as a list of columns. To create a `Table` call the constructor or use one of the following static methods: @@ -754,7 +754,12 @@ def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalT # ------------------------------------------------------------------------------------------------------------------ def summarize_statistics(self) -> ExperimentalTable: - raise NotImplementedError + if not self._data_frame: + self._data_frame = self._lazy_frame.collect() + + return ExperimentalTable._from_polars_data_frame( + [] + ) # ------------------------------------------------------------------------------------------------------------------ # Export @@ -859,7 +864,32 @@ def to_json_file( self._data_frame.write_json(path, row_oriented=(orientation == "row")) def to_parquet_file(self, path: str | Path) -> None: - raise NotImplementedError + """ + Write the table to a Parquet file. + + If the file and/or the parent directories do not exist, they will be created. If the file exists already, it + will be overwritten. + + Parameters + ---------- + path: + The path to the Parquet file. If the file extension is omitted, it is assumed to be ".parquet". + + Raises + ------ + ValueError + If the path has an extension that is not ".parquet". + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.to_parquet_file("./src/resources/to_parquet_file.parquet") + """ + path = _check_and_normalize_file_path(path, ".parquet", [".parquet"]) + path.parent.mkdir(parents=True, exist_ok=True) + + self._lazy_frame.sink_parquet(path) def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: """ diff --git a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py index 34ee1893f..85744099e 100644 --- a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py @@ -21,7 +21,7 @@ class _VectorizedCell(ExperimentalCell[T]): """ - A cell is a single value in a table. + A single value in a table. This implementation treats an entire column as a cell. This greatly speeds up operations on the cell. """ From 61e361f0da592b54283dcb1bce18bc9881d0bdd1 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 12:04:19 +0200 Subject: [PATCH 06/40] feat: summarize_statistics --- benchmarks/table/column_operations_polars.py | 10 +++- .../containers/_experimental_column.py | 57 ++++++++++++------- .../tabular/containers/_experimental_table.py | 12 +++- 3 files changed, 53 insertions(+), 26 deletions(-) diff --git a/benchmarks/table/column_operations_polars.py b/benchmarks/table/column_operations_polars.py index 00ca129c3..c452edca5 100644 --- a/benchmarks/table/column_operations_polars.py +++ b/benchmarks/table/column_operations_polars.py @@ -15,9 +15,13 @@ def _run_remove_non_numeric_columns() -> None: table.remove_non_numeric_columns()._lazy_frame.collect() +def _run_summarize_statistics() -> None: + table.summarize_statistics()._lazy_frame.collect() + + if __name__ == "__main__": # Create a synthetic Table - table = create_synthetic_table_polars(100, 50000) + table = create_synthetic_table_polars(100, 5000) # Run the benchmarks timings: dict[str, float] = { @@ -29,6 +33,10 @@ def _run_remove_non_numeric_columns() -> None: _run_remove_non_numeric_columns, number=REPETITIONS, ), + "summarize_statistics": timeit( + _run_summarize_statistics, + number=REPETITIONS, + ), } # Print the timings diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 8dba021e2..f8423ea9e 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -12,7 +12,7 @@ from ._experimental_vectorized_cell import _VectorizedCell if TYPE_CHECKING: - from polars import Series + from polars import Series, InvalidOperationError from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType @@ -412,7 +412,7 @@ def summarize_statistics(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("a", [1, 3]) >>> column.summarize_statistics() - shape: (10, 2) + shape: (9, 2) ┌──────────────────────┬──────────┐ │ metric ┆ a │ │ --- ┆ --- │ @@ -424,14 +424,17 @@ def summarize_statistics(self) -> ExperimentalTable: │ median ┆ 2.0 │ │ standard deviation ┆ 1.414214 │ │ distinct value count ┆ 2.0 │ - │ missing value count ┆ 0.0 │ - │ missing value ratio ┆ 0.0 │ │ idness ┆ 1.0 │ + │ missing value ratio ┆ 0.0 │ │ stability ┆ 0.5 │ └──────────────────────┴──────────┘ """ from ._experimental_table import ExperimentalTable + mean = self.mean() or "-" + median = self.median() or "-" + standard_deviation = self.standard_deviation() or "-" + return ExperimentalTable( { "metric": [ @@ -441,22 +444,20 @@ def summarize_statistics(self) -> ExperimentalTable: "median", "standard deviation", "distinct value count", - "missing value count", - "missing value ratio", "idness", + "missing value ratio", "stability", ], self.name: [ - self.min(), - self.max(), - self.mean(), - self.median(), - self.standard_deviation(), - self.distinct_value_count(), - self.missing_value_count(), - self.missing_value_ratio(), - self.idness(), - self.stability(), + str(self.min()), + str(self.max()), + str(mean), + str(median), + str(standard_deviation), + str(self.distinct_value_count()), + str(self.idness()), + str(self.missing_value_ratio()), + str(self.stability()), ], }, ) @@ -716,7 +717,7 @@ def stability(self) -> float: return mode_count / non_missing.len() - def standard_deviation(self) -> float: + def standard_deviation(self) -> float | None: """ Return the standard deviation of the values in the column. @@ -725,7 +726,8 @@ def standard_deviation(self) -> float: Returns ------- standard_deviation: - The standard deviation of the values in the column. + The standard deviation of the values in the column. If no standard deviation can be calculated due to the + type of the column, None is returned. Examples -------- @@ -734,9 +736,14 @@ def standard_deviation(self) -> float: >>> column.standard_deviation() 1.0 """ - return self._series.std() + from polars.exceptions import InvalidOperationError + + try: + return self._series.std() + except InvalidOperationError: + return None - def variance(self) -> float: + def variance(self) -> float | None: """ Return the variance of the values in the column. @@ -745,7 +752,8 @@ def variance(self) -> float: Returns ------- variance: - The variance of the values in the column. + The variance of the values in the column. If no variance can be calculated due to the type of the column, + None is returned. Examples -------- @@ -754,7 +762,12 @@ def variance(self) -> float: >>> column.variance() 1.0 """ - return self._series.var() + from polars.exceptions import InvalidOperationError + + try: + return self._series.var() + except InvalidOperationError: + return None # ------------------------------------------------------------------------------------------------------------------ # Export diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 4b8c6d5cf..e76868c62 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -754,11 +754,17 @@ def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalT # ------------------------------------------------------------------------------------------------------------------ def summarize_statistics(self) -> ExperimentalTable: - if not self._data_frame: - self._data_frame = self._lazy_frame.collect() + if self.number_of_columns == 0: + return ExperimentalTable() + + head = self.get_column(self.column_names[0]).summarize_statistics() + tail = [ + self.get_column(name).summarize_statistics().get_column(name)._series + for name in self.column_names[1:] + ] return ExperimentalTable._from_polars_data_frame( - [] + head._lazy_frame.collect().hstack(tail, in_place=True), ) # ------------------------------------------------------------------------------------------------------------------ From e7eeb8641d99dd7b945fee21c375258ede9e57f9 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 14:27:24 +0200 Subject: [PATCH 07/40] feat: tabular dataset backed by polars --- .../data/labeled/containers/__init__.py | 3 + .../_experimental_tabular_dataset.py | 149 ++++++++++++++++++ .../labeled/containers/_tabular_dataset.py | 4 +- .../tabular/containers/_experimental_table.py | 22 +-- 4 files changed, 165 insertions(+), 13 deletions(-) create mode 100644 src/safeds/data/labeled/containers/_experimental_tabular_dataset.py diff --git a/src/safeds/data/labeled/containers/__init__.py b/src/safeds/data/labeled/containers/__init__.py index e6237ec24..8eed70294 100644 --- a/src/safeds/data/labeled/containers/__init__.py +++ b/src/safeds/data/labeled/containers/__init__.py @@ -5,6 +5,7 @@ import apipkg if TYPE_CHECKING: + from ._experimental_tabular_dataset import ExperimentalTabularDataset from ._image_dataset import ImageDataset from ._tabular_dataset import TabularDataset from ._time_series_dataset import TimeSeriesDataset @@ -12,6 +13,7 @@ apipkg.initpkg( __name__, { + "ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset", "ImageDataset": "._image_dataset:ImageDataset", "TabularDataset": "._tabular_dataset:TabularDataset", "TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset", @@ -19,6 +21,7 @@ ) __all__ = [ + "ExperimentalTabularDataset", "ImageDataset", "TabularDataset", "TimeSeriesDataset", diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py new file mode 100644 index 000000000..7dbfa5950 --- /dev/null +++ b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING + +from safeds._utils import _structural_hash + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + + +class ExperimentalTabularDataset: + """ + A dataset containing tabular data. It can be used to train machine learning models. + + Columns in a tabular dataset are divided into three categories: + + * The target column is the column that a model should predict. + * Feature columns are columns that a model should use to make predictions. + * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, + like an ID column. + + Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns + are specified, all columns except the target column are used as features. + + Parameters + ---------- + data: + The data. + target_name: + Name of the target column. + extra_names: + Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but + the target column are used as features. + + Raises + ------ + KeyError + If a column name is not found in the data. + ValueError + If the target column is also an extra column. + ValueError + If no feature columns remains. + + Examples + -------- + >>> from safeds.data.labeled.containers import TabularDataset + >>> dataset = TabularDataset( + ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3]}, + ... target_name="target", + ... extra_names=["id"] + ... ) + """ + + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + + def __init__( + self, + data: ExperimentalTable, + target_name: str, + extra_names: list[str] | None = None, + ): + # Preprocess inputs + if extra_names is None: + extra_names = [] + + # Derive feature names + non_feature_names = {target_name, *extra_names} # perf: Comprehensions evaluate their condition every iteration + feature_names = [name for name in data.column_names if name not in non_feature_names] + + # Validate inputs + if target_name in extra_names: + raise ValueError(f"Column '{target_name}' cannot be both target and extra.") + if len(feature_names) == 0: + raise ValueError("At least one feature column must remain.") + + # Set attributes + self._table: ExperimentalTable = data + self._features: ExperimentalTable = data.remove_columns_by_name(feature_names, keep_only_listed=True) + self._target: ExperimentalColumn = data.get_column(target_name) + self._extras: ExperimentalTable = data.remove_columns_by_name(extra_names, keep_only_listed=True) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ExperimentalTabularDataset): + return NotImplemented + if self is other: + return True + return self.target == other.target and self.features == other.features and self._extras == other._extras + + def __hash__(self) -> int: + return _structural_hash(self.target, self.features, self._extras) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._target) + sys.getsizeof(self._features) + sys.getsizeof(self._extras) + + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + + @property + def features(self) -> ExperimentalTable: + """The feature columns of the tabular dataset.""" + return self._features + + @property + def target(self) -> ExperimentalColumn: + """The target column of the tabular dataset.""" + return self._target + + @property + def extras(self) -> ExperimentalTable: + """ + Additional columns of the tabular dataset that are neither features nor target. + + These can be used to store additional information about instances, such as IDs. + """ + return self._extras + + # ------------------------------------------------------------------------------------------------------------------ + # Conversion + # ------------------------------------------------------------------------------------------------------------------ + + def to_table(self) -> ExperimentalTable: + """ + Return a table containing all columns of the tabular dataset. + + Returns + ------- + table: + A table containing all columns of the tabular dataset. + """ + return self._table + + # ------------------------------------------------------------------------------------------------------------------ + # IPython integration + # ------------------------------------------------------------------------------------------------------------------ + + def _repr_html_(self) -> str: + """ + Return an HTML representation of the tabular dataset. + + Returns + ------- + html: + The generated HTML. + """ + return self._table._repr_html_() diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index 77dfc8c54..50bb892f7 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -3,7 +3,7 @@ import sys from typing import TYPE_CHECKING -from safeds._config import _init_default_device, _get_device +from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds.data.tabular.containers import Column, Table @@ -196,7 +196,7 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> dataset=_create_dataset( torch.Tensor(self.features._data.values).to(_get_device()), torch.nn.functional.one_hot( - torch.LongTensor(self.target._data).to(_get_device()), num_classes=num_of_classes + torch.LongTensor(self.target._data).to(_get_device()), num_classes=num_of_classes, ), ), batch_size=batch_size, diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index e76868c62..4c42c252c 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -4,6 +4,7 @@ from safeds._utils import _check_and_normalize_file_path, _structural_hash from safeds._utils._random import _get_random_seed +from safeds.data.labeled.containers import ExperimentalTabularDataset from safeds.data.tabular.plotting._experimental_table_plotter import ExperimentalTablePlotter from safeds.data.tabular.typing._experimental_polars_data_type import _PolarsDataType from safeds.data.tabular.typing._experimental_polars_schema import _PolarsSchema @@ -27,7 +28,6 @@ import polars as pl - from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.transformation import InvertibleTableTransformer, TableTransformer from safeds.data.tabular.typing import ExperimentalSchema from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType @@ -432,8 +432,8 @@ def remove_columns_by_name( names = [names] if keep_only_listed: - names_set = set(names) - names = [name for name in self.column_names if name not in names_set] + names_to_keep = set(names) # perf: Comprehensions evaluate their condition every iteration + names = [name for name in self.column_names if name not in names_to_keep] return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop(names), @@ -742,11 +742,14 @@ def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: ) def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer) -> ExperimentalTable: - """Not implemented yet. Convert to the old table implementation first using `temporary_to_old_table`.""" + # TODO: more efficient implementation + # old_table = self.temporary_to_old_table().inverse_transform_table(fitted_transformer) + # return ExperimentalTable._from_polars_data_frame( + # pl.DataFrame(old_table.) + # ) raise NotImplementedError def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalTable: - """Not implemented yet. Convert to the old table implementation first using `temporary_to_old_table`.""" raise NotImplementedError # ------------------------------------------------------------------------------------------------------------------ @@ -897,14 +900,14 @@ def to_parquet_file(self, path: str | Path) -> None: self._lazy_frame.sink_parquet(path) - def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: + def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> ExperimentalTabularDataset: """ Return a new `TabularDataset` with columns marked as a target, feature, or extra. * The target column is the column that a model should predict. * Feature columns are columns that a model should use to make predictions. * Extra columns are columns that are neither feature nor target. They can be used to provide additional context, - like an ID or name column. + like an ID column. Feature columns are implicitly defined as all columns except the target and extra columns. If no extra columns are specified, all columns except the target column are used as features. @@ -941,10 +944,7 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N ... ) >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ - from safeds.data.labeled.containers import TabularDataset - - # TODO: more efficient implementation - return TabularDataset(self.temporary_to_old_table(), target_name, extra_names) + return ExperimentalTabularDataset(self, target_name, extra_names) def temporary_to_old_table(self) -> Table: """ From 7b6cd5d54d89da33ba92446e19f723686e1962c6 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 14:39:04 +0200 Subject: [PATCH 08/40] refactor: get polars `IntoExpr` for any cell implementation --- .../tabular/containers/_experimental_cell.py | 10 +++++++++- .../containers/_experimental_lazy_cell.py | 4 ++++ .../tabular/containers/_experimental_table.py | 16 ++++------------ .../_experimental_vectorized_cell.py | 4 ++++ .../classical/classification/_classifier.py | 19 ++++++++++++++----- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_experimental_cell.py index ce213947f..bae73d679 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_cell.py @@ -1,7 +1,10 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +if TYPE_CHECKING: + import polars as pl T = TypeVar("T") P = TypeVar("P") @@ -129,6 +132,11 @@ def __sizeof__(self) -> int: ... # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + @abstractmethod + def _polars_expression(self) -> pl.Expr | pl.Series: + """The Polars expression that corresponds to this cell.""" + @abstractmethod def _equals(self, other: object) -> bool: """ diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py index acd7472be..2ece9095d 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py @@ -164,6 +164,10 @@ def __sizeof__(self) -> int: # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + def _polars_expression(self) -> pl.Expr: + return self._expression + def _equals(self, other: object) -> bool: if not isinstance(other, _LazyCell): return NotImplemented diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 4c42c252c..53c02b5c3 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -403,11 +403,9 @@ def compute_column( raise DuplicateColumnNameError(name) computed_column = computer(_LazyVectorizedRow(self)) - if not isinstance(computed_column, _LazyCell): - raise TypeError("The computer must return a cell.") return self._from_polars_lazy_frame( - self._lazy_frame.with_columns(name, computed_column._expression), + self._lazy_frame.with_columns(name, computed_column._polars_expression), ) def get_column(self, name: str) -> ExperimentalColumn: @@ -582,11 +580,9 @@ def remove_rows( query: Callable[[ExperimentalRow], ExperimentalCell[bool]], ) -> ExperimentalTable: mask = query(_LazyVectorizedRow(self)) - if not isinstance(mask, _LazyCell): - raise TypeError("The query must return a boolean cell.") return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(mask._expression), + self._lazy_frame.filter(mask._polars_expression), ) def remove_rows_by_column( @@ -600,11 +596,9 @@ def remove_rows_by_column( raise UnknownColumnNameError([name]) mask = query(_LazyCell(pl.col(name))) - if not isinstance(mask, _LazyCell): - raise TypeError("The query must return a cell.") return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(mask._expression), + self._lazy_frame.filter(mask._polars_expression), ) def remove_rows_with_missing_values( @@ -674,12 +668,10 @@ def sort_rows( descending: bool = False, ) -> ExperimentalTable: key = key_selector(_LazyVectorizedRow(self)) - if not isinstance(key, _LazyCell): - raise TypeError("The key selector must return a cell.") return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.sort( - key._expression, + key._polars_expression, descending=descending, maintain_order=True, ), diff --git a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py index 85744099e..24b3049e2 100644 --- a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py @@ -220,6 +220,10 @@ def type(self) -> ExperimentalDataType: # Internal # ------------------------------------------------------------------------------------------------------------------ + @property + def _polars_expression(self) -> pl.Series: + return self._series + def _equals(self, other: object) -> bool: if not isinstance(other, _VectorizedCell): return NotImplemented diff --git a/src/safeds/ml/classical/classification/_classifier.py b/src/safeds/ml/classical/classification/_classifier.py index c9a05cff3..ca08c241a 100644 --- a/src/safeds/ml/classical/classification/_classifier.py +++ b/src/safeds/ml/classical/classification/_classifier.py @@ -1,11 +1,11 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, overload from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import Table, ExperimentalTable from safeds.exceptions import PlainTableError if TYPE_CHECKING: @@ -29,7 +29,7 @@ def __hash__(self) -> int: return _structural_hash(self.__class__.__qualname__, self.is_fitted) @abstractmethod - def fit(self, training_set: TabularDataset) -> Classifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Classifier: """ Create a copy of this classifier and fit it with the given training data. @@ -51,8 +51,17 @@ def fit(self, training_set: TabularDataset) -> Classifier: If the training data contains invalid values or if the training failed. """ + @overload + def predict(self, dataset: Table) -> TabularDataset: ... + + @overload + def predict(self, dataset: ExperimentalTable | ExperimentalTabularDataset) -> ExperimentalTabularDataset: ... + @abstractmethod - def predict(self, dataset: Table) -> TabularDataset: + def predict( + self, + dataset: Table | ExperimentalTable | ExperimentalTabularDataset, + ) -> TabularDataset | ExperimentalTabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. From f9c62640d7bfb9dcba7294c9ef586f5d17c7e218 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 14:41:30 +0200 Subject: [PATCH 09/40] feat: lazy `transform_column` --- .../data/tabular/containers/_experimental_table.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 53c02b5c3..e894a23b1 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -19,7 +19,6 @@ from ._experimental_column import ExperimentalColumn from ._experimental_lazy_cell import _LazyCell from ._experimental_lazy_vectorized_row import _LazyVectorizedRow -from ._experimental_vectorized_cell import _VectorizedCell from ._table import Table if TYPE_CHECKING: @@ -532,13 +531,12 @@ def transform_column( if not self.has_column(name): raise UnknownColumnNameError([name]) # TODO: in the error, compute similar column names - transformed_column = transformer(_VectorizedCell(self.get_column(name))) - if not isinstance(transformed_column, _VectorizedCell): - raise TypeError("The transformer must return a cell.") + import polars as pl + + transformed_column = transformer(_LazyCell(pl.col(name))) - return self.replace_column( - name, - ExperimentalColumn._from_polars_series(transformed_column._series), + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.with_columns(transformed_column._polars_expression), ) # ------------------------------------------------------------------------------------------------------------------ From f9bcba044d70c27528839dc6626982818847fb1a Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 15:39:39 +0200 Subject: [PATCH 10/40] feat: handle new data structures in models --- .../_experimental_tabular_dataset.py | 2 +- src/safeds/ml/classical/_util_sklearn.py | 184 ++++++++++++------ .../ml/classical/classification/_ada_boost.py | 8 +- .../classical/classification/_classifier.py | 47 +++-- .../classification/_decision_tree.py | 8 +- .../classification/_gradient_boosting.py | 8 +- .../classification/_k_nearest_neighbors.py | 8 +- .../classification/_logistic_regression.py | 8 +- .../classification/_random_forest.py | 8 +- .../classification/_support_vector_machine.py | 10 +- .../ml/classical/regression/_ada_boost.py | 8 +- .../ml/classical/regression/_decision_tree.py | 8 +- .../regression/_elastic_net_regression.py | 8 +- .../regression/_gradient_boosting.py | 8 +- .../regression/_k_nearest_neighbors.py | 8 +- .../classical/regression/_lasso_regression.py | 8 +- .../regression/_linear_regression.py | 8 +- .../ml/classical/regression/_random_forest.py | 8 +- .../ml/classical/regression/_regressor.py | 68 +++++-- .../classical/regression/_ridge_regression.py | 8 +- .../regression/_support_vector_machine.py | 10 +- 21 files changed, 276 insertions(+), 165 deletions(-) diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py index 7dbfa5950..20f49023c 100644 --- a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py @@ -139,7 +139,7 @@ def to_table(self) -> ExperimentalTable: def _repr_html_(self) -> str: """ - Return an HTML representation of the tabular dataset. + Return a compact HTML representation of the tabular dataset for IPython. Returns ------- diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 12025d585..6f0cabae5 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -1,8 +1,8 @@ import warnings from typing import Any -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable, Table from safeds.exceptions import ( DatasetMissesDataError, DatasetMissesFeaturesError, @@ -15,8 +15,7 @@ ) -# noinspection PyProtectedMember -def fit(model: Any, tabular_dataset: TabularDataset) -> None: +def fit(model: Any, tabular_dataset: TabularDataset | ExperimentalTabularDataset) -> None: """ Fit a model for a given tabular dataset. @@ -46,9 +45,14 @@ def fit(model: Any, tabular_dataset: TabularDataset) -> None: if tabular_dataset._table.number_of_rows == 0: raise DatasetMissesDataError - non_numerical_column_names = set(tabular_dataset.features.column_names) - set( - tabular_dataset.features.remove_columns_with_non_numerical_values().column_names, - ) + if isinstance(tabular_dataset, TabularDataset): + non_numerical_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_columns_with_non_numerical_values().column_names, + ) + else: # pragma: no cover + non_numerical_column_names = set(tabular_dataset.features.column_names) - set( + tabular_dataset.features.remove_non_numeric_columns().column_names, + ) if len(non_numerical_column_names) != 0: raise NonNumericColumnError( str(non_numerical_column_names), @@ -68,16 +72,26 @@ def fit(model: Any, tabular_dataset: TabularDataset) -> None: ) try: - model.fit( - tabular_dataset.features._data, - tabular_dataset.target._data, - ) + if isinstance(tabular_dataset, TabularDataset): + model.fit( + tabular_dataset.features._data, + tabular_dataset.target._data, + ) + else: # pragma: no cover + model.fit( + tabular_dataset.features.__dataframe__(), + tabular_dataset.target._series, + ) except ValueError as exception: raise LearningError(str(exception)) from exception # noinspection PyProtectedMember -def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_name: str | None) -> TabularDataset: +def predict( + model: Any, + dataset: Table | ExperimentalTable | ExperimentalTabularDataset, + feature_names: list[str] | None, target_name: str | None, +) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -115,57 +129,105 @@ def predict(model: Any, dataset: Table, feature_names: list[str] | None, target_ # Validation if model is None or target_name is None or feature_names is None: raise ModelNotFittedError - missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] - if missing_feature_names: - raise DatasetMissesFeaturesError(missing_feature_names) - if isinstance(dataset, TabularDataset): - dataset = dataset.features # Cast to Table type, so Python will call the right methods... + if isinstance(dataset, ExperimentalTabularDataset): # pragma: no cover + dataset = dataset.features - if dataset.number_of_rows == 0: - raise DatasetMissesDataError + if isinstance(dataset, Table): + missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] + if missing_feature_names: + raise DatasetMissesFeaturesError(missing_feature_names) - non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, - ) - if len(non_numerical_column_names) != 0: - raise NonNumericColumnError( - str(non_numerical_column_names), - "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" - " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" - " different values\nor is ordinal, you should use the LabelEncoder.\n", - ) + if dataset.number_of_rows == 0: + raise DatasetMissesDataError - null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( - dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, - ) - if len(null_containing_column_names) != 0: - raise MissingValuesColumnError( - str(null_containing_column_names), - "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" - " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + non_numerical_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_non_numerical_values().column_names, ) - - dataset_df = dataset.keep_only_columns(feature_names)._data - dataset_df.columns = feature_names - - result_set = dataset._data.reset_index(drop=True) - result_set.columns = dataset.column_names - - try: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="X does not have valid feature names") - predicted_target_vector = model.predict(dataset_df.values) - result_set[target_name] = predicted_target_vector - - extra_names = [ - column_name - for column_name in dataset.column_names - if column_name != target_name and column_name not in feature_names - ] - - return Table._from_pandas_dataframe(result_set).to_tabular_dataset( - target_name=target_name, - extra_names=extra_names, + if len(non_numerical_column_names) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n", + ) + + null_containing_column_names = set(dataset.keep_only_columns(feature_names).column_names) - set( + dataset.keep_only_columns(feature_names).remove_columns_with_missing_values().column_names, ) - except ValueError as exception: - raise PredictionError(str(exception)) from exception + if len(null_containing_column_names) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + dataset_df = dataset.keep_only_columns(feature_names)._data + dataset_df.columns = feature_names + + result_set = dataset._data.reset_index(drop=True) + result_set.columns = dataset.column_names + + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="X does not have valid feature names") + predicted_target_vector = model.predict(dataset_df.values) + result_set[target_name] = predicted_target_vector + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + + return Table._from_pandas_dataframe(result_set).to_tabular_dataset( + target_name=target_name, + extra_names=extra_names, + ) + except ValueError as exception: + raise PredictionError(str(exception)) from exception + elif isinstance(dataset, ExperimentalTable): # pragma: no cover + missing_feature_names = [feature_name for feature_name in feature_names if not dataset.has_column(feature_name)] + if missing_feature_names: + raise DatasetMissesFeaturesError(missing_feature_names) + + if dataset.number_of_rows == 0: + raise DatasetMissesDataError + + non_numerical_column_names_2 = dataset.remove_non_numeric_columns().column_names + if len(non_numerical_column_names_2) != 0: + raise NonNumericColumnError( + str(non_numerical_column_names_2), + "You can use the LabelEncoder or OneHotEncoder to transform your non-numerical data to numerical" + " data.\nThe OneHotEncoder should be used if you work with nominal data. If your data contains too many" + " different values\nor is ordinal, you should use the LabelEncoder.\n", + ) + + null_containing_column_names_2 = dataset.remove_columns_with_missing_values().column_names + if len(null_containing_column_names_2) != 0: + raise MissingValuesColumnError( + str(null_containing_column_names_2), + "You can use the Imputer to replace the missing values based on different strategies.\nIf you want to" + " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", + ) + + dataset_df = dataset.remove_columns_by_name(feature_names, keep_only_listed=True) + + try: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="X does not have valid feature names") + predicted_target_vector = model.predict(dataset_df.__dataframe__()) + dataset_df.add_columns(ExperimentalColumn(target_name, predicted_target_vector)) + + extra_names = [ + column_name + for column_name in dataset.column_names + if column_name != target_name and column_name not in feature_names + ] + + return TabularDataset( + dataset_df.to_dict(), + target_name=target_name, + extra_names=extra_names, + ) + except ValueError as exception: + raise PredictionError(str(exception)) from exception diff --git a/src/safeds/ml/classical/classification/_ada_boost.py b/src/safeds/ml/classical/classification/_ada_boost.py index d251e542c..20c1b2304 100644 --- a/src/safeds/ml/classical/classification/_ada_boost.py +++ b/src/safeds/ml/classical/classification/_ada_boost.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import AdaBoostClassifier as sk_AdaBoostClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class AdaBoostClassifier(Classifier): @@ -109,7 +109,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> AdaBoostClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -152,7 +152,7 @@ def fit(self, training_set: TabularDataset) -> AdaBoostClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_classifier.py b/src/safeds/ml/classical/classification/_classifier.py index ca08c241a..614428092 100644 --- a/src/safeds/ml/classical/classification/_classifier.py +++ b/src/safeds/ml/classical/classification/_classifier.py @@ -1,11 +1,11 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, overload +from typing import TYPE_CHECKING from safeds._utils import _structural_hash from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset -from safeds.data.tabular.containers import Table, ExperimentalTable +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import PlainTableError if TYPE_CHECKING: @@ -51,17 +51,11 @@ def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Clas If the training data contains invalid values or if the training failed. """ - @overload - def predict(self, dataset: Table) -> TabularDataset: ... - - @overload - def predict(self, dataset: ExperimentalTable | ExperimentalTabularDataset) -> ExperimentalTabularDataset: ... - @abstractmethod def predict( self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset, - ) -> TabularDataset | ExperimentalTabularDataset: + ) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -105,7 +99,11 @@ def _get_sklearn_classifier(self) -> ClassifierMixin: # Metrics # ------------------------------------------------------------------------------------------------------------------ - def summarize_metrics(self, validation_or_test_set: TabularDataset, positive_class: Any) -> Table: + def summarize_metrics( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> Table: """ Summarize the classifier's metrics on the given data. @@ -138,7 +136,7 @@ def summarize_metrics(self, validation_or_test_set: TabularDataset, positive_cla }, ) - def accuracy(self, validation_or_test_set: TabularDataset) -> float: + def accuracy(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the accuracy of the classifier on the given data. @@ -162,12 +160,20 @@ def accuracy(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected_values = validation_or_test_set.target - predicted_values = self.predict(validation_or_test_set.features).target + if isinstance(validation_or_test_set, TabularDataset): + expected_values = validation_or_test_set.target + else: # pragma: no cover + expected_values = validation_or_test_set.target._series + predicted_values = self.predict(validation_or_test_set.features).target._data - return sk_accuracy_score(expected_values._data, predicted_values._data) + # TODO: more efficient implementation using polars + return sk_accuracy_score(expected_values._data, predicted_values) - def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def precision( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> float: """ Compute the classifier's precision on the given data. @@ -193,6 +199,7 @@ def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) n_true_positives = 0 n_false_positives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: @@ -204,7 +211,7 @@ def precision(self, validation_or_test_set: TabularDataset, positive_class: Any) return 1.0 return n_true_positives / (n_true_positives + n_false_positives) - def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def recall(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset, positive_class: Any) -> float: """ Compute the classifier's recall on the given data. @@ -230,6 +237,7 @@ def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> n_true_positives = 0 n_false_negatives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: @@ -241,7 +249,11 @@ def recall(self, validation_or_test_set: TabularDataset, positive_class: Any) -> return 1.0 return n_true_positives / (n_true_positives + n_false_negatives) - def f1_score(self, validation_or_test_set: TabularDataset, positive_class: Any) -> float: + def f1_score( + self, + validation_or_test_set: TabularDataset | ExperimentalTabularDataset, + positive_class: Any, + ) -> float: """ Compute the classifier's $F_1$-score on the given data. @@ -268,6 +280,7 @@ def f1_score(self, validation_or_test_set: TabularDataset, positive_class: Any) n_false_negatives = 0 n_false_positives = 0 + # TODO: more efficient implementation using polars for expected_value, predicted_value in zip(expected_values, predicted_values, strict=True): if predicted_value == positive_class: if expected_value == positive_class: diff --git a/src/safeds/ml/classical/classification/_decision_tree.py b/src/safeds/ml/classical/classification/_decision_tree.py index ca7cd8d5b..e9a43466c 100644 --- a/src/safeds/ml/classical/classification/_decision_tree.py +++ b/src/safeds/ml/classical/classification/_decision_tree.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.tree import DecisionTreeClassifier as sk_DecisionTreeClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class DecisionTreeClassifier(Classifier): @@ -77,7 +77,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of the tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -119,7 +119,7 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_gradient_boosting.py b/src/safeds/ml/classical/classification/_gradient_boosting.py index 56545d345..869c77028 100644 --- a/src/safeds/ml/classical/classification/_gradient_boosting.py +++ b/src/safeds/ml/classical/classification/_gradient_boosting.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import GradientBoostingClassifier as sk_GradientBoostingClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class GradientBoostingClassifier(Classifier): @@ -84,7 +84,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> GradientBoostingClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -123,7 +123,7 @@ def fit(self, training_set: TabularDataset) -> GradientBoostingClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py index 82c6cf920..974a4f9a5 100644 --- a/src/safeds/ml/classical/classification/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/classification/_k_nearest_neighbors.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError from safeds.ml.classical._util_sklearn import fit, predict @@ -64,7 +64,7 @@ def number_of_neighbors(self) -> int: """ return self._number_of_neighbors - def fit(self, training_set: TabularDataset) -> KNearestNeighborsClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -116,7 +116,7 @@ def fit(self, training_set: TabularDataset) -> KNearestNeighborsClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_logistic_regression.py b/src/safeds/ml/classical/classification/_logistic_regression.py index c3e0b09d0..22a6bcd00 100644 --- a/src/safeds/ml/classical/classification/_logistic_regression.py +++ b/src/safeds/ml/classical/classification/_logistic_regression.py @@ -11,8 +11,8 @@ from sklearn.base import ClassifierMixin from sklearn.linear_model import LogisticRegression as sk_LogisticRegression - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class LogisticRegressionClassifier(Classifier): @@ -27,7 +27,7 @@ def __init__(self) -> None: self._feature_names: list[str] | None = None self._target_name: str | None = None - def fit(self, training_set: TabularDataset) -> LogisticRegressionClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LogisticRegressionClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -66,7 +66,7 @@ def fit(self, training_set: TabularDataset) -> LogisticRegressionClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_random_forest.py b/src/safeds/ml/classical/classification/_random_forest.py index 567106c3d..ed5bb2681 100644 --- a/src/safeds/ml/classical/classification/_random_forest.py +++ b/src/safeds/ml/classical/classification/_random_forest.py @@ -12,8 +12,8 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import RandomForestClassifier as sk_RandomForestClassifier - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RandomForestClassifier(Classifier): @@ -93,7 +93,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of each tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> RandomForestClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -136,7 +136,7 @@ def fit(self, training_set: TabularDataset) -> RandomForestClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/classification/_support_vector_machine.py b/src/safeds/ml/classical/classification/_support_vector_machine.py index 6890ebfd7..33499d782 100644 --- a/src/safeds/ml/classical/classification/_support_vector_machine.py +++ b/src/safeds/ml/classical/classification/_support_vector_machine.py @@ -13,15 +13,15 @@ from sklearn.base import ClassifierMixin from sklearn.svm import SVC as sk_SVC # noqa: N811 - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class SupportVectorMachineKernel(ABC): """The abstract base class of the different subclasses supported by the `Kernel`.""" @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: + def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead """Return the arguments to pass to scikit-learn.""" @abstractmethod @@ -188,7 +188,7 @@ def __eq__(self, other: object) -> bool: __hash__ = SupportVectorMachineKernel.__hash__ - def fit(self, training_set: TabularDataset) -> SupportVectorMachineClassifier: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineClassifier: """ Create a copy of this classifier and fit it with the given training data. @@ -227,7 +227,7 @@ def fit(self, training_set: TabularDataset) -> SupportVectorMachineClassifier: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_ada_boost.py b/src/safeds/ml/classical/regression/_ada_boost.py index dd27e266d..3b85df127 100644 --- a/src/safeds/ml/classical/regression/_ada_boost.py +++ b/src/safeds/ml/classical/regression/_ada_boost.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import AdaBoostRegressor as sk_AdaBoostRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class AdaBoostRegressor(Regressor): @@ -109,7 +109,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> AdaBoostRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> AdaBoostRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -152,7 +152,7 @@ def fit(self, training_set: TabularDataset) -> AdaBoostRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_decision_tree.py b/src/safeds/ml/classical/regression/_decision_tree.py index d8a066973..33c40d1e6 100644 --- a/src/safeds/ml/classical/regression/_decision_tree.py +++ b/src/safeds/ml/classical/regression/_decision_tree.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.tree import DecisionTreeRegressor as sk_DecisionTreeRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class DecisionTreeRegressor(Regressor): @@ -77,7 +77,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of the tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> DecisionTreeRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -119,7 +119,7 @@ def fit(self, training_set: TabularDataset) -> DecisionTreeRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_elastic_net_regression.py b/src/safeds/ml/classical/regression/_elastic_net_regression.py index 125f49e7a..45b3069f4 100644 --- a/src/safeds/ml/classical/regression/_elastic_net_regression.py +++ b/src/safeds/ml/classical/regression/_elastic_net_regression.py @@ -14,8 +14,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import ElasticNet as sk_ElasticNet - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class ElasticNetRegressor(Regressor): @@ -114,7 +114,7 @@ def lasso_ratio(self) -> float: """ return self._lasso_ratio - def fit(self, training_set: TabularDataset) -> ElasticNetRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> ElasticNetRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -153,7 +153,7 @@ def fit(self, training_set: TabularDataset) -> ElasticNetRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_gradient_boosting.py b/src/safeds/ml/classical/regression/_gradient_boosting.py index 34ec419ab..4cf46bc97 100644 --- a/src/safeds/ml/classical/regression/_gradient_boosting.py +++ b/src/safeds/ml/classical/regression/_gradient_boosting.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import GradientBoostingRegressor as sk_GradientBoostingRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class GradientBoostingRegressor(Regressor): @@ -84,7 +84,7 @@ def learning_rate(self) -> float: """ return self._learning_rate - def fit(self, training_set: TabularDataset) -> GradientBoostingRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> GradientBoostingRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -123,7 +123,7 @@ def fit(self, training_set: TabularDataset) -> GradientBoostingRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py index 8a96b3a62..aa6198de3 100644 --- a/src/safeds/ml/classical/regression/_k_nearest_neighbors.py +++ b/src/safeds/ml/classical/regression/_k_nearest_neighbors.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import ClosedBound, DatasetMissesDataError, OutOfBoundsError, PlainTableError from safeds.ml.classical._util_sklearn import fit, predict @@ -64,7 +64,7 @@ def number_of_neighbors(self) -> int: """ return self._number_of_neighbors - def fit(self, training_set: TabularDataset) -> KNearestNeighborsRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> KNearestNeighborsRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -118,7 +118,7 @@ def fit(self, training_set: TabularDataset) -> KNearestNeighborsRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_lasso_regression.py b/src/safeds/ml/classical/regression/_lasso_regression.py index e912c4aba..2a74cc244 100644 --- a/src/safeds/ml/classical/regression/_lasso_regression.py +++ b/src/safeds/ml/classical/regression/_lasso_regression.py @@ -13,8 +13,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import Lasso as sk_Lasso - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class LassoRegressor(Regressor): @@ -68,7 +68,7 @@ def alpha(self) -> float: """ return self._alpha - def fit(self, training_set: TabularDataset) -> LassoRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LassoRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -107,7 +107,7 @@ def fit(self, training_set: TabularDataset) -> LassoRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py index 101fec7a5..513f44ad7 100644 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ b/src/safeds/ml/classical/regression/_linear_regression.py @@ -11,8 +11,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import LinearRegression as sk_LinearRegression - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class LinearRegressionRegressor(Regressor): @@ -27,7 +27,7 @@ def __init__(self) -> None: self._feature_names: list[str] | None = None self._target_name: str | None = None - def fit(self, training_set: TabularDataset) -> LinearRegressionRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> LinearRegressionRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -66,7 +66,7 @@ def fit(self, training_set: TabularDataset) -> LinearRegressionRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_random_forest.py b/src/safeds/ml/classical/regression/_random_forest.py index 1d807d3b9..2d4a8ad98 100644 --- a/src/safeds/ml/classical/regression/_random_forest.py +++ b/src/safeds/ml/classical/regression/_random_forest.py @@ -12,8 +12,8 @@ from sklearn.base import RegressorMixin from sklearn.ensemble import RandomForestRegressor as sk_RandomForestRegressor - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RandomForestRegressor(Regressor): @@ -93,7 +93,7 @@ def minimum_number_of_samples_in_leaves(self) -> int: """The minimum number of samples that must remain in the leaves of each tree.""" return self._minimum_number_of_samples_in_leaves - def fit(self, training_set: TabularDataset) -> RandomForestRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RandomForestRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -136,7 +136,7 @@ def fit(self, training_set: TabularDataset) -> RandomForestRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_regressor.py b/src/safeds/ml/classical/regression/_regressor.py index 1779bbb0e..d1ac75c2c 100644 --- a/src/safeds/ml/classical/regression/_regressor.py +++ b/src/safeds/ml/classical/regression/_regressor.py @@ -4,8 +4,8 @@ from typing import TYPE_CHECKING from safeds._utils import _structural_hash -from safeds.data.labeled.containers import TabularDataset -from safeds.data.tabular.containers import Column, Table +from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset +from safeds.data.tabular.containers import Column, ExperimentalColumn, ExperimentalTable, Table from safeds.exceptions import ColumnLengthMismatchError, PlainTableError if TYPE_CHECKING: @@ -27,7 +27,7 @@ def __hash__(self) -> int: return _structural_hash(self.__class__.__qualname__, self.is_fitted) @abstractmethod - def fit(self, training_set: TabularDataset) -> Regressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> Regressor: """ Create a copy of this regressor and fit it with the given training data. @@ -50,7 +50,7 @@ def fit(self, training_set: TabularDataset) -> Regressor: """ @abstractmethod - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. @@ -94,7 +94,7 @@ def _get_sklearn_regressor(self) -> RegressorMixin: # Metrics # ------------------------------------------------------------------------------------------------------------------ - def summarize_metrics(self, validation_or_test_set: TabularDataset) -> Table: + def summarize_metrics(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> Table: """ Summarize the regressor's metrics on the given data. @@ -123,7 +123,7 @@ def summarize_metrics(self, validation_or_test_set: TabularDataset) -> Table: }, ) - def mean_absolute_error(self, validation_or_test_set: TabularDataset) -> float: + def mean_absolute_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the mean absolute error (MAE) of the regressor on the given data. @@ -146,14 +146,24 @@ def mean_absolute_error(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target - _check_metrics_preconditions(predicted, expected) - return sk_mean_absolute_error(expected._data, predicted._data) + if isinstance(validation_or_test_set, TabularDataset): + expected = validation_or_test_set.target + predicted = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions(predicted, expected) + return sk_mean_absolute_error(expected._data, predicted._data) + elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover + expected_2 = validation_or_test_set.target + predicted_2 = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions_experimental(predicted_2, expected_2) + return sk_mean_absolute_error(expected_2._series, predicted_2._data) # noinspection PyProtectedMember - def mean_squared_error(self, validation_or_test_set: TabularDataset) -> float: + def mean_squared_error(self, validation_or_test_set: TabularDataset | ExperimentalTabularDataset) -> float: """ Compute the mean squared error (MSE) on the given data. @@ -176,14 +186,23 @@ def mean_squared_error(self, validation_or_test_set: TabularDataset) -> float: if not isinstance(validation_or_test_set, TabularDataset) and isinstance(validation_or_test_set, Table): raise PlainTableError - expected = validation_or_test_set.target - predicted = self.predict(validation_or_test_set.features).target - _check_metrics_preconditions(predicted, expected) - return sk_mean_squared_error(expected._data, predicted._data) + if isinstance(validation_or_test_set, TabularDataset): + expected = validation_or_test_set.target + predicted = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions(predicted, expected) + return sk_mean_squared_error(expected._data, predicted._data) + elif isinstance(validation_or_test_set, ExperimentalTabularDataset): # pragma: no cover + expected_2 = validation_or_test_set.target + predicted_2 = self.predict(validation_or_test_set.features).target + + # TODO: more efficient implementation using polars + _check_metrics_preconditions_experimental(predicted_2, expected_2) + return sk_mean_squared_error(expected_2._series, predicted_2._data) -# noinspection PyProtectedMember def _check_metrics_preconditions(actual: Column, expected: Column) -> None: if not actual.type.is_numeric(): raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") @@ -194,3 +213,20 @@ def _check_metrics_preconditions(actual: Column, expected: Column) -> None: raise ColumnLengthMismatchError( "\n".join([f"{column.name}: {column._data.size}" for column in [actual, expected]]), ) + + +def _check_metrics_preconditions_experimental(actual: Column, expected: ExperimentalColumn) -> None: # pragma: no cover + if not actual.type.is_numeric(): + raise TypeError(f"Column 'actual' is not numerical but {actual.type}.") + if not expected.type.is_numeric: + raise TypeError(f"Column 'expected' is not numerical but {expected.type}.") + + if actual.number_of_rows != expected.number_of_rows: + raise ColumnLengthMismatchError( + "\n".join( + [ + f"{actual.name}: {actual.number_of_rows}", + f"{expected.name}: {expected.number_of_rows}", + ], + ), + ) diff --git a/src/safeds/ml/classical/regression/_ridge_regression.py b/src/safeds/ml/classical/regression/_ridge_regression.py index de1d5cfc1..9a9b8f706 100644 --- a/src/safeds/ml/classical/regression/_ridge_regression.py +++ b/src/safeds/ml/classical/regression/_ridge_regression.py @@ -13,8 +13,8 @@ from sklearn.base import RegressorMixin from sklearn.linear_model import Ridge as sk_Ridge - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class RidgeRegressor(Regressor): @@ -69,7 +69,7 @@ def alpha(self) -> float: """ return self._alpha - def fit(self, training_set: TabularDataset) -> RidgeRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> RidgeRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -108,7 +108,7 @@ def fit(self, training_set: TabularDataset) -> RidgeRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. diff --git a/src/safeds/ml/classical/regression/_support_vector_machine.py b/src/safeds/ml/classical/regression/_support_vector_machine.py index c1425530c..56627e978 100644 --- a/src/safeds/ml/classical/regression/_support_vector_machine.py +++ b/src/safeds/ml/classical/regression/_support_vector_machine.py @@ -13,15 +13,15 @@ from sklearn.base import RegressorMixin from sklearn.svm import SVC as sk_SVR # noqa: N811 - from safeds.data.labeled.containers import TabularDataset - from safeds.data.tabular.containers import Table + from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset + from safeds.data.tabular.containers import ExperimentalTable, Table class SupportVectorMachineKernel(ABC): """The abstract base class of the different subclasses supported by the `Kernel`.""" @abstractmethod - def _get_sklearn_arguments(self) -> dict[str, Any]: + def _get_sklearn_arguments(self) -> dict[str, Any]: # TODO: use apply pattern (imputer strategy) instead """Return the arguments to pass to scikit-learn.""" @abstractmethod @@ -188,7 +188,7 @@ def __eq__(self, other: object) -> bool: __hash__ = SupportVectorMachineKernel.__hash__ - def fit(self, training_set: TabularDataset) -> SupportVectorMachineRegressor: + def fit(self, training_set: TabularDataset | ExperimentalTabularDataset) -> SupportVectorMachineRegressor: """ Create a copy of this regressor and fit it with the given training data. @@ -227,7 +227,7 @@ def fit(self, training_set: TabularDataset) -> SupportVectorMachineRegressor: return result - def predict(self, dataset: Table) -> TabularDataset: + def predict(self, dataset: Table | ExperimentalTable | ExperimentalTabularDataset) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. From a6121981a640f19d4087e5d9e04c989933518a75 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 16:04:19 +0200 Subject: [PATCH 11/40] feat: experimental table transformers (just buggy copies) --- .../data/tabular/transformation/__init__.py | 23 + .../_experimental_discretizer.py | 211 +++++++++ .../transformation/_experimental_imputer.py | 390 +++++++++++++++++ .../_experimental_label_encoder.py | 236 ++++++++++ .../_experimental_one_hot_encoder.py | 412 ++++++++++++++++++ .../_experimental_range_scaler.py | 285 ++++++++++++ .../_experimental_standard_scaler.py | 267 ++++++++++++ .../_experimental_table_transformer.py | 172 ++++++++ 8 files changed, 1996 insertions(+) create mode 100644 src/safeds/data/tabular/transformation/_experimental_discretizer.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_imputer.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_label_encoder.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_range_scaler.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_standard_scaler.py create mode 100644 src/safeds/data/tabular/transformation/_experimental_table_transformer.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 3ae5fc572..682ad4513 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -6,6 +6,13 @@ if TYPE_CHECKING: from ._discretizer import Discretizer + from ._experimental_discretizer import ExperimentalDiscretizer + from ._experimental_imputer import ExperimentalImputer + from ._experimental_label_encoder import ExperimentalLabelEncoder + from ._experimental_one_hot_encoder import ExperimentalOneHotEncoder + from ._experimental_range_scaler import ExperimentalRangeScaler + from ._experimental_standard_scaler import ExperimentalStandardScaler + from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer, ExperimentalTableTransformer from ._imputer import Imputer from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder @@ -17,6 +24,14 @@ __name__, { "Discretizer": "._discretizer:Discretizer", + "ExperimentalDiscretizer": "._experimental_discretizer:ExperimentalDiscretizer", + "ExperimentalImputer": "._experimental_imputer:ExperimentalImputer", + "ExperimentalLabelEncoder": "._experimental_label_encoder:ExperimentalLabelEncoder", + "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:Experimental", + "ExperimentalRangeScaler": "._experimental_range_scaler:ExperimentalRangeScaler", + "ExperimentalStandardScaler": "._experimental_standard_scaler:ExperimentalStandardScaler", + "ExperimentalTableTransformer": "._experimental_table_transformer:ExperimentalTableTransformer", + "ExperimentalInvertibleTableTransformer": "._experimental_table_transformer:ExperimentalInvertibleTableTransformer", "Imputer": "._imputer:Imputer", "InvertibleTableTransformer": "._table_transformer:InvertibleTableTransformer", "LabelEncoder": "._label_encoder:LabelEncoder", @@ -29,6 +44,14 @@ __all__ = [ "Discretizer", + "ExperimentalDiscretizer", + "ExperimentalImputer", + "ExperimentalLabelEncoder", + "ExperimentalOneHotEncoder", + "ExperimentalRangeScaler", + "ExperimentalStandardScaler", + "ExperimentalTableTransformer", + "ExperimentalInvertibleTableTransformer", "Imputer", "InvertibleTableTransformer", "LabelEncoder", diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py new file mode 100644 index 000000000..75f1c1cd1 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_discretizer.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable, Table +from safeds.exceptions import ( + ClosedBound, + NonNumericColumnError, + OutOfBoundsError, + TransformerNotFittedError, + UnknownColumnNameError, +) + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer + + +class ExperimentalDiscretizer(ExperimentalTableTransformer): + """ + The Discretizer bins continuous data into intervals. + + Parameters + ---------- + number_of_bins: + The number of bins to be created. + + Raises + ------ + OutOfBoundsError + If the given number_of_bins is less than 2. + """ + + def __init__(self, number_of_bins: int = 5): + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_KBinsDiscretizer | None = None + + if number_of_bins < 2: + raise OutOfBoundsError(number_of_bins, name="number_of_bins", lower_bound=ClosedBound(2)) + self._number_of_bins = number_of_bins + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalDiscretizer: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + ValueError + If the table is empty. + NonNumericColumnError + If one of the columns, that should be fitted is non-numeric. + UnknownColumnNameError + If one of the columns, that should be fitted is not in the table. + """ + from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer + + if table.number_of_rows == 0: + raise ValueError("The Discretizer cannot be fitted because the table contains 0 rows") + + if column_names is None: + column_names = table.column_names + else: + missing_columns = set(column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError( + sorted( + missing_columns, + key={val: ix for ix, val in enumerate(column_names)}.__getitem__, + ), + ) + + for column in column_names: + if not table.get_column(column).type.is_numeric(): + raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") + + wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") + wrapped_transformer.fit(table._data[column_names]) + + result = ExperimentalDiscretizer(self._number_of_bins) + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + ValueError + If the table is empty. + UnknownColumnNameError + If one of the columns, that should be transformed is not in the table. + NonNumericColumnError + If one of the columns, that should be fitted is non-numeric. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + if table.number_of_rows == 0: + raise ValueError("The table cannot be transformed because it contains 0 rows") + + # Input table does not contain all columns used to fit the transformer + missing_columns = set(self._column_names) - set(table.column_names) + if len(missing_columns) > 0: + raise UnknownColumnNameError( + sorted( + missing_columns, + key={val: ix for ix, val in enumerate(self._column_names)}.__getitem__, + ), + ) + + for column in self._column_names: + if not table.get_column(column).type.is_numeric(): + raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") + + data = table._data.reset_index(drop=True) + data.columns = table.column_names + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the Discretizer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the Discretizer. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the Discretizer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_imputer.py b/src/safeds/data/tabular/transformation/_experimental_imputer.py new file mode 100644 index 000000000..658842e9d --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_imputer.py @@ -0,0 +1,390 @@ +from __future__ import annotations + +import sys +import warnings +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import pandas as pd + +from safeds._utils import _structural_hash +from safeds.data.tabular.containers import ExperimentalTable, Table +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from sklearn.impute import SimpleImputer as sk_SimpleImputer + + +class ExperimentalImputer(ExperimentalTableTransformer): + """ + Replace missing values using the given strategy. + + Parameters + ---------- + strategy: + How to replace missing values. + value_to_replace: + The value that should be replaced. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column, Table + >>> from safeds.data.tabular.transformation import Imputer + >>> + >>> table = Table.from_columns( + ... [ + ... Column("a", [1, 3, None]), + ... Column("b", [None, 2, 3]), + ... ], + ... ) + >>> transformer = Imputer(Imputer.Strategy.Constant(0)) + >>> transformed_table = transformer.fit_and_transform(table) + """ + + class Strategy(ABC): + """Various strategies to replace missing values. Use the static methods to create instances of this class.""" + + @abstractmethod + def __eq__(self, other: object) -> bool: + pass # pragma: no cover + + @abstractmethod + def __hash__(self) -> int: + pass # pragma: no cover + + @abstractmethod + def _apply(self, imputer: sk_SimpleImputer) -> None: + """ + Set the imputer strategy of the given imputer. + + Parameters + ---------- + imputer: + The imputer to augment. + """ + + @staticmethod + def Constant(value: Any) -> ExperimentalImputer.Strategy: # noqa: N802 + """ + Replace missing values with the given constant value. + + Parameters + ---------- + value: + The value to replace missing values. + """ + return _Constant(value) # pragma: no cover + + @staticmethod + def Mean() -> ExperimentalImputer.Strategy: # noqa: N802 + """Replace missing values with the mean of each column.""" + return _Mean() # pragma: no cover + + @staticmethod + def Median() -> ExperimentalImputer.Strategy: # noqa: N802 + """Replace missing values with the median of each column.""" + return _Median() # pragma: no cover + + @staticmethod + def Mode() -> ExperimentalImputer.Strategy: # noqa: N802 + """Replace missing values with the mode of each column.""" + return _Mode() # pragma: no cover + + def __init__(self, strategy: ExperimentalImputer.Strategy, *, value_to_replace: float | str | None = None): + if value_to_replace is None: + value_to_replace = pd.NA + + self._strategy = strategy + self._value_to_replace = value_to_replace + + self._wrapped_transformer: sk_SimpleImputer | None = None + self._column_names: list[str] | None = None + + @property + def strategy(self) -> ExperimentalImputer.Strategy: + """The strategy used to replace missing values.""" + return self._strategy + + @property + def value_to_replace(self) -> Any: + """The value that should be replaced.""" + return self._value_to_replace + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalImputer: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table + ValueError + If the table contains 0 rows + NonNumericColumnError + If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data. + """ + from sklearn.impute import SimpleImputer as sk_SimpleImputer + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The Imputer cannot be fitted because the table contains 0 rows") + + if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns( + column_names, + ).remove_columns_with_non_numerical_values().number_of_columns < len( + column_names, + ): + raise NonNumericColumnError( + str( + sorted( + set(table.keep_only_columns(column_names).column_names) + - set( + table.keep_only_columns(column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + if isinstance(self._strategy, _Mode): + multiple_most_frequent = {} + for name in column_names: + if len(table.get_column(name).mode()) > 1: + multiple_most_frequent[name] = table.get_column(name).mode() + if len(multiple_most_frequent) > 0: + warnings.warn( + "There are multiple most frequent values in a column given to the Imputer.\nThe lowest values" + " are being chosen in this cases. The following columns have multiple most frequent" + f" values:\n{multiple_most_frequent}", + UserWarning, + stacklevel=2, + ) + + wrapped_transformer = sk_SimpleImputer() + self._strategy._apply(wrapped_transformer) + wrapped_transformer.missing_values = self._value_to_replace + wrapped_transformer.fit(table._data[column_names]) + + result = ExperimentalImputer(self._strategy) + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + """ + import pandas as pd + + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The Imputer cannot transform the table because it contains 0 rows") + + data = table._data.reset_index(drop=True) + data[self._column_names] = pd.DataFrame( + self._wrapped_transformer.transform(data[self._column_names]), + columns=self._column_names, + ) + return Table._from_pandas_dataframe(data, table.schema) + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the Imputer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the Imputer. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the Imputer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the Imputer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + +# ---------------------------------------------------------------------------------------------------------------------- +# Imputation strategies +# ---------------------------------------------------------------------------------------------------------------------- + + +class _Constant(ExperimentalImputer.Strategy): + def __init__(self, value: Any): + self._value = value + + @property + def value(self) -> Any: + return self._value + + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Constant): + return NotImplemented + if self is other: + return True + return self._value == other._value + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __sizeof__(self) -> int: + return sys.getsizeof(self._value) + + def __str__(self) -> str: + return f"Constant({self._value})" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "constant" + imputer.fill_value = self._value + + +class _Mean(ExperimentalImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mean): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mean" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "mean" + + +class _Median(ExperimentalImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Median): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Median" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "median" + + +class _Mode(ExperimentalImputer.Strategy): + def __eq__(self, other: object) -> bool: + if not isinstance(other, _Mode): + return NotImplemented + return True + + def __hash__(self) -> int: + return _structural_hash(str(self)) + + def __str__(self) -> str: + return "Mode" + + def _apply(self, imputer: sk_SimpleImputer) -> None: + imputer.strategy = "most_frequent" + + +# Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. +# This is needed for the DSL, where imputer strategies are variants of an enum. +ExperimentalImputer.Strategy.Constant = _Constant # type: ignore[method-assign] +ExperimentalImputer.Strategy.Mean = _Mean # type: ignore[method-assign] +ExperimentalImputer.Strategy.Median = _Median # type: ignore[method-assign] +ExperimentalImputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py new file mode 100644 index 000000000..8b5e11d0b --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable, Table +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder + + +class ExperimentalLabelEncoder(ExperimentalInvertibleTableTransformer): + """The LabelEncoder encodes one or more given columns into labels.""" + + def __init__(self) -> None: + self._wrapped_transformer: sk_OrdinalEncoder | None = None + self._column_names: list[str] | None = None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalLabelEncoder: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0: + warnings.warn( + "The columns" + f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", + UserWarning, + stacklevel=2, + ) + + wrapped_transformer = sk_OrdinalEncoder() + wrapped_transformer.fit(table._data[column_names]) + + result = ExperimentalLabelEncoder() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + data = table._data.reset_index(drop=True) + data.columns = table.column_names + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the specified columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") + + if transformed_table.keep_only_columns( + self._column_names, + ).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names): + raise NonNumericColumnError( + str( + sorted( + set(self._column_names) + - set( + transformed_table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + data = transformed_table._data.reset_index(drop=True) + data.columns = transformed_table.column_names + data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the LabelEncoder. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the LabelEncoder. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the LabelEncoder. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the LabelEncoder was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py new file mode 100644 index 000000000..e16772cb6 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -0,0 +1,412 @@ +from __future__ import annotations + +import warnings +from collections import Counter +from typing import Any + +from safeds.data.tabular.containers import Column, ExperimentalTable, Table +from safeds.exceptions import ( + NonNumericColumnError, + TransformerNotFittedError, + UnknownColumnNameError, + ValueNotPresentWhenFittedError, +) + +from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer + + +class ExperimentalOneHotEncoder(ExperimentalInvertibleTableTransformer): + """ + A way to deal with categorical features that is particularly useful for unordered (i.e. nominal) data. + + It replaces a column with a set of columns, each representing a unique value in the original column. The value of + each new column is 1 if the original column had that value, and 0 otherwise. Take the following table as an + example: + + | col1 | + |------| + | "a" | + | "b" | + | "c" | + | "a" | + + The one-hot encoding of this table is: + + | col1__a | col1__b | col1__c | + |---------|---------|---------| + | 1 | 0 | 0 | + | 0 | 1 | 0 | + | 0 | 0 | 1 | + | 1 | 0 | 0 | + + The name "one-hot" comes from the fact that each row has exactly one 1 in it, and the rest of the values are 0s. + One-hot encoding is closely related to dummy variable / indicator variables, which are used in statistics. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> from safeds.data.tabular.transformation import OneHotEncoder + >>> table = Table({"col1": ["a", "b", "c", "a"]}) + >>> transformer = OneHotEncoder() + >>> transformer.fit_and_transform(table, ["col1"])[1] + col1__a col1__b col1__c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + 3 1.0 0.0 0.0 + """ + + def __init__(self) -> None: + # Maps each old column to (list of) new columns created from it: + self._column_names: dict[str, list[str]] | None = None + # Maps concrete values (tuples of old column and value) to corresponding new column names: + self._value_to_column: dict[tuple[str, Any], str] | None = None + # Maps nan values (str of old column) to corresponding new column name + self._value_to_column_nans: dict[str, str] | None = None + + def __hash__(self) -> int: + return super().__hash__() + + def __eq__(self, other: object) -> bool: + if not isinstance(other, ExperimentalOneHotEncoder): + return NotImplemented + return ( + self._column_names == other._column_names + and self._value_to_column == other._value_to_column + and self._value_to_column_nans == other._value_to_column_nans + ) + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalOneHotEncoder: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + ValueError + If the table contains 0 rows. + """ + import numpy as np + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") + + if ( + table._as_table() + .keep_only_columns(column_names) + .remove_columns_with_non_numerical_values() + .number_of_columns + > 0 + ): + warnings.warn( + "The columns" + f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", + UserWarning, + stacklevel=2, + ) + + data = table._data.reset_index(drop=True) + data.columns = table.column_names + + result = ExperimentalOneHotEncoder() + + result._column_names = {} + result._value_to_column = {} + result._value_to_column_nans = {} + + # Keep track of number of occurrences of column names; + # initially all old column names appear exactly once: + name_counter = Counter(data.columns) + + # Iterate through all columns to-be-changed: + for column in column_names: + result._column_names[column] = [] + for element in table.get_column(column).get_unique_values(): + base_name = f"{column}__{element}" + name_counter[base_name] += 1 + new_column_name = base_name + # Check if newly created name matches some other existing column name: + if name_counter[base_name] > 1: + new_column_name += f"#{name_counter[base_name]}" + # Update dictionary entries: + result._column_names[column] += [new_column_name] + if isinstance(element, float) and np.isnan(element): + result._value_to_column_nans[column] = new_column_name + else: + result._value_to_column[(column, element)] = new_column_name + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + ValueError + If the table contains 0 rows. + ValueNotPresentWhenFittedError + If a column in the to-be-transformed table contains a new value that was not already present in the table the OneHotEncoder was fitted on. + """ + import numpy as np + + # Transformer has not been fitted yet + if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names.keys()) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") + + encoded_values = {} + for new_column_name in self._value_to_column.values(): + encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] + for new_column_name in self._value_to_column_nans.values(): + encoded_values[new_column_name] = [0.0 for _ in range(table.number_of_rows)] + + values_not_present_when_fitted = [] + for old_column_name in self._column_names: + for i in range(table.number_of_rows): + value = table.get_column(old_column_name).get_value(i) + try: + if isinstance(value, float) and np.isnan(value): + new_column_name = self._value_to_column_nans[old_column_name] + else: + new_column_name = self._value_to_column[(old_column_name, value)] + encoded_values[new_column_name][i] = 1.0 + except KeyError: + # This happens when a column in the to-be-transformed table contains a new value that was not + # already present in the table the OneHotEncoder was fitted on. + values_not_present_when_fitted.append((value, old_column_name)) + + for new_column in self._column_names[old_column_name]: + table = table.add_columns([Column(new_column, encoded_values[new_column])]) + + if len(values_not_present_when_fitted) > 0: + raise ValueNotPresentWhenFittedError(values_not_present_when_fitted) + + # New columns may not be sorted: + column_names = [] + for name in table.column_names: + if name not in self._column_names: + column_names.append(name) + else: + column_names.extend( + [f_name for f_name in self._value_to_column.values() if f_name.startswith(name)] + + [f_name for f_name in self._value_to_column_nans.values() if f_name.startswith(name)], + ) + + # Drop old, non-encoded columns: + # (Don't do this earlier - we need the old column nams for sorting, + # plus we need to prevent the table from possibly having 0 columns temporarily.) + table = table.remove_columns(list(self._column_names.keys())) + + # Apply sorting and return: + return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + import numpy as np + + # Transformer has not been fitted yet + if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: + raise TransformerNotFittedError + + _transformed_column_names = [item for sublist in self._column_names.values() for item in sublist] + + missing_columns = sorted(set(_transformed_column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") + + if transformed_table._as_table().keep_only_columns( + _transformed_column_names, + ).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names): + raise NonNumericColumnError( + str( + sorted( + set(_transformed_column_names) + - set( + transformed_table.keep_only_columns(_transformed_column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + original_columns = {} + for original_column_name in self._column_names: + original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)] + + for original_column_name, value in self._value_to_column: + constructed_column = self._value_to_column[(original_column_name, value)] + for i in range(transformed_table.number_of_rows): + if transformed_table.get_column(constructed_column)[i] == 1.0: + original_columns[original_column_name][i] = value + + for original_column_name in self._value_to_column_nans: + constructed_column = self._value_to_column_nans[original_column_name] + for i in range(transformed_table.number_of_rows): + if transformed_table.get_column(constructed_column)[i] == 1.0: + original_columns[original_column_name][i] = np.nan + + table = transformed_table + + for column_name, encoded_column in original_columns.items(): + table = table.add_column(Column(column_name, encoded_column)) + + column_names = [ + ( + name + if name not in [value for value_list in list(self._column_names.values()) for value in value_list] + else list(self._column_names.keys())[ + next( + list(self._column_names.values()).index(value) + for value in list(self._column_names.values()) + if name in value + ) + ] + ) + for name in table.column_names + ] + + # Drop old column names: + table = table.remove_columns(list(self._value_to_column.values())) + table = table.remove_columns(list(self._value_to_column_nans.values())) + + return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return ( + self._column_names is not None + and self._value_to_column is not None + and self._value_to_column_nans is not None + ) + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the OneHotEncoder. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return [name for column_names in self._column_names.values() for name in column_names] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that have been changed by the OneHotEncoder (none). + + Returns + ------- + changed_columns: + The empty list. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the OneHotEncoder. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the OneHotEncoder was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return list(self._column_names.keys()) diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py new file mode 100644 index 000000000..0514a3094 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -0,0 +1,285 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable, Table +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + + +class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): + """ + The RangeScaler transforms column values by scaling each value to a given range. + + Parameters + ---------- + minimum: + The minimum of the new range after the transformation + maximum: + The maximum of the new range after the transformation + + Raises + ------ + ValueError + If the given minimum is greater or equal to the given maximum + """ + + def __init__(self, minimum: float = 0.0, maximum: float = 1.0): + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_MinMaxScaler | None = None + if minimum >= maximum: + raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') + self._minimum = minimum + self._maximum = maximum + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalRangeScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + NonNumericColumnError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") + + if ( + table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns + < table.keep_only_columns(column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.keep_only_columns(column_names).column_names) + - set( + table.keep_only_columns(column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) + wrapped_transformer.fit(table._data[column_names]) + + result = ExperimentalRangeScaler() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + + if ( + table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns + < table.keep_only_columns(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.keep_only_columns(self._column_names).column_names) + - set( + table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + data = table._data.reset_index(drop=True) + data.columns = table.column_names + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") + + if ( + transformed_table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .number_of_columns + < transformed_table.keep_only_columns(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(transformed_table.keep_only_columns(self._column_names).column_names) + - set( + transformed_table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + data = transformed_table._data.reset_index(drop=True) + data.columns = transformed_table.column_names + data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the RangeScaler. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the RangeScaler. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the RangeScaler. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the RangeScaler was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py new file mode 100644 index 000000000..ac6818fe6 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from safeds.data.tabular.containers import ExperimentalTable, Table +from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError + +from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer + +if TYPE_CHECKING: + from sklearn.preprocessing import StandardScaler as sk_StandardScaler + + +class ExperimentalStandardScaler(ExperimentalInvertibleTableTransformer): + """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" + + def __init__(self) -> None: + self._column_names: list[str] | None = None + self._wrapped_transformer: sk_StandardScaler | None = None + + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalStandardScaler: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + + Raises + ------ + UnknownColumnNameError + If column_names contain a column name that is missing in the table. + NonNumericColumnError + If at least one of the specified columns in the table contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + from sklearn.preprocessing import StandardScaler as sk_StandardScaler + + if column_names is None: + column_names = table.column_names + else: + missing_columns = sorted(set(column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") + + if ( + table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns + < table.keep_only_columns(column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.keep_only_columns(column_names).column_names) + - set( + table.keep_only_columns(column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + wrapped_transformer = sk_StandardScaler() + wrapped_transformer.fit(table._data[column_names]) + + result = ExperimentalStandardScaler() + result._wrapped_transformer = wrapped_transformer + result._column_names = column_names + + return result + + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If at least one of the columns in the input table that is used to fit contains non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + # Input table does not contain all columns used to fit the transformer + missing_columns = sorted(set(self._column_names) - set(table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + + if ( + table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns + < table.keep_only_columns(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(table.keep_only_columns(self._column_names).column_names) + - set( + table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + data = table._data.reset_index(drop=True) + data.columns = table.column_names + data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + UnknownColumnNameError + If the input table does not contain all columns used to fit the transformer. + NonNumericColumnError + If the transformed columns of the input table contain non-numerical data. + ValueError + If the table contains 0 rows. + """ + # Transformer has not been fitted yet + if self._wrapped_transformer is None or self._column_names is None: + raise TransformerNotFittedError + + missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names)) + if len(missing_columns) > 0: + raise UnknownColumnNameError(missing_columns) + + if transformed_table.number_of_rows == 0: + raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") + + if ( + transformed_table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .number_of_columns + < transformed_table.keep_only_columns(self._column_names).number_of_columns + ): + raise NonNumericColumnError( + str( + sorted( + set(transformed_table.keep_only_columns(self._column_names).column_names) + - set( + transformed_table.keep_only_columns(self._column_names) + .remove_columns_with_non_numerical_values() + .column_names, + ), + ), + ), + ) + + data = transformed_table._data.reset_index(drop=True) + data.columns = transformed_table.column_names + data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) + return Table._from_pandas_dataframe(data) + + @property + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + return self._wrapped_transformer is not None + + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the StandardScaler. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] + + # (Must implement abstract method, cannot instantiate class otherwise.) + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that may have been changed by the StandardScaler. + + Returns + ------- + changed_columns: + The list of (potentially) changed column names, as passed to fit. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if self._column_names is None: + raise TransformerNotFittedError + return self._column_names + + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the StandardScaler. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the StandardScaler was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + if not self.is_fitted: + raise TransformerNotFittedError + return [] diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py new file mode 100644 index 000000000..2b9f4d290 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Self + +from safeds._utils import _structural_hash + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalTable + + +class ExperimentalTableTransformer(ABC): + """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" + + def __hash__(self) -> int: + """ + Return a deterministic hash value for a table transformer. + + Returns + ------- + hash: + The hash value. + """ + added = self.get_names_of_added_columns() if self.is_fitted else [] + changed = self.get_names_of_changed_columns() if self.is_fitted else [] + removed = self.get_names_of_removed_columns() if self.is_fitted else [] + return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) + + @property + @abstractmethod + def is_fitted(self) -> bool: + """Whether the transformer is fitted.""" + + @abstractmethod + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Self: + """ + Learn a transformation for a set of columns in a table. + + This transformer is not modified. + + Parameters + ---------- + table: + The table used to fit the transformer. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer: + The fitted transformer. + """ + + @abstractmethod + def transform(self, table: ExperimentalTable) -> ExperimentalTable: + """ + Apply the learned transformation to a table. + + The table is not modified. + + Parameters + ---------- + table: + The table to which the learned transformation is applied. + + Returns + ------- + transformed_table: + The transformed table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_added_columns(self) -> list[str]: + """ + Get the names of all new columns that have been added by the transformer. + + Returns + ------- + added_columns: + A list of names of the added columns, ordered as they will appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_changed_columns(self) -> list[str]: + """ + Get the names of all columns that have been changed by the transformer. + + Returns + ------- + changed_columns: + A list of names of changed columns, ordered as they appear in the table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + @abstractmethod + def get_names_of_removed_columns(self) -> list[str]: + """ + Get the names of all columns that have been removed by the transformer. + + Returns + ------- + removed_columns: + A list of names of the removed columns, ordered as they appear in the table the transformer was fitted on. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ + + def fit_and_transform(self, table: ExperimentalTable, column_names: list[str] | None = None) -> tuple[Self, ExperimentalTable]: + """ + Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. + + Neither the transformer nor the table are modified. + + Parameters + ---------- + table: + The table used to fit the transformer. The transformer is then applied to this table. + column_names: + The list of columns from the table used to fit the transformer. If `None`, all columns are used. + + Returns + ------- + fitted_transformer, transformed_table: + The fitted transformer and the transformed table.: + """ + fitted_transformer = self.fit(table, column_names) + transformed_table = fitted_transformer.transform(table) + return fitted_transformer, transformed_table + + +class ExperimentalInvertibleTableTransformer(ExperimentalTableTransformer): + """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" + + @abstractmethod + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ From d53fd09fe98521102ffa8ae25f9f1311961e69bf Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 16:05:36 +0200 Subject: [PATCH 12/40] feat: `ExperimentalTable.transform` and `ExperimentalTable.inverse_transform` --- .../tabular/containers/_experimental_table.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index e894a23b1..61889a4ff 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -27,7 +27,10 @@ import polars as pl - from safeds.data.tabular.transformation import InvertibleTableTransformer, TableTransformer + from safeds.data.tabular.transformation import ( + ExperimentalInvertibleTableTransformer, + ExperimentalTableTransformer, + ) from safeds.data.tabular.typing import ExperimentalSchema from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType @@ -731,16 +734,11 @@ def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: self._data_frame.vstack(other._data_frame), ) - def inverse_transform_table(self, fitted_transformer: InvertibleTableTransformer) -> ExperimentalTable: - # TODO: more efficient implementation - # old_table = self.temporary_to_old_table().inverse_transform_table(fitted_transformer) - # return ExperimentalTable._from_polars_data_frame( - # pl.DataFrame(old_table.) - # ) - raise NotImplementedError + def inverse_transform_table(self, fitted_transformer: ExperimentalInvertibleTableTransformer) -> ExperimentalTable: + return fitted_transformer.inverse_transform(self) - def transform_table(self, fitted_transformer: TableTransformer) -> ExperimentalTable: - raise NotImplementedError + def transform_table(self, fitted_transformer: ExperimentalTableTransformer) -> ExperimentalTable: + return fitted_transformer.transform(self) # ------------------------------------------------------------------------------------------------------------------ # Statistics From e4dcd9d6f6f54a32b3e749bdcbf21ff7a5600851 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 16:13:28 +0200 Subject: [PATCH 13/40] refactor: move invertible transformer to own file --- .../data/tabular/transformation/__init__.py | 7 ++-- .../_experimental_discretizer.py | 8 ++--- .../transformation/_experimental_imputer.py | 3 +- ...perimental_invertible_table_transformer.py | 36 +++++++++++++++++++ .../_experimental_label_encoder.py | 5 +-- .../_experimental_one_hot_encoder.py | 4 +-- .../_experimental_range_scaler.py | 5 +-- .../_experimental_standard_scaler.py | 5 +-- .../_experimental_table_transformer.py | 27 -------------- 9 files changed, 57 insertions(+), 43 deletions(-) create mode 100644 src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index 682ad4513..dd6a519f3 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -8,11 +8,12 @@ from ._discretizer import Discretizer from ._experimental_discretizer import ExperimentalDiscretizer from ._experimental_imputer import ExperimentalImputer + from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer from ._experimental_label_encoder import ExperimentalLabelEncoder from ._experimental_one_hot_encoder import ExperimentalOneHotEncoder from ._experimental_range_scaler import ExperimentalRangeScaler from ._experimental_standard_scaler import ExperimentalStandardScaler - from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer, ExperimentalTableTransformer + from ._experimental_table_transformer import ExperimentalTableTransformer from ._imputer import Imputer from ._label_encoder import LabelEncoder from ._one_hot_encoder import OneHotEncoder @@ -26,12 +27,12 @@ "Discretizer": "._discretizer:Discretizer", "ExperimentalDiscretizer": "._experimental_discretizer:ExperimentalDiscretizer", "ExperimentalImputer": "._experimental_imputer:ExperimentalImputer", + "ExperimentalInvertibleTableTransformer": "._experimental_invertible_table_transformer:ExperimentalInvertibleTableTransformer", "ExperimentalLabelEncoder": "._experimental_label_encoder:ExperimentalLabelEncoder", "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:Experimental", "ExperimentalRangeScaler": "._experimental_range_scaler:ExperimentalRangeScaler", "ExperimentalStandardScaler": "._experimental_standard_scaler:ExperimentalStandardScaler", "ExperimentalTableTransformer": "._experimental_table_transformer:ExperimentalTableTransformer", - "ExperimentalInvertibleTableTransformer": "._experimental_table_transformer:ExperimentalInvertibleTableTransformer", "Imputer": "._imputer:Imputer", "InvertibleTableTransformer": "._table_transformer:InvertibleTableTransformer", "LabelEncoder": "._label_encoder:LabelEncoder", @@ -46,12 +47,12 @@ "Discretizer", "ExperimentalDiscretizer", "ExperimentalImputer", + "ExperimentalInvertibleTableTransformer", "ExperimentalLabelEncoder", "ExperimentalOneHotEncoder", "ExperimentalRangeScaler", "ExperimentalStandardScaler", "ExperimentalTableTransformer", - "ExperimentalInvertibleTableTransformer", "Imputer", "InvertibleTableTransformer", "LabelEncoder", diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py index 75f1c1cd1..82d9ccf2d 100644 --- a/src/safeds/data/tabular/transformation/_experimental_discretizer.py +++ b/src/safeds/data/tabular/transformation/_experimental_discretizer.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING -from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import ( ClosedBound, NonNumericColumnError, @@ -16,6 +15,8 @@ if TYPE_CHECKING: from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer + from safeds.data.tabular.containers import ExperimentalTable + class ExperimentalDiscretizer(ExperimentalTableTransformer): """ @@ -85,7 +86,7 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper ) for column in column_names: - if not table.get_column(column).type.is_numeric(): + if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") @@ -142,7 +143,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: ) for column in self._column_names: - if not table.get_column(column).type.is_numeric(): + if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") data = table._data.reset_index(drop=True) @@ -173,7 +174,6 @@ def get_names_of_added_columns(self) -> list[str]: raise TransformerNotFittedError return [] - # (Must implement abstract method, cannot instantiate class otherwise.) def get_names_of_changed_columns(self) -> list[str]: """ Get the names of all columns that may have been changed by the Discretizer. diff --git a/src/safeds/data/tabular/transformation/_experimental_imputer.py b/src/safeds/data/tabular/transformation/_experimental_imputer.py index 658842e9d..5cf5f40ab 100644 --- a/src/safeds/data/tabular/transformation/_experimental_imputer.py +++ b/src/safeds/data/tabular/transformation/_experimental_imputer.py @@ -8,7 +8,6 @@ import pandas as pd from safeds._utils import _structural_hash -from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError from ._experimental_table_transformer import ExperimentalTableTransformer @@ -16,6 +15,8 @@ if TYPE_CHECKING: from sklearn.impute import SimpleImputer as sk_SimpleImputer + from safeds.data.tabular.containers import ExperimentalTable + class ExperimentalImputer(ExperimentalTableTransformer): """ diff --git a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py new file mode 100644 index 000000000..a77429f12 --- /dev/null +++ b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from abc import abstractmethod +from typing import TYPE_CHECKING + +from ._experimental_table_transformer import ExperimentalTableTransformer + +if TYPE_CHECKING: + from safeds.data.tabular.containers import ExperimentalTable + + +class ExperimentalInvertibleTableTransformer(ExperimentalTableTransformer): + """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" + + @abstractmethod + def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: + """ + Undo the learned transformation. + + The table is not modified. + + Parameters + ---------- + transformed_table: + The table to be transformed back to the original version. + + Returns + ------- + table: + The original table. + + Raises + ------ + TransformerNotFittedError + If the transformer has not been fitted yet. + """ diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py index 8b5e11d0b..0ad781e7a 100644 --- a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py @@ -3,14 +3,15 @@ import warnings from typing import TYPE_CHECKING -from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer if TYPE_CHECKING: from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder + from safeds.data.tabular.containers import ExperimentalTable + class ExperimentalLabelEncoder(ExperimentalInvertibleTableTransformer): """The LabelEncoder encodes one or more given columns into labels.""" diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py index e16772cb6..300f42894 100644 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -4,7 +4,7 @@ from collections import Counter from typing import Any -from safeds.data.tabular.containers import Column, ExperimentalTable, Table +from safeds.data.tabular.containers import Column, ExperimentalTable from safeds.exceptions import ( NonNumericColumnError, TransformerNotFittedError, @@ -12,7 +12,7 @@ ValueNotPresentWhenFittedError, ) -from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer class ExperimentalOneHotEncoder(ExperimentalInvertibleTableTransformer): diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py index 0514a3094..e701921c2 100644 --- a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -2,14 +2,15 @@ from typing import TYPE_CHECKING -from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer if TYPE_CHECKING: from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler + from safeds.data.tabular.containers import ExperimentalTable + class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): """ diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py index ac6818fe6..a17e5d941 100644 --- a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -2,14 +2,15 @@ from typing import TYPE_CHECKING -from safeds.data.tabular.containers import ExperimentalTable, Table from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError -from ._experimental_table_transformer import ExperimentalInvertibleTableTransformer +from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer if TYPE_CHECKING: from sklearn.preprocessing import StandardScaler as sk_StandardScaler + from safeds.data.tabular.containers import ExperimentalTable + class ExperimentalStandardScaler(ExperimentalInvertibleTableTransformer): """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py index 2b9f4d290..a8a7d1cd8 100644 --- a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py @@ -143,30 +143,3 @@ def fit_and_transform(self, table: ExperimentalTable, column_names: list[str] | fitted_transformer = self.fit(table, column_names) transformed_table = fitted_transformer.transform(table) return fitted_transformer, transformed_table - - -class ExperimentalInvertibleTableTransformer(ExperimentalTableTransformer): - """A `TableTransformer` that can also undo the learned transformation after it has been applied.""" - - @abstractmethod - def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: - """ - Undo the learned transformation. - - The table is not modified. - - Parameters - ---------- - transformed_table: - The table to be transformed back to the original version. - - Returns - ------- - table: - The original table. - - Raises - ------ - TransformerNotFittedError - If the transformer has not been fitted yet. - """ From 3ea43f2d89b570e7e3c3a3f6780459b5ded398ad Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 16:27:01 +0200 Subject: [PATCH 14/40] refactor: data frame field as lazy property --- .../tabular/containers/_experimental_table.py | 69 +++---------------- 1 file changed, 10 insertions(+), 59 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 61889a4ff..52b429356 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -248,14 +248,14 @@ def from_parquet_file(path: str | Path) -> ExperimentalTable: def _from_polars_data_frame(data: pl.DataFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data.lazy() - result._data_frame = data + result._data_frame_cache = data return result @staticmethod def _from_polars_lazy_frame(data: pl.LazyFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data - result._data_frame = None + result._data_frame_cache = None return result # ------------------------------------------------------------------------------------------------------------------ @@ -280,7 +280,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: # Implementation self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) - self._data_frame: pl.DataFrame | None = None + self._data_frame_cache: pl.DataFrame | None = None def __eq__(self, other: object) -> bool: if not isinstance(other, ExperimentalTable): @@ -288,38 +288,31 @@ def __eq__(self, other: object) -> bool: if self is other: return True - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - if other._data_frame is None: - other._data_frame = other._lazy_frame.collect() - return self._data_frame.frame_equal(other._data_frame) def __hash__(self) -> int: return _structural_hash(self.schema, self.number_of_rows) def __repr__(self) -> str: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.__repr__() def __sizeof__(self) -> int: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.estimated_size() def __str__(self) -> str: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ + @property + def _data_frame(self) -> pl.DataFrame: + if self._data_frame_cache is None: + self._data_frame_cache = self._lazy_frame.collect() + + return self._data_frame_cache + @property def column_names(self) -> list[str]: """ @@ -362,9 +355,6 @@ def number_of_rows(self) -> int: >>> table.number_of_rows 3 """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.height @property @@ -389,9 +379,6 @@ def add_columns( if len(columns) == 0: return self - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_data_frame( self._data_frame.hstack([column._series for column in columns]), ) @@ -411,9 +398,6 @@ def compute_column( ) def get_column(self, name: str) -> ExperimentalColumn: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalColumn._from_polars_series(self._data_frame.get_column(name)) def get_column_type(self, name: str) -> ExperimentalDataType: @@ -442,9 +426,6 @@ def remove_columns_by_name( def remove_columns_with_missing_values(self) -> ExperimentalTable: import polars as pl - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_lazy_frame( pl.LazyFrame( [series for series in self._data_frame.get_columns() if series.null_count() == 0], @@ -508,9 +489,6 @@ def replace_column( if len(new_columns) == 0: return self.remove_columns_by_name(old_name) - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - new_frame = self._data_frame index = new_frame.get_column_index(old_name) @@ -646,9 +624,6 @@ def remove_rows_with_outliers( raise NotImplementedError def shuffle_rows(self) -> ExperimentalTable: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_data_frame( self._data_frame.sample( fraction=1, @@ -719,17 +694,11 @@ def split_rows( # ------------------------------------------------------------------------------------------------------------------ def add_table_as_columns(self, other: ExperimentalTable) -> ExperimentalTable: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_data_frame( self._data_frame.hstack(other._data_frame), ) def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return ExperimentalTable._from_polars_data_frame( self._data_frame.vstack(other._data_frame), ) @@ -763,9 +732,6 @@ def summarize_statistics(self) -> ExperimentalTable: # ------------------------------------------------------------------------------------------------------------------ def to_columns(self) -> list[ExperimentalColumn]: - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return [ExperimentalColumn._from_polars_series(column) for column in self._data_frame.get_columns()] def to_csv_file(self, path: str | Path) -> None: @@ -812,9 +778,6 @@ def to_dict(self) -> dict[str, list[Any]]: >>> table.to_dict() {'a': [1, 2, 3], 'b': [4, 5, 6]} """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.to_dict(as_series=False) def to_json_file( @@ -855,9 +818,6 @@ def to_json_file( path.parent.mkdir(parents=True, exist_ok=True) # Write JSON to file - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - self._data_frame.write_json(path, row_oriented=(orientation == "row")) def to_parquet_file(self, path: str | Path) -> None: @@ -949,9 +909,6 @@ def temporary_to_old_table(self) -> Table: >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> old_table = table.temporary_to_old_table() """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return Table._from_pandas_dataframe(self._data_frame.to_pandas()) # ------------------------------------------------------------------------------------------------------------------ @@ -985,9 +942,6 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # dataframe: A dataframe object that conforms to the dataframe interchange protocol. """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame.__dataframe__(allow_copy=allow_copy) # ------------------------------------------------------------------------------------------------------------------ @@ -1005,7 +959,4 @@ def _repr_html_(self) -> str: html: The generated HTML. """ - if self._data_frame is None: - self._data_frame = self._lazy_frame.collect() - return self._data_frame._repr_html_() From f8b1f8416b52d32c16410b0e10ee0fa6bc683044 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 16:52:45 +0200 Subject: [PATCH 15/40] refactor: bring back `ExperimentalTable.remove_columns_except` The other version was way too verbose. --- .../_experimental_tabular_dataset.py | 4 ++-- .../containers/_time_series_dataset.py | 2 +- .../containers/_experimental_column.py | 2 +- .../tabular/containers/_experimental_table.py | 23 ++++++++++++------- src/safeds/data/tabular/containers/_table.py | 2 +- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py index 20f49023c..8fc523e5a 100644 --- a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py @@ -78,9 +78,9 @@ def __init__( # Set attributes self._table: ExperimentalTable = data - self._features: ExperimentalTable = data.remove_columns_by_name(feature_names, keep_only_listed=True) + self._features: ExperimentalTable = data.remove_columns_except(feature_names) self._target: ExperimentalColumn = data.get_column(target_name) - self._extras: ExperimentalTable = data.remove_columns_by_name(extra_names, keep_only_listed=True) + self._extras: ExperimentalTable = data.remove_columns_except(extra_names) def __eq__(self, other: object) -> bool: if not isinstance(other, ExperimentalTabularDataset): diff --git a/src/safeds/data/labeled/containers/_time_series_dataset.py b/src/safeds/data/labeled/containers/_time_series_dataset.py index 529f789d4..1603d21f6 100644 --- a/src/safeds/data/labeled/containers/_time_series_dataset.py +++ b/src/safeds/data/labeled/containers/_time_series_dataset.py @@ -6,7 +6,7 @@ from safeds._config import _init_default_device from safeds._utils import _structural_hash from safeds.data.tabular.containers import Column, Table -from safeds.exceptions import OutOfBoundsError, ClosedBound +from safeds.exceptions import ClosedBound, OutOfBoundsError if TYPE_CHECKING: from collections.abc import Mapping, Sequence diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index f8423ea9e..4359b0d09 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -12,7 +12,7 @@ from ._experimental_vectorized_cell import _VectorizedCell if TYPE_CHECKING: - from polars import Series, InvalidOperationError + from polars import Series from safeds.data.tabular.typing._experimental_data_type import ExperimentalDataType diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 52b429356..dac8a0efb 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -406,23 +406,30 @@ def get_column_type(self, name: str) -> ExperimentalDataType: def has_column(self, name: str) -> bool: return name in self.column_names - def remove_columns_by_name( + def remove_columns( self, names: str | list[str], - *, - keep_only_listed: bool = False, + /, ) -> ExperimentalTable: if isinstance(names, str): names = [names] - if keep_only_listed: - names_to_keep = set(names) # perf: Comprehensions evaluate their condition every iteration - names = [name for name in self.column_names if name not in names_to_keep] - return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop(names), ) + def remove_columns_except( + self, + names: str | list[str], + /, + ) -> ExperimentalTable: + if isinstance(names, str): + names = [names] + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select(names), + ) + def remove_columns_with_missing_values(self) -> ExperimentalTable: import polars as pl @@ -487,7 +494,7 @@ def replace_column( new_columns = [new_columns] if len(new_columns) == 0: - return self.remove_columns_by_name(old_name) + return self.remove_columns(old_name) new_frame = self._data_frame index = new_frame.get_column_index(old_name) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 67aa79e0c..91bbbf347 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar -from safeds._config import _init_default_device, _get_device +from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds.data.image.containers import Image from safeds.data.tabular.typing import ColumnType, Schema From b8b975b56980e210bfa9860f9af0cd9df54fda5e Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 17:45:32 +0200 Subject: [PATCH 16/40] fix: errors in transformers --- .../containers/_experimental_column.py | 18 +++++ .../tabular/containers/_experimental_table.py | 12 ++-- .../data/tabular/transformation/__init__.py | 8 +-- .../_experimental_discretizer.py | 18 +++-- ...perimental_invertible_table_transformer.py | 2 +- .../_experimental_label_encoder.py | 45 ++++++++----- .../_experimental_one_hot_encoder.py | 51 ++++----------- .../_experimental_range_scaler.py | 59 +++++++++-------- ...ter.py => _experimental_simple_imputer.py} | 65 +++++++++---------- .../_experimental_standard_scaler.py | 62 ++++++++++-------- .../_experimental_table_transformer.py | 23 ++++++- src/safeds/ml/classical/_util_sklearn.py | 2 +- 12 files changed, 200 insertions(+), 165 deletions(-) rename src/safeds/data/tabular/transformation/{_experimental_imputer.py => _experimental_simple_imputer.py} (84%) diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 4359b0d09..8507c5266 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -150,6 +150,24 @@ def type(self) -> ExperimentalDataType: # Value operations # ------------------------------------------------------------------------------------------------------------------ + def get_distinct_values(self) -> list[T]: + """ + Return the distinct values in the column. + + Returns + ------- + distinct_values: + The distinct values in the column. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3, 2]) + >>> column.get_distinct_values() + [1, 2, 3] + """ + return self._series.unique().sort().to_list() + def get_value(self, index: int) -> T: """ Return the column value at specified index. Indexing starts at 0. diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index dac8a0efb..9fc32debb 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -248,14 +248,14 @@ def from_parquet_file(path: str | Path) -> ExperimentalTable: def _from_polars_data_frame(data: pl.DataFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data.lazy() - result._data_frame_cache = data + result.__data_frame_cache = data return result @staticmethod def _from_polars_lazy_frame(data: pl.LazyFrame) -> ExperimentalTable: result = object.__new__(ExperimentalTable) result._lazy_frame = data - result._data_frame_cache = None + result.__data_frame_cache = None return result # ------------------------------------------------------------------------------------------------------------------ @@ -280,7 +280,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None: # Implementation self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data) - self._data_frame_cache: pl.DataFrame | None = None + self.__data_frame_cache: pl.DataFrame | None = None def __eq__(self, other: object) -> bool: if not isinstance(other, ExperimentalTable): @@ -308,10 +308,10 @@ def __str__(self) -> str: @property def _data_frame(self) -> pl.DataFrame: - if self._data_frame_cache is None: - self._data_frame_cache = self._lazy_frame.collect() + if self.__data_frame_cache is None: + self.__data_frame_cache = self._lazy_frame.collect() - return self._data_frame_cache + return self.__data_frame_cache @property def column_names(self) -> list[str]: diff --git a/src/safeds/data/tabular/transformation/__init__.py b/src/safeds/data/tabular/transformation/__init__.py index dd6a519f3..098af9adf 100644 --- a/src/safeds/data/tabular/transformation/__init__.py +++ b/src/safeds/data/tabular/transformation/__init__.py @@ -7,11 +7,11 @@ if TYPE_CHECKING: from ._discretizer import Discretizer from ._experimental_discretizer import ExperimentalDiscretizer - from ._experimental_imputer import ExperimentalImputer from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer from ._experimental_label_encoder import ExperimentalLabelEncoder from ._experimental_one_hot_encoder import ExperimentalOneHotEncoder from ._experimental_range_scaler import ExperimentalRangeScaler + from ._experimental_simple_imputer import ExperimentalSimpleImputer from ._experimental_standard_scaler import ExperimentalStandardScaler from ._experimental_table_transformer import ExperimentalTableTransformer from ._imputer import Imputer @@ -26,11 +26,11 @@ { "Discretizer": "._discretizer:Discretizer", "ExperimentalDiscretizer": "._experimental_discretizer:ExperimentalDiscretizer", - "ExperimentalImputer": "._experimental_imputer:ExperimentalImputer", "ExperimentalInvertibleTableTransformer": "._experimental_invertible_table_transformer:ExperimentalInvertibleTableTransformer", "ExperimentalLabelEncoder": "._experimental_label_encoder:ExperimentalLabelEncoder", - "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:Experimental", + "ExperimentalOneHotEncoder": "._experimental_one_hot_encoder:ExperimentalOneHotEncoder", "ExperimentalRangeScaler": "._experimental_range_scaler:ExperimentalRangeScaler", + "ExperimentalSimpleImputer": "._experimental_simple_imputer:ExperimentalSimpleImputer", "ExperimentalStandardScaler": "._experimental_standard_scaler:ExperimentalStandardScaler", "ExperimentalTableTransformer": "._experimental_table_transformer:ExperimentalTableTransformer", "Imputer": "._imputer:Imputer", @@ -46,11 +46,11 @@ __all__ = [ "Discretizer", "ExperimentalDiscretizer", - "ExperimentalImputer", "ExperimentalInvertibleTableTransformer", "ExperimentalLabelEncoder", "ExperimentalOneHotEncoder", "ExperimentalRangeScaler", + "ExperimentalSimpleImputer", "ExperimentalStandardScaler", "ExperimentalTableTransformer", "Imputer", diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py index 82d9ccf2d..74ee3052f 100644 --- a/src/safeds/data/tabular/transformation/_experimental_discretizer.py +++ b/src/safeds/data/tabular/transformation/_experimental_discretizer.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from safeds.data.tabular.containers import ExperimentalTable from safeds.exceptions import ( ClosedBound, NonNumericColumnError, @@ -15,8 +16,6 @@ if TYPE_CHECKING: from sklearn.preprocessing import KBinsDiscretizer as sk_KBinsDiscretizer - from safeds.data.tabular.containers import ExperimentalTable - class ExperimentalDiscretizer(ExperimentalTableTransformer): """ @@ -90,7 +89,10 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") wrapped_transformer = sk_KBinsDiscretizer(n_bins=self._number_of_bins, encode="ordinal") - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = ExperimentalDiscretizer(self._number_of_bins) result._wrapped_transformer = wrapped_transformer @@ -146,10 +148,12 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: if not table.get_column(column).type.is_numeric: raise NonNumericColumnError(f"{column} is of type {table.get_column(column).type}.") - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py index a77429f12..9e240c050 100644 --- a/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_experimental_invertible_table_transformer.py @@ -26,7 +26,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta Returns ------- - table: + original_table: The original table. Raises diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py index 0ad781e7a..3156edd96 100644 --- a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py @@ -3,6 +3,7 @@ import warnings from typing import TYPE_CHECKING +from safeds.data.tabular.containers import ExperimentalTable from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer @@ -10,8 +11,6 @@ if TYPE_CHECKING: from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder - from safeds.data.tabular.containers import ExperimentalTable - class ExperimentalLabelEncoder(ExperimentalInvertibleTableTransformer): """The LabelEncoder encodes one or more given columns into labels.""" @@ -57,17 +56,23 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0: + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: warnings.warn( "The columns" - f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" " numerical data. The LabelEncoder is designed to encode non-numerical values into numerical values", UserWarning, stacklevel=2, ) + # TODO: use polars Enum type instead: + # my_enum = pl.Enum(['A', 'B', 'C']) <-- create this from the given order + # my_data = pl.Series(['A', 'A', 'B'], dtype=my_enum) wrapped_transformer = sk_OrdinalEncoder() - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = ExperimentalLabelEncoder() result._wrapped_transformer = wrapped_transformer @@ -112,10 +117,12 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: if table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows") - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data), + ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: """ @@ -130,7 +137,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta Returns ------- - table: + original_table: The original table. Raises @@ -155,26 +162,28 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta if transformed_table.number_of_rows == 0: raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows") - if transformed_table.keep_only_columns( + if transformed_table.remove_columns_except( self._column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names): + ).remove_non_numeric_columns().number_of_columns < len(self._column_names): raise NonNumericColumnError( str( sorted( set(self._column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + transformed_table._lazy_frame.update(new_data), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py index 300f42894..6482d2976 100644 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -4,7 +4,7 @@ from collections import Counter from typing import Any -from safeds.data.tabular.containers import Column, ExperimentalTable +from safeds.data.tabular.containers import Column, ExperimentalColumn, ExperimentalTable from safeds.exceptions import ( NonNumericColumnError, TransformerNotFittedError, @@ -114,23 +114,20 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") if ( - table._as_table() - .keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() + table + .remove_columns_except(column_names) + .remove_non_numeric_columns() .number_of_columns > 0 ): warnings.warn( "The columns" - f" {table._as_table().keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain" + f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" " numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values", UserWarning, stacklevel=2, ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - result = ExperimentalOneHotEncoder() result._column_names = {} @@ -139,12 +136,12 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper # Keep track of number of occurrences of column names; # initially all old column names appear exactly once: - name_counter = Counter(data.columns) + name_counter = Counter(table.column_names) # Iterate through all columns to-be-changed: for column in column_names: result._column_names[column] = [] - for element in table.get_column(column).get_unique_values(): + for element in table.get_column(column).get_distinct_values(): base_name = f"{column}__{element}" name_counter[base_name] += 1 new_column_name = base_name @@ -223,7 +220,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: values_not_present_when_fitted.append((value, old_column_name)) for new_column in self._column_names[old_column_name]: - table = table.add_columns([Column(new_column, encoded_values[new_column])]) + table = table.add_columns([ExperimentalColumn(new_column, encoded_values[new_column])]) if len(values_not_present_when_fitted) > 0: raise ValueNotPresentWhenFittedError(values_not_present_when_fitted) @@ -242,10 +239,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: # Drop old, non-encoded columns: # (Don't do this earlier - we need the old column nams for sorting, # plus we need to prevent the table from possibly having 0 columns temporarily.) - table = table.remove_columns(list(self._column_names.keys())) - - # Apply sorting and return: - return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + return table.remove_columns(list(self._column_names.keys())) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: """ @@ -289,16 +283,16 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta if transformed_table.number_of_rows == 0: raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows") - if transformed_table._as_table().keep_only_columns( + if transformed_table.remove_columns_except( _transformed_column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names): + ).remove_non_numeric_columns().number_of_columns < len(_transformed_column_names): raise NonNumericColumnError( str( sorted( set(_transformed_column_names) - set( - transformed_table.keep_only_columns(_transformed_column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(_transformed_column_names) + .remove_non_numeric_columns() .column_names, ), ), @@ -326,26 +320,9 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta for column_name, encoded_column in original_columns.items(): table = table.add_column(Column(column_name, encoded_column)) - column_names = [ - ( - name - if name not in [value for value_list in list(self._column_names.values()) for value in value_list] - else list(self._column_names.keys())[ - next( - list(self._column_names.values()).index(value) - for value in list(self._column_names.values()) - if name in value - ) - ] - ) - for name in table.column_names - ] - # Drop old column names: table = table.remove_columns(list(self._value_to_column.values())) - table = table.remove_columns(list(self._value_to_column_nans.values())) - - return table.sort_columns(lambda col1, col2: column_names.index(col1.name) - column_names.index(col2.name)) + return table.remove_columns(list(self._value_to_column_nans.values())) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py index e701921c2..f1a6b0be4 100644 --- a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -77,16 +77,16 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows") if ( - table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(column_names).number_of_columns + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(column_names).column_names) + set(table.remove_columns_except(column_names).column_names) - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() + table.remove_columns_except(column_names) + .remove_non_numeric_columns() .column_names, ), ), @@ -94,7 +94,10 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper ) wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum)) - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = ExperimentalRangeScaler() result._wrapped_transformer = wrapped_transformer @@ -142,26 +145,28 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") if ( - table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(self._column_names).number_of_columns + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(self._column_names).column_names) + set(table.remove_columns_except(self._column_names).column_names) - set( - table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data), + ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: """ @@ -176,7 +181,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta Returns ------- - table: + original_table: The original table. Raises @@ -202,28 +207,30 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .number_of_columns - < transformed_table.keep_only_columns(self._column_names).number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(transformed_table.keep_only_columns(self._column_names).column_names) + set(transformed_table.remove_columns_except(self._column_names).column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + transformed_table._lazy_frame.update(new_data), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_imputer.py b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py similarity index 84% rename from src/safeds/data/tabular/transformation/_experimental_imputer.py rename to src/safeds/data/tabular/transformation/_experimental_simple_imputer.py index 5cf5f40ab..4df15e978 100644 --- a/src/safeds/data/tabular/transformation/_experimental_imputer.py +++ b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py @@ -8,6 +8,7 @@ import pandas as pd from safeds._utils import _structural_hash +from safeds.data.tabular.containers import ExperimentalTable from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError from ._experimental_table_transformer import ExperimentalTableTransformer @@ -15,10 +16,8 @@ if TYPE_CHECKING: from sklearn.impute import SimpleImputer as sk_SimpleImputer - from safeds.data.tabular.containers import ExperimentalTable - -class ExperimentalImputer(ExperimentalTableTransformer): +class ExperimentalSimpleImputer(ExperimentalTableTransformer): """ Replace missing values using the given strategy. @@ -67,7 +66,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: """ @staticmethod - def Constant(value: Any) -> ExperimentalImputer.Strategy: # noqa: N802 + def Constant(value: Any) -> ExperimentalSimpleImputer.Strategy: # noqa: N802 """ Replace missing values with the given constant value. @@ -79,21 +78,21 @@ def Constant(value: Any) -> ExperimentalImputer.Strategy: # noqa: N802 return _Constant(value) # pragma: no cover @staticmethod - def Mean() -> ExperimentalImputer.Strategy: # noqa: N802 + def Mean() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 """Replace missing values with the mean of each column.""" return _Mean() # pragma: no cover @staticmethod - def Median() -> ExperimentalImputer.Strategy: # noqa: N802 + def Median() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 """Replace missing values with the median of each column.""" return _Median() # pragma: no cover @staticmethod - def Mode() -> ExperimentalImputer.Strategy: # noqa: N802 + def Mode() -> ExperimentalSimpleImputer.Strategy: # noqa: N802 """Replace missing values with the mode of each column.""" return _Mode() # pragma: no cover - def __init__(self, strategy: ExperimentalImputer.Strategy, *, value_to_replace: float | str | None = None): + def __init__(self, strategy: ExperimentalSimpleImputer.Strategy, *, value_to_replace: float | str | None = None): if value_to_replace is None: value_to_replace = pd.NA @@ -104,7 +103,7 @@ def __init__(self, strategy: ExperimentalImputer.Strategy, *, value_to_replace: self._column_names: list[str] | None = None @property - def strategy(self) -> ExperimentalImputer.Strategy: + def strategy(self) -> ExperimentalSimpleImputer.Strategy: """The strategy used to replace missing values.""" return self._strategy @@ -118,7 +117,7 @@ def is_fitted(self) -> bool: """Whether the transformer is fitted.""" return self._wrapped_transformer is not None - def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalImputer: + def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalSimpleImputer: """ Learn a transformation for a set of columns in a table. @@ -157,18 +156,18 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper if table.number_of_rows == 0: raise ValueError("The Imputer cannot be fitted because the table contains 0 rows") - if (isinstance(self._strategy, _Mean | _Median)) and table.keep_only_columns( + if (isinstance(self._strategy, _Mean | _Median)) and table.remove_columns_except( column_names, - ).remove_columns_with_non_numerical_values().number_of_columns < len( + ).remove_non_numeric_columns().number_of_columns < len( column_names, ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(column_names).column_names) + set(table.remove_columns_except(column_names).column_names) - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() + table.remove_columns_except(column_names) + .remove_non_numeric_columns() .column_names, ), ), @@ -189,12 +188,14 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper stacklevel=2, ) - wrapped_transformer = sk_SimpleImputer() + wrapped_transformer = sk_SimpleImputer(missing_values=self._value_to_replace) self._strategy._apply(wrapped_transformer) - wrapped_transformer.missing_values = self._value_to_replace - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) - result = ExperimentalImputer(self._strategy) + result = ExperimentalSimpleImputer(self._strategy) result._wrapped_transformer = wrapped_transformer result._column_names = column_names @@ -225,8 +226,6 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: ValueError If the table contains 0 rows. """ - import pandas as pd - # Transformer has not been fitted yet if self._wrapped_transformer is None or self._column_names is None: raise TransformerNotFittedError @@ -239,12 +238,10 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: if table.number_of_rows == 0: raise ValueError("The Imputer cannot transform the table because it contains 0 rows") - data = table._data.reset_index(drop=True) - data[self._column_names] = pd.DataFrame( - self._wrapped_transformer.transform(data[self._column_names]), - columns=self._column_names, + new_data = self._wrapped_transformer.transform(table.remove_columns_except(self._column_names)._data_frame) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data), ) - return Table._from_pandas_dataframe(data, table.schema) def get_names_of_added_columns(self) -> list[str]: """ @@ -306,7 +303,7 @@ def get_names_of_removed_columns(self) -> list[str]: # ---------------------------------------------------------------------------------------------------------------------- -class _Constant(ExperimentalImputer.Strategy): +class _Constant(ExperimentalSimpleImputer.Strategy): def __init__(self, value: Any): self._value = value @@ -335,7 +332,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.fill_value = self._value -class _Mean(ExperimentalImputer.Strategy): +class _Mean(ExperimentalSimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Mean): return NotImplemented @@ -351,7 +348,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "mean" -class _Median(ExperimentalImputer.Strategy): +class _Median(ExperimentalSimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Median): return NotImplemented @@ -367,7 +364,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: imputer.strategy = "median" -class _Mode(ExperimentalImputer.Strategy): +class _Mode(ExperimentalSimpleImputer.Strategy): def __eq__(self, other: object) -> bool: if not isinstance(other, _Mode): return NotImplemented @@ -385,7 +382,7 @@ def _apply(self, imputer: sk_SimpleImputer) -> None: # Override the methods with classes, so they can be used in `isinstance` calls. Unlike methods, classes define a type. # This is needed for the DSL, where imputer strategies are variants of an enum. -ExperimentalImputer.Strategy.Constant = _Constant # type: ignore[method-assign] -ExperimentalImputer.Strategy.Mean = _Mean # type: ignore[method-assign] -ExperimentalImputer.Strategy.Median = _Median # type: ignore[method-assign] -ExperimentalImputer.Strategy.Mode = _Mode # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Constant = _Constant # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Mean = _Mean # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Median = _Median # type: ignore[method-assign] +ExperimentalSimpleImputer.Strategy.Mode = _Mode # type: ignore[method-assign] diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py index a17e5d941..ba1fc8c34 100644 --- a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from safeds.data.tabular.containers import ExperimentalTable from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer @@ -9,8 +10,6 @@ if TYPE_CHECKING: from sklearn.preprocessing import StandardScaler as sk_StandardScaler - from safeds.data.tabular.containers import ExperimentalTable - class ExperimentalStandardScaler(ExperimentalInvertibleTableTransformer): """The StandardScaler transforms column values to a range by removing the mean and scaling to unit variance.""" @@ -59,16 +58,16 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper raise ValueError("The StandardScaler cannot be fitted because the table contains 0 rows") if ( - table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(column_names).number_of_columns + table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(column_names).column_names) + set(table.remove_columns_except(column_names).column_names) - set( - table.keep_only_columns(column_names) - .remove_columns_with_non_numerical_values() + table.remove_columns_except(column_names) + .remove_non_numeric_columns() .column_names, ), ), @@ -76,7 +75,10 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper ) wrapped_transformer = sk_StandardScaler() - wrapped_transformer.fit(table._data[column_names]) + wrapped_transformer.set_output(transform="polars") + wrapped_transformer.fit( + table.remove_columns_except(column_names)._data_frame, + ) result = ExperimentalStandardScaler() result._wrapped_transformer = wrapped_transformer @@ -124,26 +126,28 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") if ( - table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns - < table.keep_only_columns(self._column_names).number_of_columns + table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns + < table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(table.keep_only_columns(self._column_names).column_names) + set(table.remove_columns_except(self._column_names).column_names) - set( - table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = table._data.reset_index(drop=True) - data.columns = table.column_names - data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.transform( + table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + table._lazy_frame.update(new_data), + ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: """ @@ -158,7 +162,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta Returns ------- - table: + original_table: The original table. Raises @@ -184,28 +188,30 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .number_of_columns - < transformed_table.keep_only_columns(self._column_names).number_of_columns + < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( str( sorted( - set(transformed_table.keep_only_columns(self._column_names).column_names) + set(transformed_table.remove_columns_except(self._column_names).column_names) - set( - transformed_table.keep_only_columns(self._column_names) - .remove_columns_with_non_numerical_values() + transformed_table.remove_columns_except(self._column_names) + .remove_non_numeric_columns() .column_names, ), ), ), ) - data = transformed_table._data.reset_index(drop=True) - data.columns = transformed_table.column_names - data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names]) - return Table._from_pandas_dataframe(data) + new_data = self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + return ExperimentalTable._from_polars_lazy_frame( + transformed_table._lazy_frame.update(new_data), + ) @property def is_fitted(self) -> bool: diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py index a8a7d1cd8..a853b56d6 100644 --- a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py @@ -12,6 +12,10 @@ class ExperimentalTableTransformer(ABC): """Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns.""" + # ------------------------------------------------------------------------------------------------------------------ + # Dunder methods + # ------------------------------------------------------------------------------------------------------------------ + def __hash__(self) -> int: """ Return a deterministic hash value for a table transformer. @@ -26,11 +30,19 @@ def __hash__(self) -> int: removed = self.get_names_of_removed_columns() if self.is_fitted else [] return _structural_hash(self.__class__.__qualname__, self.is_fitted, added, changed, removed) + # ------------------------------------------------------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------------------------------------------------------ + @property @abstractmethod def is_fitted(self) -> bool: """Whether the transformer is fitted.""" + # ------------------------------------------------------------------------------------------------------------------ + # Methods + # ------------------------------------------------------------------------------------------------------------------ + @abstractmethod def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Self: """ @@ -122,7 +134,10 @@ def get_names_of_removed_columns(self) -> list[str]: If the transformer has not been fitted yet. """ - def fit_and_transform(self, table: ExperimentalTable, column_names: list[str] | None = None) -> tuple[Self, ExperimentalTable]: + def fit_and_transform( + self, + table: ExperimentalTable, column_names: list[str] | None = None + ) -> tuple[Self, ExperimentalTable]: """ Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. @@ -137,8 +152,10 @@ def fit_and_transform(self, table: ExperimentalTable, column_names: list[str] | Returns ------- - fitted_transformer, transformed_table: - The fitted transformer and the transformed table.: + fitted_transformer: + The fitted transformer. + transformed_table: + The transformed table. """ fitted_transformer = self.fit(table, column_names) transformed_table = fitted_transformer.transform(table) diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 6f0cabae5..65be0f14d 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -210,7 +210,7 @@ def predict( " remove the missing values entirely you can use the method `Table.remove_rows_with_missing_values`.", ) - dataset_df = dataset.remove_columns_by_name(feature_names, keep_only_listed=True) + dataset_df = dataset.remove_columns_except(feature_names) try: with warnings.catch_warnings(): From 6e3843a868084cf58bc47f0642a45d4f808084a9 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 19:24:58 +0200 Subject: [PATCH 17/40] ci: ruff config in MegaLinter? --- .mega-linter.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mega-linter.yml b/.mega-linter.yml index 87d022827..df7608fd1 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -15,6 +15,8 @@ JSON_PRETTIER_FILE_EXTENSIONS: - .html # - .md +PYTHON_RUFF_CONFIG_FILE: pyproject.toml + # Commands PRE_COMMANDS: - command: npm i @lars-reimann/prettier-config From 27f84951448984d77221dcf44d624af296277314 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 19:25:12 +0200 Subject: [PATCH 18/40] feat: use ASCII to format tables --- .../table/utils/create_synthetic_table.py | 3 +- .../utils/create_synthetic_table_polars.py | 3 +- src/resources/from_json_file_2.json | 1 + src/resources/from_parquet_file.parquet | Bin 0 -> 1686 bytes src/resources/to_excel_file.xlsx | Bin 4968 -> 4969 bytes src/resources/to_json_file.json | 2 +- src/resources/to_json_file_2.json | 1 + src/resources/to_parquet_file.parquet | Bin 0 -> 1686 bytes src/safeds/_config/__init__.py | 6 +- src/safeds/_config/_polars.py | 17 +++ src/safeds/_config/{_device.py => _torch.py} | 4 +- .../containers/_experimental_column.py | 100 ++++++++----- .../tabular/containers/_experimental_table.py | 134 +++++++++--------- .../_experimental_one_hot_encoder.py | 2 +- 14 files changed, 157 insertions(+), 116 deletions(-) create mode 100644 src/resources/from_json_file_2.json create mode 100644 src/resources/from_parquet_file.parquet create mode 100644 src/resources/to_json_file_2.json create mode 100644 src/resources/to_parquet_file.parquet create mode 100644 src/safeds/_config/_polars.py rename src/safeds/_config/{_device.py => _torch.py} (88%) diff --git a/benchmarks/table/utils/create_synthetic_table.py b/benchmarks/table/utils/create_synthetic_table.py index d1ad47d6e..9c201a098 100644 --- a/benchmarks/table/utils/create_synthetic_table.py +++ b/benchmarks/table/utils/create_synthetic_table.py @@ -10,7 +10,8 @@ def create_synthetic_table( min_value: int = 0, max_value: int = 1000, ) -> Table: - """Create a synthetic Table with random numerical data. + """ + Create a synthetic Table with random numerical data. Parameters ---------- diff --git a/benchmarks/table/utils/create_synthetic_table_polars.py b/benchmarks/table/utils/create_synthetic_table_polars.py index d1425c851..34a354b13 100644 --- a/benchmarks/table/utils/create_synthetic_table_polars.py +++ b/benchmarks/table/utils/create_synthetic_table_polars.py @@ -10,7 +10,8 @@ def create_synthetic_table_polars( min_value: int = 0, max_value: int = 1000, ) -> ExperimentalTable: - """Create a synthetic Table with random numerical data. + """ + Create a synthetic Table with random numerical data. Parameters ---------- diff --git a/src/resources/from_json_file_2.json b/src/resources/from_json_file_2.json new file mode 100644 index 000000000..8be2e957d --- /dev/null +++ b/src/resources/from_json_file_2.json @@ -0,0 +1 @@ +{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"b","datatype":"Int64","bit_settings":"","values":[4,5,6]}]} diff --git a/src/resources/from_parquet_file.parquet b/src/resources/from_parquet_file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..75234fd67f216314b8c3414fe05e6fda32ab8dde GIT binary patch literal 1686 zcmaJ?&ui2`6rN17G3vG;I70$CcxfoOxC&ycEW)HNwTLub1WAy`L90J#&9N|T4%#T86DB6x9c7DG8g%W}k2a@OjmEiFeW zcAGjmw~`~YEJsWwXRU7GYdJ~@148v5)34+REz1#8$yuw*i=#p*VL*tn=o*jA0koiC zktCY78fbzxO`8NXL7S$J3N%5RrlkW-(5A^M&;(05mVuZ!V3I|#oI3{3T!W`AgBQNR zIn@dt^&WdiqtQpShM~@iFo{N*$DW09E=FgA^aT zCy=zMn=n9x=yJ_x1p%3l?y9S`Ud;Kj6TvsY_);MF literal 0 HcmV?d00001 diff --git a/src/resources/to_excel_file.xlsx b/src/resources/to_excel_file.xlsx index d28e5f48f9af4271c75c38d275d54a0b24cc28d6..42a6af394d3b16894cd955e1ab3cf0e2677482e8 100644 GIT binary patch delta 576 zcmaE%_EL>Ez?+#xgn@y9gMoeaiix~yIe}C}#6y(_ZzdkrtY4-r-?b@bMi~=lr>8<* z;GSzeycV)H1cX?<>cw#qsxx3p0)W`ZOPwP`#IJc zpIx`}P=%NWWBothIz|-VN6d_wqb$J2z%WOhfk7A;CKa1sGU~7~14Cr91jk-RFyjX2 zc_t8J@+zKZV1dcJQDB+LANU@D1+MaYLj;rstr4L)IbF~UtZ#v!G~?;XdjyrhyoW#@ z&*Z;?>QSI@|NgC?b2$?OLoPQ112514Fwnqwh7;(V9R1>w%AC|G(Avpz*WyY@`?^+8~_nVP{K?<&( hfuVu%3E$-Zd@}rKR!mk9mS#LT*+y86O;ix19sso|(Y62p delta 575 zcmaE<_Ck#}z?+#xgn@y9gMn}H@`=1_Ie}Ef1JL)gQ^hpDhz zCVzE?-4uzfzrSxueeF>2SYm&_{OztI3K!g~y6gg;v|U=XeXTdsVXNE5?2Ho%{KJ+f zDcljrtKBlIJ^ha3g&*t6SxO`0npCg3ZEg#k(SCD=>{Pb&)E^yhRSY<;Oj140xoT09 z@oACVUz-!X(l+^@^(n274KF#}DQ&ju!Pk!uE2Hf4W0KxZFjD2^>REEr-84>wuU7kE zbf8>%z32&@HqIvg#%nh6;@qe8y_e^d- zlRq}u!Sl!0{8z##o{yLrGe=o~je%i~Is=0+Fi0wLHos=nW@841$Yx27y^LVSP0sU7 zAjag?JkP)aQ+T7mGLt{@Jpv0{F;PDx;-Tigf0iaSbpmv~cIA~y;JGov+3hbacLeh+FlXnQofq6HD eq?sP`P2!i~M>A`(qOdgMiOIIYYHT8cAcFwz#mf=^ diff --git a/src/resources/to_json_file.json b/src/resources/to_json_file.json index 5965ef149..b45718124 100644 --- a/src/resources/to_json_file.json +++ b/src/resources/to_json_file.json @@ -1 +1 @@ -{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } } +{"a":{"0":1,"1":2,"2":3},"b":{"0":4,"1":5,"2":6}} \ No newline at end of file diff --git a/src/resources/to_json_file_2.json b/src/resources/to_json_file_2.json new file mode 100644 index 000000000..29d04896f --- /dev/null +++ b/src/resources/to_json_file_2.json @@ -0,0 +1 @@ +{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"b","datatype":"Int64","bit_settings":"","values":[4,5,6]}]} \ No newline at end of file diff --git a/src/resources/to_parquet_file.parquet b/src/resources/to_parquet_file.parquet new file mode 100644 index 0000000000000000000000000000000000000000..75234fd67f216314b8c3414fe05e6fda32ab8dde GIT binary patch literal 1686 zcmaJ?&ui2`6rN17G3vG;I70$CcxfoOxC&ycEW)HNwTLub1WAy`L90J#&9N|T4%#T86DB6x9c7DG8g%W}k2a@OjmEiFeW zcAGjmw~`~YEJsWwXRU7GYdJ~@148v5)34+REz1#8$yuw*i=#p*VL*tn=o*jA0koiC zktCY78fbzxO`8NXL7S$J3N%5RrlkW-(5A^M&;(05mVuZ!V3I|#oI3{3T!W`AgBQNR zIn@dt^&WdiqtQpShM~@iFo{N*$DW09E=FgA^aT zCy=zMn=n9x=yJ_x1p%3l?y9S`Ud;Kj6TvsY_);MF literal 0 HcmV?d00001 diff --git a/src/safeds/_config/__init__.py b/src/safeds/_config/__init__.py index bd7b5188a..3ebf733ba 100644 --- a/src/safeds/_config/__init__.py +++ b/src/safeds/_config/__init__.py @@ -5,13 +5,13 @@ import apipkg if TYPE_CHECKING: - from ._device import _get_device, _init_default_device + from ._torch import _get_device, _init_default_device apipkg.initpkg( __name__, { - "_get_device": "._device:_get_device", - "_init_default_device": "._device:_init_default_device", + "_get_device": "._torch:_get_device", + "_init_default_device": "._torch:_init_default_device", }, ) diff --git a/src/safeds/_config/_polars.py b/src/safeds/_config/_polars.py new file mode 100644 index 000000000..02b595994 --- /dev/null +++ b/src/safeds/_config/_polars.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import polars as pl + + +def _get_polars_config() -> pl.Config: + import polars as pl + + return pl.Config( + float_precision=5, + tbl_cell_numeric_alignment="RIGHT", + tbl_formatting="ASCII_FULL_CONDENSED", + tbl_hide_dataframe_shape=True, + ) diff --git a/src/safeds/_config/_device.py b/src/safeds/_config/_torch.py similarity index 88% rename from src/safeds/_config/_device.py rename to src/safeds/_config/_torch.py index 3fc1db282..63b79ea80 100644 --- a/src/safeds/_config/_device.py +++ b/src/safeds/_config/_torch.py @@ -18,7 +18,7 @@ def _get_device() -> Device: def _init_default_device() -> None: import torch - global _default_device + global _default_device # noqa: PLW0603 if _default_device is None: _default_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -28,7 +28,7 @@ def _init_default_device() -> None: def _set_default_device(device: Device) -> None: # This changes all future tensors, but not any tensor that already exists - global _default_device + global _default_device # noqa: PLW0603 _default_device = device _init_default_device() diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 8507c5266..95628e0fc 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -39,7 +39,15 @@ class ExperimentalColumn(Sequence[T]): -------- >>> from safeds.data.tabular.containers import ExperimentalColumn >>> ExperimentalColumn("test", [1, 2, 3]) - Column('test', [1, 2, 3]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ # ------------------------------------------------------------------------------------------------------------------ @@ -97,20 +105,13 @@ def __len__(self) -> int: return self.number_of_rows def __repr__(self) -> str: - import polars as pl - - if self.number_of_rows <= 50: - data = self._series.to_list() - else: - data = f"[{', '.join(self._series.slice(0, 50).cast(pl.String).to_list())}, ...]" - - return f"Column({self.name!r}, {data})" + return self.to_table().__repr__() def __sizeof__(self) -> int: return self._series.estimated_size() def __str__(self) -> str: - return self.__repr__() + return self.to_table().__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties @@ -376,7 +377,15 @@ def rename(self, new_name: str) -> ExperimentalColumn[T]: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.rename("new_name") - Column('new_name', [1, 2, 3]) + +----------+ + | new_name | + | --- | + | i64 | + +==========+ + | 1 | + | 2 | + | 3 | + +----------+ """ return self._from_polars_series(self._series.rename(new_name)) @@ -404,7 +413,15 @@ def transform( >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.transform(lambda cell: 2 * cell) - Column('test', [2, 4, 6]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 2 | + | 4 | + | 6 | + +------+ """ result = transformer(_VectorizedCell(self)) if not isinstance(result, _VectorizedCell): @@ -430,22 +447,21 @@ def summarize_statistics(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("a", [1, 3]) >>> column.summarize_statistics() - shape: (9, 2) - ┌──────────────────────┬──────────┐ - │ metric ┆ a │ - │ --- ┆ --- │ - │ str ┆ f64 │ - ╞══════════════════════╪══════════╡ - │ min ┆ 1.0 │ - │ max ┆ 3.0 │ - │ mean ┆ 2.0 │ - │ median ┆ 2.0 │ - │ standard deviation ┆ 1.414214 │ - │ distinct value count ┆ 2.0 │ - │ idness ┆ 1.0 │ - │ missing value ratio ┆ 0.0 │ - │ stability ┆ 0.5 │ - └──────────────────────┴──────────┘ + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ """ from ._experimental_table import ExperimentalTable @@ -701,7 +717,14 @@ def mode(self) -> ExperimentalColumn[T]: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [3, 1, 2, 1, 3]) >>> column.mode() - Column('test', [1, 3]) + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 3 | + +------+ """ return self._from_polars_series(self._series.mode().sort()) @@ -823,16 +846,15 @@ def to_table(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalColumn >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> column.to_table() - shape: (3, 1) - ┌──────┐ - │ test │ - │ --- │ - │ i64 │ - ╞══════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └──────┘ + +------+ + | test | + | --- | + | i64 | + +======+ + | 1 | + | 2 | + | 3 | + +------+ """ from ._experimental_table import ExperimentalTable diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 9fc32debb..2c52e0b88 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, Literal +from safeds._config._polars import _get_polars_config from safeds._utils import _check_and_normalize_file_path, _structural_hash from safeds._utils._random import _get_random_seed from safeds.data.labeled.containers import ExperimentalTabularDataset @@ -109,15 +110,14 @@ def from_csv_file(path: str | Path) -> ExperimentalTable: -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> ExperimentalTable.from_csv_file("./src/resources/from_csv_file.csv") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 1 │ - │ 0 ┆ 0 ┆ 7 │ - └─────┴─────┴─────┘ + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 2 | 1 | + | 0 | 0 | 7 | + +-----+-----+-----+ """ import polars as pl @@ -149,16 +149,15 @@ def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalTable >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]} >>> ExperimentalTable.from_dict(data) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - └─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ return ExperimentalTable(data) @@ -187,16 +186,16 @@ def from_json_file(path: str | Path) -> ExperimentalTable: Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable - >>> ExperimentalTable.from_json_file("./src/resources/from_json_file.json") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 1 │ - │ 0 ┆ 0 ┆ 7 │ - └─────┴─────┴─────┘ + >>> ExperimentalTable.from_json_file("./src/resources/from_json_file_2.json") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ import polars as pl @@ -229,15 +228,15 @@ def from_parquet_file(path: str | Path) -> ExperimentalTable: -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> ExperimentalTable.from_parquet_file("./src/resources/from_parquet_file.parquet") - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 2 ┆ 1 │ - │ 0 ┆ 0 ┆ 7 │ - └─────┴─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ import polars as pl @@ -294,13 +293,15 @@ def __hash__(self) -> int: return _structural_hash(self.schema, self.number_of_rows) def __repr__(self) -> str: - return self._data_frame.__repr__() + with _get_polars_config(): + return self._data_frame.__repr__() def __sizeof__(self) -> int: return self._data_frame.estimated_size() def __str__(self) -> str: - return self._data_frame.__str__() + with _get_polars_config(): + return self._data_frame.__str__() # ------------------------------------------------------------------------------------------------------------------ # Properties @@ -469,16 +470,15 @@ def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> table.rename_column("a", "A") - shape: (3, 2) - ┌─────┬─────┐ - │ A ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - └─────┴─────┘ + +-----+-----+ + | A | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ """ # TODO: raises? return ExperimentalTable._from_polars_lazy_frame( @@ -547,15 +547,14 @@ def remove_duplicate_rows(self) -> ExperimentalTable: >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 2], "b": [4, 5, 5]}) >>> table.remove_duplicate_rows() - shape: (2, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - └─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + +-----+-----+ """ return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.unique(maintain_order=True), @@ -611,14 +610,13 @@ def remove_rows_with_missing_values( >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, None, 3], "b": [4, 5, None]}) >>> table.remove_rows_with_missing_values() - shape: (1, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - └─────┴─────┘ + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + +-----+-----+ """ return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop_nulls(subset=subset_names), @@ -819,7 +817,7 @@ def to_json_file( -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.to_json_file("./src/resources/to_json_file.json") + >>> table.to_json_file("./src/resources/to_json_file_2.json") """ path = _check_and_normalize_file_path(path, ".json", [".json"]) path.parent.mkdir(parents=True, exist_ok=True) diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py index 6482d2976..0e8bc7b85 100644 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -318,7 +318,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta table = transformed_table for column_name, encoded_column in original_columns.items(): - table = table.add_column(Column(column_name, encoded_column)) + table = table.add_columns(Column(column_name, encoded_column)) # Drop old column names: table = table.remove_columns(list(self._value_to_column.values())) From 3eefe18867ed003dfda144e6ce120e40ebba1b45 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 20:12:58 +0200 Subject: [PATCH 19/40] feat: `remove_rows_with_outliers` --- .../tabular/containers/_experimental_table.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 2c52e0b88..f618ce6cd 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -626,7 +626,23 @@ def remove_rows_with_outliers( self, subset_names: list[str] | None = None, ) -> ExperimentalTable: - raise NotImplementedError + if subset_names is None: + subset_names = self.column_names + + import polars as pl + import polars.selectors as cs + + non_outlier_mask = pl.all_horizontal( + self._data_frame + .select(cs.numeric() & cs.by_name(subset_names)) + .select( + ((pl.all() - pl.all().mean()) / pl.all().std()) <= 3, + ), + ) + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.filter(non_outlier_mask), + ) def shuffle_rows(self) -> ExperimentalTable: return ExperimentalTable._from_polars_data_frame( From ae3ac76adab357b0c46e40344bc70355c54dcb0c Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 21:24:13 +0200 Subject: [PATCH 20/40] feat: column plots --- .../plotting/_experimental_column_plotter.py | 110 +++++- .../plotting/_experimental_table_plotter.py | 312 ++++++++++++++++++ 2 files changed, 416 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 970103426..df5ce13ea 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -1,9 +1,12 @@ from __future__ import annotations +import io from typing import TYPE_CHECKING +from safeds.data.image.containers import Image +from safeds.exceptions import NonNumericColumnError + if TYPE_CHECKING: - from safeds.data.image.containers import Image from safeds.data.tabular.containers import ExperimentalColumn @@ -12,10 +15,105 @@ def __init__(self, column: ExperimentalColumn): self._column: ExperimentalColumn = column def box_plot(self) -> Image: - raise NotImplementedError + """ + Create a box plot for the values in the column. This is only possible for numeric columns. + + Returns + ------- + box_plot: + The box plot as an image. + + Raises + ------ + TypeError + If the column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> boxplot = column.plot.box_plot() + """ + import matplotlib.pyplot as plt + import seaborn as sns + + if not self._column.is_numeric: + raise NonNumericColumnError(f"{self._column.name} is of type {self._column.type}.") + + fig = plt.figure() + ax = sns.boxplot(data=self._column) + ax.set(title=self._column.name) + ax.set_xticks([]) + ax.set_ylabel("") + plt.tight_layout() + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) + + def histogram(self, *, number_of_bins: int = 10) -> Image: + """ + Create a histogram for the values in the column. + + Parameters + ---------- + number_of_bins: + The number of bins to use in the histogram. Default is 10. + + Returns + ------- + histogram: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> histogram = column.plot.histogram() + """ + return self._column.to_table().plot.histograms(number_of_bins=number_of_bins) + + def lag_plot(self, lag: int) -> Image: + """ + Create a lag plot for the values in the column. + + Parameters + ---------- + lag: + The amount of lag. + + Returns + ------- + lag_plot: + The plot as an image. + + Raises + ------ + TypeError + If the column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> table = ExperimentalColumn("values", [1,2,3,4,3,2]) + >>> image = table.plot.lag_plot(2) + """ + import matplotlib.pyplot as plt + + if not self._column.is_numeric: + raise NonNumericColumnError("This time series target contains non-numerical columns.") + + x_column_name = "y(t)" + y_column_name = f"y(t + {lag})" - def histogram(self) -> Image: - raise NotImplementedError + fig, ax = plt.subplots() + ax.scatter(x=self._column[:-lag], y=self._column[lag:]) + ax.set(xlabel=x_column_name, ylabel=y_column_name) - def lag_plot(self) -> Image: - raise NotImplementedError + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 253853c9d..26e2c6afb 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -27,3 +27,315 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: raise NotImplementedError # TODO: equivalent to Column.plot_compare_columns that takes a list of column names (index_plot)? + + def plot_correlation_heatmap(self) -> Image: + """ + Plot a correlation heatmap for all numerical columns of this `Table`. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> image = table.plot_correlation_heatmap() + """ + import matplotlib.pyplot as plt + import seaborn as sns + + only_numerical = self.remove_columns_with_non_numerical_values() + + if self.number_of_rows == 0: + warnings.warn( + "An empty table has been used. A correlation heatmap on an empty table will show nothing.", + stacklevel=2, + ) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=( + "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" + " automatically expanding." + ), + ) + fig = plt.figure() + sns.heatmap( + data=only_numerical._data.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + else: + fig = plt.figure() + sns.heatmap( + data=only_numerical._data.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) + + def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image: + """ + Plot two columns against each other in a lineplot. + + If there are multiple x-values for a y-value, the resulting plot will consist of a line representing the mean + and the lower-transparency area around the line representing the 95% confidence interval. + + Parameters + ---------- + x_column_name: + The column name of the column to be plotted on the x-Axis. + y_column_name: + The column name of the column to be plotted on the y-Axis. + + Returns + ------- + plot: + The plot as an image. + + Raises + ------ + UnknownColumnNameError + If either of the columns do not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> image = table.plot_lineplot("temperature", "sales") + """ + import matplotlib.pyplot as plt + import seaborn as sns + + if not self.has_column(x_column_name) or not self.has_column(y_column_name): + similar_columns_x = self._get_similar_columns(x_column_name) + similar_columns_y = self._get_similar_columns(y_column_name) + raise UnknownColumnNameError( + ([x_column_name] if not self.has_column(x_column_name) else []) + + ([y_column_name] if not self.has_column(y_column_name) else []), + (similar_columns_x if not self.has_column(x_column_name) else []) + + (similar_columns_y if not self.has_column(y_column_name) else []), + ) + + fig = plt.figure() + ax = sns.lineplot( + data=self._data, + x=x_column_name, + y=y_column_name, + ) + ax.set(xlabel=x_column_name, ylabel=y_column_name) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + plt.tight_layout() + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) + + def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image: + """ + Plot two columns against each other in a scatterplot. + + Parameters + ---------- + x_column_name: + The column name of the column to be plotted on the x-Axis. + y_column_name: + The column name of the column to be plotted on the y-Axis. + + Returns + ------- + plot: + The plot as an image. + + Raises + ------ + UnknownColumnNameError + If either of the columns do not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> image = table.plot_scatterplot("temperature", "sales") + """ + import matplotlib.pyplot as plt + import seaborn as sns + + if not self.has_column(x_column_name) or not self.has_column(y_column_name): + similar_columns_x = self._get_similar_columns(x_column_name) + similar_columns_y = self._get_similar_columns(y_column_name) + raise UnknownColumnNameError( + ([x_column_name] if not self.has_column(x_column_name) else []) + + ([y_column_name] if not self.has_column(y_column_name) else []), + (similar_columns_x if not self.has_column(x_column_name) else []) + + (similar_columns_y if not self.has_column(y_column_name) else []), + ) + + fig = plt.figure() + ax = sns.scatterplot( + data=self._data, + x=x_column_name, + y=y_column_name, + ) + ax.set(xlabel=x_column_name, ylabel=y_column_name) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + plt.tight_layout() + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) + + def plot_boxplots(self) -> Image: + """ + Plot a boxplot for every numerical column. + + Returns + ------- + plot: + The plot as an image. + + Raises + ------ + NonNumericColumnError + If the table contains only non-numerical columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a":[1, 2], "b": [3, 42]}) + >>> image = table.plot_boxplots() + """ + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns + + numerical_table = self.remove_columns_with_non_numerical_values() + if numerical_table.number_of_columns == 0: + raise NonNumericColumnError("This table contains only non-numerical columns.") + col_wrap = min(numerical_table.number_of_columns, 3) + + data = pd.melt(numerical_table._data, value_vars=numerical_table.column_names) + grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", + ) + grid.map(sns.boxplot, "variable", "value") + grid.set_xlabels("") + grid.set_ylabels("") + grid.set_titles("{col_name}") + for axes in grid.axes.flat: + axes.set_xticks([]) + plt.tight_layout() + fig = grid.fig + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) + + def plot_histograms(self, *, number_of_bins: int = 10) -> Image: + """ + Plot a histogram for every column. + + Parameters + ---------- + number_of_bins: + The number of bins to use in the histogram. Default is 10. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) + >>> image = table.plot_histograms() + """ + import matplotlib.pyplot as plt + import numpy as np + import pandas as pd + + n_cols = min(3, self.number_of_columns) + n_rows = 1 + (self.number_of_columns - 1) // n_cols + + if n_cols == 1 and n_rows == 1: + fig, axs = plt.subplots(1, 1, tight_layout=True) + one_col = True + else: + fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) + one_col = False + + col_names = self.column_names + for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): + np_col = np.array(self.get_column(col_name)) + bins = min(number_of_bins, len(pd.unique(np_col))) + + ax.set_title(col_name) + ax.set_xlabel("") + ax.set_ylabel("") + + if self.get_column(col_name).type.is_numeric(): + np_col = np_col[~np.isnan(np_col)] + + if bins < len(pd.unique(np_col)): + min_val = np.min(np_col) + max_val = np.max(np_col) + hist, bin_edges = np.histogram(self.get_column(col_name), bins, range=(min_val, max_val)) + + bars = np.array([]) + for i in range(len(hist)): + bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") + + ax.bar(bars, hist, edgecolor="black") + ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") + continue + + np_col = np_col.astype(str) + unique_values = np.unique(np_col) + hist = np.array([np.sum(np_col == value) for value in unique_values]) + ax.bar(unique_values, hist, edgecolor="black") + ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") + + for i in range(len(col_names), n_rows * n_cols): + fig.delaxes(axs.flatten()[i]) # Remove empty subplots + + buffer = io.BytesIO() + fig.savefig(buffer, format="png") + plt.close() # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) From 1ba620e0cd03ee912c2f4a9599c5615c5316b922 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 22:19:32 +0200 Subject: [PATCH 21/40] perf: faster lag plot --- .../plotting/_experimental_column_plotter.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index df5ce13ea..04bc8f6a8 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -100,17 +100,21 @@ def lag_plot(self, lag: int) -> Image: >>> table = ExperimentalColumn("values", [1,2,3,4,3,2]) >>> image = table.plot.lag_plot(2) """ - import matplotlib.pyplot as plt - if not self._column.is_numeric: raise NonNumericColumnError("This time series target contains non-numerical columns.") - x_column_name = "y(t)" - y_column_name = f"y(t + {lag})" + import matplotlib.pyplot as plt fig, ax = plt.subplots() - ax.scatter(x=self._column[:-lag], y=self._column[lag:]) - ax.set(xlabel=x_column_name, ylabel=y_column_name) + series = self._column._series + ax.scatter( + x=series.slice(0, len(self._column) - lag), + y=series.slice(lag), + ) + ax.set( + xlabel="y(t)", + ylabel=f"y(t + {lag})", + ) buffer = io.BytesIO() fig.savefig(buffer, format="png") From 7a7037cce53479904a5105a6602e14722d2128d0 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 22:38:32 +0200 Subject: [PATCH 22/40] refactor: extract conversion of figure to image --- src/safeds/_utils/__init__.py | 6 ++++ src/safeds/_utils/_plotting.py | 32 +++++++++++++++++++ .../plotting/_experimental_column_plotter.py | 7 ++-- 3 files changed, 40 insertions(+), 5 deletions(-) create mode 100644 src/safeds/_utils/_plotting.py diff --git a/src/safeds/_utils/__init__.py b/src/safeds/_utils/__init__.py index 3d445c97c..f50d26ed5 100644 --- a/src/safeds/_utils/__init__.py +++ b/src/safeds/_utils/__init__.py @@ -7,16 +7,22 @@ if TYPE_CHECKING: from ._file_io import _check_and_normalize_file_path from ._hashing import _structural_hash + from ._plotting import _figure_to_image + from ._random import _get_random_seed apipkg.initpkg( __name__, { "_check_and_normalize_file_path": "._file_io:_check_and_normalize_file_path", "_structural_hash": "._hashing:_structural_hash", + "_figure_to_image": "._plotting:_figure_to_image", + "_get_random_seed": "._random:_get_random_seed", }, ) __all__ = [ "_check_and_normalize_file_path", "_structural_hash", + "_figure_to_image", + "_get_random_seed", ] diff --git a/src/safeds/_utils/_plotting.py b/src/safeds/_utils/_plotting.py new file mode 100644 index 000000000..6b80c74f0 --- /dev/null +++ b/src/safeds/_utils/_plotting.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import io +from typing import TYPE_CHECKING + +from safeds.data.image.containers import Image + +if TYPE_CHECKING: + import matplotlib.pyplot as plt + + +def _figure_to_image(figure: plt.Figure) -> Image: + """ + Store the figure as an image and closes it. + + Parameters + ---------- + figure: + The figure to store. + + Returns + ------- + image: + The figure as an image. + """ + import matplotlib.pyplot as plt + + buffer = io.BytesIO() + figure.savefig(buffer, format="png") + plt.close(figure) # Prevents the figure from being displayed directly + buffer.seek(0) + return Image.from_bytes(buffer.read()) diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 04bc8f6a8..014d40709 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -3,6 +3,7 @@ import io from typing import TYPE_CHECKING +from safeds._utils import _figure_to_image from safeds.data.image.containers import Image from safeds.exceptions import NonNumericColumnError @@ -116,8 +117,4 @@ def lag_plot(self, lag: int) -> Image: ylabel=f"y(t + {lag})", ) - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + return _figure_to_image(fig) From 4c79e6e3e4e3e7a0c045810f2ae4336694ab3961 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Wed, 8 May 2024 23:33:07 +0200 Subject: [PATCH 23/40] feat: finish column plotter --- .../plotting/_experimental_column_plotter.py | 44 +- .../plotting/_experimental_table_plotter.py | 622 +++++++++--------- 2 files changed, 341 insertions(+), 325 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 014d40709..34cb2569e 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -1,17 +1,31 @@ from __future__ import annotations -import io from typing import TYPE_CHECKING from safeds._utils import _figure_to_image -from safeds.data.image.containers import Image from safeds.exceptions import NonNumericColumnError if TYPE_CHECKING: + from safeds.data.image.containers import Image from safeds.data.tabular.containers import ExperimentalColumn class ExperimentalColumnPlotter: + """ + A class that contains plotting methods for a column. + + Parameters + ---------- + column: + The column to plot. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("test", [1, 2, 3]) + >>> plotter = column.plot + """ + def __init__(self, column: ExperimentalColumn): self._column: ExperimentalColumn = column @@ -35,24 +49,25 @@ def box_plot(self) -> Image: >>> column = ExperimentalColumn("test", [1, 2, 3]) >>> boxplot = column.plot.box_plot() """ - import matplotlib.pyplot as plt - import seaborn as sns - if not self._column.is_numeric: raise NonNumericColumnError(f"{self._column.name} is of type {self._column.type}.") - fig = plt.figure() - ax = sns.boxplot(data=self._column) + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + plot = ax.boxplot( + self._column._series, + patch_artist=True, + ) + plt.setp(plot["boxes"], facecolor="lightsteelblue") + plt.setp(plot["medians"], color="red") + ax.set(title=self._column.name) ax.set_xticks([]) - ax.set_ylabel("") - plt.tight_layout() + ax.yaxis.grid(visible=True) + fig.tight_layout() - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + return _figure_to_image(fig) def histogram(self, *, number_of_bins: int = 10) -> Image: """ @@ -116,5 +131,6 @@ def lag_plot(self, lag: int) -> Image: xlabel="y(t)", ylabel=f"y(t + {lag})", ) + fig.tight_layout() return _figure_to_image(fig) diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 26e2c6afb..e81f8c070 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -28,314 +28,314 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: # TODO: equivalent to Column.plot_compare_columns that takes a list of column names (index_plot)? - def plot_correlation_heatmap(self) -> Image: - """ - Plot a correlation heatmap for all numerical columns of this `Table`. - - Returns - ------- - plot: - The plot as an image. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_correlation_heatmap() - """ - import matplotlib.pyplot as plt - import seaborn as sns - - only_numerical = self.remove_columns_with_non_numerical_values() - - if self.number_of_rows == 0: - warnings.warn( - "An empty table has been used. A correlation heatmap on an empty table will show nothing.", - stacklevel=2, - ) - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=( - "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" - " automatically expanding." - ), - ) - fig = plt.figure() - sns.heatmap( - data=only_numerical._data.corr(), - vmin=-1, - vmax=1, - xticklabels=only_numerical.column_names, - yticklabels=only_numerical.column_names, - cmap="vlag", - ) - plt.tight_layout() - else: - fig = plt.figure() - sns.heatmap( - data=only_numerical._data.corr(), - vmin=-1, - vmax=1, - xticklabels=only_numerical.column_names, - yticklabels=only_numerical.column_names, - cmap="vlag", - ) - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image: - """ - Plot two columns against each other in a lineplot. - - If there are multiple x-values for a y-value, the resulting plot will consist of a line representing the mean - and the lower-transparency area around the line representing the 95% confidence interval. - - Parameters - ---------- - x_column_name: - The column name of the column to be plotted on the x-Axis. - y_column_name: - The column name of the column to be plotted on the y-Axis. - - Returns - ------- - plot: - The plot as an image. - - Raises - ------ - UnknownColumnNameError - If either of the columns do not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_lineplot("temperature", "sales") - """ - import matplotlib.pyplot as plt - import seaborn as sns - - if not self.has_column(x_column_name) or not self.has_column(y_column_name): - similar_columns_x = self._get_similar_columns(x_column_name) - similar_columns_y = self._get_similar_columns(y_column_name) - raise UnknownColumnNameError( - ([x_column_name] if not self.has_column(x_column_name) else []) - + ([y_column_name] if not self.has_column(y_column_name) else []), - (similar_columns_x if not self.has_column(x_column_name) else []) - + (similar_columns_y if not self.has_column(y_column_name) else []), - ) - - fig = plt.figure() - ax = sns.lineplot( - data=self._data, - x=x_column_name, - y=y_column_name, - ) - ax.set(xlabel=x_column_name, ylabel=y_column_name) - ax.set_xticks(ax.get_xticks()) - ax.set_xticklabels( - ax.get_xticklabels(), - rotation=45, - horizontalalignment="right", - ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image: - """ - Plot two columns against each other in a scatterplot. - - Parameters - ---------- - x_column_name: - The column name of the column to be plotted on the x-Axis. - y_column_name: - The column name of the column to be plotted on the y-Axis. - - Returns - ------- - plot: - The plot as an image. - - Raises - ------ - UnknownColumnNameError - If either of the columns do not exist. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - >>> image = table.plot_scatterplot("temperature", "sales") - """ - import matplotlib.pyplot as plt - import seaborn as sns - - if not self.has_column(x_column_name) or not self.has_column(y_column_name): - similar_columns_x = self._get_similar_columns(x_column_name) - similar_columns_y = self._get_similar_columns(y_column_name) - raise UnknownColumnNameError( - ([x_column_name] if not self.has_column(x_column_name) else []) - + ([y_column_name] if not self.has_column(y_column_name) else []), - (similar_columns_x if not self.has_column(x_column_name) else []) - + (similar_columns_y if not self.has_column(y_column_name) else []), - ) - - fig = plt.figure() - ax = sns.scatterplot( - data=self._data, - x=x_column_name, - y=y_column_name, - ) - ax.set(xlabel=x_column_name, ylabel=y_column_name) - ax.set_xticks(ax.get_xticks()) - ax.set_xticklabels( - ax.get_xticklabels(), - rotation=45, - horizontalalignment="right", - ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - plt.tight_layout() - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_boxplots(self) -> Image: - """ - Plot a boxplot for every numerical column. - - Returns - ------- - plot: - The plot as an image. - - Raises - ------ - NonNumericColumnError - If the table contains only non-numerical columns. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table({"a":[1, 2], "b": [3, 42]}) - >>> image = table.plot_boxplots() - """ - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - numerical_table = self.remove_columns_with_non_numerical_values() - if numerical_table.number_of_columns == 0: - raise NonNumericColumnError("This table contains only non-numerical columns.") - col_wrap = min(numerical_table.number_of_columns, 3) - - data = pd.melt(numerical_table._data, value_vars=numerical_table.column_names) - grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", - ) - grid.map(sns.boxplot, "variable", "value") - grid.set_xlabels("") - grid.set_ylabels("") - grid.set_titles("{col_name}") - for axes in grid.axes.flat: - axes.set_xticks([]) - plt.tight_layout() - fig = grid.fig - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) - - def plot_histograms(self, *, number_of_bins: int = 10) -> Image: - """ - Plot a histogram for every column. - - Parameters - ---------- - number_of_bins: - The number of bins to use in the histogram. Default is 10. - - Returns - ------- - plot: - The plot as an image. - - Examples - -------- - >>> from safeds.data.tabular.containers import Table - >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) - >>> image = table.plot_histograms() - """ - import matplotlib.pyplot as plt - import numpy as np - import pandas as pd - - n_cols = min(3, self.number_of_columns) - n_rows = 1 + (self.number_of_columns - 1) // n_cols - - if n_cols == 1 and n_rows == 1: - fig, axs = plt.subplots(1, 1, tight_layout=True) - one_col = True - else: - fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) - one_col = False - - col_names = self.column_names - for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): - np_col = np.array(self.get_column(col_name)) - bins = min(number_of_bins, len(pd.unique(np_col))) - - ax.set_title(col_name) - ax.set_xlabel("") - ax.set_ylabel("") - - if self.get_column(col_name).type.is_numeric(): - np_col = np_col[~np.isnan(np_col)] - - if bins < len(pd.unique(np_col)): - min_val = np.min(np_col) - max_val = np.max(np_col) - hist, bin_edges = np.histogram(self.get_column(col_name), bins, range=(min_val, max_val)) - - bars = np.array([]) - for i in range(len(hist)): - bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") - - ax.bar(bars, hist, edgecolor="black") - ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") - continue - - np_col = np_col.astype(str) - unique_values = np.unique(np_col) - hist = np.array([np.sum(np_col == value) for value in unique_values]) - ax.bar(unique_values, hist, edgecolor="black") - ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") - - for i in range(len(col_names), n_rows * n_cols): - fig.delaxes(axs.flatten()[i]) # Remove empty subplots - - buffer = io.BytesIO() - fig.savefig(buffer, format="png") - plt.close() # Prevents the figure from being displayed directly - buffer.seek(0) - return Image.from_bytes(buffer.read()) + # def plot_correlation_heatmap(self) -> Image: + # """ + # Plot a correlation heatmap for all numerical columns of this `Table`. + # + # Returns + # ------- + # plot: + # The plot as an image. + # + # Examples + # -------- + # >>> from safeds.data.tabular.containers import Table + # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + # >>> image = table.plot_correlation_heatmap() + # """ + # import matplotlib.pyplot as plt + # import seaborn as sns + # + # only_numerical = self.remove_columns_with_non_numerical_values() + # + # if self.number_of_rows == 0: + # warnings.warn( + # "An empty table has been used. A correlation heatmap on an empty table will show nothing.", + # stacklevel=2, + # ) + # + # with warnings.catch_warnings(): + # warnings.filterwarnings( + # "ignore", + # message=( + # "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" + # " automatically expanding." + # ), + # ) + # fig = plt.figure() + # sns.heatmap( + # data=only_numerical._data.corr(), + # vmin=-1, + # vmax=1, + # xticklabels=only_numerical.column_names, + # yticklabels=only_numerical.column_names, + # cmap="vlag", + # ) + # plt.tight_layout() + # else: + # fig = plt.figure() + # sns.heatmap( + # data=only_numerical._data.corr(), + # vmin=-1, + # vmax=1, + # xticklabels=only_numerical.column_names, + # yticklabels=only_numerical.column_names, + # cmap="vlag", + # ) + # plt.tight_layout() + # + # buffer = io.BytesIO() + # fig.savefig(buffer, format="png") + # plt.close() # Prevents the figure from being displayed directly + # buffer.seek(0) + # return Image.from_bytes(buffer.read()) + # + # def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image: + # """ + # Plot two columns against each other in a lineplot. + # + # If there are multiple x-values for a y-value, the resulting plot will consist of a line representing the mean + # and the lower-transparency area around the line representing the 95% confidence interval. + # + # Parameters + # ---------- + # x_column_name: + # The column name of the column to be plotted on the x-Axis. + # y_column_name: + # The column name of the column to be plotted on the y-Axis. + # + # Returns + # ------- + # plot: + # The plot as an image. + # + # Raises + # ------ + # UnknownColumnNameError + # If either of the columns do not exist. + # + # Examples + # -------- + # >>> from safeds.data.tabular.containers import Table + # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + # >>> image = table.plot_lineplot("temperature", "sales") + # """ + # import matplotlib.pyplot as plt + # import seaborn as sns + # + # if not self.has_column(x_column_name) or not self.has_column(y_column_name): + # similar_columns_x = self._get_similar_columns(x_column_name) + # similar_columns_y = self._get_similar_columns(y_column_name) + # raise UnknownColumnNameError( + # ([x_column_name] if not self.has_column(x_column_name) else []) + # + ([y_column_name] if not self.has_column(y_column_name) else []), + # (similar_columns_x if not self.has_column(x_column_name) else []) + # + (similar_columns_y if not self.has_column(y_column_name) else []), + # ) + # + # fig = plt.figure() + # ax = sns.lineplot( + # data=self._data, + # x=x_column_name, + # y=y_column_name, + # ) + # ax.set(xlabel=x_column_name, ylabel=y_column_name) + # ax.set_xticks(ax.get_xticks()) + # ax.set_xticklabels( + # ax.get_xticklabels(), + # rotation=45, + # horizontalalignment="right", + # ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + # plt.tight_layout() + # + # buffer = io.BytesIO() + # fig.savefig(buffer, format="png") + # plt.close() # Prevents the figure from being displayed directly + # buffer.seek(0) + # return Image.from_bytes(buffer.read()) + # + # def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image: + # """ + # Plot two columns against each other in a scatterplot. + # + # Parameters + # ---------- + # x_column_name: + # The column name of the column to be plotted on the x-Axis. + # y_column_name: + # The column name of the column to be plotted on the y-Axis. + # + # Returns + # ------- + # plot: + # The plot as an image. + # + # Raises + # ------ + # UnknownColumnNameError + # If either of the columns do not exist. + # + # Examples + # -------- + # >>> from safeds.data.tabular.containers import Table + # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + # >>> image = table.plot_scatterplot("temperature", "sales") + # """ + # import matplotlib.pyplot as plt + # import seaborn as sns + # + # if not self.has_column(x_column_name) or not self.has_column(y_column_name): + # similar_columns_x = self._get_similar_columns(x_column_name) + # similar_columns_y = self._get_similar_columns(y_column_name) + # raise UnknownColumnNameError( + # ([x_column_name] if not self.has_column(x_column_name) else []) + # + ([y_column_name] if not self.has_column(y_column_name) else []), + # (similar_columns_x if not self.has_column(x_column_name) else []) + # + (similar_columns_y if not self.has_column(y_column_name) else []), + # ) + # + # fig = plt.figure() + # ax = sns.scatterplot( + # data=self._data, + # x=x_column_name, + # y=y_column_name, + # ) + # ax.set(xlabel=x_column_name, ylabel=y_column_name) + # ax.set_xticks(ax.get_xticks()) + # ax.set_xticklabels( + # ax.get_xticklabels(), + # rotation=45, + # horizontalalignment="right", + # ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + # plt.tight_layout() + # + # buffer = io.BytesIO() + # fig.savefig(buffer, format="png") + # plt.close() # Prevents the figure from being displayed directly + # buffer.seek(0) + # return Image.from_bytes(buffer.read()) + # + # def plot_boxplots(self) -> Image: + # """ + # Plot a boxplot for every numerical column. + # + # Returns + # ------- + # plot: + # The plot as an image. + # + # Raises + # ------ + # NonNumericColumnError + # If the table contains only non-numerical columns. + # + # Examples + # -------- + # >>> from safeds.data.tabular.containers import Table + # >>> table = Table({"a":[1, 2], "b": [3, 42]}) + # >>> image = table.plot_boxplots() + # """ + # import matplotlib.pyplot as plt + # import pandas as pd + # import seaborn as sns + # + # numerical_table = self.remove_columns_with_non_numerical_values() + # if numerical_table.number_of_columns == 0: + # raise NonNumericColumnError("This table contains only non-numerical columns.") + # col_wrap = min(numerical_table.number_of_columns, 3) + # + # data = pd.melt(numerical_table._data, value_vars=numerical_table.column_names) + # grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) + # with warnings.catch_warnings(): + # warnings.filterwarnings( + # "ignore", + # message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", + # ) + # grid.map(sns.boxplot, "variable", "value") + # grid.set_xlabels("") + # grid.set_ylabels("") + # grid.set_titles("{col_name}") + # for axes in grid.axes.flat: + # axes.set_xticks([]) + # plt.tight_layout() + # fig = grid.fig + # + # buffer = io.BytesIO() + # fig.savefig(buffer, format="png") + # plt.close() # Prevents the figure from being displayed directly + # buffer.seek(0) + # return Image.from_bytes(buffer.read()) + # + # def plot_histograms(self, *, number_of_bins: int = 10) -> Image: + # """ + # Plot a histogram for every column. + # + # Parameters + # ---------- + # number_of_bins: + # The number of bins to use in the histogram. Default is 10. + # + # Returns + # ------- + # plot: + # The plot as an image. + # + # Examples + # -------- + # >>> from safeds.data.tabular.containers import Table + # >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) + # >>> image = table.plot_histograms() + # """ + # import matplotlib.pyplot as plt + # import numpy as np + # import pandas as pd + # + # n_cols = min(3, self.number_of_columns) + # n_rows = 1 + (self.number_of_columns - 1) // n_cols + # + # if n_cols == 1 and n_rows == 1: + # fig, axs = plt.subplots(1, 1, tight_layout=True) + # one_col = True + # else: + # fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) + # one_col = False + # + # col_names = self.column_names + # for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): + # np_col = np.array(self.get_column(col_name)) + # bins = min(number_of_bins, len(pd.unique(np_col))) + # + # ax.set_title(col_name) + # ax.set_xlabel("") + # ax.set_ylabel("") + # + # if self.get_column(col_name).type.is_numeric(): + # np_col = np_col[~np.isnan(np_col)] + # + # if bins < len(pd.unique(np_col)): + # min_val = np.min(np_col) + # max_val = np.max(np_col) + # hist, bin_edges = np.histogram(self.get_column(col_name), bins, range=(min_val, max_val)) + # + # bars = np.array([]) + # for i in range(len(hist)): + # bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") + # + # ax.bar(bars, hist, edgecolor="black") + # ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") + # continue + # + # np_col = np_col.astype(str) + # unique_values = np.unique(np_col) + # hist = np.array([np.sum(np_col == value) for value in unique_values]) + # ax.bar(unique_values, hist, edgecolor="black") + # ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") + # + # for i in range(len(col_names), n_rows * n_cols): + # fig.delaxes(axs.flatten()[i]) # Remove empty subplots + # + # buffer = io.BytesIO() + # fig.savefig(buffer, format="png") + # plt.close() # Prevents the figure from being displayed directly + # buffer.seek(0) + # return Image.from_bytes(buffer.read()) From 8b65aead12591ba91ac6e6e8409e7a480c911df2 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 09:40:57 +0200 Subject: [PATCH 24/40] feat: named cell operations --- .../tabular/containers/_experimental_cell.py | 502 ++++++++++++++++++ 1 file changed, 502 insertions(+) diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_experimental_cell.py index bae73d679..479cc4f00 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_cell.py @@ -128,6 +128,508 @@ def __hash__(self) -> int: ... @abstractmethod def __sizeof__(self) -> int: ... + # ------------------------------------------------------------------------------------------------------------------ + # Boolean operations + # ------------------------------------------------------------------------------------------------------------------ + + def not_(self) -> ExperimentalCell[bool]: + """ + Negate a boolean. This is equivalent to the `~` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.not_()) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: ~cell) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__invert__() + + def and_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean AND operation. This is equivalent to the `&` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.and_(False)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + + >>> column.transform(lambda cell: cell & False) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + """ + return self.__and__(other) + + def or_(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean OR operation. This is equivalent to the `|` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.or_(True)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + + >>> column.transform(lambda cell: cell | True) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + """ + return self.__or__(other) + + def xor(self, other: bool | ExperimentalCell[bool]) -> ExperimentalCell[bool]: + """ + Perform a boolean XOR operation. This is equivalent to the `^` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [True, False]) + >>> column.transform(lambda cell: cell.xor(True)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell ^ True) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__xor__(other) + + # ------------------------------------------------------------------------------------------------------------------ + # Numeric operations + # ------------------------------------------------------------------------------------------------------------------ + + def abs(self) -> ExperimentalCell[R]: + """ + Get the absolute value. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, -2]) + >>> column.transform(lambda cell: cell.abs()) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 1 | + | 2 | + +---------+ + """ + return self.__abs__() + + def neg(self) -> ExperimentalCell[R]: + """ + Negate the value. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, -2]) + >>> column.transform(lambda cell: cell.neg()) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | -1 | + | 2 | + +---------+ + """ + return self.__neg__() + + def add(self, other: Any) -> ExperimentalCell[R]: + """ + Add a value. This is equivalent to the `+` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.add(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 4 | + | 5 | + +---------+ + + >>> column.transform(lambda cell: cell + 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 4 | + | 5 | + +---------+ + """ + return self.__add__(other) + + def mod(self, other: Any) -> ExperimentalCell[R]: + """ + Perform a modulo operation. This is equivalent to the `%` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [5, 6]) + >>> column.transform(lambda cell: cell.mod(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 0 | + +---------+ + + >>> column.transform(lambda cell: cell % 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 0 | + +---------+ + """ + return self.__mod__(other) + + def mul(self, other: Any) -> ExperimentalCell[R]: + """ + Multiply by a value. This is equivalent to the `*` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [2, 3]) + >>> column.transform(lambda cell: cell.mul(4)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 12 | + +---------+ + + >>> column.transform(lambda cell: cell * 4) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 12 | + +---------+ + """ + return self.__mul__(other) + + def pow(self, other: float | ExperimentalCell[P]) -> ExperimentalCell[R]: + """ + Raise to a power. This is equivalent to the `**` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [2, 3]) + >>> column.transform(lambda cell: cell.pow(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 27 | + +---------+ + + >>> column.transform(lambda cell: cell ** 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 8 | + | 27 | + +---------+ + """ + return self.__pow__(other) + + def sub(self, other: Any) -> ExperimentalCell[R]: + """ + Subtract a value. This is equivalent to the `-` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [5, 6]) + >>> column.transform(lambda cell: cell.sub(3)) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 3 | + +---------+ + + >>> column.transform(lambda cell: cell - 3) + +---------+ + | example | + | --- | + | i64 | + +=========+ + | 2 | + | 3 | + +---------+ + """ + return self.__sub__(other) + + def div(self, other: Any) -> ExperimentalCell[R]: + """ + Divide by a value. This is equivalent to the `/` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [6, 8]) + >>> column.transform(lambda cell: cell.div(2)) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 3.00000 | + | 4.00000 | + +---------+ + + >>> column.transform(lambda cell: cell / 2) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 3.00000 | + | 4.00000 | + +---------+ + """ + return self.__truediv__(other) + + # ------------------------------------------------------------------------------------------------------------------ + # Comparison operations + # ------------------------------------------------------------------------------------------------------------------ + + def eq(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if equal to a value. This is equivalent to the `==` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.eq(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell == 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__eq__(other) + + def ge(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if greater than or equal to a value. This is equivalent to the `>=` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.ge(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + + >>> column.transform(lambda cell: cell >= 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | true | + +---------+ + """ + return self.__ge__(other) + + def gt(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if greater than a value. This is equivalent to the `>` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.gt(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + + >>> column.transform(lambda cell: cell > 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | false | + | false | + +---------+ + """ + return self.__gt__(other) + + def le(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if less than or equal to a value. This is equivalent to the `<=` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.le(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + + >>> column.transform(lambda cell: cell <= 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | true | + +---------+ + """ + return self.__le__(other) + + def lt(self, other: Any) -> ExperimentalCell[bool]: + """ + Check if less than a value. This is equivalent to the `<` operator. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1, 2]) + >>> column.transform(lambda cell: cell.lt(2)) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | false | + +---------+ + + >>> column.transform(lambda cell: cell < 2) + +---------+ + | example | + | --- | + | bool | + +=========+ + | true | + | false | + +---------+ + """ + return self.__lt__(other) + # ------------------------------------------------------------------------------------------------------------------ # Internal # ------------------------------------------------------------------------------------------------------------------ From 31f9c11b8585cd450fc840c35bac37ecba68ef3c Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 09:54:27 +0200 Subject: [PATCH 25/40] feat: floor & ceil --- .../tabular/containers/_experimental_cell.py | 46 +++++++++++++++++++ .../containers/_experimental_lazy_cell.py | 6 +++ .../_experimental_vectorized_cell.py | 6 +++ 3 files changed, 58 insertions(+) diff --git a/src/safeds/data/tabular/containers/_experimental_cell.py b/src/safeds/data/tabular/containers/_experimental_cell.py index 479cc4f00..7c77046e5 100644 --- a/src/safeds/data/tabular/containers/_experimental_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_cell.py @@ -72,6 +72,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr @abstractmethod def __abs__(self) -> ExperimentalCell[R]: ... + @abstractmethod + def __ceil__(self) -> ExperimentalCell[R]: ... + + @abstractmethod + def __floor__(self) -> ExperimentalCell[R]: ... + @abstractmethod def __neg__(self) -> ExperimentalCell[R]: ... @@ -276,6 +282,46 @@ def abs(self) -> ExperimentalCell[R]: """ return self.__abs__() + def ceil(self) -> ExperimentalCell[R]: + """ + Round up to the nearest integer. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> column.transform(lambda cell: cell.ceil()) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 2.00000 | + | 3.00000 | + +---------+ + """ + return self.__ceil__() + + def floor(self) -> ExperimentalCell[R]: + """ + Round down to the nearest integer. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> column = ExperimentalColumn("example", [1.1, 2.9]) + >>> column.transform(lambda cell: cell.floor()) + +---------+ + | example | + | --- | + | f64 | + +=========+ + | 1.00000 | + | 2.00000 | + +---------+ + """ + return self.__floor__() + def neg(self) -> ExperimentalCell[R]: """ Negate the value. diff --git a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py index 2ece9095d..d3b5e56cf 100644 --- a/src/safeds/data/tabular/containers/_experimental_lazy_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_lazy_cell.py @@ -82,6 +82,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr def __abs__(self) -> ExperimentalCell[R]: return _wrap(self._expression.__abs__()) + def __ceil__(self) -> ExperimentalCell[R]: + return _wrap(self._expression.ceil()) + + def __floor__(self) -> ExperimentalCell[R]: + return _wrap(self._expression.floor()) + def __neg__(self) -> ExperimentalCell[R]: return _wrap(self._expression.__neg__()) diff --git a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py index 24b3049e2..b648cc2ab 100644 --- a/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py +++ b/src/safeds/data/tabular/containers/_experimental_vectorized_cell.py @@ -126,6 +126,12 @@ def __ne__(self, other: object) -> ExperimentalCell[bool]: # type: ignore[overr def __abs__(self) -> ExperimentalCell[R]: return _wrap(self._series.__abs__()) + def __ceil__(self) -> ExperimentalCell[R]: + return _wrap(self._series.ceil()) + + def __floor__(self) -> ExperimentalCell[R]: + return _wrap(self._series.floor()) + def __neg__(self) -> ExperimentalCell[R]: return _wrap(self._series.__neg__()) From dcd4c61edb6cca61287f5fcd52d3da993bf91715 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 10:48:03 +0200 Subject: [PATCH 26/40] docs: document table operations --- .../tabular/containers/_experimental_table.py | 426 +++++++++++++++++- src/safeds/exceptions/_data.py | 4 +- 2 files changed, 410 insertions(+), 20 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index f618ce6cd..5c3bb5d7c 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -60,7 +60,7 @@ class ExperimentalTable: Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples @@ -75,8 +75,39 @@ class ExperimentalTable: @staticmethod def from_columns(columns: ExperimentalColumn | list[ExperimentalColumn]) -> ExperimentalTable: + """ + Create a table from a list of columns. + + Parameters + ---------- + columns: + The columns. + + Returns + ------- + table: + The created table. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> a = ExperimentalColumn("a", [1, 2, 3]) + >>> b = ExperimentalColumn("b", [4, 5, 6]) + >>> ExperimentalTable.from_columns([a, b]) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ import polars as pl + # TODO: raises + if isinstance(columns, ExperimentalColumn): columns = [columns] @@ -141,7 +172,7 @@ def from_dict(data: dict[str, list[Any]]) -> ExperimentalTable: Raises ------ - ColumnLengthMismatchError + ValueError If columns have different lengths. Examples @@ -347,7 +378,7 @@ def number_of_rows(self) -> int: """ The number of rows in the table. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Examples -------- @@ -360,10 +391,12 @@ def number_of_rows(self) -> int: @property def plot(self) -> ExperimentalTablePlotter: + """The plotter for the table.""" return ExperimentalTablePlotter(self) @property def schema(self) -> ExperimentalSchema: + """The schema of the table.""" return _PolarsSchema(self._lazy_frame.schema) # ------------------------------------------------------------------------------------------------------------------ @@ -374,6 +407,40 @@ def add_columns( self, columns: ExperimentalColumn | list[ExperimentalColumn], ) -> ExperimentalTable: + """ + Return a new table with additional columns. + + **Notes:** + + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + columns: + The columns to add. + + Returns + ------- + new_table: + The table with the additional columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> new_column = ExperimentalColumn("b", [4, 5, 6]) + >>> table.add_columns(new_column) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ if isinstance(columns, ExperimentalColumn): columns = [columns] @@ -384,27 +451,148 @@ def add_columns( self._data_frame.hstack([column._series for column in columns]), ) - def compute_column( + def add_computed_column( self, name: str, computer: Callable[[ExperimentalRow], ExperimentalCell], ) -> ExperimentalTable: + """ + Return a new table with an additional computed column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the new column. + computer: + The function that computes the values of the new column. + + Returns + ------- + new_table: + The table with the computed column. + + Raises + ------ + ValueError + If the column name already exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.add_computed_column("c", lambda row: row.get_value("a") + row.get_value("b")) + +-----+-----+-----+ + | a | b | c | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 1 | 4 | 5 | + | 2 | 5 | 7 | + | 3 | 6 | 9 | + +-----+-----+-----+ + """ if self.has_column(name): raise DuplicateColumnNameError(name) computed_column = computer(_LazyVectorizedRow(self)) return self._from_polars_lazy_frame( - self._lazy_frame.with_columns(name, computed_column._polars_expression), + self._lazy_frame.with_columns(computed_column._polars_expression.alias(name)), ) def get_column(self, name: str) -> ExperimentalColumn: + """ + Get a column from the table. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + column: + The column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) + return ExperimentalColumn._from_polars_series(self._data_frame.get_column(name)) def get_column_type(self, name: str) -> ExperimentalDataType: + """ + Get the data type of a column. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + type: + The data type of the column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.get_column_type("a") + Int64 + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) + return _PolarsDataType(self._lazy_frame.schema[name]) def has_column(self, name: str) -> bool: + """ + Check if the table has a column with a specific name. + + Parameters + ---------- + name: + The name of the column. + + Returns + ------- + has_column: + Whether the table has a column with the specified name. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.has_column("a") + True + """ return name in self.column_names def remove_columns( @@ -412,9 +600,46 @@ def remove_columns( names: str | list[str], /, ) -> ExperimentalTable: + """ + Return a new table without the specified columns. + + **Note:** The original table is not modified. + + Parameters + ---------- + names: + The names of the columns to remove. + + Returns + ------- + new_table: + The table with the columns removed. + + Raises + ------ + KeyError + If a column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns("a") + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + """ if isinstance(names, str): names = [names] + # TODO: raises? + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.drop(names), ) @@ -424,14 +649,77 @@ def remove_columns_except( names: str | list[str], /, ) -> ExperimentalTable: + """ + Return a new table with only the specified columns. + + Parameters + ---------- + names: + The names of the columns to keep. + + Returns + ------- + new_table: + The table with only the specified columns. + + Raises + ------ + KeyError + If a column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_columns_except("a") + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ if isinstance(names, str): names = [names] + # TODO: raises? + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.select(names), ) def remove_columns_with_missing_values(self) -> ExperimentalTable: + """ + Return a new table without columns that contain missing values. + + **Notes:** + + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. + + Returns + ------- + new_table: + The table without columns containing missing values. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, None]}) + >>> table.remove_columns_with_missing_values() + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ import polars as pl return ExperimentalTable._from_polars_lazy_frame( @@ -441,6 +729,31 @@ def remove_columns_with_missing_values(self) -> ExperimentalTable: ) def remove_non_numeric_columns(self) -> ExperimentalTable: + """ + Return a new table without non-numeric columns. + + **Note:** The original table is not modified. + + Returns + ------- + new_table: + The table without non-numeric columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": ["4", "5", "6"]}) + >>> table.remove_non_numeric_columns() + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + +-----+ + """ import polars.selectors as cs return ExperimentalTable._from_polars_lazy_frame( @@ -451,7 +764,7 @@ def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: """ Return a new table with a column renamed. - Note that the original table is not modified. + **Note:** The original table is not modified. Parameters ---------- @@ -465,13 +778,18 @@ def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: new_table: The table with the column renamed. + Raises + ------ + KeyError + If no column with the old name exists. + Examples -------- >>> from safeds.data.tabular.containers import ExperimentalTable >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> table.rename_column("a", "A") + >>> table.rename_column("a", "c") +-----+-----+ - | A | b | + | c | b | | --- | --- | | i64 | i64 | +===========+ @@ -480,7 +798,9 @@ def rename_column(self, old_name: str, new_name: str) -> ExperimentalTable: | 3 | 6 | +-----+-----+ """ - # TODO: raises? + if not self.has_column(old_name): + raise UnknownColumnNameError([old_name]) + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.rename({old_name: new_name}), ) @@ -490,20 +810,90 @@ def replace_column( old_name: str, new_columns: ExperimentalColumn | list[ExperimentalColumn], ) -> ExperimentalTable: + """ + Return a new table with a column replaced by zero or more columns. + + **Note:** + + * The original table is not modified. + * If a column is replaced by multiple columns, this operation must fully load the data into memory, which can be + expensive. + + Parameters + ---------- + old_name: + The name of the column to replace. + new_columns: + The new column or columns. + + Returns + ------- + new_table: + The table with the column replaced. + + Raises + ------ + KeyError + If no column with the old name exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.replace_column("a", []) + +-----+ + | b | + | --- | + | i64 | + +=====+ + | 4 | + | 5 | + | 6 | + +-----+ + + >>> column1 = ExperimentalColumn("c", [7, 8, 9]) + >>> table.replace_column("a", column1) + +-----+-----+ + | c | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 7 | 4 | + | 8 | 5 | + | 9 | 6 | + +-----+-----+ + + >>> column2 = ExperimentalColumn("d", [10, 11, 12]) + >>> table.replace_column("a", [column1, column2]) + +-----+-----+-----+ + | c | d | b | + | --- | --- | --- | + | i64 | i64 | i64 | + +=================+ + | 7 | 10 | 4 | + | 8 | 11 | 5 | + | 9 | 12 | 6 | + +-----+-----+-----+ + """ + if not self.has_column(old_name): + raise UnknownColumnNameError([old_name]) + if isinstance(new_columns, ExperimentalColumn): new_columns = [new_columns] if len(new_columns) == 0: return self.remove_columns(old_name) - new_frame = self._data_frame - index = new_frame.get_column_index(old_name) - if len(new_columns) == 1: - return ExperimentalTable._from_polars_data_frame( - new_frame.replace_column(index, new_columns[0]._series), + new_column = new_columns[0] + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame + .with_columns(new_column._series.alias(old_name)) + .rename({old_name: new_column.name}), ) + new_frame = self._data_frame + index = new_frame.get_column_index(old_name) prefix = new_frame.select(self.column_names[:index]) suffix = new_frame.select(self.column_names[index + 1:]) @@ -593,7 +983,7 @@ def remove_rows_with_missing_values( """ Remove rows with missing values from the table. - Note that the original table is not modified. + **Note:** The original table is not modified. Parameters ---------- @@ -813,7 +1203,7 @@ def to_json_file( If the file and/or the parent directories do not exist, they will be created. If the file exists already, it will be overwritten. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Parameters ---------- @@ -948,7 +1338,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): # The specification of the dataframe interchange protocol can be found [here](https://data-apis.org/dataframe-protocol/latest/index.html). - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Parameters ---------- @@ -973,7 +1363,7 @@ def _repr_html_(self) -> str: """ Return a compact HTML representation of the table for IPython. - Note that this operation must fully load the data into memory, which can be expensive. + **Note:** This operation must fully load the data into memory, which can be expensive. Returns ------- diff --git a/src/safeds/exceptions/_data.py b/src/safeds/exceptions/_data.py index ec37ae31a..a13542537 100644 --- a/src/safeds/exceptions/_data.py +++ b/src/safeds/exceptions/_data.py @@ -53,7 +53,7 @@ def __init__(self, column_info: str, help_msg: str | None = None) -> None: ) -class DuplicateColumnNameError(Exception): +class DuplicateColumnNameError(ValueError): """ Exception raised for trying to modify a table resulting in a duplicate column name. @@ -120,7 +120,7 @@ def __init__(self, expected_size: str, actual_size: str): super().__init__(f"Expected a column of size {expected_size} but got column of size {actual_size}.") -class ColumnLengthMismatchError(Exception): +class ColumnLengthMismatchError(ValueError): """Exception raised when the lengths of two or more columns do not match.""" def __init__(self, column_info: str): From e7315c2c632dc4553fd2fdfe205c339bb6de0a45 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 11:04:08 +0200 Subject: [PATCH 27/40] perf: lazy `replace_column` --- .../tabular/containers/_experimental_table.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 5c3bb5d7c..9ec12d421 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -816,8 +816,6 @@ def replace_column( **Note:** * The original table is not modified. - * If a column is replaced by multiple columns, this operation must fully load the data into memory, which can be - expensive. Parameters ---------- @@ -892,13 +890,16 @@ def replace_column( .rename({old_name: new_column.name}), ) - new_frame = self._data_frame - index = new_frame.get_column_index(old_name) - prefix = new_frame.select(self.column_names[:index]) - suffix = new_frame.select(self.column_names[index + 1:]) + import polars as pl - return ExperimentalTable._from_polars_data_frame( - prefix.hstack([column._series for column in new_columns]).hstack(suffix), + index = self.column_names.index(old_name) + + return ExperimentalTable._from_polars_lazy_frame( + self._lazy_frame.select( + *[pl.col(name) for name in self.column_names[:index]], + *[column._series for column in new_columns], + *[pl.col(name) for name in self.column_names[index + 1:]], + ), ) def transform_column( From 1eaec6a2a723605097669afb2aacc78a655d8851 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 11:35:45 +0200 Subject: [PATCH 28/40] test: benchmark for remove_rows_with_outliers --- benchmarks/table/row_operations_polars.py | 33 +++++++++++++---------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/benchmarks/table/row_operations_polars.py b/benchmarks/table/row_operations_polars.py index e6fc14aeb..403bfb80d 100644 --- a/benchmarks/table/row_operations_polars.py +++ b/benchmarks/table/row_operations_polars.py @@ -1,5 +1,7 @@ from timeit import timeit +import polars as pl + from safeds.data.tabular.containers import ExperimentalTable from benchmarks.table.utils import create_synthetic_table_polars @@ -15,8 +17,8 @@ def _run_remove_rows_with_missing_values() -> None: table.remove_rows_with_missing_values()._lazy_frame.collect() -# def _run_remove_rows_with_outliers() -> None: -# table.remove_rows_with_outliers() +def _run_remove_rows_with_outliers() -> None: + table.remove_rows_with_outliers() def _run_remove_rows() -> None: @@ -55,7 +57,7 @@ def _run_transform_column() -> None: if __name__ == "__main__": # Create a synthetic Table - table = create_synthetic_table_polars(100000, 50) + table = create_synthetic_table_polars(1000, 50) # Run the benchmarks timings: dict[str, float] = { @@ -67,10 +69,10 @@ def _run_transform_column() -> None: _run_remove_rows_with_missing_values, number=REPETITIONS, ), - # "remove_rows_with_outliers": timeit( - # _run_remove_rows_with_outliers, - # number=REPETITIONS, - # ), + "remove_rows_with_outliers": timeit( + _run_remove_rows_with_outliers, + number=REPETITIONS, + ), "remove_rows": timeit( _run_remove_rows, number=REPETITIONS, @@ -106,11 +108,14 @@ def _run_transform_column() -> None: } # Print the timings - print( - ExperimentalTable( - { - "method": list(timings.keys()), - "timing": list(timings.values()), - } + with pl.Config( + tbl_rows=-1, + ): + print( + ExperimentalTable( + { + "method": list(timings.keys()), + "timing": list(timings.values()), + } + ) ) - ) From 69cda9d6329b7968a7dc527b1864c21cc35fee37 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 11:42:08 +0200 Subject: [PATCH 29/40] docs: more documentation for table operations --- .../tabular/containers/_experimental_table.py | 171 +++++++++++++++++- 1 file changed, 162 insertions(+), 9 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 9ec12d421..6922db640 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -506,6 +506,8 @@ def get_column(self, name: str) -> ExperimentalColumn: """ Get a column from the table. + **Note:** This operation must fully load the data into memory, which can be expensive. + Parameters ---------- name: @@ -813,9 +815,7 @@ def replace_column( """ Return a new table with a column replaced by zero or more columns. - **Note:** - - * The original table is not modified. + **Note:** The original table is not modified. Parameters ---------- @@ -907,6 +907,44 @@ def transform_column( name: str, transformer: Callable[[ExperimentalCell], ExperimentalCell], ) -> ExperimentalTable: + """ + Return a new table with a column transformed. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column to transform. + + transformer: + The function that transforms the column. + + Returns + ------- + new_table: + The table with the transformed column. + + Raises + ------ + KeyError + If no column with the specified name exists. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.transform_column("a", lambda cell: cell + 1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 4 | + | 3 | 5 | + | 4 | 6 | + +-----+-----+ + """ if not self.has_column(name): raise UnknownColumnNameError([name]) # TODO: in the error, compute similar column names @@ -930,7 +968,7 @@ def remove_duplicate_rows(self) -> ExperimentalTable: Returns ------- - filtered_table: + new_table: The table without duplicate rows. Examples @@ -955,10 +993,39 @@ def remove_rows( self, query: Callable[[ExperimentalRow], ExperimentalCell[bool]], ) -> ExperimentalTable: + """ + Remove rows from the table that satisfy a condition. + + **Note:** The original table is not modified. + + Parameters + ---------- + query: + The function that determines which rows to remove. + + Returns + ------- + new_table: + The table without the specified rows. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows(lambda row: row.get_value("a") == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ + """ mask = query(_LazyVectorizedRow(self)) return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(mask._polars_expression), + self._lazy_frame.filter(~mask._polars_expression), ) def remove_rows_by_column( @@ -966,6 +1033,42 @@ def remove_rows_by_column( name: str, query: Callable[[ExperimentalCell], ExperimentalCell[bool]], ) -> ExperimentalTable: + """ + Remove rows from the table that satisfy a condition on a specific column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column. + query: + The function that determines which rows to remove. + + Returns + ------- + new_table: + The table without the specified rows. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.remove_rows_by_column("a", lambda cell: cell == 2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 3 | 6 | + +-----+-----+ + """ import polars as pl if not self.has_column(name): @@ -974,7 +1077,7 @@ def remove_rows_by_column( mask = query(_LazyCell(pl.col(name))) return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.filter(mask._polars_expression), + self._lazy_frame.filter(~mask._polars_expression), ) def remove_rows_with_missing_values( @@ -982,7 +1085,7 @@ def remove_rows_with_missing_values( subset_names: list[str] | None = None, ) -> ExperimentalTable: """ - Remove rows with missing values from the table. + Remove rows with missing values in at least one the specified columns from the table. **Note:** The original table is not modified. @@ -993,7 +1096,7 @@ def remove_rows_with_missing_values( Returns ------- - filtered_table: + new_table: The table without rows containing missing values in the specified columns. Examples @@ -1016,7 +1119,57 @@ def remove_rows_with_missing_values( def remove_rows_with_outliers( self, subset_names: list[str] | None = None, + *, + z_score_threshold: float = 3, ) -> ExperimentalTable: + """ + Remove rows with outliers in at least one of the specified columns from the table. + + Whether a data point is an outlier in a column is determined by its z-score. The z-score the distance of the + data point from the mean of the column divided by the standard deviation of the column. If the z-score is + greater than the given threshold, the data point is considered an outlier. Missing values are ignored during the + calculation of the z-score. + + + + **Note:** The original table is not modified. + + Parameters + ---------- + subset_names: + Names of the columns to consider. If None, all numeric columns are considered. + z_score_threshold: + The z-score threshold for detecting outliers. + + Returns + ------- + new_table: + The table without rows containing outliers in the specified columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5, 6, 1000, None], + ... "b": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + ... ) + >>> table.remove_rows_with_outliers(z_score_threshold=2) + +------+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +============+ + | 1 | 1 | + | 2 | 2 | + | 3 | 3 | + | 4 | 4 | + | 5 | 5 | + | 6 | 6 | + | null | 8 | + +------+-----+ + """ if subset_names is None: subset_names = self.column_names @@ -1027,7 +1180,7 @@ def remove_rows_with_outliers( self._data_frame .select(cs.numeric() & cs.by_name(subset_names)) .select( - ((pl.all() - pl.all().mean()) / pl.all().std()) <= 3, + pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), ), ) From caf17454230f58da962bce782f3ef3cece1da51e Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 12:12:39 +0200 Subject: [PATCH 30/40] docs: more documentation for table operations --- .../tabular/containers/_experimental_table.py | 298 +++++++++++++++++- 1 file changed, 284 insertions(+), 14 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 6922db640..448d44ac2 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -964,7 +964,9 @@ def transform_column( def remove_duplicate_rows(self) -> ExperimentalTable: """ - Remove duplicate rows from the table. + Return a new table without duplicate rows. + + **Note:** The original table is not modified. Returns ------- @@ -994,7 +996,7 @@ def remove_rows( query: Callable[[ExperimentalRow], ExperimentalCell[bool]], ) -> ExperimentalTable: """ - Remove rows from the table that satisfy a condition. + Return a new table without rows that satisfy a condition. **Note:** The original table is not modified. @@ -1034,7 +1036,7 @@ def remove_rows_by_column( query: Callable[[ExperimentalCell], ExperimentalCell[bool]], ) -> ExperimentalTable: """ - Remove rows from the table that satisfy a condition on a specific column. + Return a new table without rows that satisfy a condition on a specific column. **Note:** The original table is not modified. @@ -1082,16 +1084,16 @@ def remove_rows_by_column( def remove_rows_with_missing_values( self, - subset_names: list[str] | None = None, + column_names: list[str] | None = None, ) -> ExperimentalTable: """ - Remove rows with missing values in at least one the specified columns from the table. + Return a new table without rows containing missing values in the specified columns. **Note:** The original table is not modified. Parameters ---------- - subset_names: + column_names: Names of the columns to consider. If None, all columns are considered. Returns @@ -1113,30 +1115,34 @@ def remove_rows_with_missing_values( +-----+-----+ """ return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame.drop_nulls(subset=subset_names), + self._lazy_frame.drop_nulls(subset=column_names), ) def remove_rows_with_outliers( self, - subset_names: list[str] | None = None, + column_names: list[str] | None = None, *, z_score_threshold: float = 3, ) -> ExperimentalTable: """ - Remove rows with outliers in at least one of the specified columns from the table. + Return a new table without rows containing outliers in the specified columns. Whether a data point is an outlier in a column is determined by its z-score. The z-score the distance of the data point from the mean of the column divided by the standard deviation of the column. If the z-score is greater than the given threshold, the data point is considered an outlier. Missing values are ignored during the calculation of the z-score. + The z-score is only defined for numeric columns. Non-numeric columns are ignored, even if they are specified in + `column_names`. + **Notes:** - **Note:** The original table is not modified. + * The original table is not modified. + * This operation must fully load the data into memory, which can be expensive. Parameters ---------- - subset_names: + column_names: Names of the columns to consider. If None, all numeric columns are considered. z_score_threshold: The z-score threshold for detecting outliers. @@ -1170,15 +1176,15 @@ def remove_rows_with_outliers( | null | 8 | +------+-----+ """ - if subset_names is None: - subset_names = self.column_names + if column_names is None: + column_names = self.column_names import polars as pl import polars.selectors as cs non_outlier_mask = pl.all_horizontal( self._data_frame - .select(cs.numeric() & cs.by_name(subset_names)) + .select(cs.numeric() & cs.by_name(column_names)) .select( pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), ), @@ -1189,6 +1195,31 @@ def remove_rows_with_outliers( ) def shuffle_rows(self) -> ExperimentalTable: + """ + Return a new table with the rows shuffled. + + **Note:** The original table is not modified. + + Returns + ------- + new_table: + The table with the rows shuffled. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.shuffle_rows() + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 3 | 6 | + | 2 | 5 | + | 1 | 4 | + +-----+-----+ + """ return ExperimentalTable._from_polars_data_frame( self._data_frame.sample( fraction=1, @@ -1198,6 +1229,46 @@ def shuffle_rows(self) -> ExperimentalTable: ) def slice_rows(self, start: int = 0, length: int | None = None) -> ExperimentalTable: + """ + Return a new table with a slice of rows. + + **Note:** The original table is not modified. + + Parameters + ---------- + start: + The start index of the slice. + length: + The length of the slice. If None, the slice contains all rows starting from `start`. + + Returns + ------- + new_table: + The table with the slice of rows. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> table.slice_rows(start=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + + >>> table.slice_rows(start=1, length=1) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 2 | 5 | + +-----+-----+ + """ return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.slice(start, length), ) @@ -1208,6 +1279,38 @@ def sort_rows( *, descending: bool = False, ) -> ExperimentalTable: + """ + Return a new table with the rows sorted. + + **Note:** The original table is not modified. + + Parameters + ---------- + key_selector: + The function that selects the key to sort by. + descending: + Whether to sort in descending order. + + Returns + ------- + new_table: + The table with the rows sorted. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows(lambda row: row.get_value("a") - row.get_value("b")) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ key = key_selector(_LazyVectorizedRow(self)) return ExperimentalTable._from_polars_lazy_frame( @@ -1224,6 +1327,46 @@ def sort_rows_by_column( *, descending: bool = False, ) -> ExperimentalTable: + """ + Return a new table with the rows sorted by a specific column. + + **Note:** The original table is not modified. + + Parameters + ---------- + name: + The name of the column to sort by. + descending: + Whether to sort in descending order. + + Returns + ------- + new_table: + The table with the rows sorted by the specified column. + + Raises + ------ + KeyError + If the column does not exist. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [2, 1, 3], "b": [1, 1, 2]}) + >>> table.sort_rows_by_column("a") + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 1 | + | 2 | 1 | + | 3 | 2 | + +-----+-----+ + """ + if not self.has_column(name): + raise UnknownColumnNameError([name]) + return ExperimentalTable._from_polars_lazy_frame( self._lazy_frame.sort( name, @@ -1238,6 +1381,58 @@ def split_rows( *, shuffle: bool = True, ) -> tuple[ExperimentalTable, ExperimentalTable]: + """ + Create two tables by splitting the rows of the current table. + + The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table + contains the remaining rows. + + **Note:** The original table is not modified. + + Parameters + ---------- + percentage_in_first: + The percentage of rows to include in the first table. Must be between 0 and 1. + shuffle: + Whether to shuffle the rows before splitting. + + Returns + ------- + first_table: + The first table. + second_table: + The second table. + + Raises + ------ + ValueError + If `percentage_in_first` is not between 0 and 1. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3, 4, 5], "b": [6, 7, 8, 9, 10]}) + >>> first_table, second_table = table.split_rows(0.6) + >>> first_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 6 | + | 4 | 9 | + | 3 | 8 | + +-----+-----+ + >>> second_table + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 5 | 10 | + | 2 | 7 | + +-----+-----+ + """ if percentage_in_first < 0 or percentage_in_first > 1: raise OutOfBoundsError( actual=percentage_in_first, @@ -1259,11 +1454,86 @@ def split_rows( # ------------------------------------------------------------------------------------------------------------------ def add_table_as_columns(self, other: ExperimentalTable) -> ExperimentalTable: + """ + Return a new table with the columns of another table added. + + **Notes:** + + * The original tables are not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + other: + The table to add as columns. + + Returns + ------- + new_table: + The table with the columns added. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) + >>> table2 = ExperimentalTable({"b": [4, 5, 6]}) + >>> table1.add_table_as_columns(table2) + +-----+-----+ + | a | b | + | --- | --- | + | i64 | i64 | + +===========+ + | 1 | 4 | + | 2 | 5 | + | 3 | 6 | + +-----+-----+ + """ + # TODO: raises? + return ExperimentalTable._from_polars_data_frame( self._data_frame.hstack(other._data_frame), ) def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: + """ + Return a new table with the rows of another table added. + + **Notes:** + + * The original tables are not modified. + * This operation must fully load the data into memory, which can be expensive. + + Parameters + ---------- + other: + The table to add as rows. + + Returns + ------- + new_table: + The table with the rows added. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table1 = ExperimentalTable({"a": [1, 2, 3]}) + >>> table2 = ExperimentalTable({"a": [4, 5, 6]}) + >>> table1.add_table_as_rows(table2) + +-----+ + | a | + | --- | + | i64 | + +=====+ + | 1 | + | 2 | + | 3 | + | 4 | + | 5 | + | 6 | + +-----+ + """ + # TODO: raises? + return ExperimentalTable._from_polars_data_frame( self._data_frame.vstack(other._data_frame), ) From 973db7f0f6611c4c26fbab23468fc259e03f1a5f Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 13:55:17 +0200 Subject: [PATCH 31/40] docs: remainder of table --- .../tabular/containers/_experimental_table.py | 114 ++++++++++++++++++ .../_experimental_discretizer.py | 2 +- .../_experimental_label_encoder.py | 4 +- .../_experimental_one_hot_encoder.py | 8 +- .../_experimental_range_scaler.py | 32 +++-- .../_experimental_simple_imputer.py | 2 +- .../_experimental_standard_scaler.py | 6 +- 7 files changed, 143 insertions(+), 25 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 448d44ac2..04729cbf3 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -1539,9 +1539,80 @@ def add_table_as_rows(self, other: ExperimentalTable) -> ExperimentalTable: ) def inverse_transform_table(self, fitted_transformer: ExperimentalInvertibleTableTransformer) -> ExperimentalTable: + """ + Return a new table inverse-transformed by a **fitted, invertible** transformer. + + **Notes:** + + * The original table is not modified. + * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. + + Parameters + ---------- + fitted_transformer: + The fitted, invertible transformer to apply. + + Returns + ------- + new_table: + The inverse-transformed table. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> transformer, transformed_table = ExperimentalRangeScaler(min_=0, max_=1).fit_and_transform(table, ["a"]) + >>> transformed_table.inverse_transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 1.00000 | + | 2.00000 | + | 3.00000 | + +---------+ + """ return fitted_transformer.inverse_transform(self) def transform_table(self, fitted_transformer: ExperimentalTableTransformer) -> ExperimentalTable: + """ + Return a new table transformed by a **fitted** transformer. + + **Notes:** + + * The original table is not modified. + * Depending on the transformer, this operation might fully load the data into memory, which can be expensive. + + + Parameters + ---------- + fitted_transformer: + The fitted transformer to apply. + + Returns + ------- + new_table: + The transformed table. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> from safeds.data.tabular.transformation import ExperimentalRangeScaler + >>> table = ExperimentalTable({"a": [1, 2, 3]}) + >>> transformer = ExperimentalRangeScaler(min_=0, max_=1).fit(table, ["a"]) + >>> table.transform_table(transformer) + +---------+ + | a | + | --- | + | f64 | + +=========+ + | 0.00000 | + | 0.50000 | + | 1.00000 | + +---------+ + """ return fitted_transformer.transform(self) # ------------------------------------------------------------------------------------------------------------------ @@ -1549,6 +1620,35 @@ def transform_table(self, fitted_transformer: ExperimentalTableTransformer) -> E # ------------------------------------------------------------------------------------------------------------------ def summarize_statistics(self) -> ExperimentalTable: + """ + Return a table with important statistics about this table. + + Returns + ------- + statistics: + The table with statistics. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 3]}) + >>> table.summarize_statistics() + +----------------------+--------------------+ + | metric | a | + | --- | --- | + | str | str | + +===========================================+ + | min | 1 | + | max | 3 | + | mean | 2.0 | + | median | 2.0 | + | standard deviation | 1.4142135623730951 | + | distinct value count | 2 | + | idness | 1.0 | + | missing value ratio | 0.0 | + | stability | 0.5 | + +----------------------+--------------------+ + """ if self.number_of_columns == 0: return ExperimentalTable() @@ -1567,6 +1667,20 @@ def summarize_statistics(self) -> ExperimentalTable: # ------------------------------------------------------------------------------------------------------------------ def to_columns(self) -> list[ExperimentalColumn]: + """ + Return the data of the table as a list of columns. + + Returns + ------- + columns: + List of columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> columns = table.to_columns() + """ return [ExperimentalColumn._from_polars_series(column) for column in self._data_frame.get_columns()] def to_csv_file(self, path: str | Path) -> None: diff --git a/src/safeds/data/tabular/transformation/_experimental_discretizer.py b/src/safeds/data/tabular/transformation/_experimental_discretizer.py index 74ee3052f..ea3485831 100644 --- a/src/safeds/data/tabular/transformation/_experimental_discretizer.py +++ b/src/safeds/data/tabular/transformation/_experimental_discretizer.py @@ -152,7 +152,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: table.remove_columns_except(self._column_names)._data_frame, ) return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data), + table._lazy_frame.update(new_data.lazy()), ) @property diff --git a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py index 3156edd96..a556260aa 100644 --- a/src/safeds/data/tabular/transformation/_experimental_label_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_label_encoder.py @@ -121,7 +121,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: table.remove_columns_except(self._column_names)._data_frame, ) return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data), + table._lazy_frame.update(new_data.lazy()), ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: @@ -182,7 +182,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta transformed_table.remove_columns_except(self._column_names)._data_frame, ) return ExperimentalTable._from_polars_lazy_frame( - transformed_table._lazy_frame.update(new_data), + transformed_table._lazy_frame.update(new_data.lazy()), ) @property diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py index 0e8bc7b85..047219c34 100644 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -4,7 +4,7 @@ from collections import Counter from typing import Any -from safeds.data.tabular.containers import Column, ExperimentalColumn, ExperimentalTable +from safeds.data.tabular.containers import ExperimentalColumn, ExperimentalTable from safeds.exceptions import ( NonNumericColumnError, TransformerNotFittedError, @@ -268,8 +268,6 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta ValueError If the table contains 0 rows. """ - import numpy as np - # Transformer has not been fitted yet if self._column_names is None or self._value_to_column is None or self._value_to_column_nans is None: raise TransformerNotFittedError @@ -313,12 +311,12 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta constructed_column = self._value_to_column_nans[original_column_name] for i in range(transformed_table.number_of_rows): if transformed_table.get_column(constructed_column)[i] == 1.0: - original_columns[original_column_name][i] = np.nan + original_columns[original_column_name][i] = None table = transformed_table for column_name, encoded_column in original_columns.items(): - table = table.add_columns(Column(column_name, encoded_column)) + table = table.add_columns(ExperimentalColumn(column_name, encoded_column)) # Drop old column names: table = table.remove_columns(list(self._value_to_column.values())) diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py index f1a6b0be4..35478446e 100644 --- a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from safeds.data.tabular.containers import ExperimentalTable from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError from ._experimental_invertible_table_transformer import ExperimentalInvertibleTableTransformer @@ -9,8 +10,6 @@ if TYPE_CHECKING: from sklearn.preprocessing import MinMaxScaler as sk_MinMaxScaler - from safeds.data.tabular.containers import ExperimentalTable - class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): """ @@ -18,9 +17,9 @@ class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): Parameters ---------- - minimum: + min_: The minimum of the new range after the transformation - maximum: + max_: The maximum of the new range after the transformation Raises @@ -29,13 +28,13 @@ class ExperimentalRangeScaler(ExperimentalInvertibleTableTransformer): If the given minimum is greater or equal to the given maximum """ - def __init__(self, minimum: float = 0.0, maximum: float = 1.0): + def __init__(self, min_: float = 0.0, max_: float = 1.0): self._column_names: list[str] | None = None self._wrapped_transformer: sk_MinMaxScaler | None = None - if minimum >= maximum: + if min_ >= max_: raise ValueError('Parameter "maximum" must be higher than parameter "minimum".') - self._minimum = minimum - self._maximum = maximum + self._minimum = min_ + self._maximum = max_ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> ExperimentalRangeScaler: """ @@ -165,7 +164,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: table.remove_columns_except(self._column_names)._data_frame, ) return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data), + table._lazy_frame.update(new_data.lazy()), ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: @@ -225,11 +224,18 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta ), ) - new_data = self._wrapped_transformer.inverse_transform( + import polars as pl + + new_data = pl.DataFrame(self._wrapped_transformer.inverse_transform( transformed_table.remove_columns_except(self._column_names)._data_frame, - ) - return ExperimentalTable._from_polars_lazy_frame( - transformed_table._lazy_frame.update(new_data), + )) + + name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) + + new_data = new_data.rename(name_mapping) + + return ExperimentalTable._from_polars_data_frame( + transformed_table._data_frame.update(new_data), ) @property diff --git a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py index 4df15e978..0663403fc 100644 --- a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py @@ -240,7 +240,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: new_data = self._wrapped_transformer.transform(table.remove_columns_except(self._column_names)._data_frame) return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data), + table._lazy_frame.update(new_data.lazy()), ) def get_names_of_added_columns(self) -> list[str]: diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py index ba1fc8c34..0b93c6f4c 100644 --- a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -146,7 +146,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: table.remove_columns_except(self._column_names)._data_frame, ) return ExperimentalTable._from_polars_lazy_frame( - table._lazy_frame.update(new_data), + table._lazy_frame.update(new_data.lazy()), ) def inverse_transform(self, transformed_table: ExperimentalTable) -> ExperimentalTable: @@ -209,8 +209,8 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta new_data = self._wrapped_transformer.inverse_transform( transformed_table.remove_columns_except(self._column_names)._data_frame, ) - return ExperimentalTable._from_polars_lazy_frame( - transformed_table._lazy_frame.update(new_data), + return ExperimentalTable._from_polars_data_frame( + transformed_table._data_frame.update(new_data), ) @property From 8971dd33c6e4d4c7070dc1bb6b302369517023cc Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:14:58 +0200 Subject: [PATCH 32/40] feat: scatter and line plot --- .../plotting/_experimental_table_plotter.py | 228 ++++++++---------- 1 file changed, 101 insertions(+), 127 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index e81f8c070..0c25539ca 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -2,6 +2,9 @@ from typing import TYPE_CHECKING +from safeds._utils import _figure_to_image +from safeds.exceptions import NonNumericColumnError, UnknownColumnNameError + if TYPE_CHECKING: from safeds.data.image.containers import Image from safeds.data.tabular.containers import ExperimentalTable @@ -21,10 +24,106 @@ def histograms(self, *, number_of_bins: int = 10) -> Image: raise NotImplementedError def line_plot(self, x_name: str, y_name: str) -> Image: - raise NotImplementedError + # TODO: extract validation + missing_columns = [] + if not self._table.has_column(x_name): + missing_columns.append(x_name) + if not self._table.has_column(y_name): + missing_columns.append(y_name) + if missing_columns: + raise UnknownColumnNameError(missing_columns) + + # TODO: pass list of columns names + if not self._table.get_column(x_name).is_numeric: + raise NonNumericColumnError(x_name) + if not self._table.get_column(y_name).is_numeric: + raise NonNumericColumnError(y_name) + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + ax.plot( + self._table.get_column(x_name)._series, + self._table.get_column(y_name)._series, + ) + ax.set( + xlabel=x_name, + ylabel=y_name, + ) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + fig.tight_layout() + + return _figure_to_image(fig) def scatter_plot(self, x_name: str, y_name: str) -> Image: - raise NotImplementedError + """ + Create a scatter plot for two columns in the table. + + Parameters + ---------- + x_name: + The name of the column to be plotted on the x-axis. + y_name: + The name of the column to be plotted on the y-axis. + + Returns + ------- + scatter_plot: + The plot as an image. + + Raises + ------ + KeyError + If a column does not exist. + TypeError + If a column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalColumn + >>> table = ExperimentalColumn("values", [1,2,3,4,3,2]) + >>> image = table.plot.lag_plot(2) + """ + # TODO: extract validation + missing_columns = [] + if not self._table.has_column(x_name): + missing_columns.append(x_name) + if not self._table.has_column(y_name): + missing_columns.append(y_name) + if missing_columns: + raise UnknownColumnNameError(missing_columns) + + # TODO: pass list of columns names + if not self._table.get_column(x_name).is_numeric: + raise NonNumericColumnError(x_name) + if not self._table.get_column(y_name).is_numeric: + raise NonNumericColumnError(y_name) + + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + ax.scatter( + x=self._table.get_column(x_name)._series, + y=self._table.get_column(y_name)._series, + ) + ax.set( + xlabel=x_name, + ylabel=y_name, + ) + ax.set_xticks(ax.get_xticks()) + ax.set_xticklabels( + ax.get_xticklabels(), + rotation=45, + horizontalalignment="right", + ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels + fig.tight_layout() + + return _figure_to_image(fig) # TODO: equivalent to Column.plot_compare_columns that takes a list of column names (index_plot)? @@ -90,131 +189,6 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: # buffer.seek(0) # return Image.from_bytes(buffer.read()) # - # def plot_lineplot(self, x_column_name: str, y_column_name: str) -> Image: - # """ - # Plot two columns against each other in a lineplot. - # - # If there are multiple x-values for a y-value, the resulting plot will consist of a line representing the mean - # and the lower-transparency area around the line representing the 95% confidence interval. - # - # Parameters - # ---------- - # x_column_name: - # The column name of the column to be plotted on the x-Axis. - # y_column_name: - # The column name of the column to be plotted on the y-Axis. - # - # Returns - # ------- - # plot: - # The plot as an image. - # - # Raises - # ------ - # UnknownColumnNameError - # If either of the columns do not exist. - # - # Examples - # -------- - # >>> from safeds.data.tabular.containers import Table - # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - # >>> image = table.plot_lineplot("temperature", "sales") - # """ - # import matplotlib.pyplot as plt - # import seaborn as sns - # - # if not self.has_column(x_column_name) or not self.has_column(y_column_name): - # similar_columns_x = self._get_similar_columns(x_column_name) - # similar_columns_y = self._get_similar_columns(y_column_name) - # raise UnknownColumnNameError( - # ([x_column_name] if not self.has_column(x_column_name) else []) - # + ([y_column_name] if not self.has_column(y_column_name) else []), - # (similar_columns_x if not self.has_column(x_column_name) else []) - # + (similar_columns_y if not self.has_column(y_column_name) else []), - # ) - # - # fig = plt.figure() - # ax = sns.lineplot( - # data=self._data, - # x=x_column_name, - # y=y_column_name, - # ) - # ax.set(xlabel=x_column_name, ylabel=y_column_name) - # ax.set_xticks(ax.get_xticks()) - # ax.set_xticklabels( - # ax.get_xticklabels(), - # rotation=45, - # horizontalalignment="right", - # ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - # plt.tight_layout() - # - # buffer = io.BytesIO() - # fig.savefig(buffer, format="png") - # plt.close() # Prevents the figure from being displayed directly - # buffer.seek(0) - # return Image.from_bytes(buffer.read()) - # - # def plot_scatterplot(self, x_column_name: str, y_column_name: str) -> Image: - # """ - # Plot two columns against each other in a scatterplot. - # - # Parameters - # ---------- - # x_column_name: - # The column name of the column to be plotted on the x-Axis. - # y_column_name: - # The column name of the column to be plotted on the y-Axis. - # - # Returns - # ------- - # plot: - # The plot as an image. - # - # Raises - # ------ - # UnknownColumnNameError - # If either of the columns do not exist. - # - # Examples - # -------- - # >>> from safeds.data.tabular.containers import Table - # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - # >>> image = table.plot_scatterplot("temperature", "sales") - # """ - # import matplotlib.pyplot as plt - # import seaborn as sns - # - # if not self.has_column(x_column_name) or not self.has_column(y_column_name): - # similar_columns_x = self._get_similar_columns(x_column_name) - # similar_columns_y = self._get_similar_columns(y_column_name) - # raise UnknownColumnNameError( - # ([x_column_name] if not self.has_column(x_column_name) else []) - # + ([y_column_name] if not self.has_column(y_column_name) else []), - # (similar_columns_x if not self.has_column(x_column_name) else []) - # + (similar_columns_y if not self.has_column(y_column_name) else []), - # ) - # - # fig = plt.figure() - # ax = sns.scatterplot( - # data=self._data, - # x=x_column_name, - # y=y_column_name, - # ) - # ax.set(xlabel=x_column_name, ylabel=y_column_name) - # ax.set_xticks(ax.get_xticks()) - # ax.set_xticklabels( - # ax.get_xticklabels(), - # rotation=45, - # horizontalalignment="right", - # ) # rotate the labels of the x Axis to prevent the chance of overlapping of the labels - # plt.tight_layout() - # - # buffer = io.BytesIO() - # fig.savefig(buffer, format="png") - # plt.close() # Prevents the figure from being displayed directly - # buffer.seek(0) - # return Image.from_bytes(buffer.read()) - # # def plot_boxplots(self) -> Image: # """ # Plot a boxplot for every numerical column. From 5a151861933f504380eda674738cd02eccbb518c Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:27:36 +0200 Subject: [PATCH 33/40] feat: histograms --- .../containers/_experimental_column.py | 1 + .../plotting/_experimental_column_plotter.py | 4 +- .../plotting/_experimental_table_plotter.py | 185 +++++++++++------- 3 files changed, 112 insertions(+), 78 deletions(-) diff --git a/src/safeds/data/tabular/containers/_experimental_column.py b/src/safeds/data/tabular/containers/_experimental_column.py index 95628e0fc..65a85ac20 100644 --- a/src/safeds/data/tabular/containers/_experimental_column.py +++ b/src/safeds/data/tabular/containers/_experimental_column.py @@ -465,6 +465,7 @@ def summarize_statistics(self) -> ExperimentalTable: """ from ._experimental_table import ExperimentalTable + # TODO: turn this around (call table method, implement in table; allows parallelization) mean = self.mean() or "-" median = self.median() or "-" standard_deviation = self.standard_deviation() or "-" diff --git a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py index 34cb2569e..3402cd6d4 100644 --- a/src/safeds/data/tabular/plotting/_experimental_column_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_column_plotter.py @@ -113,8 +113,8 @@ def lag_plot(self, lag: int) -> Image: Examples -------- >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> table = ExperimentalColumn("values", [1,2,3,4,3,2]) - >>> image = table.plot.lag_plot(2) + >>> column = ExperimentalColumn("values", [1, 2, 3, 4]) + >>> image = column.plot.lag_plot(2) """ if not self._column.is_numeric: raise NonNumericColumnError("This time series target contains non-numerical columns.") diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 0c25539ca..5644e4c52 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -21,9 +21,109 @@ def correlation_heatmap(self) -> Image: raise NotImplementedError def histograms(self, *, number_of_bins: int = 10) -> Image: - raise NotImplementedError + """ + Plot a histogram for every column. + + Parameters + ---------- + number_of_bins: + The number of bins to use in the histogram. Default is 10. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) + >>> image = table.plot_histograms() + """ + import matplotlib.pyplot as plt + import numpy as np + import pandas as pd + + n_cols = min(3, self._table.number_of_columns) + n_rows = 1 + (self._table.number_of_columns - 1) // n_cols + + if n_cols == 1 and n_rows == 1: + fig, axs = plt.subplots(1, 1, tight_layout=True) + one_col = True + else: + fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) + one_col = False + + col_names = self._table.column_names + for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): + np_col = np.array(self._table.get_column(col_name)) + bins = min(number_of_bins, len(pd.unique(np_col))) + + ax.set_title(col_name) + ax.set_xlabel("") + ax.set_ylabel("") + + if self._table.get_column(col_name).type.is_numeric: + np_col = np_col[~np.isnan(np_col)] + + if bins < len(pd.unique(np_col)): + min_val = np.min(np_col) + max_val = np.max(np_col) + hist, bin_edges = np.histogram(self._table.get_column(col_name), bins, range=(min_val, max_val)) + + bars = np.array([]) + for i in range(len(hist)): + bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") + + ax.bar(bars, hist, edgecolor="black") + ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") + continue + + np_col = np_col.astype(str) + unique_values = np.unique(np_col) + hist = np.array([np.sum(np_col == value) for value in unique_values]) + ax.bar(unique_values, hist, edgecolor="black") + ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") + + for i in range(len(col_names), n_rows * n_cols): + fig.delaxes(axs.flatten()[i]) # Remove empty subplots + + return _figure_to_image(fig) def line_plot(self, x_name: str, y_name: str) -> Image: + """ + Create a line plot for two columns in the table. + + Parameters + ---------- + x_name: + The name of the column to be plotted on the x-axis. + y_name: + The name of the column to be plotted on the y-axis. + + Returns + ------- + line_plot: + The plot as an image. + + Raises + ------ + KeyError + If a column does not exist. + TypeError + If a column is not numeric. + + Examples + -------- + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": [2, 3, 4, 5, 6], + ... } + ... ) + >>> image = table.plot.line_plot("a", "b") + """ # TODO: extract validation missing_columns = [] if not self._table.has_column(x_name): @@ -85,9 +185,14 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: Examples -------- - >>> from safeds.data.tabular.containers import ExperimentalColumn - >>> table = ExperimentalColumn("values", [1,2,3,4,3,2]) - >>> image = table.plot.lag_plot(2) + >>> from safeds.data.tabular.containers import ExperimentalTable + >>> table = ExperimentalTable( + ... { + ... "a": [1, 2, 3, 4, 5], + ... "b": [2, 3, 4, 5, 6], + ... } + ... ) + >>> image = table.plot.scatter_plot("a", "b") """ # TODO: extract validation missing_columns = [] @@ -241,75 +346,3 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: # return Image.from_bytes(buffer.read()) # # def plot_histograms(self, *, number_of_bins: int = 10) -> Image: - # """ - # Plot a histogram for every column. - # - # Parameters - # ---------- - # number_of_bins: - # The number of bins to use in the histogram. Default is 10. - # - # Returns - # ------- - # plot: - # The plot as an image. - # - # Examples - # -------- - # >>> from safeds.data.tabular.containers import Table - # >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) - # >>> image = table.plot_histograms() - # """ - # import matplotlib.pyplot as plt - # import numpy as np - # import pandas as pd - # - # n_cols = min(3, self.number_of_columns) - # n_rows = 1 + (self.number_of_columns - 1) // n_cols - # - # if n_cols == 1 and n_rows == 1: - # fig, axs = plt.subplots(1, 1, tight_layout=True) - # one_col = True - # else: - # fig, axs = plt.subplots(n_rows, n_cols, tight_layout=True, figsize=(n_cols * 3, n_rows * 3)) - # one_col = False - # - # col_names = self.column_names - # for col_name, ax in zip(col_names, axs.flatten() if not one_col else [axs], strict=False): - # np_col = np.array(self.get_column(col_name)) - # bins = min(number_of_bins, len(pd.unique(np_col))) - # - # ax.set_title(col_name) - # ax.set_xlabel("") - # ax.set_ylabel("") - # - # if self.get_column(col_name).type.is_numeric(): - # np_col = np_col[~np.isnan(np_col)] - # - # if bins < len(pd.unique(np_col)): - # min_val = np.min(np_col) - # max_val = np.max(np_col) - # hist, bin_edges = np.histogram(self.get_column(col_name), bins, range=(min_val, max_val)) - # - # bars = np.array([]) - # for i in range(len(hist)): - # bars = np.append(bars, f"{round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)}") - # - # ax.bar(bars, hist, edgecolor="black") - # ax.set_xticks(np.arange(len(hist)), bars, rotation=45, horizontalalignment="right") - # continue - # - # np_col = np_col.astype(str) - # unique_values = np.unique(np_col) - # hist = np.array([np.sum(np_col == value) for value in unique_values]) - # ax.bar(unique_values, hist, edgecolor="black") - # ax.set_xticks(np.arange(len(unique_values)), unique_values, rotation=45, horizontalalignment="right") - # - # for i in range(len(col_names), n_rows * n_cols): - # fig.delaxes(axs.flatten()[i]) # Remove empty subplots - # - # buffer = io.BytesIO() - # fig.savefig(buffer, format="png") - # plt.close() # Prevents the figure from being displayed directly - # buffer.seek(0) - # return Image.from_bytes(buffer.read()) From 565778dd604b513856740c1d3be4503d6b936559 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:39:38 +0200 Subject: [PATCH 34/40] feat: remaining table plots --- .../plotting/_experimental_table_plotter.py | 223 +++++++++--------- 1 file changed, 106 insertions(+), 117 deletions(-) diff --git a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py index 5644e4c52..16c8e5916 100644 --- a/src/safeds/data/tabular/plotting/_experimental_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_experimental_table_plotter.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING from safeds._utils import _figure_to_image @@ -15,10 +16,111 @@ def __init__(self, table: ExperimentalTable): self._table: ExperimentalTable = table def box_plots(self) -> Image: - raise NotImplementedError + """ + Plot a boxplot for every numerical column. + + Returns + ------- + plot: + The plot as an image. + + Raises + ------ + NonNumericColumnError + If the table contains only non-numerical columns. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table({"a":[1, 2], "b": [3, 42]}) + >>> image = table.plot_boxplots() + """ + # TOOD: implement using matplotlib and polars + import matplotlib.pyplot as plt + import seaborn as sns + + numerical_table = self._table.remove_non_numeric_columns() + if numerical_table.number_of_columns == 0: + raise NonNumericColumnError("This table contains only non-numerical columns.") + col_wrap = min(numerical_table.number_of_columns, 3) + + data = numerical_table._lazy_frame.melt(value_vars=numerical_table.column_names).collect() + grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", + ) + grid.map(sns.boxplot, "variable", "value") + grid.set_xlabels("") + grid.set_ylabels("") + grid.set_titles("{col_name}") + for axes in grid.axes.flat: + axes.set_xticks([]) + plt.tight_layout() + fig = grid.fig + + return _figure_to_image(fig) def correlation_heatmap(self) -> Image: - raise NotImplementedError + """ + Plot a correlation heatmap for all numerical columns of this `Table`. + + Returns + ------- + plot: + The plot as an image. + + Examples + -------- + >>> from safeds.data.tabular.containers import Table + >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> image = table.plot_correlation_heatmap() + """ + # TODO: implement using matplotlib and polars + # https://stackoverflow.com/questions/33282368/plotting-a-2d-heatmap + import matplotlib.pyplot as plt + import seaborn as sns + + only_numerical = self._table.remove_non_numeric_columns() + + if self._table.number_of_rows == 0: + warnings.warn( + "An empty table has been used. A correlation heatmap on an empty table will show nothing.", + stacklevel=2, + ) + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=( + "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" + " automatically expanding." + ), + ) + fig = plt.figure() + sns.heatmap( + data=only_numerical._data_frame.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + else: + fig = plt.figure() + sns.heatmap( + data=only_numerical._data_frame.corr(), + vmin=-1, + vmax=1, + xticklabels=only_numerical.column_names, + yticklabels=only_numerical.column_names, + cmap="vlag", + ) + plt.tight_layout() + + return _figure_to_image(fig) def histograms(self, *, number_of_bins: int = 10) -> Image: """ @@ -40,6 +142,7 @@ def histograms(self, *, number_of_bins: int = 10) -> Image: >>> table = Table({"a": [2, 3, 5, 1], "b": [54, 74, 90, 2014]}) >>> image = table.plot_histograms() """ + # TODO: implement using polars import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -194,6 +297,7 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: ... ) >>> image = table.plot.scatter_plot("a", "b") """ + # TODO: merge with line_plot? # TODO: extract validation missing_columns = [] if not self._table.has_column(x_name): @@ -231,118 +335,3 @@ def scatter_plot(self, x_name: str, y_name: str) -> Image: return _figure_to_image(fig) # TODO: equivalent to Column.plot_compare_columns that takes a list of column names (index_plot)? - - # def plot_correlation_heatmap(self) -> Image: - # """ - # Plot a correlation heatmap for all numerical columns of this `Table`. - # - # Returns - # ------- - # plot: - # The plot as an image. - # - # Examples - # -------- - # >>> from safeds.data.tabular.containers import Table - # >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) - # >>> image = table.plot_correlation_heatmap() - # """ - # import matplotlib.pyplot as plt - # import seaborn as sns - # - # only_numerical = self.remove_columns_with_non_numerical_values() - # - # if self.number_of_rows == 0: - # warnings.warn( - # "An empty table has been used. A correlation heatmap on an empty table will show nothing.", - # stacklevel=2, - # ) - # - # with warnings.catch_warnings(): - # warnings.filterwarnings( - # "ignore", - # message=( - # "Attempting to set identical low and high (xlims|ylims) makes transformation singular;" - # " automatically expanding." - # ), - # ) - # fig = plt.figure() - # sns.heatmap( - # data=only_numerical._data.corr(), - # vmin=-1, - # vmax=1, - # xticklabels=only_numerical.column_names, - # yticklabels=only_numerical.column_names, - # cmap="vlag", - # ) - # plt.tight_layout() - # else: - # fig = plt.figure() - # sns.heatmap( - # data=only_numerical._data.corr(), - # vmin=-1, - # vmax=1, - # xticklabels=only_numerical.column_names, - # yticklabels=only_numerical.column_names, - # cmap="vlag", - # ) - # plt.tight_layout() - # - # buffer = io.BytesIO() - # fig.savefig(buffer, format="png") - # plt.close() # Prevents the figure from being displayed directly - # buffer.seek(0) - # return Image.from_bytes(buffer.read()) - # - # def plot_boxplots(self) -> Image: - # """ - # Plot a boxplot for every numerical column. - # - # Returns - # ------- - # plot: - # The plot as an image. - # - # Raises - # ------ - # NonNumericColumnError - # If the table contains only non-numerical columns. - # - # Examples - # -------- - # >>> from safeds.data.tabular.containers import Table - # >>> table = Table({"a":[1, 2], "b": [3, 42]}) - # >>> image = table.plot_boxplots() - # """ - # import matplotlib.pyplot as plt - # import pandas as pd - # import seaborn as sns - # - # numerical_table = self.remove_columns_with_non_numerical_values() - # if numerical_table.number_of_columns == 0: - # raise NonNumericColumnError("This table contains only non-numerical columns.") - # col_wrap = min(numerical_table.number_of_columns, 3) - # - # data = pd.melt(numerical_table._data, value_vars=numerical_table.column_names) - # grid = sns.FacetGrid(data, col="variable", col_wrap=col_wrap, sharex=False, sharey=False) - # with warnings.catch_warnings(): - # warnings.filterwarnings( - # "ignore", - # message="Using the boxplot function without specifying `order` is likely to produce an incorrect plot.", - # ) - # grid.map(sns.boxplot, "variable", "value") - # grid.set_xlabels("") - # grid.set_ylabels("") - # grid.set_titles("{col_name}") - # for axes in grid.axes.flat: - # axes.set_xticks([]) - # plt.tight_layout() - # fig = grid.fig - # - # buffer = io.BytesIO() - # fig.savefig(buffer, format="png") - # plt.close() # Prevents the figure from being displayed directly - # buffer.seek(0) - # return Image.from_bytes(buffer.read()) - # - # def plot_histograms(self, *, number_of_bins: int = 10) -> Image: From 76136f58048ec355a31388aad1aad378d4b9a339 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:41:04 +0200 Subject: [PATCH 35/40] fix: import errors --- src/safeds/_config/__init__.py | 4 +++- tests/helpers/_devices.py | 4 +--- tests/safeds/_config/{test_device.py => test_torch.py} | 5 ++--- tests/safeds/data/image/containers/test_image.py | 5 ++--- 4 files changed, 8 insertions(+), 10 deletions(-) rename tests/safeds/_config/{test_device.py => test_torch.py} (82%) diff --git a/src/safeds/_config/__init__.py b/src/safeds/_config/__init__.py index 3ebf733ba..a4db1a36f 100644 --- a/src/safeds/_config/__init__.py +++ b/src/safeds/_config/__init__.py @@ -5,17 +5,19 @@ import apipkg if TYPE_CHECKING: - from ._torch import _get_device, _init_default_device + from ._torch import _get_device, _init_default_device, _set_default_device apipkg.initpkg( __name__, { "_get_device": "._torch:_get_device", "_init_default_device": "._torch:_init_default_device", + "_set_default_device": "._torch:_set_default_device", }, ) __all__ = [ "_get_device", "_init_default_device", + "_set_default_device", ] diff --git a/tests/helpers/_devices.py b/tests/helpers/_devices.py index b371a85f7..54b043e0e 100644 --- a/tests/helpers/_devices.py +++ b/tests/helpers/_devices.py @@ -1,10 +1,8 @@ import pytest import torch +from safeds._config import _init_default_device, _set_default_device from torch.types import Device -from safeds._config import _init_default_device -from safeds._config._device import _set_default_device - _init_default_device() device_cpu = torch.device("cpu") diff --git a/tests/safeds/_config/test_device.py b/tests/safeds/_config/test_torch.py similarity index 82% rename from tests/safeds/_config/test_device.py rename to tests/safeds/_config/test_torch.py index d99997757..9a2d7e008 100644 --- a/tests/safeds/_config/test_device.py +++ b/tests/safeds/_config/test_torch.py @@ -1,10 +1,9 @@ import pytest import torch +from safeds._config import _get_device, _init_default_device, _set_default_device from torch.types import Device -from safeds._config import _get_device, _init_default_device -from safeds._config._device import _set_default_device -from tests.helpers import get_devices, get_devices_ids, configure_test_with_device, device_cuda, device_cpu +from tests.helpers import configure_test_with_device, device_cpu, device_cuda, get_devices, get_devices_ids from tests.helpers._devices import _skip_if_device_not_available diff --git a/tests/safeds/data/image/containers/test_image.py b/tests/safeds/data/image/containers/test_image.py index 34d5bb37d..5cecde78a 100644 --- a/tests/safeds/data/image/containers/test_image.py +++ b/tests/safeds/data/image/containers/test_image.py @@ -7,7 +7,6 @@ import PIL.Image import pytest import torch - from safeds._config import _get_device from safeds.data.image.containers import Image from safeds.data.image.typing import ImageSize @@ -18,6 +17,8 @@ from tests.helpers import ( configure_test_with_device, + device_cpu, + device_cuda, get_devices, get_devices_ids, grayscale_jpg_id, @@ -41,8 +42,6 @@ white_square_jpg_path, white_square_png_id, white_square_png_path, - device_cpu, - device_cuda, ) From 3640b3b1db0efcc21d7a907a3dbde3c7027ceffc Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:42:00 +0200 Subject: [PATCH 36/40] build: update deps --- poetry.lock | 16 ++++++++-------- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index e20acf8c6..86d2ff99c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2268,17 +2268,17 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "polars" -version = "0.20.24" +version = "0.20.25" description = "Blazingly fast DataFrame library" optional = false python-versions = ">=3.8" files = [ - {file = "polars-0.20.24-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a2c7282e0c81f038c9800ec4e1d97fe53dcacbba9632baf31a633e8bf12caab3"}, - {file = "polars-0.20.24-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:02587e12435e583693351c4757cf571b90165ceb53b031e891aadf2c816cc59d"}, - {file = "polars-0.20.24-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfbb129941dd0cfa05f0fb5ef1cde341fed336b4dfcb81c3bef6f3f6b899cb17"}, - {file = "polars-0.20.24-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:9921df98cee040903d35aef2c7237182240451e1ad413116a82e1e166d8fe943"}, - {file = "polars-0.20.24-cp38-abi3-win_amd64.whl", hash = "sha256:dc0fb1169d3d0b286793421a6919c6a9f06235b9f93c1e00f01f199e038d3681"}, - {file = "polars-0.20.24.tar.gz", hash = "sha256:a0c11f3b5e756bab7ba164ed73104c96fa9c861efce157fe8991b3eafeb4b0b8"}, + {file = "polars-0.20.25-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:126e3b7d9394e4b23b4cc48919b7188203feeeb35d861ad808f281eaa06d76e2"}, + {file = "polars-0.20.25-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:3bda62b681726538714a1159638ab7c9eeca6b8633fd778d84810c3e13b9c7e3"}, + {file = "polars-0.20.25-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:62c8826e81c759f07bf5c0ae00f57a537644ae05fe68737185666b8ad8430664"}, + {file = "polars-0.20.25-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:0fb5e7a4a9831fba742f1c706e01656607089b6362a5e6f8d579b134a99795ce"}, + {file = "polars-0.20.25-cp38-abi3-win_amd64.whl", hash = "sha256:9eaeb9080c853e11b207d191025e0ba8fd59ea06a36c22d410a48f2f124e18cd"}, + {file = "polars-0.20.25.tar.gz", hash = "sha256:4308d63f956874bac9ae040bdd6d62b2992d0b1e1349301bc7a3b59458189108"}, ] [package.dependencies] @@ -3789,4 +3789,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11,<3.13" -content-hash = "e8918188c27818e6491dbdfdbf92304c2d3bce38f3f1a6e01ebeebd09418d809" +content-hash = "84af08810fc4597a0076fb879faf198ec8935a359927460982b02da7bc4a71c9" diff --git a/pyproject.toml b/pyproject.toml index ad6eb4148..706aee510 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ matplotlib = "^3.6.3" openpyxl = "^3.1.2" pandas = "^2.0.0" pillow = ">=9.5,<11.0" -polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"} +polars = {extras = ["numpy", "pyarrow"], version = "^0.20.25"} scikit-learn = "^1.2.0" seaborn = "^0.13.0" statsmodels = "^0.14.1" From cc2e09449bcc02c0edfbb6902e3b3cfc67073eb2 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 14:45:10 +0200 Subject: [PATCH 37/40] ci: undo setting ruff config --- .mega-linter.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.mega-linter.yml b/.mega-linter.yml index df7608fd1..87d022827 100644 --- a/.mega-linter.yml +++ b/.mega-linter.yml @@ -15,8 +15,6 @@ JSON_PRETTIER_FILE_EXTENSIONS: - .html # - .md -PYTHON_RUFF_CONFIG_FILE: pyproject.toml - # Commands PRE_COMMANDS: - command: npm i @lars-reimann/prettier-config From f5a62388dbc1b15342f1a6249d9e8ff028d17f52 Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Thu, 9 May 2024 12:46:50 +0000 Subject: [PATCH 38/40] style: apply automated linter fixes --- src/resources/from_json_file_2.json | 7 ++++++- src/resources/to_json_file.json | 2 +- src/resources/to_json_file_2.json | 7 ++++++- .../labeled/containers/_tabular_dataset.py | 3 ++- .../tabular/containers/_experimental_table.py | 19 ++++++------------ .../_experimental_one_hot_encoder.py | 8 +------- .../_experimental_range_scaler.py | 20 ++++++++----------- .../_experimental_simple_imputer.py | 4 +--- .../_experimental_standard_scaler.py | 12 +++-------- .../_experimental_table_transformer.py | 3 +-- src/safeds/ml/classical/_util_sklearn.py | 3 ++- 11 files changed, 37 insertions(+), 51 deletions(-) diff --git a/src/resources/from_json_file_2.json b/src/resources/from_json_file_2.json index 8be2e957d..5cd814134 100644 --- a/src/resources/from_json_file_2.json +++ b/src/resources/from_json_file_2.json @@ -1 +1,6 @@ -{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"b","datatype":"Int64","bit_settings":"","values":[4,5,6]}]} +{ + "columns": [ + { "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] }, + { "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] } + ] +} diff --git a/src/resources/to_json_file.json b/src/resources/to_json_file.json index b45718124..5965ef149 100644 --- a/src/resources/to_json_file.json +++ b/src/resources/to_json_file.json @@ -1 +1 @@ -{"a":{"0":1,"1":2,"2":3},"b":{"0":4,"1":5,"2":6}} \ No newline at end of file +{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } } diff --git a/src/resources/to_json_file_2.json b/src/resources/to_json_file_2.json index 29d04896f..5cd814134 100644 --- a/src/resources/to_json_file_2.json +++ b/src/resources/to_json_file_2.json @@ -1 +1,6 @@ -{"columns":[{"name":"a","datatype":"Int64","bit_settings":"","values":[1,2,3]},{"name":"b","datatype":"Int64","bit_settings":"","values":[4,5,6]}]} \ No newline at end of file +{ + "columns": [ + { "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] }, + { "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] } + ] +} diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index 50bb892f7..938e62469 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -196,7 +196,8 @@ def _into_dataloader_with_classes(self, batch_size: int, num_of_classes: int) -> dataset=_create_dataset( torch.Tensor(self.features._data.values).to(_get_device()), torch.nn.functional.one_hot( - torch.LongTensor(self.target._data).to(_get_device()), num_classes=num_of_classes, + torch.LongTensor(self.target._data).to(_get_device()), + num_classes=num_of_classes, ), ), batch_size=batch_size, diff --git a/src/safeds/data/tabular/containers/_experimental_table.py b/src/safeds/data/tabular/containers/_experimental_table.py index 04729cbf3..7fd7d700f 100644 --- a/src/safeds/data/tabular/containers/_experimental_table.py +++ b/src/safeds/data/tabular/containers/_experimental_table.py @@ -885,9 +885,7 @@ def replace_column( if len(new_columns) == 1: new_column = new_columns[0] return ExperimentalTable._from_polars_lazy_frame( - self._lazy_frame - .with_columns(new_column._series.alias(old_name)) - .rename({old_name: new_column.name}), + self._lazy_frame.with_columns(new_column._series.alias(old_name)).rename({old_name: new_column.name}), ) import polars as pl @@ -898,7 +896,7 @@ def replace_column( self._lazy_frame.select( *[pl.col(name) for name in self.column_names[:index]], *[column._series for column in new_columns], - *[pl.col(name) for name in self.column_names[index + 1:]], + *[pl.col(name) for name in self.column_names[index + 1 :]], ), ) @@ -1183,11 +1181,9 @@ def remove_rows_with_outliers( import polars.selectors as cs non_outlier_mask = pl.all_horizontal( - self._data_frame - .select(cs.numeric() & cs.by_name(column_names)) - .select( - pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), - ), + self._data_frame.select(cs.numeric() & cs.by_name(column_names)).select( + pl.all().is_null() | (((pl.all() - pl.all().mean()) / pl.all().std()).abs() <= z_score_threshold), + ), ) return ExperimentalTable._from_polars_lazy_frame( @@ -1653,10 +1649,7 @@ def summarize_statistics(self) -> ExperimentalTable: return ExperimentalTable() head = self.get_column(self.column_names[0]).summarize_statistics() - tail = [ - self.get_column(name).summarize_statistics().get_column(name)._series - for name in self.column_names[1:] - ] + tail = [self.get_column(name).summarize_statistics().get_column(name)._series for name in self.column_names[1:]] return ExperimentalTable._from_polars_data_frame( head._lazy_frame.collect().hstack(tail, in_place=True), diff --git a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py index 047219c34..a11cf5798 100644 --- a/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py +++ b/src/safeds/data/tabular/transformation/_experimental_one_hot_encoder.py @@ -113,13 +113,7 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper if table.number_of_rows == 0: raise ValueError("The OneHotEncoder cannot be fitted because the table contains 0 rows") - if ( - table - .remove_columns_except(column_names) - .remove_non_numeric_columns() - .number_of_columns - > 0 - ): + if table.remove_columns_except(column_names).remove_non_numeric_columns().number_of_columns > 0: warnings.warn( "The columns" f" {table.remove_columns_except(column_names).remove_non_numeric_columns().column_names} contain" diff --git a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py index 35478446e..7d708b721 100644 --- a/src/safeds/data/tabular/transformation/_experimental_range_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_range_scaler.py @@ -84,9 +84,7 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper sorted( set(table.remove_columns_except(column_names).column_names) - set( - table.remove_columns_except(column_names) - .remove_non_numeric_columns() - .column_names, + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, ), ), ), @@ -152,9 +150,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: sorted( set(table.remove_columns_except(self._column_names).column_names) - set( - table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, ), ), ), @@ -206,9 +202,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .number_of_columns + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( @@ -226,9 +220,11 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta import polars as pl - new_data = pl.DataFrame(self._wrapped_transformer.inverse_transform( - transformed_table.remove_columns_except(self._column_names)._data_frame, - )) + new_data = pl.DataFrame( + self._wrapped_transformer.inverse_transform( + transformed_table.remove_columns_except(self._column_names)._data_frame, + ) + ) name_mapping = dict(zip(new_data.columns, self._column_names, strict=True)) diff --git a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py index 0663403fc..d75c87acf 100644 --- a/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py +++ b/src/safeds/data/tabular/transformation/_experimental_simple_imputer.py @@ -166,9 +166,7 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper sorted( set(table.remove_columns_except(column_names).column_names) - set( - table.remove_columns_except(column_names) - .remove_non_numeric_columns() - .column_names, + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, ), ), ), diff --git a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py index 0b93c6f4c..c3176eb81 100644 --- a/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py +++ b/src/safeds/data/tabular/transformation/_experimental_standard_scaler.py @@ -66,9 +66,7 @@ def fit(self, table: ExperimentalTable, column_names: list[str] | None) -> Exper sorted( set(table.remove_columns_except(column_names).column_names) - set( - table.remove_columns_except(column_names) - .remove_non_numeric_columns() - .column_names, + table.remove_columns_except(column_names).remove_non_numeric_columns().column_names, ), ), ), @@ -134,9 +132,7 @@ def transform(self, table: ExperimentalTable) -> ExperimentalTable: sorted( set(table.remove_columns_except(self._column_names).column_names) - set( - table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .column_names, + table.remove_columns_except(self._column_names).remove_non_numeric_columns().column_names, ), ), ), @@ -188,9 +184,7 @@ def inverse_transform(self, transformed_table: ExperimentalTable) -> Experimenta raise ValueError("The StandardScaler cannot transform the table because it contains 0 rows") if ( - transformed_table.remove_columns_except(self._column_names) - .remove_non_numeric_columns() - .number_of_columns + transformed_table.remove_columns_except(self._column_names).remove_non_numeric_columns().number_of_columns < transformed_table.remove_columns_except(self._column_names).number_of_columns ): raise NonNumericColumnError( diff --git a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py index a853b56d6..ed30ae728 100644 --- a/src/safeds/data/tabular/transformation/_experimental_table_transformer.py +++ b/src/safeds/data/tabular/transformation/_experimental_table_transformer.py @@ -135,8 +135,7 @@ def get_names_of_removed_columns(self) -> list[str]: """ def fit_and_transform( - self, - table: ExperimentalTable, column_names: list[str] | None = None + self, table: ExperimentalTable, column_names: list[str] | None = None ) -> tuple[Self, ExperimentalTable]: """ Learn a transformation for a set of columns in a table and apply the learned transformation to the same table. diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index 65be0f14d..bfdbbde0f 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -90,7 +90,8 @@ def fit(model: Any, tabular_dataset: TabularDataset | ExperimentalTabularDataset def predict( model: Any, dataset: Table | ExperimentalTable | ExperimentalTabularDataset, - feature_names: list[str] | None, target_name: str | None, + feature_names: list[str] | None, + target_name: str | None, ) -> TabularDataset: """ Predict a target vector using a dataset containing feature vectors. The model has to be trained first. From fb7ed012c631d2fcd17309a4e21a81fc0e1c0332 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Thu, 9 May 2024 15:09:45 +0200 Subject: [PATCH 39/40] fix: models not working with new containers --- .../_experimental_tabular_dataset.py | 6 ++++++ src/safeds/ml/classical/_util_sklearn.py | 18 ++++++++++++------ .../classical/regression/_linear_regression.py | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py index 8fc523e5a..b6711045a 100644 --- a/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_experimental_tabular_dataset.py @@ -92,9 +92,15 @@ def __eq__(self, other: object) -> bool: def __hash__(self) -> int: return _structural_hash(self.target, self.features, self._extras) + def __repr__(self) -> str: + return self._table.__repr__() + def __sizeof__(self) -> int: return sys.getsizeof(self._target) + sys.getsizeof(self._features) + sys.getsizeof(self._extras) + def __str__(self) -> str: + return self._table.__str__() + # ------------------------------------------------------------------------------------------------------------------ # Properties # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/ml/classical/_util_sklearn.py b/src/safeds/ml/classical/_util_sklearn.py index bfdbbde0f..df8ec362a 100644 --- a/src/safeds/ml/classical/_util_sklearn.py +++ b/src/safeds/ml/classical/_util_sklearn.py @@ -79,7 +79,7 @@ def fit(model: Any, tabular_dataset: TabularDataset | ExperimentalTabularDataset ) else: # pragma: no cover model.fit( - tabular_dataset.features.__dataframe__(), + tabular_dataset.features._data_frame, tabular_dataset.target._series, ) except ValueError as exception: @@ -194,7 +194,9 @@ def predict( if dataset.number_of_rows == 0: raise DatasetMissesDataError - non_numerical_column_names_2 = dataset.remove_non_numeric_columns().column_names + non_numerical_column_names_2 = set(dataset.column_names) - set( + dataset.remove_non_numeric_columns().column_names, + ) if len(non_numerical_column_names_2) != 0: raise NonNumericColumnError( str(non_numerical_column_names_2), @@ -203,7 +205,9 @@ def predict( " different values\nor is ordinal, you should use the LabelEncoder.\n", ) - null_containing_column_names_2 = dataset.remove_columns_with_missing_values().column_names + null_containing_column_names_2 = set(dataset.column_names) - set( + dataset.remove_columns_with_missing_values().column_names, + ) if len(null_containing_column_names_2) != 0: raise MissingValuesColumnError( str(null_containing_column_names_2), @@ -216,8 +220,10 @@ def predict( try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="X does not have valid feature names") - predicted_target_vector = model.predict(dataset_df.__dataframe__()) - dataset_df.add_columns(ExperimentalColumn(target_name, predicted_target_vector)) + predicted_target_vector = model.predict(dataset_df._data_frame) + output = dataset.remove_columns(target_name).add_columns( + ExperimentalColumn(target_name, predicted_target_vector), + ) extra_names = [ column_name @@ -226,7 +232,7 @@ def predict( ] return TabularDataset( - dataset_df.to_dict(), + output.to_dict(), target_name=target_name, extra_names=extra_names, ) diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py index 513f44ad7..4dbf22ab9 100644 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ b/src/safeds/ml/classical/regression/_linear_regression.py @@ -14,7 +14,7 @@ from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset from safeds.data.tabular.containers import ExperimentalTable, Table - +# TODO: rename to linear regressor class LinearRegressionRegressor(Regressor): """Linear regression.""" From bf09f2c7bb6d1fb56f54c72c2fe2355d026d8c4d Mon Sep 17 00:00:00 2001 From: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> Date: Thu, 9 May 2024 13:11:42 +0000 Subject: [PATCH 40/40] style: apply automated linter fixes --- src/safeds/ml/classical/regression/_linear_regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/safeds/ml/classical/regression/_linear_regression.py b/src/safeds/ml/classical/regression/_linear_regression.py index 4dbf22ab9..8c9d5db4d 100644 --- a/src/safeds/ml/classical/regression/_linear_regression.py +++ b/src/safeds/ml/classical/regression/_linear_regression.py @@ -14,6 +14,7 @@ from safeds.data.labeled.containers import ExperimentalTabularDataset, TabularDataset from safeds.data.tabular.containers import ExperimentalTable, Table + # TODO: rename to linear regressor class LinearRegressionRegressor(Regressor): """Linear regression."""