Safe-DS · alex-senger · Jul 12, 2023 · May 26, 2023 · Jun 2, 2023 · Jun 2, 2023
@@ -76,7 +76,7 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col
         result._name = data.name
         result._data = data
         # noinspection PyProtectedMember
-        result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype)
+        result._type = type_ if type_ is not None else ColumnType._data_type(data)
 
         return result
 
@@ -106,7 +106,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None:
         self._name: str = name
         self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name)
         # noinspection PyProtectedMember
-        self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype)
+        self._type: ColumnType = ColumnType._data_type(data)
 
     def __contains__(self, item: Any) -> bool:
         return item in self._data

@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import copy
-from collections.abc import Mapping
+import functools
+from collections.abc import Callable, Mapping
 from typing import TYPE_CHECKING, Any
 
 import pandas as pd
@@ -441,6 +442,39 @@ def get_column_type(self, column_name: str) -> ColumnType:
         """
         return self._schema.get_column_type(column_name)
 
+    # ------------------------------------------------------------------------------------------------------------------
+    # Transformations
+    # ------------------------------------------------------------------------------------------------------------------
+
+    def sort_columns(
+        self,
+        comparator: Callable[[tuple, tuple], int] = lambda col1, col2: (col1[0] > col2[0]) - (col1[0] < col2[0]),
+    ) -> Row:
+        """
+        Sort the columns of a `Row` with the given comparator and return a new `Row`.
+
+        The original row is not modified. The comparator is a function that takes two tuples of (ColumnName, Value) `col1` and `col2` and
+        returns an integer:
+
+        * If `col1` should be ordered before `col2`, the function should return a negative number.
+        * If `col1` should be ordered after `col2`, the function should return a positive number.
+        * If the original order of `col1` and `col2` should be kept, the function should return 0.
+
+        If no comparator is given, the columns will be sorted alphabetically by their name.
+
+        Parameters
+        ----------
+        comparator : Callable[[tuple, tuple], int]
+            The function used to compare two tuples of (ColumnName, Value).
+
+        Returns
+        -------
+        new_row : Row
+            A new row with sorted columns.
+        """
+        sorted_row_dict = dict(sorted(self.to_dict().items(), key=functools.cmp_to_key(comparator)))
+        return Row.from_dict(sorted_row_dict)
+
     # ------------------------------------------------------------------------------------------------------------------
     # Conversion
     # ------------------------------------------------------------------------------------------------------------------

@@ -24,7 +24,6 @@
     DuplicateColumnNameError,
     IndexOutOfBoundsError,
     NonNumericColumnError,
-    SchemaMismatchError,
     UnknownColumnNameError,
     WrongFileExtensionError,
 )
@@ -302,8 +301,8 @@ def from_rows(rows: list[Row]) -> Table:
 
         Raises
         ------
-        SchemaMismatchError
-            If any of the row schemas does not match with the others.
+        UnknownColumnNameError
+            If any of the row column names does not match with the first row.
 
         Examples
         --------
@@ -318,17 +317,22 @@ def from_rows(rows: list[Row]) -> Table:
         if len(rows) == 0:
             return Table._from_pandas_dataframe(pd.DataFrame())
 
-        schema_compare: Schema = rows[0]._schema
+        column_names_compare: list = list(rows[0].column_names)
+        unknown_column_names = set()
         row_array: list[pd.DataFrame] = []
 
         for row in rows:
-            if schema_compare != row._schema:
-                raise SchemaMismatchError
+            unknown_column_names.update(set(column_names_compare) - set(row.column_names))
             row_array.append(row._data)
+        if len(unknown_column_names) > 0:
+            raise UnknownColumnNameError(list(unknown_column_names))
 
         dataframe: DataFrame = pd.concat(row_array, ignore_index=True)
-        dataframe.columns = schema_compare.column_names
-        return Table._from_pandas_dataframe(dataframe)
+        dataframe.columns = column_names_compare
+
+        schema = Schema.merge_multiple_schemas([row.schema for row in rows])
+
+        return Table._from_pandas_dataframe(dataframe, schema)
 
     @staticmethod
     def _from_pandas_dataframe(data: pd.DataFrame, schema: Schema | None = None) -> Table:
@@ -906,6 +910,9 @@ def add_row(self, row: Row) -> Table:
 
         If the table happens to be empty beforehand, respective columns will be added automatically.
 
+        The order of columns of the new row will be adjusted to the order of columns in the table.
+        The new table will contain the merged schema.
+
         This table is not modified.
 
         Parameters
@@ -920,8 +927,8 @@ def add_row(self, row: Row) -> Table:
 
         Raises
         ------
-        SchemaMismatchError
-            If the schema of the row does not match the table schema.
+        UnknownColumnNameError
+            If the row has different column names than the table.
 
         Examples
         --------
@@ -935,20 +942,18 @@ def add_row(self, row: Row) -> Table:
         """
         int_columns = []
         result = self._copy()
+        if self.number_of_columns == 0:
+            return Table.from_rows([row])
+        if len(set(self.column_names) - set(row.column_names)) > 0:
+            raise UnknownColumnNameError(list(set(self.column_names) - set(row.column_names)))
+
         if result.number_of_rows == 0:
-            int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names))
-            if result.number_of_columns == 0:
-                for column in row.column_names:
-                    result._data[column] = Column(column, [])
-                result._schema = Schema._from_pandas_dataframe(result._data)
-            elif result.column_names != row.column_names:
-                raise SchemaMismatchError
-        elif result._schema != row.schema:
-            raise SchemaMismatchError
+            int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64 | np.int32), row.column_names))
 
         new_df = pd.concat([result._data, row._data]).infer_objects()
         new_df.columns = result.column_names
-        result = Table._from_pandas_dataframe(new_df)
+        schema = Schema.merge_multiple_schemas([result.schema, row.schema])
+        result = Table._from_pandas_dataframe(new_df, schema)
 
         for column in int_columns:
             result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))])
@@ -959,6 +964,9 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
         """
         Add multiple rows to a table.
 
+        The order of columns of the new rows will be adjusted to the order of columns in the table.
+        The new table will contain the merged schema.
+
         This table is not modified.
 
         Parameters
@@ -973,8 +981,8 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
 
         Raises
         ------
-        SchemaMismatchError
-            If the schema of one of the rows does not match the table schema.
+        UnknownColumnNameError
+            If at least one of the rows have different column names than the table.
 
         Examples
         --------
@@ -990,28 +998,21 @@ def add_rows(self, rows: list[Row] | Table) -> Table:
         """
         if isinstance(rows, Table):
             rows = rows.to_rows()
-        int_columns = []
         result = self._copy()
+
+        if len(rows) == 0:
+            return self._copy()
+
+        different_column_names = set()
         for row in rows:
-            if result.number_of_rows == 0:
-                int_columns = list(filter(lambda name: isinstance(row[name], int | np.int64), row.column_names))
-                if result.number_of_columns == 0:
-                    for column in row.column_names:
-                        result._data[column] = Column(column, [])
-                    result._schema = Schema._from_pandas_dataframe(result._data)
-                elif result.column_names != row.column_names:
-                    raise SchemaMismatchError
-            elif result._schema != row.schema:
-                raise SchemaMismatchError
-
-        row_frames = (row._data for row in rows)
-
-        new_df = pd.concat([result._data, *row_frames]).infer_objects()
-        new_df.columns = result.column_names
-        result = Table._from_pandas_dataframe(new_df)
+            different_column_names.update(set(rows[0].column_names) - set(row.column_names))
+        if len(different_column_names) > 0:
+            raise UnknownColumnNameError(list(different_column_names))
 
-        for column in int_columns:
-            result = result.replace_column(column, [result.get_column(column).transform(lambda it: int(it))])
+        result = self._copy()
+
+        for row in rows:
+            result = result.add_row(row)
 
         return result
 
@@ -1269,7 +1270,7 @@ def remove_rows_with_missing_values(self) -> Table:
         """
         result = self._data.copy(deep=True)
         result = result.dropna(axis="index")
-        return Table._from_pandas_dataframe(result, self._schema)
+        return Table._from_pandas_dataframe(result)
 
     def remove_rows_with_outliers(self) -> Table:
         """

@@ -152,6 +152,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
         if len(missing_columns) > 0:
             raise UnknownColumnNameError(missing_columns)
 
+        if transformed_table.number_of_rows == 0:
+            raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")
+
         if transformed_table.keep_only_columns(
             self._column_names,
         ).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names):
@@ -168,9 +171,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        if transformed_table.number_of_rows == 0:
-            raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")
-
         data = transformed_table._data.copy()
         data.columns = transformed_table.column_names
         data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])

@@ -277,6 +277,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
         if len(missing_columns) > 0:
             raise UnknownColumnNameError(missing_columns)
 
+        if transformed_table.number_of_rows == 0:
+            raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows")
+
         if transformed_table._as_table().keep_only_columns(
             _transformed_column_names,
         ).remove_columns_with_non_numerical_values().number_of_columns < len(_transformed_column_names):
@@ -293,9 +296,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        if transformed_table.number_of_rows == 0:
-            raise ValueError("The OneHotEncoder cannot inverse transform the table because it contains 0 rows")
-
         original_columns = {}
         for original_column_name in self._column_names:
             original_columns[original_column_name] = [None for _ in range(transformed_table.number_of_rows)]
@@ -306,6 +306,12 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 if transformed_table.get_column(constructed_column)[i] == 1.0:
                     original_columns[original_column_name][i] = value
 
+        for original_column_name in self._value_to_column_nans:
+            constructed_column = self._value_to_column_nans[original_column_name]
+            for i in range(transformed_table.number_of_rows):
+                if transformed_table.get_column(constructed_column)[i] == 1.0:
+                    original_columns[original_column_name][i] = np.nan
+
         table = transformed_table
 
         for column_name, encoded_column in original_columns.items():

@@ -66,6 +66,9 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler:
             if len(missing_columns) > 0:
                 raise UnknownColumnNameError(missing_columns)
 
+        if table.number_of_rows == 0:
+            raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows")
+
         if (
             table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns
             < table.keep_only_columns(column_names).number_of_columns
@@ -83,9 +86,6 @@ def fit(self, table: Table, column_names: list[str] | None) -> RangeScaler:
                 ),
             )
 
-        if table.number_of_rows == 0:
-            raise ValueError("The RangeScaler cannot be fitted because the table contains 0 rows")
-
         wrapped_transformer = sk_MinMaxScaler((self._minimum, self._maximum))
         wrapped_transformer.fit(table._data[column_names])
 
@@ -131,6 +131,9 @@ def transform(self, table: Table) -> Table:
         if len(missing_columns) > 0:
             raise UnknownColumnNameError(missing_columns)
 
+        if table.number_of_rows == 0:
+            raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")
+
         if (
             table.keep_only_columns(self._column_names).remove_columns_with_non_numerical_values().number_of_columns
             < table.keep_only_columns(self._column_names).number_of_columns
@@ -148,9 +151,6 @@ def transform(self, table: Table) -> Table:
                 ),
             )
 
-        if table.number_of_rows == 0:
-            raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")
-
         data = table._data.copy()
         data.columns = table.column_names
         data[self._column_names] = self._wrapped_transformer.transform(data[self._column_names])
@@ -191,6 +191,9 @@ def inverse_transform(self, transformed_table: Table) -> Table:
         if len(missing_columns) > 0:
             raise UnknownColumnNameError(missing_columns)
 
+        if transformed_table.number_of_rows == 0:
+            raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")
+
         if (
             transformed_table.keep_only_columns(self._column_names)
             .remove_columns_with_non_numerical_values()
@@ -210,9 +213,6 @@ def inverse_transform(self, transformed_table: Table) -> Table:
                 ),
             )
 
-        if transformed_table.number_of_rows == 0:
-            raise ValueError("The RangeScaler cannot transform the table because it contains 0 rows")
-
         data = transformed_table._data.copy()
         data.columns = transformed_table.column_names
         data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])