Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: specify partial order in label encoder #763

Merged
merged 21 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ omit = [
]

[tool.pytest.ini_options]
addopts = "--snapshot-warn-unused"
addopts = "--snapshot-warn-unused --tb=short"
filterwarnings = [
"ignore:Deprecated call to `pkg_resources.declare_namespace",
"ignore:Jupyter is migrating its paths to use standard platformdirs"
Expand Down
61 changes: 61 additions & 0 deletions src/safeds/_validation/_check_columns_are_numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from safeds.exceptions import ColumnTypeError

if TYPE_CHECKING:
from collections.abc import Container

from safeds.data.tabular.containers import Table
from safeds.data.tabular.typing import Schema


def _check_columns_are_numeric(
table_or_schema: Table | Schema,
column_names: str | list[str],
*,
operation: str = "do a numeric operation",
) -> None:
"""
Check if the columns with the specified names are numeric and raise an error if they are not.

Missing columns are ignored. Use `_check_columns_exist` to check for missing columns.

Parameters
----------
table_or_schema:
The table or schema to check.
column_names:
The column names to check.
operation:
The operation that is performed on the columns. This is used in the error message.

Raises
------
ColumnTypeError
If a column exists but is not numeric.
"""
from safeds.data.tabular.containers import Table # circular import

if isinstance(table_or_schema, Table):
table_or_schema = table_or_schema.schema
if isinstance(column_names, str):
column_names = [column_names]

if len(column_names) > 1:
# Create a set for faster containment checks
known_names: Container = set(table_or_schema.column_names)
else:
known_names = table_or_schema.column_names

non_numeric_names = [
name for name in column_names if name in known_names and not table_or_schema.get_column_type(name).is_numeric
]
if non_numeric_names:
message = _build_error_message(non_numeric_names, operation)
raise ColumnTypeError(message)


def _build_error_message(non_numeric_names: list[str], operation: str) -> str:
return f"Tried to {operation} on non-numeric columns {non_numeric_names}."
6 changes: 4 additions & 2 deletions src/safeds/data/image/containers/_image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import io
import os.path
import sys
import warnings
from pathlib import Path
Expand Down Expand Up @@ -79,9 +78,12 @@ def from_file(path: str | Path) -> Image:
"""
from torchvision.io import read_image

if isinstance(path, str):
path = Path(path)

_init_default_device()

if not os.path.isfile(path):
if not path.is_file():
raise FileNotFoundError(f"No such file or directory: '{path}'")

return Image(image_tensor=read_image(str(path)).to(_get_device()))
Expand Down
2 changes: 0 additions & 2 deletions src/safeds/data/image/containers/_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ def from_files(
return image_list

class _FromFileThreadPackage:

def __init__(
self,
im_files: list[str],
Expand Down Expand Up @@ -323,7 +322,6 @@ def __len__(self) -> int:
return len(self._im_files)

class _FromImageThread(Thread):

def __init__(self, packages: list[ImageList._FromFileThreadPackage]) -> None:
super().__init__()
self._packages = packages
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/image/containers/_multi_size_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
single_size_image_list._indices_to_tensor_positions.keys(),
[image_size] * len(single_size_image_list),
strict=False,
)
),
)
if max_channel is None:
max_channel = single_size_image_list.channel
Expand All @@ -80,7 +80,7 @@ def _create_from_single_sized_image_lists(single_size_image_lists: list[_SingleS
for size in image_list._image_list_dict:
if max_channel is not None and image_list._image_list_dict[size].channel != max_channel:
image_list._image_list_dict[size] = image_list._image_list_dict[size].change_channel(
int(max_channel)
int(max_channel),
)
return image_list

Expand Down
9 changes: 7 additions & 2 deletions src/safeds/data/image/containers/_single_size_image_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
from typing import TYPE_CHECKING

from safeds._config import _init_default_device, _get_device
from safeds._config import _get_device, _init_default_device
from safeds._utils import _structural_hash
from safeds.data.image._utils._image_transformation_error_and_warning_checks import (
_check_add_noise_errors,
Expand Down Expand Up @@ -82,7 +82,12 @@ def _create_image_list_from_files(
image_list = _SingleSizeImageList()

images_tensor = torch.empty(
number_of_images, max_channel, height, width, dtype=torch.uint8, device=_get_device()
number_of_images,
max_channel,
height,
width,
dtype=torch.uint8,
device=_get_device(),
)

thread_packages: list[ImageList._FromFileThreadPackage] = []
Expand Down
12 changes: 6 additions & 6 deletions src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, s
_output_size: int | ImageSize = output_data.number_of_columns
elif isinstance(output_data, Column):
_column_as_tensor = _ColumnAsTensor(output_data)
_output_size = len(_column_as_tensor._one_hot_encoder.get_names_of_added_columns())
_output_size = len(_column_as_tensor._one_hot_encoder._get_names_of_added_columns())
_output = _column_as_tensor
elif isinstance(output_data, _SingleSizeImageList):
_output = output_data._clone()._as_single_size_image_list()
Expand Down Expand Up @@ -289,7 +289,6 @@ def shuffle(self) -> ImageDataset[T]:


class _TableAsTensor:

def __init__(self, table: Table) -> None:
import torch

Expand Down Expand Up @@ -345,7 +344,6 @@ def _to_table(self) -> Table:


class _ColumnAsTensor:

def __init__(self, column: Column) -> None:
import torch

Expand All @@ -359,6 +357,8 @@ def __init__(self, column: Column) -> None:
message=rf"The columns \['{self._column_name}'\] contain numerical data. The OneHotEncoder is designed to encode non-numerical values into numerical values",
category=UserWarning,
)
# TODO: should not one-hot-encode the target. label encoding without order is sufficient. should also not
# be done automatically?
self._one_hot_encoder = OneHotEncoder().fit(column_as_table, [self._column_name])
self._tensor = torch.Tensor(self._one_hot_encoder.transform(column_as_table)._data_frame.to_torch()).to(
_get_device(),
Expand Down Expand Up @@ -394,9 +394,9 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode
raise ValueError(f"Tensor has an invalid amount of dimensions. Needed 2 dimensions but got {tensor.dim()}.")
if not one_hot_encoder.is_fitted:
raise TransformerNotFittedError
if tensor.size(dim=1) != len(one_hot_encoder.get_names_of_added_columns()):
if tensor.size(dim=1) != len(one_hot_encoder._get_names_of_added_columns()):
raise ValueError(
f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder.get_names_of_added_columns())}).",
f"Tensor and one_hot_encoder have different amounts of classes ({tensor.size(dim=1)}!={len(one_hot_encoder._get_names_of_added_columns())}).",
)
table_as_tensor = _ColumnAsTensor.__new__(_ColumnAsTensor)
table_as_tensor._tensor = tensor
Expand All @@ -406,6 +406,6 @@ def _from_tensor(tensor: Tensor, column_name: str, one_hot_encoder: OneHotEncode

def _to_column(self) -> Column:
table = Table(
dict(zip(self._one_hot_encoder.get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
dict(zip(self._one_hot_encoder._get_names_of_added_columns(), self._tensor.T.tolist(), strict=False)),
)
return self._one_hot_encoder.inverse_transform(table).get_column(self._column_name)
12 changes: 10 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(self, data: Mapping[str, Sequence[Any]] | None = None) -> None:

# Implementation
self._lazy_frame: pl.LazyFrame = pl.LazyFrame(data)
self.__data_frame_cache: pl.DataFrame | None = None
self.__data_frame_cache: pl.DataFrame | None = None # Scramble the name to prevent access from outside

def __eq__(self, other: object) -> bool:
if not isinstance(other, Table):
Expand Down Expand Up @@ -1033,6 +1033,9 @@ def remove_duplicate_rows(self) -> Table:
| 2 | 5 |
+-----+-----+
"""
if self.number_of_columns == 0:
return self # Workaround for https://github.com/pola-rs/polars/issues/16207

return Table._from_polars_lazy_frame(
self._lazy_frame.unique(maintain_order=True),
)
Expand Down Expand Up @@ -1221,6 +1224,8 @@ def remove_rows_with_outliers(
| null | 8 |
+------+-----+
"""
if self.number_of_rows == 0:
return self # polars raises a ComputeError for tables without rows
if column_names is None:
column_names = self.column_names

Expand Down Expand Up @@ -1440,7 +1445,10 @@ def split_rows(
The first table contains a percentage of the rows specified by `percentage_in_first`, and the second table
contains the remaining rows.

**Note:** The original table is not modified.
**Notes:**

- The original table is not modified.
- By default, the rows are shuffled before splitting. You can disable this by setting `shuffle` to False.

Parameters
----------
Expand Down
87 changes: 26 additions & 61 deletions src/safeds/data/tabular/transformation/_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound
from safeds.data.tabular.containers import Table
from safeds.exceptions import (
Expand Down Expand Up @@ -30,13 +31,36 @@ class Discretizer(TableTransformer):
If the given number_of_bins is less than 2.
"""

def __init__(self, number_of_bins: int = 5):
# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, number_of_bins: int = 5) -> None:
TableTransformer.__init__(self)

_check_bounds("number_of_bins", number_of_bins, lower_bound=_ClosedBound(2))

self._column_names: list[str] | None = None
self._wrapped_transformer: sk_KBinsDiscretizer | None = None
self._number_of_bins = number_of_bins

def __hash__(self) -> int:
return _structural_hash(
TableTransformer.__hash__(self),
self._number_of_bins,
)

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
def number_of_bins(self) -> int:
return self._number_of_bins

# ------------------------------------------------------------------------------------------------------------------
# Learning and transformation
# ------------------------------------------------------------------------------------------------------------------

def fit(self, table: Table, column_names: list[str] | None) -> Discretizer:
"""
Learn a transformation for a set of columns in a table.
Expand Down Expand Up @@ -137,62 +161,3 @@ def transform(self, table: Table) -> Table:
return Table._from_polars_lazy_frame(
table._lazy_frame.update(new_data.lazy()),
)

@property
def is_fitted(self) -> bool:
"""Whether the transformer is fitted."""
return self._wrapped_transformer is not None

def get_names_of_added_columns(self) -> list[str]:
"""
Get the names of all new columns that have been added by the Discretizer.

Returns
-------
added_columns:
A list of names of the added columns, ordered as they will appear in the table.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted:
raise TransformerNotFittedError
return []

def get_names_of_changed_columns(self) -> list[str]:
"""
Get the names of all columns that may have been changed by the Discretizer.

Returns
-------
changed_columns:
The list of (potentially) changed column names, as passed to fit.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if self._column_names is None:
raise TransformerNotFittedError
return self._column_names

def get_names_of_removed_columns(self) -> list[str]:
"""
Get the names of all columns that have been removed by the Discretizer.

Returns
-------
removed_columns:
A list of names of the removed columns, ordered as they appear in the table the Discretizer was fitted on.

Raises
------
TransformerNotFittedError
If the transformer has not been fitted yet.
"""
if not self.is_fitted:
raise TransformerNotFittedError
return []
Loading
Loading