Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add deterministic hash methods to all types #573

Merged
merged 5 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 119 additions & 3 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ scikit-learn = "^1.2.0"
seaborn = "^0.13.0"
torch = {version = "^2.2.0", source = "torch_cuda121"}
torchvision = {version = "^0.17.0", source = "torch_cuda121"}
xxhash = "^3.4.1"

[tool.poetry.group.dev.dependencies]
pytest = ">=7.2.1,<9.0.0"
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/image/containers/_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import torch
import torch.nn.functional as func
import xxhash
from PIL.Image import open as pil_image_open
from torch import Tensor

Expand Down Expand Up @@ -109,6 +110,17 @@ def __eq__(self, other: object) -> bool:
and torch.all(torch.eq(self._image_tensor, other._set_device(self.device)._image_tensor)).item()
)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this image.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.width.to_bytes(8) + self.height.to_bytes(8) + self.channel.to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
import seaborn as sns
import xxhash

from safeds.data.image.containers import Image
from safeds.data.tabular.typing import ColumnType
Expand Down Expand Up @@ -191,6 +192,17 @@ def __getitem__(self, index: int | slice) -> T | Column[T]:
data = self._data[index].reset_index(drop=True).rename(self.name)
return Column._from_pandas_series(data, self._type)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this column.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.name.encode("utf-8") + self.type.__repr__().encode("utf-8") + self.number_of_rows.to_bytes(8)).intdigest()

def __iter__(self) -> Iterator[T]:
r"""
Create an iterator for the data of this column. This way e.g. for-each loops can be used on it.
Expand Down
13 changes: 13 additions & 0 deletions src/safeds/data/tabular/containers/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

import sys
import functools
import operator
from collections.abc import Callable, Mapping
from typing import TYPE_CHECKING, Any

import pandas as pd
import xxhash

from safeds.data.tabular.typing import ColumnType, Schema
from safeds.exceptions import UnknownColumnNameError
Expand Down Expand Up @@ -216,6 +218,17 @@ def __getitem__(self, column_name: str) -> Any:
"""
return self.get_value(column_name)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this row.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + functools.reduce(operator.add, [xxhash.xxh3_64(str(self.get_value(value))).intdigest().to_bytes(8) for value in self], b"\0")).intdigest()

def __iter__(self) -> Iterator[Any]:
"""
Create an iterator for the column names of this row.
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import openpyxl
import pandas as pd
import seaborn as sns
import xxhash
from pandas import DataFrame
from scipy import stats

Expand Down Expand Up @@ -457,6 +458,17 @@ def __eq__(self, other: object) -> bool:
return table1.column_names == table2.column_names
return table1._schema == table2._schema and table1._data.equals(table2._data)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this table.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self._schema).to_bytes(8) + self.number_of_rows.to_bytes(8)).intdigest()

def __repr__(self) -> str:
r"""
Display the table in only one line.
Expand Down
27 changes: 27 additions & 0 deletions src/safeds/data/tabular/containers/_tagged_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import sys
from typing import TYPE_CHECKING

import xxhash

from safeds.data.tabular.containers import Column, Row, Table
from safeds.exceptions import (
ColumnIsTargetError,
Expand Down Expand Up @@ -165,6 +167,31 @@ def __init__(
self._features: Table = _data.keep_only_columns(feature_names)
self._target: Column = _data.get_column(target_name)

def __eq__(self, other: object) -> bool:
"""
Compare two tagged table instances.

Returns
-------
'True' if contents and tags are equal, 'False' otherwise.
"""
if not isinstance(other, TaggedTable):
return NotImplemented
if self is other:
return True
return self.target == other.target and self.features == other.features and Table.__eq__(self, other)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this tagged table.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self.target).to_bytes(8) + hash(self.features).to_bytes(8) + Table.__hash__(self).to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
26 changes: 26 additions & 0 deletions src/safeds/data/tabular/containers/_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xxhash

from safeds.data.image.containers import Image
from safeds.data.tabular.containers import Column, Row, Table, TaggedTable
Expand Down Expand Up @@ -194,6 +195,31 @@ def __init__(
raise UnknownColumnNameError([time_name])
self._time: Column = _data.get_column(time_name)

def __eq__(self, other: object) -> bool:
"""
Compare two time series instances.

Returns
-------
'True' if contents are equal, 'False' otherwise.
"""
if not isinstance(other, TimeSeries):
return NotImplemented
if self is other:
return True
return self.time == other.time and TaggedTable.__eq__(self, other)

def __hash__(self) -> int:
"""
Return a deterministic hash value for this time series.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(hash(self.time).to_bytes(8) + TaggedTable.__hash__(self).to_bytes(8)).intdigest()

def __sizeof__(self) -> int:
"""
Return the complete size of this object.
Expand Down
13 changes: 13 additions & 0 deletions src/safeds/data/tabular/transformation/_table_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,26 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash

if TYPE_CHECKING:
from safeds.data.tabular.containers import Table


class TableTransformer(ABC):
"""Learn a transformation for a set of columns in a `Table` and transform another `Table` with the same columns."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a table transformer.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, table: Table, column_names: list[str] | None) -> TableTransformer:
"""
Expand Down
6 changes: 4 additions & 2 deletions src/safeds/data/tabular/typing/_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

import xxhash

from safeds.data.tabular.typing import Anything, Integer, Nothing, RealNumber
from safeds.data.tabular.typing._column_type import ColumnType
from safeds.exceptions import UnknownColumnNameError
Expand Down Expand Up @@ -66,7 +68,7 @@ def __init__(self, schema: dict[str, ColumnType]):

def __hash__(self) -> int:
"""
Return a hash value for the schema.
Return a deterministic hash value for the schema.

Returns
-------
Expand All @@ -81,7 +83,7 @@ def __hash__(self) -> int:
"""
column_names = self._schema.keys()
column_types = map(repr, self._schema.values())
return hash(tuple(zip(column_names, column_types, strict=True)))
return xxhash.xxh3_64(str(tuple(zip(column_names, column_types, strict=True)))).intdigest()

def __repr__(self) -> str:
"""
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/ml/classical/classification/_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash
from sklearn.metrics import accuracy_score as sk_accuracy_score

from safeds.data.tabular.containers import Table, TaggedTable
Expand All @@ -17,6 +18,17 @@
class Classifier(ABC):
"""Abstract base class for all classifiers."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a classifier.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, training_set: TaggedTable) -> Classifier:
"""
Expand Down
12 changes: 12 additions & 0 deletions src/safeds/ml/classical/regression/_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

import xxhash
from sklearn.metrics import mean_absolute_error as sk_mean_absolute_error
from sklearn.metrics import mean_squared_error as sk_mean_squared_error

Expand All @@ -16,6 +17,17 @@
class Regressor(ABC):
"""Abstract base class for all regressors."""

def __hash__(self) -> int:
"""
Return a deterministic hash value for a regressor.

Returns
-------
hash : int
The hash value.
"""
return xxhash.xxh3_64(self.__class__.__qualname__.encode("utf-8") + (1 if self.is_fitted() else 0).to_bytes(1)).intdigest()

@abstractmethod
def fit(self, training_set: TaggedTable) -> Regressor:
"""
Expand Down
38 changes: 38 additions & 0 deletions tests/safeds/data/image/containers/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,44 @@ def test_should_raise(self, resource_path: str, device: Device) -> None:
assert (image.__eq__(other)) is NotImplemented


class TestHash:
@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
@pytest.mark.parametrize(
"resource_path",
_test_images_all(),
ids=_test_images_all_ids(),
)
def test_should_hash_be_equal(self, resource_path: str, device: Device) -> None:
_skip_if_device_not_available(device)
image = Image.from_file(resolve_resource_path(resource_path), device)
image2 = Image.from_file(resolve_resource_path(resource_path), device)
assert hash(image) == hash(image2)

@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
def test_should_hash_not_be_equal(self, device: Device) -> None:
_skip_if_device_not_available(device)
image = Image.from_file(resolve_resource_path(_plane_png_path), device)
image2 = Image.from_file(resolve_resource_path(_white_square_png_path), device)
assert hash(image) != hash(image2)

@pytest.mark.parametrize(
"resource_path",
_test_images_all(),
ids=_test_images_all_ids(),
)
def test_should_hash_be_equal_different_devices(self, resource_path: str) -> None:
_skip_if_device_not_available(_device_cuda)
image = Image.from_file(resolve_resource_path(resource_path), torch.device("cpu"))
image2 = Image.from_file(resolve_resource_path(resource_path), torch.device("cuda"))
assert hash(image) == hash(image2)

def test_should_hash_not_be_equal_different_devices(self) -> None:
_skip_if_device_not_available(_device_cuda)
image = Image.from_file(resolve_resource_path(_plane_png_path), torch.device("cpu"))
image2 = Image.from_file(resolve_resource_path(_white_square_png_path), torch.device("cuda"))
assert hash(image) != hash(image2)


@pytest.mark.parametrize("device", _test_devices(), ids=_test_devices_ids())
class TestResize:
@pytest.mark.parametrize(
Expand Down
36 changes: 36 additions & 0 deletions tests/safeds/data/tabular/containers/_column/test_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any

import pytest
from safeds.data.tabular.containers import Column, Row


@pytest.mark.parametrize(
("column1", "column2"),
[
(Column("a"), Column("a")),
(Column("a", [1, 2, 3]), Column("a", [1, 2, 3])),
(Column("a", [1, 2, 3]), Column("a", [1, 2, 4])),
],
ids=[
"empty columns",
"equal columns",
"different values",
],
)
def test_should_return_same_hash_for_equal_columns(column1: Column, column2: Column) -> None:
assert hash(column1) == hash(column2)


@pytest.mark.parametrize(
("column1", "column2"),
[
(Column("a"), Column("b")),
(Column("a", [1, 2, 3]), Column("a", ["1", "2", "3"])),
],
ids=[
"different names",
"different types",
],
)
def test_should_return_different_hash_for_unequal_columns(column1: Column, column2: Column) -> None:
assert hash(column1) != hash(column2)
Loading