Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: polars implementation of table #744

Merged
merged 40 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
04629ab
refactor: make fields internal
lars-reimann May 7, 2024
2835bd4
test: store polars benchmark in polars table
lars-reimann May 7, 2024
5cd1229
style: add comma
lars-reimann May 7, 2024
9844d65
feat: implement some table operations
lars-reimann May 8, 2024
d4023ff
docs: minor changes
lars-reimann May 8, 2024
61e361f
feat: summarize_statistics
lars-reimann May 8, 2024
e7eeb86
feat: tabular dataset backed by polars
lars-reimann May 8, 2024
7b6cd5d
refactor: get polars `IntoExpr` for any cell implementation
lars-reimann May 8, 2024
f9c6264
feat: lazy `transform_column`
lars-reimann May 8, 2024
f9bcba0
feat: handle new data structures in models
lars-reimann May 8, 2024
a612198
feat: experimental table transformers (just buggy copies)
lars-reimann May 8, 2024
d53fd09
feat: `ExperimentalTable.transform` and `ExperimentalTable.inverse_tr…
lars-reimann May 8, 2024
e4dcd9d
refactor: move invertible transformer to own file
lars-reimann May 8, 2024
3ea43f2
refactor: data frame field as lazy property
lars-reimann May 8, 2024
f8b1f84
refactor: bring back `ExperimentalTable.remove_columns_except`
lars-reimann May 8, 2024
b8b975b
fix: errors in transformers
lars-reimann May 8, 2024
6e3843a
ci: ruff config in MegaLinter?
lars-reimann May 8, 2024
27f8495
feat: use ASCII to format tables
lars-reimann May 8, 2024
3eefe18
feat: `remove_rows_with_outliers`
lars-reimann May 8, 2024
ae3ac76
feat: column plots
lars-reimann May 8, 2024
1ba620e
perf: faster lag plot
lars-reimann May 8, 2024
7a7037c
refactor: extract conversion of figure to image
lars-reimann May 8, 2024
4c79e6e
feat: finish column plotter
lars-reimann May 8, 2024
8b65aea
feat: named cell operations
lars-reimann May 9, 2024
31f9c11
feat: floor & ceil
lars-reimann May 9, 2024
dcd4c61
docs: document table operations
lars-reimann May 9, 2024
e7315c2
perf: lazy `replace_column`
lars-reimann May 9, 2024
1eaec6a
test: benchmark for remove_rows_with_outliers
lars-reimann May 9, 2024
69cda9d
docs: more documentation for table operations
lars-reimann May 9, 2024
caf1745
docs: more documentation for table operations
lars-reimann May 9, 2024
973db7f
docs: remainder of table
lars-reimann May 9, 2024
8971dd3
feat: scatter and line plot
lars-reimann May 9, 2024
5a15186
feat: histograms
lars-reimann May 9, 2024
565778d
feat: remaining table plots
lars-reimann May 9, 2024
76136f5
fix: import errors
lars-reimann May 9, 2024
3640b3b
build: update deps
lars-reimann May 9, 2024
cc2e094
ci: undo setting ruff config
lars-reimann May 9, 2024
f5a6238
style: apply automated linter fixes
megalinter-bot May 9, 2024
fb7ed01
fix: models not working with new containers
lars-reimann May 9, 2024
bf09f2c
style: apply automated linter fixes
megalinter-bot May 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions benchmarks/table/column_operations_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from timeit import timeit

from safeds.data.tabular.containers import ExperimentalTable

from benchmarks.table.utils import create_synthetic_table_polars

REPETITIONS = 10


def _run_remove_columns_with_missing_values() -> None:
table.remove_columns_with_missing_values()._lazy_frame.collect()


def _run_remove_non_numeric_columns() -> None:
table.remove_non_numeric_columns()._lazy_frame.collect()


def _run_summarize_statistics() -> None:
table.summarize_statistics()._lazy_frame.collect()


if __name__ == "__main__":
# Create a synthetic Table
table = create_synthetic_table_polars(100, 5000)

# Run the benchmarks
timings: dict[str, float] = {
"remove_columns_with_missing_values": timeit(
_run_remove_columns_with_missing_values,
number=REPETITIONS,
),
"remove_non_numeric_columns": timeit(
_run_remove_non_numeric_columns,
number=REPETITIONS,
),
"summarize_statistics": timeit(
_run_summarize_statistics,
number=REPETITIONS,
),
}

# Print the timings
print(
ExperimentalTable(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
41 changes: 27 additions & 14 deletions benchmarks/table/row_operations_polars.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from timeit import timeit

from safeds.data.tabular.containers import Table
import polars as pl

from safeds.data.tabular.containers import ExperimentalTable

from benchmarks.table.utils import create_synthetic_table_polars

Expand All @@ -15,14 +17,18 @@ def _run_remove_rows_with_missing_values() -> None:
table.remove_rows_with_missing_values()._lazy_frame.collect()


# def _run_remove_rows_with_outliers() -> None:
# table.remove_rows_with_outliers()
def _run_remove_rows_with_outliers() -> None:
table.remove_rows_with_outliers()


def _run_remove_rows() -> None:
table.remove_rows(lambda row: row.get_value("column_0") % 2 == 0)._lazy_frame.collect()


def _run_remove_rows_by_column() -> None:
table.remove_rows_by_column("column_0", lambda cell: cell % 2 == 0)._lazy_frame.collect()


def _run_shuffle_rows() -> None:
table.shuffle_rows()._lazy_frame.collect()

Expand Down Expand Up @@ -63,14 +69,18 @@ def _run_transform_column() -> None:
_run_remove_rows_with_missing_values,
number=REPETITIONS,
),
# "remove_rows_with_outliers": timeit(
# _run_remove_rows_with_outliers,
# number=REPETITIONS,
# ),
"remove_rows_with_outliers": timeit(
_run_remove_rows_with_outliers,
number=REPETITIONS,
),
"remove_rows": timeit(
_run_remove_rows,
number=REPETITIONS,
),
"remove_rows_by_column": timeit(
_run_remove_rows_by_column,
number=REPETITIONS,
),
"shuffle_rows": timeit(
_run_shuffle_rows,
number=REPETITIONS,
Expand Down Expand Up @@ -98,11 +108,14 @@ def _run_transform_column() -> None:
}

# Print the timings
print(
Table(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
with pl.Config(
tbl_rows=-1,
):
print(
ExperimentalTable(
{
"method": list(timings.keys()),
"timing": list(timings.values()),
}
)
)
)
3 changes: 2 additions & 1 deletion benchmarks/table/utils/create_synthetic_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def create_synthetic_table(
min_value: int = 0,
max_value: int = 1000,
) -> Table:
"""Create a synthetic Table with random numerical data.
"""
Create a synthetic Table with random numerical data.

Parameters
----------
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/table/utils/create_synthetic_table_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def create_synthetic_table_polars(
min_value: int = 0,
max_value: int = 1000,
) -> ExperimentalTable:
"""Create a synthetic Table with random numerical data.
"""
Create a synthetic Table with random numerical data.

Parameters
----------
Expand Down
16 changes: 8 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ matplotlib = "^3.6.3"
openpyxl = "^3.1.2"
pandas = "^2.0.0"
pillow = ">=9.5,<11.0"
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.24"}
polars = {extras = ["numpy", "pyarrow"], version = "^0.20.25"}
scikit-learn = "^1.2.0"
seaborn = "^0.13.0"
statsmodels = "^0.14.1"
Expand Down
6 changes: 6 additions & 0 deletions src/resources/from_json_file_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
Binary file added src/resources/from_parquet_file.parquet
Binary file not shown.
Binary file modified src/resources/to_excel_file.xlsx
Binary file not shown.
6 changes: 6 additions & 0 deletions src/resources/to_json_file_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
Binary file added src/resources/to_parquet_file.parquet
Binary file not shown.
8 changes: 5 additions & 3 deletions src/safeds/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
import apipkg

if TYPE_CHECKING:
from ._device import _get_device, _init_default_device
from ._torch import _get_device, _init_default_device, _set_default_device

apipkg.initpkg(
__name__,
{
"_get_device": "._device:_get_device",
"_init_default_device": "._device:_init_default_device",
"_get_device": "._torch:_get_device",
"_init_default_device": "._torch:_init_default_device",
"_set_default_device": "._torch:_set_default_device",
},
)

__all__ = [
"_get_device",
"_init_default_device",
"_set_default_device",
]
17 changes: 17 additions & 0 deletions src/safeds/_config/_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
import polars as pl


def _get_polars_config() -> pl.Config:
import polars as pl

return pl.Config(
float_precision=5,
tbl_cell_numeric_alignment="RIGHT",
tbl_formatting="ASCII_FULL_CONDENSED",
tbl_hide_dataframe_shape=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def _get_device() -> Device:
def _init_default_device() -> None:
import torch

global _default_device
global _default_device # noqa: PLW0603

if _default_device is None:
_default_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
Expand All @@ -28,7 +28,7 @@ def _init_default_device() -> None:

def _set_default_device(device: Device) -> None:
# This changes all future tensors, but not any tensor that already exists
global _default_device
global _default_device # noqa: PLW0603

_default_device = device
_init_default_device()
6 changes: 6 additions & 0 deletions src/safeds/_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@
if TYPE_CHECKING:
from ._file_io import _check_and_normalize_file_path
from ._hashing import _structural_hash
from ._plotting import _figure_to_image
from ._random import _get_random_seed

apipkg.initpkg(
__name__,
{
"_check_and_normalize_file_path": "._file_io:_check_and_normalize_file_path",
"_structural_hash": "._hashing:_structural_hash",
"_figure_to_image": "._plotting:_figure_to_image",
"_get_random_seed": "._random:_get_random_seed",
},
)

__all__ = [
"_check_and_normalize_file_path",
"_structural_hash",
"_figure_to_image",
"_get_random_seed",
]
32 changes: 32 additions & 0 deletions src/safeds/_utils/_plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

import io
from typing import TYPE_CHECKING

from safeds.data.image.containers import Image

if TYPE_CHECKING:
import matplotlib.pyplot as plt


def _figure_to_image(figure: plt.Figure) -> Image:
"""
Store the figure as an image and closes it.

Parameters
----------
figure:
The figure to store.

Returns
-------
image:
The figure as an image.
"""
import matplotlib.pyplot as plt

buffer = io.BytesIO()
figure.savefig(buffer, format="png")
plt.close(figure) # Prevents the figure from being displayed directly
buffer.seek(0)
return Image.from_bytes(buffer.read())
3 changes: 3 additions & 0 deletions src/safeds/data/labeled/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
import apipkg

if TYPE_CHECKING:
from ._experimental_tabular_dataset import ExperimentalTabularDataset
from ._image_dataset import ImageDataset
from ._tabular_dataset import TabularDataset
from ._time_series_dataset import TimeSeriesDataset

apipkg.initpkg(
__name__,
{
"ExperimentalTabularDataset": "._experimental_tabular_dataset:ExperimentalTabularDataset",
"ImageDataset": "._image_dataset:ImageDataset",
"TabularDataset": "._tabular_dataset:TabularDataset",
"TimeSeriesDataset": "._time_series_dataset:TimeSeriesDataset",
},
)

__all__ = [
"ExperimentalTabularDataset",
"ImageDataset",
"TabularDataset",
"TimeSeriesDataset",
Expand Down
Loading