Skip to content

Commit

Permalink
fix: handling of boolean columns in column statistics (#778)
Browse files Browse the repository at this point in the history
### Summary of Changes

* `False` (or any falsy non-numeric value) was replaced by `-` in
`summarize_statistics` for min/max. It's now displayed properly.
* `stability` could not be computed for boolean columns (polars
ComputeError). This is fixed.

---------

Co-authored-by: Saman Hushalsadat <[email protected]>
  • Loading branch information
lars-reimann and SamanHushi committed May 17, 2024
1 parent 0b5aa60 commit f61cceb
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 14 deletions.
10 changes: 7 additions & 3 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,9 +688,12 @@ def summarize_statistics(self) -> Table:
self.stability(),
]
else:
min_ = self.min()
max_ = self.max()

values = [
str(self.min() or "-"),
str(self.max() or "-"),
str("-" if min_ is None else min_),
str("-" if max_ is None else max_),
"-",
"-",
"-",
Expand Down Expand Up @@ -1055,7 +1058,8 @@ def stability(self) -> float:
if non_missing.len() == 0:
return 1.0 # All non-null values are the same (since there is are none)

mode_count = non_missing.unique_counts().max()
# `unique_counts` crashes in polars for boolean columns
mode_count = non_missing.value_counts().get_column("count").max()

return mode_count / non_missing.len()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
([None], 1),
([1, 2, 3, 4], 1 / 4),
(["b", "a", "abc", "abc", "abc"], 3 / 5),
([True, False, True, True, None], 3 / 4),
],
ids=[
"empty",
"some missing values",
"only missing values",
"numeric",
"non-numeric",
"boolean", # caused a crash in previous implementation
],
)
def test_should_return_stability_of_column(values: list[Any], expected: float) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,36 @@
@pytest.mark.parametrize(
("column", "expected"),
[
(
( # boolean
Column("col", [True, False, True]),
Table(
{
"metric": [
"min",
"max",
"mean",
"median",
"standard deviation",
"distinct value count",
"idness",
"missing value ratio",
"stability",
],
"col": [
"False",
"True",
"-",
"-",
"-",
"2",
str(2 / 3),
"0.0",
str(2 / 3),
],
},
),
),
( # ints
Column("col", [1, 2, 1]),
Table(
{
Expand All @@ -25,18 +54,18 @@
"col": [
1,
2,
4.0 / 3,
1.0,
4 / 3,
1,
stdev([1, 2, 1]),
2,
2.0 / 3,
0.0,
2.0 / 3,
2 / 3,
0,
2 / 3,
],
},
),
),
(
( # strings
Column("col", ["a", "b", "c"]),
Table(
{
Expand Down Expand Up @@ -65,7 +94,7 @@
},
),
),
(
( # only missing
Column("col", [None, None]),
Table(
{
Expand Down Expand Up @@ -96,9 +125,10 @@
),
],
ids=[
"Column of ints",
"Column of strings",
"Column of None",
"boolean",
"ints",
"strings",
"only missing",
],
)
def test_should_summarize_statistics(column: Column, expected: Table) -> None:
Expand Down

0 comments on commit f61cceb

Please sign in to comment.