Skip to content

Commit

Permalink
fix: handling of missing values when dropping rows with outliers (#101)
Browse files Browse the repository at this point in the history
Closes #7.

### Summary of Changes

Previously, calling `drop_rows_with_outliers` on a `Table` that had at
least one missing value in a numerical column cause the resulting table
to be completely empty. This PR introduces two changes:

1. Missing values are never considered outliers.
2. Missing values are ignored when computing the standard deviation.

---------

Co-authored-by: lars-reimann <[email protected]>
  • Loading branch information
lars-reimann and lars-reimann authored Mar 27, 2023
1 parent a0c56ad commit 0a5e853
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 63 deletions.
42 changes: 14 additions & 28 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,9 @@ def drop_columns_with_non_numerical_values(self) -> Table:
A table without the columns that contain non-numerical values.
"""
return Table.from_columns(self._list_columns_with_numerical_values())
return Table.from_columns(
[column for column in self.to_columns() if column.type.is_numeric()]
)

def drop_duplicate_rows(self) -> Table:
"""
Expand Down Expand Up @@ -626,27 +628,26 @@ def drop_rows_with_missing_values(self) -> Table:

def drop_rows_with_outliers(self) -> Table:
"""
Remove all rows from the table that contain at least one outlier defined as having a value that has a distance
of more than 3 standard deviations from the column average.
Remove all rows from the table that contain at least one outlier.
We define an outlier as a value that has a distance of more than 3 standard deviations from the column mean.
Missing values are not considered outliers. They are also ignored during the calculation of the standard
deviation.
Returns
-------
new_table : Table
A new table without rows containing outliers.
"""
result = self._data.copy(deep=True)
copy = self._data.copy(deep=True)

table_without_nonnumericals = Table.from_columns(
self._list_columns_with_numerical_values()
table_without_nonnumericals = self.drop_columns_with_non_numerical_values()
z_scores = np.absolute(
stats.zscore(table_without_nonnumericals._data, nan_policy="omit")
)
filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)

result = result[
(np.absolute(stats.zscore(table_without_nonnumericals._data)) < 3).all(
axis=1
)
]

return Table(result, self._schema)
return Table(copy[filter_], self._schema)

def filter_rows(self, query: Callable[[Row], bool]) -> Table:
"""
Expand Down Expand Up @@ -1098,18 +1099,3 @@ def _ipython_display_(self) -> DisplayHandle:
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)

def _list_columns_with_numerical_values(self) -> list[Column]:
"""
Return a list of columns only containing numerical values.
Returns
-------
cols : list[Column]
The list with only numerical columns.
"""
cols = []
for column_name, data_type in self._schema._schema.items():
if data_type.is_numeric():
cols.append(self.get_column(column_name))
return cols
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_drop_rows_with_outliers_no_outliers() -> None:


def test_drop_rows_with_outliers_with_outliers() -> None:
table = Table(
input_ = Table(
pd.DataFrame(
data={
"col1": [
Expand All @@ -39,14 +39,24 @@ def test_drop_rows_with_outliers_with_outliers() -> None:
"a",
"a",
],
"col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
"col2": [1.0, 2.0, 3.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None],
"col3": [2, 3, 1, 1_000_000_000, 1, 1, 1, 1, 1, 1, 1, 1],
}
)
)
result = table.drop_rows_with_outliers()
assert result.count_rows() == 11
assert result.count_columns() == 3
result = input_.drop_rows_with_outliers()

expected = Table(
pd.DataFrame(
data={
"col1": ["A", "B", "C", "a", "a", "a", "a", "a", "a", "a", "a"],
"col2": [1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, None],
"col3": [2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
}
)
)

assert result == expected


def test_drop_rows_with_outliers_no_rows() -> None:
Expand Down

This file was deleted.

0 comments on commit 0a5e853

Please sign in to comment.