Skip to content

Commit

Permalink
feat: rename drop_XY methods of Table to remove_XY (#122)
Browse files Browse the repository at this point in the history
### Summary of Changes

Rename the `drop_XY` methods of `Table` to `remove_XY`. When users want
to get rid of columns, they will probably look for "remove",
particularly, since the names of methods to include new columns and rows
start with "add".

---------

Co-authored-by: lars-reimann <[email protected]>
  • Loading branch information
lars-reimann and lars-reimann authored Mar 30, 2023
1 parent 76a7112 commit 98d76a4
Show file tree
Hide file tree
Showing 24 changed files with 121 additions and 129 deletions.
2 changes: 1 addition & 1 deletion docs/tutorials/data_processing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
"execution_count": null,
"outputs": [],
"source": [
"titanic_slice.drop_columns([\n",
"titanic_slice.remove_columns([\n",
" \"id\",\n",
" \"name\",\n",
" \"ticket\",\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/data_visualization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
"execution_count": null,
"outputs": [],
"source": [
"titanic_numerical = titanic.drop_columns(\n",
"titanic_numerical = titanic.remove_columns(\n",
" [\"id\", \"name\", \"sex\", \"ticket\", \"cabin\", \"port_embarked\"]\n",
")"
],
Expand Down
122 changes: 61 additions & 61 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,7 +532,60 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table:
result.columns = self._schema.get_column_names()
return Table(result)

def drop_columns(self, column_names: list[str]) -> Table:
def filter_rows(self, query: Callable[[Row], bool]) -> Table:
"""
Return a table with rows filtered by Callable (e.g. lambda function).
Parameters
----------
query : lambda function
A Callable that is applied to all rows.
Returns
-------
table : Table
A table containing only the rows filtered by the query.
"""
rows: list[Row] = [row for row in self.to_rows() if query(row)]
if len(rows) == 0:
result_table = Table([], self._schema)
else:
result_table = self.from_rows(rows)
return result_table

def keep_only_columns(self, column_names: list[str]) -> Table:
"""
Return a table with only the given column(s).
Parameters
----------
column_names : list[str]
A list containing only the columns to be kept.
Returns
-------
table : Table
A table containing only the given column(s).
Raises
------
ColumnNameError
If any of the given columns do not exist.
"""
invalid_columns = []
column_indices = []
for name in column_names:
if not self._schema.has_column(name):
invalid_columns.append(name)
else:
column_indices.append(self._schema._get_column_index_by_name(name))
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns)
transformed_data = self._data[column_indices]
transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names)
return Table(transformed_data)

def remove_columns(self, column_names: list[str]) -> Table:
"""
Return a table without the given column(s).
Expand Down Expand Up @@ -564,7 +617,7 @@ def drop_columns(self, column_names: list[str]) -> Table:
transformed_data.columns = list(name for name in self._schema.get_column_names() if name not in column_names)
return Table(transformed_data)

def drop_columns_with_missing_values(self) -> Table:
def remove_columns_with_missing_values(self) -> Table:
"""
Return a table without the columns that contain missing values.
Expand All @@ -575,7 +628,7 @@ def drop_columns_with_missing_values(self) -> Table:
"""
return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()])

def drop_columns_with_non_numerical_values(self) -> Table:
def remove_columns_with_non_numerical_values(self) -> Table:
"""
Return a table without the columns that contain non-numerical values.
Expand All @@ -587,7 +640,7 @@ def drop_columns_with_non_numerical_values(self) -> Table:
"""
return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()])

def drop_duplicate_rows(self) -> Table:
def remove_duplicate_rows(self) -> Table:
"""
Return a copy of the table with every duplicate row removed.
Expand All @@ -600,7 +653,7 @@ def drop_duplicate_rows(self) -> Table:
df.columns = self._schema.get_column_names()
return Table(df)

def drop_rows_with_missing_values(self) -> Table:
def remove_rows_with_missing_values(self) -> Table:
"""
Return a table without the rows that contain missing values.
Expand All @@ -613,7 +666,7 @@ def drop_rows_with_missing_values(self) -> Table:
result = result.dropna(axis="index")
return Table(result, self._schema)

def drop_rows_with_outliers(self) -> Table:
def remove_rows_with_outliers(self) -> Table:
"""
Remove all rows from the table that contain at least one outlier.
Expand All @@ -628,65 +681,12 @@ def drop_rows_with_outliers(self) -> Table:
"""
copy = self._data.copy(deep=True)

table_without_nonnumericals = self.drop_columns_with_non_numerical_values()
table_without_nonnumericals = self.remove_columns_with_non_numerical_values()
z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)

return Table(copy[filter_], self._schema)

def filter_rows(self, query: Callable[[Row], bool]) -> Table:
"""
Return a table with rows filtered by Callable (e.g. lambda function).
Parameters
----------
query : lambda function
A Callable that is applied to all rows.
Returns
-------
table : Table
A table containing only the rows filtered by the query.
"""
rows: list[Row] = [row for row in self.to_rows() if query(row)]
if len(rows) == 0:
result_table = Table([], self._schema)
else:
result_table = self.from_rows(rows)
return result_table

def keep_only_columns(self, column_names: list[str]) -> Table:
"""
Return a table with only the given column(s).
Parameters
----------
column_names : list[str]
A list containing only the columns to be kept.
Returns
-------
table : Table
A table containing only the given column(s).
Raises
------
ColumnNameError
If any of the given columns do not exist.
"""
invalid_columns = []
column_indices = []
for name in column_names:
if not self._schema.has_column(name):
invalid_columns.append(name)
else:
column_indices.append(self._schema._get_column_index_by_name(name))
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns)
transformed_data = self._data[column_indices]
transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names)
return Table(transformed_data)

def rename_column(self, old_name: str, new_name: str) -> Table:
"""
Rename a single column.
Expand Down Expand Up @@ -955,7 +955,7 @@ def correlation_heatmap(self) -> None:
"""
Plot a correlation heatmap for all numerical columns of this `Table`.
"""
only_numerical = self.drop_columns_with_non_numerical_values()
only_numerical = self.remove_columns_with_non_numerical_values()

sns.heatmap(
data=only_numerical._data.corr(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from tests.helpers import resolve_resource_path


def test_table_column_drop() -> None:
def test_table_remove_columns() -> None:
table = Table.from_csv_file(resolve_resource_path("test_table_from_csv_file.csv"))
transformed_table = table.drop_columns(["A"])
transformed_table = table.remove_columns(["A"])
assert transformed_table.schema.has_column("B") and not transformed_table.schema.has_column("A")


def test_table_column_drop_warning() -> None:
def test_table_remove_columns_warning() -> None:
table = Table.from_csv_file(resolve_resource_path("test_table_from_csv_file.csv"))
with pytest.raises(UnknownColumnNameError):
table.drop_columns(["C"])
table.remove_columns(["C"])
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_columns_with_missing_values_valid() -> None:
def test_remove_columns_with_missing_values_valid() -> None:
table = Table(
pd.DataFrame(
data={
Expand All @@ -15,13 +15,11 @@ def test_drop_columns_with_missing_values_valid() -> None:
}
)
)
updated_table = table.drop_columns_with_missing_values()
updated_table = table.remove_columns_with_missing_values()
assert updated_table.get_column_names() == ["col3", "col4"]


def test_drop_columns_with_missing_values_empty() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
updated_table = table.drop_columns_with_missing_values()
def test_remove_columns_with_missing_values_empty() -> None:
table = Table([], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}))
updated_table = table.remove_columns_with_missing_values()
assert updated_table.get_column_names() == ["col1"]
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_columns_with_non_numerical_values_valid() -> None:
def test_remove_columns_with_non_numerical_values_valid() -> None:
table = Table(
pd.DataFrame(
data={
Expand All @@ -15,13 +15,11 @@ def test_drop_columns_with_non_numerical_values_valid() -> None:
}
)
)
updated_table = table.drop_columns_with_non_numerical_values()
updated_table = table.remove_columns_with_non_numerical_values()
assert updated_table.get_column_names() == ["col3", "col4"]


def test_drop_columns_with_non_numerical_values_empty() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
updated_table = table.drop_columns_with_non_numerical_values()
def test_remove_columns_with_non_numerical_values_empty() -> None:
table = Table([], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}))
updated_table = table.remove_columns_with_non_numerical_values()
assert updated_table.get_column_names() == ["col1"]
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"test_table_duplicate_rows_no_duplicates.csv",
],
)
def test_drop_duplicate_rows(path: str) -> None:
def test_remove_duplicate_rows(path: str) -> None:
expected_table = Table(pd.DataFrame(data={"A": [1, 4], "B": [2, 5]}))
table = Table.from_csv_file(resolve_resource_path(path))
result_table = table.drop_duplicate_rows()
result_table = table.remove_duplicate_rows()
assert expected_table == result_table
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_rows_with_missing_values_valid() -> None:
def test_remove_rows_with_missing_values_valid() -> None:
table = Table(
pd.DataFrame(
data={
Expand All @@ -15,13 +15,11 @@ def test_drop_rows_with_missing_values_valid() -> None:
}
)
)
updated_table = table.drop_rows_with_missing_values()
updated_table = table.remove_rows_with_missing_values()
assert updated_table.count_rows() == 2


def test_drop_rows_with_missing_values_empty() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
updated_table = table.drop_rows_with_missing_values()
def test_remove_rows_with_missing_values_empty() -> None:
table = Table([], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}))
updated_table = table.remove_rows_with_missing_values()
assert updated_table.get_column_names() == ["col1"]
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_rows_with_outliers_no_outliers() -> None:
def test_remove_rows_with_outliers_no_outliers() -> None:
table = Table(
pd.DataFrame(
data={
Expand All @@ -15,13 +15,13 @@ def test_drop_rows_with_outliers_no_outliers() -> None:
)
)
names = table.get_column_names()
result = table.drop_rows_with_outliers()
result = table.remove_rows_with_outliers()
assert result.count_rows() == 3
assert result.count_columns() == 3
assert names == table.get_column_names()


def test_drop_rows_with_outliers_with_outliers() -> None:
def test_remove_rows_with_outliers_with_outliers() -> None:
input_ = Table(
pd.DataFrame(
data={
Expand All @@ -44,7 +44,7 @@ def test_drop_rows_with_outliers_with_outliers() -> None:
}
)
)
result = input_.drop_rows_with_outliers()
result = input_.remove_rows_with_outliers()

expected = Table(
pd.DataFrame(
Expand All @@ -59,10 +59,8 @@ def test_drop_rows_with_outliers_with_outliers() -> None:
assert result == expected


def test_drop_rows_with_outliers_no_rows() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
result = table.drop_rows_with_outliers()
def test_remove_rows_with_outliers_no_rows() -> None:
table = Table([], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))}))
result = table.remove_rows_with_outliers()
assert result.count_rows() == 0
assert result.count_columns() == 1
4 changes: 2 additions & 2 deletions tests/safeds/ml/classification/test_ada_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def test_should_include_features_of_prediction_input(self, classifier: Classifie

def test_should_include_complete_prediction_input(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_regressor = classifier.fit(valid_data)
prediction = fitted_regressor.predict(valid_data.drop_columns(["target"]))
assert prediction.drop_columns(["target"]) == valid_data.drop_columns(["target"])
prediction = fitted_regressor.predict(valid_data.remove_columns(["target"]))
assert prediction.remove_columns(["target"]) == valid_data.remove_columns(["target"])

def test_should_set_correct_target_name(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_classifier = classifier.fit(valid_data)
Expand Down
4 changes: 2 additions & 2 deletions tests/safeds/ml/classification/test_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def test_should_include_features_of_prediction_input(self, classifier: Classifie

def test_should_include_complete_prediction_input(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_regressor = classifier.fit(valid_data)
prediction = fitted_regressor.predict(valid_data.drop_columns(["target"]))
assert prediction.drop_columns(["target"]) == valid_data.drop_columns(["target"])
prediction = fitted_regressor.predict(valid_data.remove_columns(["target"]))
assert prediction.remove_columns(["target"]) == valid_data.remove_columns(["target"])

def test_should_set_correct_target_name(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_classifier = classifier.fit(valid_data)
Expand Down
4 changes: 2 additions & 2 deletions tests/safeds/ml/classification/test_gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def test_should_include_features_of_prediction_input(self, classifier: Classifie

def test_should_include_complete_prediction_input(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_regressor = classifier.fit(valid_data)
prediction = fitted_regressor.predict(valid_data.drop_columns(["target"]))
assert prediction.drop_columns(["target"]) == valid_data.drop_columns(["target"])
prediction = fitted_regressor.predict(valid_data.remove_columns(["target"]))
assert prediction.remove_columns(["target"]) == valid_data.remove_columns(["target"])

def test_should_set_correct_target_name(self, classifier: Classifier, valid_data: TaggedTable) -> None:
fitted_classifier = classifier.fit(valid_data)
Expand Down
Loading

0 comments on commit 98d76a4

Please sign in to comment.