Skip to content

Make categorical encoders multisegment #554

Merged
merged 5 commits into from
Feb 22, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
-
- Rename `_SARIMAXModel` and `_ProphetModel`, make `SARIMAXModel` and `ProphetModel` inherit from `PerSegmentPredictionIntervalModel` ([#549](https://github.com/tinkoff-ai/etna/pull/549))
-
-
-
- Make `LabelEncoderTransform` and `OneHotEncoderTransform` multi-segment ([#554](https://github.com/tinkoff-ai/etna/pull/554))
### Fixed
- Fix `TSDataset._update_regressors` logic removing the regressors ([#489](https://github.com/tinkoff-ai/etna/pull/489))
- Fix `TSDataset.info`, `TSDataset.describe` methods ([#519](https://github.com/tinkoff-ai/etna/pull/519))
Expand Down
153 changes: 54 additions & 99 deletions etna/transforms/encoders/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.utils._encode import _check_unknown
from sklearn.utils._encode import _encode

from etna.transforms.base import PerSegmentWrapper
from etna.datasets import TSDataset
from etna.transforms.base import Transform


Expand Down Expand Up @@ -40,43 +40,45 @@ def transform(self, y: pd.Series, strategy: str):
return encoded


class _OneSegmentLabelEncoderTransform(Transform):
"""Replace the values in the column with the Label encoding."""
class LabelEncoderTransform(Transform):
"""Encode categorical feature with value between 0 and n_classes-1."""

def __init__(self, in_column: str, out_column: str, strategy: str):
def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: str = ImputerMode.mean):
"""
Create instance of _OneSegmentLabelEncoderTransform.
Init LabelEncoderTransform.

Parameters
----------
in_column:
name of column to apply transform to
Name of column to be transformed
out_column:
name of added column.
Name of added column. If not given, use `self.__repr__()`
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing dates with '-1'
- If "mean", then replace missing dates using the mean in encoded column
- If "none", then replace missing dates with None
Filling encoding in not fitted values:
- If "new_value", then replace missing values with '-1'
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
"""
self.in_column = in_column
self.out_column = out_column
self.strategy = strategy
self.le = _LabelEncoder()

def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform":
def fit(self, df: pd.DataFrame) -> "LabelEncoderTransform":
"""
Fit Label encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Dataframe with data to fit the transform
Returns
-------
self
self:
Fitted transform
"""
self.le.fit(df[self.in_column])
y = TSDataset.to_flatten(df)[self.in_column]
self.le.fit(y=y)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -86,85 +88,64 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Parameters
----------
df
dataframe with data to transform.
Dataframe with data to transform
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[self.out_column] = self.le.transform(df[self.in_column], self.strategy)
result_df[self.out_column] = result_df[self.out_column].astype("category")
result:
Dataframe with column with encoded values
"""
out_column = self._get_column_name()
result_df = TSDataset.to_flatten(df)
result_df[out_column] = self.le.transform(result_df[self.in_column], self.strategy)
result_df[out_column] = result_df[out_column].astype("category")
result_df = TSDataset.to_dataset(result_df)
return result_df


class LabelEncoderTransform(PerSegmentWrapper):
"""Encode categorical feature with value between 0 and n_classes-1."""

def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: str = ImputerMode.mean):
"""
Init LabelEncoderTransform.

Parameters
----------
in_column:
name of column to be transformed
out_column:
name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
strategy:
filling encoding in not fitted values:
- If "new_value", then replace missing values with '-1'
- If "mean", then replace missing values using the mean in encoded column
- If "none", then replace missing values with None
"""
self.in_column = in_column
self.strategy = strategy
self.out_column = out_column
super().__init__(
transform=_OneSegmentLabelEncoderTransform(
in_column=self.in_column, out_column=self._get_column_name(), strategy=self.strategy
)
)

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
return f"regressor_{self.__repr__()}"
return self.__repr__()


class _OneSegmentOneHotEncoderTransform(Transform):
"""Create one-hot encoding columns."""
class OneHotEncoderTransform(Transform):
"""Encode categorical feature as a one-hot numeric features.

If unknown category is encountered during transform, the resulting one-hot
encoded columns for this feature will be all zeros.

def __init__(self, in_column: str, out_column: str):
"""

def __init__(self, in_column: str, out_column: Optional[str] = None):
"""
Create instance of _OneSegmentOneHotEncoderTransform.
Init OneHotEncoderTransform.

Parameters
----------
in_column:
name of column to apply transform to
Name of column to be encoded
out_column:
name of added column
Prefix of names of added columns. If not given, use `self.__repr__()`
"""
self.in_column = in_column
self.out_column = out_column
self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)

def fit(self, df: pd.DataFrame) -> "_OneSegmentOneHotEncoderTransform":
def fit(self, df: pd.DataFrame) -> "OneHotEncoderTransform":
"""
Fit One Hot encoder.

Parameters
----------
df:
dataframe with data to fit the transform.
Dataframe with data to fit the transform
Returns
-------
self
self:
Fitted transform
"""
self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1))
x = TSDataset.to_flatten(df)[self.in_column].values.reshape(-1, 1)
self.ohe.fit(X=x)
return self

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -174,49 +155,23 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
Parameters
----------
df
dataframe with data to transform.
Dataframe with data to transform
Returns
-------
result dataframe
"""
result_df = df.copy()
result_df[[self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]] = self.ohe.transform(
np.array(df[self.in_column]).reshape(-1, 1)
)
result_df[[self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]] = result_df[
[self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]
].astype("category")
result:
Dataframe with column with encoded values
"""
out_column = self._get_column_name()
out_columns = [out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]
result_df = TSDataset.to_flatten(df)
x = result_df[self.in_column].values.reshape(-1, 1)
result_df[out_columns] = self.ohe.transform(X=x)
result_df[out_columns] = result_df[out_columns].astype("category")
result_df = TSDataset.to_dataset(result_df)
return result_df


class OneHotEncoderTransform(PerSegmentWrapper):
"""Encode categorical feature as a one-hot numeric features.

If unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros.

"""

def __init__(self, in_column: str, out_column: Optional[str] = None):
"""
Init OneHotEncoderTransform.

Parameters
----------
in_column:
name of column to be encoded
out_column:
prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor
"""
self.in_column = in_column
self.out_column = out_column
super().__init__(
transform=_OneSegmentOneHotEncoderTransform(in_column=self.in_column, out_column=self._get_column_name())
)

def _get_column_name(self) -> str:
"""Get the `out_column` depending on the transform's parameters."""
if self.out_column:
return self.out_column
if self.in_column.startswith("regressor"):
return f"regressor_{self.__repr__()}"
return self.__repr__()
64 changes: 33 additions & 31 deletions tests/test_transforms/test_encoders/test_categorical_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,15 @@
@pytest.fixture
def two_df_with_new_values():
d = {
"timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03"))
+ list(pd.date_range(start="2021-01-01", end="2021-01-03")),
"segment": ["segment_0", "segment_0", "segment_0", "segment_1", "segment_1", "segment_1"],
"timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03")) * 2,
"segment": ["segment_0"] * 3 + ["segment_1"] * 3,
"regressor_0": [5, 8, 5, 9, 5, 9],
"target": [1, 2, 3, 4, 5, 6],
}
df1 = TSDataset.to_dataset(pd.DataFrame(d))
d = {
"timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03"))
+ list(pd.date_range(start="2021-01-01", end="2021-01-03")),
"segment": ["segment_0", "segment_0", "segment_0", "segment_1", "segment_1", "segment_1"],
"timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03")) * 2,
"segment": ["segment_0"] * 3 + ["segment_1"] * 3,
"regressor_0": [5, 8, 9, 5, 0, 0],
"target": [1, 2, 3, 4, 5, 6],
}
Expand Down Expand Up @@ -145,32 +143,38 @@ def test_value_error_label_encoder(df_for_label_encoding):
@pytest.mark.parametrize(
"strategy, expected_values",
[
("new_value", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, -1, 5], [9, -1, 3, 0, -1, 6]])),
("none", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, np.nan, 5], [9, np.nan, 3, 0, np.nan, 6]])),
("mean", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, 0, 5], [9, 0.5, 3, 0, 0, 6]])),
("new_value", {"segment_0": [0, 1, 2], "segment_1": [0, -1, -1]}),
("none", {"segment_0": [0, 1, 2], "segment_1": [0, np.nan, np.nan]}),
("mean", {"segment_0": [0, 1, 2], "segment_1": [0, 3 / 4, 3 / 4]}),
],
)
def test_new_value_label_encoder(two_df_with_new_values, strategy, expected_values):
"""Test LabelEncoderTransform correct works with unknown values."""
df1, df2 = two_df_with_new_values
le = LabelEncoderTransform(in_column="regressor_0", strategy=strategy)
segments = df1.columns.get_level_values("segment").unique().tolist()
le = LabelEncoderTransform(in_column="regressor_0", strategy=strategy, out_column="encoded_regressor_0")
le.fit(df1)
np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values)
df2_transformed = le.transform(df2)
for segment in segments:
values = df2_transformed.loc[:, pd.IndexSlice[segment, "encoded_regressor_0"]].values
np.testing.assert_array_almost_equal(values, expected_values[segment])


def test_new_value_ohe_encoder(two_df_with_new_values):
@pytest.mark.parametrize(
"expected_values",
[{"segment_0": [[1, 0, 0], [0, 1, 0], [0, 0, 1]], "segment_1": [[1, 0, 0], [0, 0, 0], [0, 0, 0]]}],
)
def test_new_value_ohe_encoder(two_df_with_new_values, expected_values):
"""Test OneHotEncoderTransform correct works with unknown values."""
expected_values = np.array(
[
[5.0, 1.0, 1.0, 0.0, 5.0, 4.0, 1.0, 0.0],
[8.0, 2.0, 0.0, 1.0, 0.0, 5.0, 0.0, 0.0],
[9.0, 3.0, 0.0, 0.0, 0.0, 6.0, 0.0, 0.0],
]
)
df1, df2 = two_df_with_new_values
segments = df1.columns.get_level_values("segment").unique().tolist()
out_columns = ["targets_0", "targets_1", "targets_2"]
ohe = OneHotEncoderTransform(in_column="regressor_0", out_column="targets")
ohe.fit(df1)
np.testing.assert_array_almost_equal(ohe.transform(df2).values, expected_values)
df2_transformed = ohe.transform(df2)
for segment in segments:
values = df2_transformed.loc[:, pd.IndexSlice[segment, out_columns]].values
np.testing.assert_array_almost_equal(values, expected_values[segment])


def test_naming_ohe_encoder(two_df_with_new_values):
Expand All @@ -179,35 +183,33 @@ def test_naming_ohe_encoder(two_df_with_new_values):
ohe = OneHotEncoderTransform(in_column="regressor_0", out_column="targets")
ohe.fit(df1)
segments = ["segment_0", "segment_1"]
target = ["target", "targets_0", "targets_1", "regressor_0"]
target = ["target", "targets_0", "targets_1", "targets_2", "regressor_0"]
assert set([(i, j) for i in segments for j in target]) == set(ohe.transform(df2).columns.values)


@pytest.mark.parametrize(
"in_column, prefix",
[("2", ""), ("regressor_1", "regressor_")],
"in_column",
[("2"), ("regressor_1")],
)
def test_naming_ohe_encoder_no_out_column(df_for_naming, in_column, prefix):
def test_naming_ohe_encoder_no_out_column(df_for_naming, in_column):
"""Test OneHotEncoderTransform gives the correct columns with no out_column."""
df = df_for_naming
ohe = OneHotEncoderTransform(in_column=in_column)
ohe.fit(df)
answer = set(
list(df["segment_0"].columns) + [prefix + str(ohe.__repr__()) + "_0", prefix + str(ohe.__repr__()) + "_1"]
)
answer = set(list(df["segment_0"].columns) + [str(ohe.__repr__()) + "_0", str(ohe.__repr__()) + "_1"])
assert answer == set(ohe.transform(df)["segment_0"].columns.values)


@pytest.mark.parametrize(
"in_column, prefix",
[("2", ""), ("regressor_1", "regressor_")],
"in_column",
[("2"), ("regressor_1")],
)
def test_naming_label_encoder_no_out_column(df_for_naming, in_column, prefix):
def test_naming_label_encoder_no_out_column(df_for_naming, in_column):
"""Test LabelEncoderTransform gives the correct columns with no out_column."""
df = df_for_naming
le = LabelEncoderTransform(in_column=in_column)
le.fit(df)
answer = set(list(df["segment_0"].columns) + [prefix + str(le.__repr__())])
answer = set(list(df["segment_0"].columns) + [str(le.__repr__())])
assert answer == set(le.transform(df)["segment_0"].columns.values)


Expand Down