diff --git a/CHANGELOG.md b/CHANGELOG.md index 653a458df..93cedfd0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Method TSDataset.info ([#409](https://github.com/tinkoff-ai/etna/pull/409)) - DifferencingTransform ([#414](https://github.com/tinkoff-ai/etna/pull/414)) +- OneHotEncoderTransform and LabelEncoderTransform ([#431](https://github.com/tinkoff-ai/etna/pull/431)) - MADTransform ([#441](https://github.com/tinkoff-ai/etna/pull/441)) - `MRMRFeatureSelectionTransform` ([#439](https://github.com/tinkoff-ai/etna/pull/439)) - Possibility to change metric representation in backtest using Metric.name ([#454](https://github.com/tinkoff-ai/etna/pull/454)) diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index 523d3d7c7..2f35e912e 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -6,7 +6,9 @@ from etna.transforms.decomposition import STLTransform from etna.transforms.decomposition import TheilSenTrendTransform from etna.transforms.decomposition import TrendTransform +from etna.transforms.encoders import LabelEncoderTransform from etna.transforms.encoders import MeanSegmentEncoderTransform +from etna.transforms.encoders import OneHotEncoderTransform from etna.transforms.encoders import SegmentEncoderTransform from etna.transforms.feature_selection import FilterFeaturesTransform from etna.transforms.feature_selection import GaleShapleyFeatureSelectionTransform diff --git a/etna/transforms/encoders/__init__.py b/etna/transforms/encoders/__init__.py index ccc9d46bd..2b23a01ce 100644 --- a/etna/transforms/encoders/__init__.py +++ b/etna/transforms/encoders/__init__.py @@ -1,2 +1,4 @@ +from etna.transforms.encoders.categorical import LabelEncoderTransform +from etna.transforms.encoders.categorical import OneHotEncoderTransform from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform diff --git a/etna/transforms/encoders/categorical.py b/etna/transforms/encoders/categorical.py new file mode 100644 index 000000000..3c58a0e51 --- /dev/null +++ b/etna/transforms/encoders/categorical.py @@ -0,0 +1,222 @@ +from enum import Enum +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn import preprocessing +from sklearn.utils._encode import _check_unknown +from sklearn.utils._encode import _encode + +from etna.transforms.base import PerSegmentWrapper +from etna.transforms.base import Transform + + +class ImputerMode(str, Enum): + """Enum for different imputation strategy.""" + + new_value = "new_value" + mean = "mean" + none = "none" + + +class _LabelEncoder(preprocessing.LabelEncoder): + def transform(self, y: pd.Series, strategy: str): + diff = _check_unknown(y, known_values=self.classes_) + + index = np.where(np.isin(y, diff))[0] + + encoded = _encode(y, uniques=self.classes_, check_unknown=False).astype(float) + + if strategy == ImputerMode.none: + filling_value = None + elif strategy == ImputerMode.new_value: + filling_value = -1 + elif strategy == ImputerMode.mean: + filling_value = np.mean(encoded[~np.isin(y, diff)]) + else: + raise ValueError(f"The strategy '{strategy}' doesn't exist") + + encoded[index] = filling_value + return encoded + + +class _OneSegmentLabelEncoderTransform(Transform): + """Replace the values in the column with the Label encoding.""" + + def __init__(self, in_column: str, out_column: str, strategy: str): + """ + Create instance of _OneSegmentLabelEncoderTransform. + + Parameters + ---------- + in_column: + name of column to apply transform to + out_column: + name of added column. + strategy: + filling encoding in not fitted values: + - If "new_value", then replace missing dates with '-1' + - If "mean", then replace missing dates using the mean in encoded column + - If "none", then replace missing dates with None + """ + self.in_column = in_column + self.out_column = out_column + self.strategy = strategy + self.le = _LabelEncoder() + + def fit(self, df: pd.DataFrame) -> "_OneSegmentLabelEncoderTransform": + """ + Fit Label encoder. + + Parameters + ---------- + df: + dataframe with data to fit the transform. + Returns + ------- + self + """ + self.le.fit(df[self.in_column]) + return self + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Encode the `in_column` by fitted Label encoder. + + Parameters + ---------- + df + dataframe with data to transform. + Returns + ------- + result dataframe + """ + result_df = df.copy() + result_df[self.out_column] = self.le.transform(df[self.in_column], self.strategy) + result_df[self.out_column] = result_df[self.out_column].astype("category") + return result_df + + +class LabelEncoderTransform(PerSegmentWrapper): + """Encode categorical feature with value between 0 and n_classes-1.""" + + def __init__(self, in_column: str, out_column: Optional[str] = None, strategy: str = ImputerMode.mean): + """ + Init LabelEncoderTransform. + + Parameters + ---------- + in_column: + name of column to be transformed + out_column: + name of added column. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor + strategy: + filling encoding in not fitted values: + - If "new_value", then replace missing values with '-1' + - If "mean", then replace missing values using the mean in encoded column + - If "none", then replace missing values with None + """ + self.in_column = in_column + self.strategy = strategy + self.out_column = out_column + super().__init__( + transform=_OneSegmentLabelEncoderTransform( + in_column=self.in_column, out_column=self._get_column_name(), strategy=self.strategy + ) + ) + + def _get_column_name(self) -> str: + """Get the `out_column` depending on the transform's parameters.""" + if self.out_column: + return self.out_column + if self.in_column.startswith("regressor"): + return f"regressor_{self.__repr__()}" + return self.__repr__() + + +class _OneSegmentOneHotEncoderTransform(Transform): + """Create one-hot encoding columns.""" + + def __init__(self, in_column: str, out_column: str): + """ + Create instance of _OneSegmentOneHotEncoderTransform. + + Parameters + ---------- + in_column: + name of column to apply transform to + out_column: + name of added column + """ + self.in_column = in_column + self.out_column = out_column + self.ohe = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False) + + def fit(self, df: pd.DataFrame) -> "_OneSegmentOneHotEncoderTransform": + """ + Fit One Hot encoder. + + Parameters + ---------- + df: + dataframe with data to fit the transform. + Returns + ------- + self + """ + self.ohe.fit(np.array(df[self.in_column]).reshape(-1, 1)) + return self + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Encode the `in_column` by fitted One Hot encoder. + + Parameters + ---------- + df + dataframe with data to transform. + Returns + ------- + result dataframe + """ + result_df = df.copy() + result_df[[self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]] = self.ohe.transform( + np.array(df[self.in_column]).reshape(-1, 1) + ) + result_df[[self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))]] = result_df[ + [self.out_column + "_" + str(i) for i in range(len(self.ohe.categories_[0]))] + ].astype("category") + return result_df + + +class OneHotEncoderTransform(PerSegmentWrapper): + """Encode categorical feature as a one-hot numeric features. + + If unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros. + + """ + + def __init__(self, in_column: str, out_column: Optional[str] = None): + """ + Init OneHotEncoderTransform. + + Parameters + ---------- + in_column: + name of column to be encoded + out_column: + prefix of names of added columns. If not given, use `self.__repr__()` or `regressor_{self.__repr__()}` if it is a regressor + """ + self.in_column = in_column + self.out_column = out_column + super().__init__( + transform=_OneSegmentOneHotEncoderTransform(in_column=self.in_column, out_column=self._get_column_name()) + ) + + def _get_column_name(self) -> str: + """Get the `out_column` depending on the transform's parameters.""" + if self.out_column: + return self.out_column + if self.in_column.startswith("regressor"): + return f"regressor_{self.__repr__()}" + return self.__repr__() diff --git a/tests/test_transforms/test_encoders/test_categorical_transform.py b/tests/test_transforms/test_encoders/test_categorical_transform.py new file mode 100644 index 000000000..52b53fc84 --- /dev/null +++ b/tests/test_transforms/test_encoders/test_categorical_transform.py @@ -0,0 +1,245 @@ +import numpy as np +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.datasets import generate_ar_df +from etna.datasets import generate_const_df +from etna.datasets import generate_periodic_df +from etna.metrics import R2 +from etna.models import LinearPerSegmentModel +from etna.transforms import FilterFeaturesTransform +from etna.transforms.encoders.categorical import LabelEncoderTransform +from etna.transforms.encoders.categorical import OneHotEncoderTransform + + +@pytest.fixture +def two_df_with_new_values(): + d = { + "timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03")) + + list(pd.date_range(start="2021-01-01", end="2021-01-03")), + "segment": ["segment_0", "segment_0", "segment_0", "segment_1", "segment_1", "segment_1"], + "regressor_0": [5, 8, 5, 9, 5, 9], + "target": [1, 2, 3, 4, 5, 6], + } + df1 = TSDataset.to_dataset(pd.DataFrame(d)) + d = { + "timestamp": list(pd.date_range(start="2021-01-01", end="2021-01-03")) + + list(pd.date_range(start="2021-01-01", end="2021-01-03")), + "segment": ["segment_0", "segment_0", "segment_0", "segment_1", "segment_1", "segment_1"], + "regressor_0": [5, 8, 9, 5, 0, 0], + "target": [1, 2, 3, 4, 5, 6], + } + df2 = TSDataset.to_dataset(pd.DataFrame(d)) + return df1, df2 + + +@pytest.fixture +def df_for_ohe_encoding(): + df_to_forecast = generate_ar_df(10, start_time="2021-01-01", n_segments=1) + d = { + "timestamp": pd.date_range(start="2021-01-01", end="2021-01-12"), + "regressor_0": [5, 8, 5, 8, 5, 8, 5, 8, 5, 8, 5, 8], + "regressor_1": [9, 5, 9, 5, 9, 5, 9, 5, 9, 5, 9, 5], + "regressor_2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "regressor_3": [1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7], + } + df_regressors = pd.DataFrame(d) + df_regressors["segment"] = "segment_0" + df_to_forecast = TSDataset.to_dataset(df_to_forecast) + df_regressors = TSDataset.to_dataset(df_regressors) + tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors) + + answer_on_regressor_0 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_0["test_0"] = answer_on_regressor_0["regressor_0"].apply(lambda x: float(x == 5)) + answer_on_regressor_0["test_1"] = answer_on_regressor_0["regressor_0"].apply(lambda x: float(x == 8)) + answer_on_regressor_0["test_0"] = answer_on_regressor_0["test_0"].astype("category") + answer_on_regressor_0["test_1"] = answer_on_regressor_0["test_1"].astype("category") + + answer_on_regressor_1 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_1["test_0"] = answer_on_regressor_1["regressor_1"].apply(lambda x: float(x == 5)) + answer_on_regressor_1["test_1"] = answer_on_regressor_1["regressor_1"].apply(lambda x: float(x == 9)) + answer_on_regressor_1["test_0"] = answer_on_regressor_1["test_0"].astype("category") + answer_on_regressor_1["test_1"] = answer_on_regressor_1["test_1"].astype("category") + + answer_on_regressor_2 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_2["test_0"] = answer_on_regressor_2["regressor_2"].apply(lambda x: float(x == 0)) + answer_on_regressor_2["test_0"] = answer_on_regressor_2["test_0"].astype("category") + + return tsdataset.df, (answer_on_regressor_0, answer_on_regressor_1, answer_on_regressor_2) + + +@pytest.fixture +def df_for_label_encoding(): + df_to_forecast = generate_ar_df(10, start_time="2021-01-01", n_segments=1) + d = { + "timestamp": pd.date_range(start="2021-01-01", end="2021-01-12"), + "regressor_0": [5, 8, 5, 8, 5, 8, 5, 8, 5, 8, 5, 8], + "regressor_1": [9, 5, 9, 5, 9, 5, 9, 5, 9, 5, 9, 5], + "regressor_2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "regressor_3": [1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7], + } + df_regressors = pd.DataFrame(d) + df_regressors["segment"] = "segment_0" + df_to_forecast = TSDataset.to_dataset(df_to_forecast) + df_regressors = TSDataset.to_dataset(df_regressors) + tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors) + + answer_on_regressor_0 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_0["test"] = answer_on_regressor_0["regressor_0"].apply(lambda x: float(x == 8)) + answer_on_regressor_0["test"] = answer_on_regressor_0["test"].astype("category") + + answer_on_regressor_1 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_1["test"] = answer_on_regressor_1["regressor_1"].apply(lambda x: float(x == 9)) + answer_on_regressor_1["test"] = answer_on_regressor_1["test"].astype("category") + + answer_on_regressor_2 = tsdataset.df.copy()["segment_0"] + answer_on_regressor_2["test"] = answer_on_regressor_2["regressor_2"].apply(lambda x: float(x == 1)) + answer_on_regressor_2["test"] = answer_on_regressor_2["test"].astype("category") + + return tsdataset.df, (answer_on_regressor_0, answer_on_regressor_1, answer_on_regressor_2) + + +@pytest.fixture +def df_for_naming(): + df_to_forecast = generate_ar_df(10, start_time="2021-01-01", n_segments=1) + df_regressors = generate_periodic_df(12, start_time="2021-01-01", scale=10, period=2, n_segments=2) + df_regressors = df_regressors.pivot(index="timestamp", columns="segment").reset_index() + df_regressors.columns = ["timestamp"] + ["regressor_1", "2"] + df_regressors["segment"] = "segment_0" + df_to_forecast = TSDataset.to_dataset(df_to_forecast) + df_regressors = TSDataset.to_dataset(df_regressors) + tsdataset = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors) + return tsdataset.df + + +def test_label_encoder_simple(df_for_label_encoding): + """Test that LabelEncoderTransform works correct in a simple cases.""" + df, answers = df_for_label_encoding + for i in range(3): + le = LabelEncoderTransform(in_column=f"regressor_{i}", out_column="test") + le.fit(df) + cols = le.transform(df)["segment_0"].columns + assert le.transform(df)["segment_0"][cols].equals(answers[i][cols]) + + +def test_ohe_encoder_simple(df_for_ohe_encoding): + """Test that OneHotEncoderTransform works correct in a simple case.""" + df, answers = df_for_ohe_encoding + for i in range(3): + ohe = OneHotEncoderTransform(in_column=f"regressor_{i}", out_column="test") + ohe.fit(df) + cols = ohe.transform(df)["segment_0"].columns + assert ohe.transform(df)["segment_0"][cols].equals(answers[i][cols]) + + +def test_value_error_label_encoder(df_for_label_encoding): + """Test LabelEncoderTransform with wrong strategy.""" + df, _ = df_for_label_encoding + with pytest.raises(ValueError, match="The strategy"): + le = LabelEncoderTransform(in_column="target", strategy="new_vlue") + le.fit(df) + le.transform(df) + + +@pytest.mark.parametrize( + "strategy, expected_values", + [ + ("new_value", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, -1, 5], [9, -1, 3, 0, -1, 6]])), + ("none", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, np.nan, 5], [9, np.nan, 3, 0, np.nan, 6]])), + ("mean", np.array([[5, 0, 1, 5, 0, 4], [8, 1, 2, 0, 0, 5], [9, 0.5, 3, 0, 0, 6]])), + ], +) +def test_new_value_label_encoder(two_df_with_new_values, strategy, expected_values): + """Test LabelEncoderTransform correct works with unknown values.""" + df1, df2 = two_df_with_new_values + le = LabelEncoderTransform(in_column="regressor_0", strategy=strategy) + le.fit(df1) + np.testing.assert_array_almost_equal(le.transform(df2).values, expected_values) + + +def test_new_value_ohe_encoder(two_df_with_new_values): + """Test OneHotEncoderTransform correct works with unknown values.""" + expected_values = np.array( + [ + [5.0, 1.0, 1.0, 0.0, 5.0, 4.0, 1.0, 0.0], + [8.0, 2.0, 0.0, 1.0, 0.0, 5.0, 0.0, 0.0], + [9.0, 3.0, 0.0, 0.0, 0.0, 6.0, 0.0, 0.0], + ] + ) + df1, df2 = two_df_with_new_values + ohe = OneHotEncoderTransform(in_column="regressor_0", out_column="targets") + ohe.fit(df1) + np.testing.assert_array_almost_equal(ohe.transform(df2).values, expected_values) + + +def test_naming_ohe_encoder(two_df_with_new_values): + """Test OneHotEncoderTransform gives the correct columns.""" + df1, df2 = two_df_with_new_values + ohe = OneHotEncoderTransform(in_column="regressor_0", out_column="targets") + ohe.fit(df1) + segments = ["segment_0", "segment_1"] + target = ["target", "targets_0", "targets_1", "regressor_0"] + assert set([(i, j) for i in segments for j in target]) == set(ohe.transform(df2).columns.values) + + +@pytest.mark.parametrize( + "in_column, prefix", + [("2", ""), ("regressor_1", "regressor_")], +) +def test_naming_ohe_encoder_no_out_column(df_for_naming, in_column, prefix): + """Test OneHotEncoderTransform gives the correct columns with no out_column.""" + df = df_for_naming + ohe = OneHotEncoderTransform(in_column=in_column) + ohe.fit(df) + answer = set( + list(df["segment_0"].columns) + [prefix + str(ohe.__repr__()) + "_0", prefix + str(ohe.__repr__()) + "_1"] + ) + assert answer == set(ohe.transform(df)["segment_0"].columns.values) + + +@pytest.mark.parametrize( + "in_column, prefix", + [("2", ""), ("regressor_1", "regressor_")], +) +def test_naming_label_encoder_no_out_column(df_for_naming, in_column, prefix): + """Test LabelEncoderTransform gives the correct columns with no out_column.""" + df = df_for_naming + le = LabelEncoderTransform(in_column=in_column) + le.fit(df) + answer = set(list(df["segment_0"].columns) + [prefix + str(le.__repr__())]) + assert answer == set(le.transform(df)["segment_0"].columns.values) + + +@pytest.fixture +def ts_for_ohe_sanity(): + df_to_forecast = generate_const_df(periods=100, start_time="2021-01-01", scale=0, n_segments=1) + df_regressors = generate_periodic_df(periods=120, start_time="2021-01-01", scale=10, period=4, n_segments=1) + df_regressors = df_regressors.pivot(index="timestamp", columns="segment").reset_index() + df_regressors.columns = ["timestamp"] + [f"regressor_{i}" for i in range(1)] + df_regressors["segment"] = "segment_0" + df_to_forecast = TSDataset.to_dataset(df_to_forecast) + df_regressors = TSDataset.to_dataset(df_regressors) + rng = np.random.default_rng(12345) + + def f(x): + return x ** 2 + rng.normal(0, 0.01) + + df_to_forecast["segment_0", "target"] = df_regressors["segment_0"]["regressor_0"][:100].apply(f) + ts = TSDataset(df=df_to_forecast, freq="D", df_exog=df_regressors) + return ts + + +def test_ohe_sanity(ts_for_ohe_sanity): + """Test for correct work in the full forecasting pipeline.""" + horizon = 10 + train_ts, test_ts = ts_for_ohe_sanity.train_test_split(test_size=horizon) + ohe = OneHotEncoderTransform(in_column="regressor_0") + filt = FilterFeaturesTransform(exclude=["regressor_0"]) + train_ts.fit_transform([ohe, filt]) + model = LinearPerSegmentModel() + model.fit(train_ts) + future_ts = train_ts.make_future(horizon) + forecast_ts = model.forecast(future_ts) + r2 = R2() + assert 1 - r2(test_ts, forecast_ts)["segment_0"] < 1e-5