Skip to content

Commit

Permalink
Merge pull request #398 from tinkoff-ai/issue-377
Browse files Browse the repository at this point in the history
Fix bug with column names in `SklearnTransform`
  • Loading branch information
alex-hse-repository authored and martins0n committed Dec 23, 2021
1 parent e52dec3 commit 89b3d50
Show file tree
Hide file tree
Showing 9 changed files with 359 additions and 169 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Catboost version too high ([#394](https://github.com/tinkoff-ai/etna/pull/394))
- Add sorting of classes in left bar in docs ([#397](https://github.com/tinkoff-ai/etna/pull/397))
- nn notebook in docs ([#396](https://github.com/tinkoff-ai/etna/pull/396))
- SklearnTransform column name generation ([#398](https://github.com/tinkoff-ai/etna/pull/398))

## [1.4.2] - 2021-12-09
### Fixed
Expand Down
1 change: 1 addition & 0 deletions etna/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from etna.core.mixins import BaseMixin
from etna.core.mixins import StringEnumWithRepr
9 changes: 9 additions & 0 deletions etna/core/mixins.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import inspect
import warnings
from enum import Enum


class BaseMixin:
Expand All @@ -24,3 +25,11 @@ def __repr__(self):
warnings.warn(f"You haven't set all parameters inside class __init__ method: {e}")
args_str_representation += f"{arg} = {value.__repr__()}, "
return f"{self.__class__.__name__}({args_str_representation})"


class StringEnumWithRepr(str, Enum):
"""Base class for str enums, that has alternative __repr__ method."""

def __repr__(self):
"""Get string representation for enum string so that enum can be created from it."""
return self.value.__repr__()
35 changes: 16 additions & 19 deletions etna/transforms/power.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from typing import List
from typing import Optional
from typing import Union
Expand Down Expand Up @@ -26,27 +25,26 @@ def __init__(
Parameters
----------
in_column:
name of processed column
columns to be transformed, if None - all columns will be transformed.
inplace:
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
name of added column. Use self.__repr__() if not given
base for the names of generated columns, uses self.__repr__() if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Raises
------
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.standardize = standardize
self.inplace = inplace
self.out_column = out_column
self.mode = TransformMode(mode)
self.in_column = [in_column] if isinstance(in_column, str) else in_column
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=PowerTransformer(method="yeo-johnson", standardize=self.standardize),
mode=mode,
)
Expand All @@ -69,27 +67,26 @@ def __init__(
Parameters
----------
in_column:
name of processed column
columns to be transformed, if None - all columns will be transformed.
inplace:
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Raises
------
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.standardize = standardize
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.out_column = out_column
self.mode = TransformMode(mode)
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=PowerTransformer(method="box-cox", standardize=self.standardize),
mode=mode,
)
Expand Down
53 changes: 14 additions & 39 deletions etna/transforms/scalers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from typing import List
from typing import Optional
from typing import Tuple
Expand Down Expand Up @@ -38,7 +37,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
with_mean:
if True, center the data before scaling.
with_std:
Expand All @@ -53,18 +52,12 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.with_mean = with_mean
self.with_std = with_std
self.out_column = out_column
super().__init__(
in_column=in_column,
transformer=StandardScaler(with_mean=with_mean, with_std=with_std, copy=True),
out_column=self.out_column if self.out_column is not None else self.__repr__(),
transformer=StandardScaler(with_mean=self.with_mean, with_std=self.with_std, copy=True),
out_column=out_column,
inplace=inplace,
mode=mode,
)
Expand Down Expand Up @@ -97,7 +90,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
with_centering:
if True, center the data before scaling.
with_scaling:
Expand All @@ -119,25 +112,19 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.out_column = out_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.with_centering = with_centering
self.with_scaling = with_scaling
self.quantile_range = quantile_range
self.unit_variance = unit_variance
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=RobustScaler(
with_centering=with_centering,
with_scaling=with_scaling,
quantile_range=quantile_range,
unit_variance=unit_variance,
with_centering=self.with_centering,
with_scaling=self.with_scaling,
quantile_range=self.quantile_range,
unit_variance=self.unit_variance,
copy=True,
),
mode=mode,
Expand Down Expand Up @@ -169,7 +156,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
feature_range:
desired range of transformed data.
clip:
Expand All @@ -184,19 +171,13 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.out_column = out_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.feature_range = feature_range
self.clip = clip
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
transformer=MinMaxScaler(feature_range=feature_range, clip=clip, copy=True),
out_column=out_column,
transformer=MinMaxScaler(feature_range=self.feature_range, clip=self.clip, copy=True),
mode=mode,
)

Expand All @@ -223,7 +204,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand All @@ -234,16 +215,10 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.out_column = out_column
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=MaxAbsScaler(copy=True),
mode=mode,
)
Expand Down
41 changes: 32 additions & 9 deletions etna/transforms/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
import warnings
from copy import deepcopy
from typing import List
from typing import Optional
from typing import Union
Expand All @@ -7,10 +8,11 @@
import pandas as pd
from sklearn.base import TransformerMixin

from etna.core import StringEnumWithRepr
from etna.transforms.base import Transform


class TransformMode(str, Enum):
class TransformMode(StringEnumWithRepr):
"""Enum for different metric aggregation modes."""

macro = "macro"
Expand All @@ -23,7 +25,7 @@ class SklearnTransform(Transform):
def __init__(
self,
in_column: Optional[Union[str, List[str]]],
out_column: str,
out_column: Optional[str],
transformer: TransformerMixin,
inplace: bool = True,
mode: Union[TransformMode, str] = "per-segment",
Expand All @@ -34,13 +36,13 @@ def __init__(
Parameters
----------
in_column:
columns to be transformed, if None - all columns will be scaled.
columns to be transformed, if None - all columns will be transformed.
transformer:
sklearn.base.TransformerMixin instance.
inplace:
features are changed by transformed.
out_column:
name of result column
base for the names of generated columns, uses self.__repr__() if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand All @@ -51,13 +53,28 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")

self.transformer = transformer

if isinstance(in_column, str):
in_column = [in_column]
self.in_column = in_column if in_column is None else sorted(in_column)

self.inplace = inplace
self.mode = TransformMode(mode)
self.out_column_name = out_column
self.out_column = out_column

self.out_columns: Optional[List[str]] = None

def _get_column_name(self, in_column: str) -> str:
if self.out_column is None:
new_transform = deepcopy(self)
new_transform.in_column = [in_column]
return f"{new_transform.__repr__()}"
else:
return f"{self.out_column}_{in_column}"

def fit(self, df: pd.DataFrame) -> "SklearnTransform":
"""
Expand All @@ -73,14 +90,22 @@ def fit(self, df: pd.DataFrame) -> "SklearnTransform":
self
"""
segments = sorted(set(df.columns.get_level_values("segment")))

if self.in_column is None:
self.in_column = sorted(set(df.columns.get_level_values("feature")))

if self.inplace:
self.out_columns = self.in_column
else:
self.out_columns = [self._get_column_name(column) for column in self.in_column]

if self.mode == TransformMode.per_segment:
x = df.loc[:, (segments, self.in_column)].values
elif self.mode == TransformMode.macro:
x = self._reshape(df)
else:
raise ValueError(f"'{self.mode}' is not a valid TransformMode.")

self.transformer.fit(X=x)
return self

Expand Down Expand Up @@ -114,9 +139,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
transformed_features = pd.DataFrame(
transformed, columns=df.loc[:, (segments, self.in_column)].columns, index=df.index
)
transformed_features.columns = pd.MultiIndex.from_tuples(
[(segment_name, self.out_column_name) for segment_name, feature_name in transformed_features.columns]
)
transformed_features.columns = pd.MultiIndex.from_product([segments, self.out_columns])
df = pd.concat((df, transformed_features), axis=1)
df = df.sort_index(axis=1)

Expand Down
Loading

0 comments on commit 89b3d50

Please sign in to comment.