Skip to content

Commit

Permalink
Merge pull request #398 from tinkoff-ai/issue-377
Browse files Browse the repository at this point in the history
Fix bug with column names in `SklearnTransform`
  • Loading branch information
alex-hse-repository committed Dec 24, 2021
1 parent dda8906 commit 09d4f4a
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 170 deletions.
1 change: 1 addition & 0 deletions etna/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from etna.core.mixins import BaseMixin
from etna.core.mixins import StringEnumWithRepr
9 changes: 9 additions & 0 deletions etna/core/mixins.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import inspect
import warnings
from enum import Enum


class BaseMixin:
Expand All @@ -24,3 +25,11 @@ def __repr__(self):
warnings.warn(f"You haven't set all parameters inside class __init__ method: {e}")
args_str_representation += f"{arg} = {value.__repr__()}, "
return f"{self.__class__.__name__}({args_str_representation})"


class StringEnumWithRepr(str, Enum):
"""Base class for str enums, that has alternative __repr__ method."""

def __repr__(self):
"""Get string representation for enum string so that enum can be created from it."""
return self.value.__repr__()
35 changes: 16 additions & 19 deletions etna/transforms/power.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from typing import List
from typing import Optional
from typing import Union
Expand Down Expand Up @@ -26,27 +25,26 @@ def __init__(
Parameters
----------
in_column:
name of processed column
columns to be transformed, if None - all columns will be transformed.
inplace:
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
name of added column. Use self.__repr__() if not given
base for the names of generated columns, uses self.__repr__() if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Raises
------
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.standardize = standardize
self.inplace = inplace
self.out_column = out_column
self.mode = TransformMode(mode)
self.in_column = [in_column] if isinstance(in_column, str) else in_column
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=PowerTransformer(method="yeo-johnson", standardize=self.standardize),
mode=mode,
)
Expand All @@ -69,27 +67,26 @@ def __init__(
Parameters
----------
in_column:
name of processed column
columns to be transformed, if None - all columns will be transformed.
inplace:
if True, apply transformation inplace to in_column,
if False, add column to dataset.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
standardize:
Set to True to apply zero-mean, unit-variance normalization to the
transformed output.
Raises
------
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.standardize = standardize
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.out_column = out_column
self.mode = TransformMode(mode)
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=PowerTransformer(method="box-cox", standardize=self.standardize),
mode=mode,
)
Expand Down
53 changes: 14 additions & 39 deletions etna/transforms/scalers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from typing import List
from typing import Optional
from typing import Tuple
Expand Down Expand Up @@ -38,7 +37,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
with_mean:
if True, center the data before scaling.
with_std:
Expand All @@ -53,18 +52,12 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.with_mean = with_mean
self.with_std = with_std
self.out_column = out_column
super().__init__(
in_column=in_column,
transformer=StandardScaler(with_mean=with_mean, with_std=with_std, copy=True),
out_column=self.out_column if self.out_column is not None else self.__repr__(),
transformer=StandardScaler(with_mean=self.with_mean, with_std=self.with_std, copy=True),
out_column=out_column,
inplace=inplace,
mode=mode,
)
Expand Down Expand Up @@ -97,7 +90,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
with_centering:
if True, center the data before scaling.
with_scaling:
Expand All @@ -119,25 +112,19 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.out_column = out_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.with_centering = with_centering
self.with_scaling = with_scaling
self.quantile_range = quantile_range
self.unit_variance = unit_variance
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=RobustScaler(
with_centering=with_centering,
with_scaling=with_scaling,
quantile_range=quantile_range,
unit_variance=unit_variance,
with_centering=self.with_centering,
with_scaling=self.with_scaling,
quantile_range=self.quantile_range,
unit_variance=self.unit_variance,
copy=True,
),
mode=mode,
Expand Down Expand Up @@ -169,7 +156,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
feature_range:
desired range of transformed data.
clip:
Expand All @@ -184,19 +171,13 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.out_column = out_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.feature_range = feature_range
self.clip = clip
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
transformer=MinMaxScaler(feature_range=feature_range, clip=clip, copy=True),
out_column=out_column,
transformer=MinMaxScaler(feature_range=self.feature_range, clip=self.clip, copy=True),
mode=mode,
)

Expand All @@ -223,7 +204,7 @@ def __init__(
inplace:
features are changed by scaled.
out_column:
name of added column. Use self.__repr__() if not given.
base for the names of generated columns, uses self.__repr__() if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand All @@ -234,16 +215,10 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")
self.in_column = [in_column] if isinstance(in_column, str) else in_column
self.inplace = inplace
self.mode = TransformMode(mode)
self.out_column = out_column
super().__init__(
in_column=in_column,
inplace=inplace,
out_column=self.out_column if self.out_column is not None else self.__repr__(),
out_column=out_column,
transformer=MaxAbsScaler(copy=True),
mode=mode,
)
Expand Down
43 changes: 33 additions & 10 deletions etna/transforms/sklearn.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from enum import Enum
import warnings
from copy import deepcopy
from typing import List
from typing import Union

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin

from etna.core import StringEnumWithRepr
from etna.transforms.base import Transform


class TransformMode(str, Enum):
class TransformMode(StringEnumWithRepr):
"""Enum for different metric aggregation modes."""

macro = "macro"
Expand All @@ -21,8 +23,8 @@ class SklearnTransform(Transform):

def __init__(
self,
in_column: Union[str, List[str]],
out_column: str,
in_column: Optional[Union[str, List[str]]],
out_column: Optional[str],
transformer: TransformerMixin,
inplace: bool = True,
mode: Union[TransformMode, str] = "per-segment",
Expand All @@ -33,13 +35,13 @@ def __init__(
Parameters
----------
in_column:
columns to be transformed, if None - all columns will be scaled.
columns to be transformed, if None - all columns will be transformed.
transformer:
sklearn.base.TransformerMixin instance.
inplace:
features are changed by transformed.
out_column:
name of result column
base for the names of generated columns, uses self.__repr__() if not given.
mode:
"macro" or "per-segment", way to transform features over segments.
If "macro", transforms features globally, gluing the corresponding ones for all segments.
Expand All @@ -50,13 +52,28 @@ def __init__(
ValueError:
if incorrect mode given
"""
if inplace and (out_column is not None):
warnings.warn("Transformation will be applied inplace, out_column param will be ignored")

self.transformer = transformer

if isinstance(in_column, str):
in_column = [in_column]
self.in_column = in_column if in_column is None else sorted(in_column)

self.inplace = inplace
self.mode = TransformMode(mode)
self.out_column_name = out_column
self.out_column = out_column

self.out_columns: Optional[List[str]] = None

def _get_column_name(self, in_column: str) -> str:
if self.out_column is None:
new_transform = deepcopy(self)
new_transform.in_column = [in_column]
return f"{new_transform.__repr__()}"
else:
return f"{self.out_column}_{in_column}"

def fit(self, df: pd.DataFrame) -> "SklearnTransform":
"""
Expand All @@ -72,14 +89,22 @@ def fit(self, df: pd.DataFrame) -> "SklearnTransform":
self
"""
segments = sorted(set(df.columns.get_level_values("segment")))

if self.in_column is None:
self.in_column = sorted(set(df.columns.get_level_values("feature")))

if self.inplace:
self.out_columns = self.in_column
else:
self.out_columns = [self._get_column_name(column) for column in self.in_column]

if self.mode == TransformMode.per_segment:
x = df.loc[:, (segments, self.in_column)].values
elif self.mode == TransformMode.macro:
x = self._reshape(df)
else:
raise ValueError(f"'{self.mode}' is not a valid TransformMode.")

self.transformer.fit(X=x)
return self

Expand Down Expand Up @@ -113,9 +138,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
transformed_features = pd.DataFrame(
transformed, columns=df.loc[:, (segments, self.in_column)].columns, index=df.index
)
transformed_features.columns = pd.MultiIndex.from_tuples(
[(segment_name, self.out_column_name) for segment_name, feature_name in transformed_features.columns]
)
transformed_features.columns = pd.MultiIndex.from_product([segments, self.out_columns])
df = pd.concat((df, transformed_features), axis=1)
df = df.sort_index(axis=1)

Expand Down
Loading

0 comments on commit 09d4f4a

Please sign in to comment.