From 571013efec602f91529015e38ff638742d75684f Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 10:26:44 +0300 Subject: [PATCH 1/9] Fix bug with column names in SklearnTransform, refactor code, add new tests on transforms --- etna/core/__init__.py | 1 + etna/core/mixins.py | 9 + etna/transforms/power.py | 19 +- etna/transforms/scalers.py | 29 +- etna/transforms/sklearn.py | 38 ++- .../test_interface_transform.py | 272 ++++++++++++++++++ .../test_power_transform.py | 20 -- .../test_scalers_transform.py | 84 +----- 8 files changed, 331 insertions(+), 141 deletions(-) create mode 100644 tests/test_transforms/test_sklearn_transform/test_interface_transform.py rename tests/test_transforms/{ => test_sklearn_transform}/test_power_transform.py (82%) rename tests/test_transforms/{ => test_sklearn_transform}/test_scalers_transform.py (61%) diff --git a/etna/core/__init__.py b/etna/core/__init__.py index 5dd1c90cc..d2f2da86d 100644 --- a/etna/core/__init__.py +++ b/etna/core/__init__.py @@ -1 +1,2 @@ from etna.core.mixins import BaseMixin +from etna.core.mixins import StringEnumWithRepr diff --git a/etna/core/mixins.py b/etna/core/mixins.py index cd2314870..8d63dde35 100644 --- a/etna/core/mixins.py +++ b/etna/core/mixins.py @@ -1,5 +1,6 @@ import inspect import warnings +from enum import Enum class BaseMixin: @@ -24,3 +25,11 @@ def __repr__(self): warnings.warn(f"You haven't set all parameters inside class __init__ method: {e}") args_str_representation += f"{arg} = {value.__repr__()}, " return f"{self.__class__.__name__}({args_str_representation})" + + +class StringEnumWithRepr(str, Enum): + """Base class for str enum objects.""" + + def __repr__(self): + """Get string representation for enum strings.""" + return self.value.__repr__() diff --git a/etna/transforms/power.py b/etna/transforms/power.py index dd6a2fb48..ac4522864 100644 --- a/etna/transforms/power.py +++ b/etna/transforms/power.py @@ -1,4 +1,3 @@ -import warnings from typing import List from typing import Optional from typing import Union @@ -26,27 +25,24 @@ def __init__( Parameters ---------- in_column: - name of processed column + columns to be transformed, if None - all columns will be transformed. inplace: if True, apply transformation inplace to in_column, if False, add column to dataset. out_column: - name of added column. Use self.__repr__() if not given + base for the names of generated columns, uses self.__repr__() if not given. standardize: Set to True to apply zero-mean, unit-variance normalization to the transformed output. """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") self.standardize = standardize self.inplace = inplace self.out_column = out_column self.mode = TransformMode(mode) - self.in_column = [in_column] if isinstance(in_column, str) else in_column super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, transformer=PowerTransformer(method="yeo-johnson", standardize=self.standardize), mode=mode, ) @@ -69,27 +65,24 @@ def __init__( Parameters ---------- in_column: - name of processed column + columns to be transformed, if None - all columns will be transformed. inplace: if True, apply transformation inplace to in_column, if False, add column to dataset. out_column: - name of added column. Use self.__repr__() if not given. + base for the names of generated columns, uses self.__repr__() if not given. standardize: Set to True to apply zero-mean, unit-variance normalization to the transformed output. """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") self.standardize = standardize - self.in_column = [in_column] if isinstance(in_column, str) else in_column self.inplace = inplace self.out_column = out_column self.mode = TransformMode(mode) super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, transformer=PowerTransformer(method="box-cox", standardize=self.standardize), mode=mode, ) diff --git a/etna/transforms/scalers.py b/etna/transforms/scalers.py index fc58cf157..90f132e9c 100644 --- a/etna/transforms/scalers.py +++ b/etna/transforms/scalers.py @@ -1,4 +1,3 @@ -import warnings from typing import List from typing import Optional from typing import Tuple @@ -38,7 +37,7 @@ def __init__( inplace: features are changed by scaled. out_column: - name of added column. Use self.__repr__() if not given. + base for the names of generated columns, uses self.__repr__() if not given. with_mean: if True, center the data before scaling. with_std: @@ -53,9 +52,6 @@ def __init__( ValueError: if incorrect mode given """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") - self.in_column = [in_column] if isinstance(in_column, str) else in_column self.inplace = inplace self.mode = TransformMode(mode) self.with_mean = with_mean @@ -64,7 +60,7 @@ def __init__( super().__init__( in_column=in_column, transformer=StandardScaler(with_mean=with_mean, with_std=with_std, copy=True), - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, inplace=inplace, mode=mode, ) @@ -97,7 +93,7 @@ def __init__( inplace: features are changed by scaled. out_column: - name of added column. Use self.__repr__() if not given. + base for the names of generated columns, uses self.__repr__() if not given. with_centering: if True, center the data before scaling. with_scaling: @@ -119,9 +115,6 @@ def __init__( ValueError: if incorrect mode given """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") - self.in_column = [in_column] if isinstance(in_column, str) else in_column self.out_column = out_column self.inplace = inplace self.mode = TransformMode(mode) @@ -132,7 +125,7 @@ def __init__( super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, transformer=RobustScaler( with_centering=with_centering, with_scaling=with_scaling, @@ -169,7 +162,7 @@ def __init__( inplace: features are changed by scaled. out_column: - name of added column. Use self.__repr__() if not given. + base for the names of generated columns, uses self.__repr__() if not given. feature_range: desired range of transformed data. clip: @@ -184,9 +177,6 @@ def __init__( ValueError: if incorrect mode given """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") - self.in_column = [in_column] if isinstance(in_column, str) else in_column self.out_column = out_column self.inplace = inplace self.mode = TransformMode(mode) @@ -195,7 +185,7 @@ def __init__( super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, transformer=MinMaxScaler(feature_range=feature_range, clip=clip, copy=True), mode=mode, ) @@ -223,7 +213,7 @@ def __init__( inplace: features are changed by scaled. out_column: - name of added column. Use self.__repr__() if not given. + base for the names of generated columns, uses self.__repr__() if not given. mode: "macro" or "per-segment", way to transform features over segments. If "macro", transforms features globally, gluing the corresponding ones for all segments. @@ -234,16 +224,13 @@ def __init__( ValueError: if incorrect mode given """ - if inplace and (out_column is not None): - warnings.warn("Transformation will be applied inplace, out_column param will be ignored") - self.in_column = [in_column] if isinstance(in_column, str) else in_column self.inplace = inplace self.mode = TransformMode(mode) self.out_column = out_column super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column if self.out_column is not None else self.__repr__(), + out_column=self.out_column, transformer=MaxAbsScaler(copy=True), mode=mode, ) diff --git a/etna/transforms/sklearn.py b/etna/transforms/sklearn.py index f9cbecca4..9136b53a3 100644 --- a/etna/transforms/sklearn.py +++ b/etna/transforms/sklearn.py @@ -1,4 +1,5 @@ -from enum import Enum +import warnings +from copy import deepcopy from typing import List from typing import Optional from typing import Union @@ -7,10 +8,11 @@ import pandas as pd from sklearn.base import TransformerMixin +from etna.core import StringEnumWithRepr from etna.transforms.base import Transform -class TransformMode(str, Enum): +class TransformMode(StringEnumWithRepr): """Enum for different metric aggregation modes.""" macro = "macro" @@ -23,7 +25,7 @@ class SklearnTransform(Transform): def __init__( self, in_column: Optional[Union[str, List[str]]], - out_column: str, + out_column: Optional[str], transformer: TransformerMixin, inplace: bool = True, mode: Union[TransformMode, str] = "per-segment", @@ -34,13 +36,13 @@ def __init__( Parameters ---------- in_column: - columns to be transformed, if None - all columns will be scaled. + columns to be transformed, if None - all columns will be transformed. transformer: sklearn.base.TransformerMixin instance. inplace: features are changed by transformed. out_column: - name of result column + base for the names of generated columns, uses self.__repr__() if not given. mode: "macro" or "per-segment", way to transform features over segments. If "macro", transforms features globally, gluing the corresponding ones for all segments. @@ -51,13 +53,28 @@ def __init__( ValueError: if incorrect mode given """ + if inplace and (out_column is not None): + warnings.warn("Transformation will be applied inplace, out_column param will be ignored") + self.transformer = transformer + if isinstance(in_column, str): in_column = [in_column] self.in_column = in_column if in_column is None else sorted(in_column) + self.inplace = inplace self.mode = TransformMode(mode) - self.out_column_name = out_column + self.out_column = out_column + + self.out_columns: Optional[List[str]] = None + + def _get_column_name(self, in_column: str) -> str: + if self.out_column is None: + new_transform = deepcopy(self) + new_transform.in_column = [in_column] + return f"regressor_{new_transform.__repr__()}" + else: + return f"{self.out_column}_{in_column}" def fit(self, df: pd.DataFrame) -> "SklearnTransform": """ @@ -73,14 +90,19 @@ def fit(self, df: pd.DataFrame) -> "SklearnTransform": self """ segments = sorted(set(df.columns.get_level_values("segment"))) + if self.in_column is None: self.in_column = sorted(set(df.columns.get_level_values("feature"))) + + self.out_columns = [self._get_column_name(column) for column in self.in_column] + if self.mode == TransformMode.per_segment: x = df.loc[:, (segments, self.in_column)].values elif self.mode == TransformMode.macro: x = self._reshape(df) else: raise ValueError(f"'{self.mode}' is not a valid TransformMode.") + self.transformer.fit(X=x) return self @@ -114,9 +136,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: transformed_features = pd.DataFrame( transformed, columns=df.loc[:, (segments, self.in_column)].columns, index=df.index ) - transformed_features.columns = pd.MultiIndex.from_tuples( - [(segment_name, self.out_column_name) for segment_name, feature_name in transformed_features.columns] - ) + transformed_features.columns = pd.MultiIndex.from_product([segments, self.out_columns]) df = pd.concat((df, transformed_features), axis=1) df = df.sort_index(axis=1) diff --git a/tests/test_transforms/test_sklearn_transform/test_interface_transform.py b/tests/test_transforms/test_sklearn_transform/test_interface_transform.py new file mode 100644 index 000000000..77b3c9213 --- /dev/null +++ b/tests/test_transforms/test_sklearn_transform/test_interface_transform.py @@ -0,0 +1,272 @@ +import numpy as np +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.datasets import generate_const_df +from etna.transforms import BoxCoxTransform +from etna.transforms import MaxAbsScalerTransform +from etna.transforms import MinMaxScalerTransform +from etna.transforms import RobustScalerTransform +from etna.transforms import StandardScalerTransform +from etna.transforms import YeoJohnsonTransform + + +@pytest.fixture +def multicolumn_ts(random_seed): + df = generate_const_df(start_time="2020-01-01", periods=20, freq="D", scale=1.0, n_segments=3) + df["target"] += np.random.uniform(0, 0.1, size=df.shape[0]) + df_exog = df.copy().rename(columns={"target": "exog_1"}) + for i in range(2, 6): + df_exog[f"exog_{i}"] = float(i) + np.random.uniform(0, 0.1, size=df.shape[0]) + + df_formatted = TSDataset.to_dataset(df) + df_exog_formatted = TSDataset.to_dataset(df_exog) + + return TSDataset(df=df_formatted, df_exog=df_exog_formatted, freq="D") + + +@pytest.mark.parametrize( + "transform_constructor", + ( + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ), +) +def test_fail_invalid_mode(transform_constructor): + """Test that transform raises error in invalid mode.""" + with pytest.raises(ValueError): + _ = transform_constructor(mode="non_existent") + + +@pytest.mark.parametrize( + "transform_constructor", + ( + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ), +) +def test_warning_not_inplace(transform_constructor): + """Test that transform raises warning if inplace is set to True, but out_column is also given.""" + with pytest.warns(UserWarning, match="Transformation will be applied inplace"): + _ = transform_constructor(inplace=True, out_column="new_exog") + + +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +def test_inplace_no_new_columns(transform_constructor, in_column, multicolumn_ts): + """Test that transform in inplace mode doesn't generate new columns.""" + transform = transform_constructor(in_column=in_column, inplace=True) + initial_df = multicolumn_ts.to_pandas() + transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + + new_columns = set( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .tolist() + ) + assert len(new_columns) == 0 + + +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +def test_creating_columns(transform_constructor, in_column, multicolumn_ts): + """Test that transform creates new columns according to out_column parameter.""" + transform = transform_constructor(in_column=in_column, out_column="new_exog", inplace=False) + initial_df = multicolumn_ts.to_pandas() + transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + + new_columns = set( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .tolist() + ) + in_column = [in_column] if isinstance(in_column, str) else in_column + expected_columns = {f"new_exog_{column}" for column in in_column} + assert new_columns == expected_columns + + +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +@pytest.mark.parametrize( + "in_column", + [ + "exog_1", + ["exog_1", "exog_2"], + ], +) +def test_generated_column_names(transform_constructor, in_column, multicolumn_ts): + """Test that transform generates names for the columns correctly.""" + transform = transform_constructor(in_column=in_column, out_column=None, inplace=False) + initial_df = multicolumn_ts.to_pandas() + transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + + columns = ( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .unique() + .tolist() + ) + + for column in columns: + transform_temp = eval(column[len("regressor_") :]) + df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) + columns_temp = ( + df_temp.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .unique() + .tolist() + ) + assert len(columns_temp) == 1 + assert columns_temp[0] == column + + +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +def test_all_columns(transform_constructor, multicolumn_ts): + """Test that transform can process all columns using None value for in_column.""" + transform = transform_constructor(in_column=None, out_column=None, inplace=False) + initial_df = multicolumn_ts.df.copy() + transformed_df = transform.fit_transform(multicolumn_ts.df) + + new_features = set( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .tolist() + ) + assert len(new_features) == initial_df.columns.get_level_values("feature").nunique() + + +@pytest.mark.parametrize( + "transform_constructor", + [ + BoxCoxTransform, + YeoJohnsonTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + MaxAbsScalerTransform, + StandardScalerTransform, + RobustScalerTransform, + MinMaxScalerTransform, + ], +) +@pytest.mark.parametrize( + "in_column", [["exog_1", "exog_2", "exog_3"], ["exog_2", "exog_1", "exog_3"], ["exog_3", "exog_2", "exog_1"]] +) +@pytest.mark.parametrize( + "mode", + [ + "macro", + "per-segment", + ], +) +def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): + """Test that transform don't mix columns between each other.""" + transform = transform_constructor(in_column=in_column, out_column=None, mode=mode, inplace=False) + transforms_one_column = [ + transform_constructor(in_column=column, out_column=None, mode=mode, inplace=False) for column in in_column + ] + + segments = sorted(multicolumn_ts.segments) + initial_df = multicolumn_ts.to_pandas() + transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + + transformed_dfs_one_column = [] + for transform_one_column in transforms_one_column: + transformed_dfs_one_column.append(transform_one_column.fit_transform(multicolumn_ts.to_pandas())) + + new_columns = ( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .tolist() + ) + + for i, column in enumerate(in_column): + column_multi = [x for x in new_columns if column in x][0] + column_single = ( + transformed_dfs_one_column[i] + .columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .tolist()[0] + ) + + df_multi = transformed_df.loc[:, pd.IndexSlice[segments, column_multi]] + df_single = transformed_dfs_one_column[i].loc[:, pd.IndexSlice[segments, column_single]] + assert np.all(df_multi == df_single) diff --git a/tests/test_transforms/test_power_transform.py b/tests/test_transforms/test_sklearn_transform/test_power_transform.py similarity index 82% rename from tests/test_transforms/test_power_transform.py rename to tests/test_transforms/test_sklearn_transform/test_power_transform.py index f8fc204c2..21bfd83eb 100644 --- a/tests/test_transforms/test_power_transform.py +++ b/tests/test_transforms/test_sklearn_transform/test_power_transform.py @@ -99,23 +99,3 @@ def test_inverse_transform_one_column(positive_df: pd.DataFrame, preprocessing_c transformed_target = preprocess.fit_transform(df=positive_df.copy()) inversed_target = preprocess.inverse_transform(df=transformed_target) np.testing.assert_array_almost_equal(inversed_target.values, positive_df.values) - - -@pytest.mark.parametrize("preprocessing_class", (BoxCoxTransform, YeoJohnsonTransform)) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_interface_repr(positive_df: pd.DataFrame, preprocessing_class: Any, mode: str): - preprocess = preprocessing_class(in_column="target", mode=mode, inplace=False) - excepted_column = f"{preprocess.__repr__()}" - result = preprocess.fit_transform(df=positive_df) - for segment in result.columns.get_level_values("segment").unique(): - assert excepted_column in result[segment].columns - - -@pytest.mark.parametrize("preprocessing_class", (BoxCoxTransform, YeoJohnsonTransform)) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_interface_out_column(positive_df: pd.DataFrame, preprocessing_class: Any, mode: str): - out_column = "test_name" - preprocess = preprocessing_class(in_column="target", mode=mode, inplace=False, out_column=out_column) - result = preprocess.fit_transform(df=positive_df) - for segment in result.columns.get_level_values("segment").unique(): - assert out_column in result[segment].columns diff --git a/tests/test_transforms/test_scalers_transform.py b/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py similarity index 61% rename from tests/test_transforms/test_scalers_transform.py rename to tests/test_transforms/test_sklearn_transform/test_scalers_transform.py index 1621e7a65..b07872096 100644 --- a/tests/test_transforms/test_scalers_transform.py +++ b/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py @@ -1,4 +1,3 @@ -from typing import Any from typing import List from typing import Optional from typing import Union @@ -33,7 +32,7 @@ def __init__( self, in_column: Optional[Union[str, List[str]]] = None, inplace: bool = True, - out_column: str = None, + out_column: Optional[Union[str, List[str]]] = None, mode: Union[TransformMode, str] = "per-segment", ): self.in_column = in_column @@ -43,7 +42,7 @@ def __init__( super().__init__( in_column=in_column, inplace=inplace, - out_column=out_column if out_column is not None else self.__repr__(), + out_column=out_column, transformer=DummySkTransform(), mode=mode, ) @@ -64,24 +63,6 @@ def normal_distributed_df() -> pd.DataFrame: return TSDataset.to_dataset(classic_df) -@pytest.mark.parametrize( - "scaler", - ( - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -def test_transform_invalid_mode(scaler): - """Check scaler behavior in case of invalid transform mode""" - with pytest.raises(ValueError): - _ = scaler(mode="a") - - @pytest.mark.parametrize( "scaler", ( @@ -142,61 +123,8 @@ def test_dummy_inverse_transform_one_column(normal_distributed_df, scaler, mode) @pytest.mark.parametrize("mode", ("macro", "per-segment")) def test_inverse_transform_not_inplace(normal_distributed_df, scaler, mode): """Check that inversed values the same for not inplace version.""" - inplace_scaler = scaler(mode=mode) not_inplace_scaler = scaler(inplace=False, mode=mode) - columns_to_compare = pd.MultiIndex.from_tuples( - [(segment_name, not_inplace_scaler.__repr__()) for segment_name, _ in normal_distributed_df.columns] - ) - inplace_feature_df = inplace_scaler.fit_transform(df=normal_distributed_df.copy()) - not_inplace_feature_df = not_inplace_scaler.fit_transform(df=normal_distributed_df.copy()) - - inplace_feature_df.columns = columns_to_compare - npt.assert_array_almost_equal( - inplace_feature_df.loc[:, columns_to_compare].values, not_inplace_feature_df.loc[:, columns_to_compare] - ) - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_interface_out_column(normal_distributed_df: pd.DataFrame, scaler: Any, mode: str): - """Check transform interface in non inplace mode with given out_column param.""" - out_column = "result" - transform = scaler(inplace=False, mode=mode, out_column=out_column) - result = transform.fit_transform(df=normal_distributed_df) - for segment in result.columns.get_level_values("segment").unique(): - assert out_column in result[segment].columns - - -@pytest.mark.parametrize( - "scaler", - ( - DummyTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - MaxAbsScalerTransform, - StandardScalerTransform, - RobustScalerTransform, - MinMaxScalerTransform, - ), -) -@pytest.mark.parametrize("mode", ("macro", "per-segment")) -def test_interface_repr(normal_distributed_df: pd.DataFrame, scaler: Any, mode: str): - """Check transform interface in non inplace mode without given out_column param.""" - transform = scaler(inplace=False, mode=mode) - excepted_column = transform.__repr__() - result = transform.fit_transform(df=normal_distributed_df) - for segment in result.columns.get_level_values("segment").unique(): - assert excepted_column in result[segment].columns + columns_to_compare = normal_distributed_df.columns + transformed_df = not_inplace_scaler.fit_transform(df=normal_distributed_df.copy()) + inverse_transformed_df = not_inplace_scaler.inverse_transform(transformed_df) + assert np.all(inverse_transformed_df[columns_to_compare] == normal_distributed_df) From 40ec4a25f9d488376b681204e55a188bf212c5e6 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 10:30:12 +0300 Subject: [PATCH 2/9] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97cc2e1e5..7df775351 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Sarimax bug in future prediction with quantiles ([#391](https://github.com/tinkoff-ai/etna/pull/391)) - Catboost version too high ([#394](https://github.com/tinkoff-ai/etna/pull/394)) +- SklearnTransform column name generation ([#398](https://github.com/tinkoff-ai/etna/pull/398)) ## [1.4.2] - 2021-12-09 ### Fixed From 4d19308c908a5d863ea644260794faa541f63f25 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 11:05:15 +0300 Subject: [PATCH 3/9] Add comments to the tests, add test on values in test_generated_column_names --- ...st_interface_transform.py => test_interface.py} | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) rename tests/test_transforms/test_sklearn_transform/{test_interface_transform.py => test_interface.py} (94%) diff --git a/tests/test_transforms/test_sklearn_transform/test_interface_transform.py b/tests/test_transforms/test_sklearn_transform/test_interface.py similarity index 94% rename from tests/test_transforms/test_sklearn_transform/test_interface_transform.py rename to tests/test_transforms/test_sklearn_transform/test_interface.py index 77b3c9213..4ef9f976c 100644 --- a/tests/test_transforms/test_sklearn_transform/test_interface_transform.py +++ b/tests/test_transforms/test_sklearn_transform/test_interface.py @@ -164,6 +164,7 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts transform = transform_constructor(in_column=in_column, out_column=None, inplace=False) initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + segments = sorted(multicolumn_ts.segments) columns = ( transformed_df.columns.get_level_values("feature") @@ -173,6 +174,7 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts ) for column in columns: + # create transform from column transform_temp = eval(column[len("regressor_") :]) df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) columns_temp = ( @@ -181,8 +183,15 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts .unique() .tolist() ) + + # compare column names and column values assert len(columns_temp) == 1 - assert columns_temp[0] == column + column_temp = columns_temp[0] + assert column_temp == column + assert np.all( + df_temp.loc[:, pd.IndexSlice[segments, column_temp]] + == transformed_df.loc[:, pd.IndexSlice[segments, column]] + ) @pytest.mark.parametrize( @@ -259,7 +268,10 @@ def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): ) for i, column in enumerate(in_column): + # find relevant column name in transformed_df column_multi = [x for x in new_columns if column in x][0] + + # find relevant column name in transformed_dfs_one_column[i] column_single = ( transformed_dfs_one_column[i] .columns.get_level_values("feature") From 3b5305ca1d542a12804634fbda470bc0e111dc05 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 11:47:25 +0300 Subject: [PATCH 4/9] Remove unnecessary parameters assignments in SklearnTransform children, remove regressor prefix in name generation --- etna/transforms/power.py | 20 +++++++----- etna/transforms/scalers.py | 32 ++++++------------- etna/transforms/sklearn.py | 2 +- .../test_sklearn_transform/test_interface.py | 2 +- .../test_scalers_transform.py | 6 +--- 5 files changed, 25 insertions(+), 37 deletions(-) diff --git a/etna/transforms/power.py b/etna/transforms/power.py index ac4522864..367a04764 100644 --- a/etna/transforms/power.py +++ b/etna/transforms/power.py @@ -34,15 +34,17 @@ def __init__( standardize: Set to True to apply zero-mean, unit-variance normalization to the transformed output. + + Raises + ------ + ValueError: + if incorrect mode given """ self.standardize = standardize - self.inplace = inplace - self.out_column = out_column - self.mode = TransformMode(mode) super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column, + out_column=out_column, transformer=PowerTransformer(method="yeo-johnson", standardize=self.standardize), mode=mode, ) @@ -74,15 +76,17 @@ def __init__( standardize: Set to True to apply zero-mean, unit-variance normalization to the transformed output. + + Raises + ------ + ValueError: + if incorrect mode given """ self.standardize = standardize - self.inplace = inplace - self.out_column = out_column - self.mode = TransformMode(mode) super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column, + out_column=out_column, transformer=PowerTransformer(method="box-cox", standardize=self.standardize), mode=mode, ) diff --git a/etna/transforms/scalers.py b/etna/transforms/scalers.py index 90f132e9c..6248ac464 100644 --- a/etna/transforms/scalers.py +++ b/etna/transforms/scalers.py @@ -52,15 +52,12 @@ def __init__( ValueError: if incorrect mode given """ - self.inplace = inplace - self.mode = TransformMode(mode) self.with_mean = with_mean self.with_std = with_std - self.out_column = out_column super().__init__( in_column=in_column, - transformer=StandardScaler(with_mean=with_mean, with_std=with_std, copy=True), - out_column=self.out_column, + transformer=StandardScaler(with_mean=self.with_mean, with_std=self.with_std, copy=True), + out_column=out_column, inplace=inplace, mode=mode, ) @@ -115,9 +112,6 @@ def __init__( ValueError: if incorrect mode given """ - self.out_column = out_column - self.inplace = inplace - self.mode = TransformMode(mode) self.with_centering = with_centering self.with_scaling = with_scaling self.quantile_range = quantile_range @@ -125,12 +119,12 @@ def __init__( super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column, + out_column=out_column, transformer=RobustScaler( - with_centering=with_centering, - with_scaling=with_scaling, - quantile_range=quantile_range, - unit_variance=unit_variance, + with_centering=self.with_centering, + with_scaling=self.with_scaling, + quantile_range=self.quantile_range, + unit_variance=self.unit_variance, copy=True, ), mode=mode, @@ -177,16 +171,13 @@ def __init__( ValueError: if incorrect mode given """ - self.out_column = out_column - self.inplace = inplace - self.mode = TransformMode(mode) self.feature_range = feature_range self.clip = clip super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column, - transformer=MinMaxScaler(feature_range=feature_range, clip=clip, copy=True), + out_column=out_column, + transformer=MinMaxScaler(feature_range=self.feature_range, clip=self.clip, copy=True), mode=mode, ) @@ -224,13 +215,10 @@ def __init__( ValueError: if incorrect mode given """ - self.inplace = inplace - self.mode = TransformMode(mode) - self.out_column = out_column super().__init__( in_column=in_column, inplace=inplace, - out_column=self.out_column, + out_column=out_column, transformer=MaxAbsScaler(copy=True), mode=mode, ) diff --git a/etna/transforms/sklearn.py b/etna/transforms/sklearn.py index 9136b53a3..db9a54d83 100644 --- a/etna/transforms/sklearn.py +++ b/etna/transforms/sklearn.py @@ -72,7 +72,7 @@ def _get_column_name(self, in_column: str) -> str: if self.out_column is None: new_transform = deepcopy(self) new_transform.in_column = [in_column] - return f"regressor_{new_transform.__repr__()}" + return f"{new_transform.__repr__()}" else: return f"{self.out_column}_{in_column}" diff --git a/tests/test_transforms/test_sklearn_transform/test_interface.py b/tests/test_transforms/test_sklearn_transform/test_interface.py index 4ef9f976c..8b2a851d9 100644 --- a/tests/test_transforms/test_sklearn_transform/test_interface.py +++ b/tests/test_transforms/test_sklearn_transform/test_interface.py @@ -175,7 +175,7 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts for column in columns: # create transform from column - transform_temp = eval(column[len("regressor_") :]) + transform_temp = eval(column) df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) columns_temp = ( df_temp.columns.get_level_values("feature") diff --git a/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py b/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py index b07872096..89cf48522 100644 --- a/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py +++ b/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py @@ -32,13 +32,9 @@ def __init__( self, in_column: Optional[Union[str, List[str]]] = None, inplace: bool = True, - out_column: Optional[Union[str, List[str]]] = None, + out_column: Optional[str] = None, mode: Union[TransformMode, str] = "per-segment", ): - self.in_column = in_column - self.inplace = inplace - self.out_column = out_column - self.mode = TransformMode(mode) super().__init__( in_column=in_column, inplace=inplace, From f061810f35bcf73caa082f535a1914719886882f Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 11:50:42 +0300 Subject: [PATCH 5/9] Add more precise description to the StringEnumWithRepr class --- etna/core/mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etna/core/mixins.py b/etna/core/mixins.py index 8d63dde35..f8ac4e8e9 100644 --- a/etna/core/mixins.py +++ b/etna/core/mixins.py @@ -28,8 +28,8 @@ def __repr__(self): class StringEnumWithRepr(str, Enum): - """Base class for str enum objects.""" + """Base class for str enums, that has alternative __repr__ method.""" def __repr__(self): - """Get string representation for enum strings.""" + """Get string representation for enum string so that enum can be created from it.""" return self.value.__repr__() From 5e6e846d9e3b6c36a4c52507b4234b4055af66ba Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 11:55:08 +0300 Subject: [PATCH 6/9] Rename tests for SklearnTransform --- .../{test_sklearn_transform => sklearn}/test_interface.py | 0 .../{test_sklearn_transform => sklearn}/test_power_transform.py | 0 .../{test_sklearn_transform => sklearn}/test_scalers_transform.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tests/test_transforms/{test_sklearn_transform => sklearn}/test_interface.py (100%) rename tests/test_transforms/{test_sklearn_transform => sklearn}/test_power_transform.py (100%) rename tests/test_transforms/{test_sklearn_transform => sklearn}/test_scalers_transform.py (100%) diff --git a/tests/test_transforms/test_sklearn_transform/test_interface.py b/tests/test_transforms/sklearn/test_interface.py similarity index 100% rename from tests/test_transforms/test_sklearn_transform/test_interface.py rename to tests/test_transforms/sklearn/test_interface.py diff --git a/tests/test_transforms/test_sklearn_transform/test_power_transform.py b/tests/test_transforms/sklearn/test_power_transform.py similarity index 100% rename from tests/test_transforms/test_sklearn_transform/test_power_transform.py rename to tests/test_transforms/sklearn/test_power_transform.py diff --git a/tests/test_transforms/test_sklearn_transform/test_scalers_transform.py b/tests/test_transforms/sklearn/test_scalers_transform.py similarity index 100% rename from tests/test_transforms/test_sklearn_transform/test_scalers_transform.py rename to tests/test_transforms/sklearn/test_scalers_transform.py From 54bcbaf0e0d13cd7f224c81643deb872b5d3f51e Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Tue, 21 Dec 2021 16:04:04 +0300 Subject: [PATCH 7/9] Simplify test by extracting common logic into extract_new_features_columns function --- .../test_transforms/sklearn/test_interface.py | 54 +++++++------------ 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/tests/test_transforms/sklearn/test_interface.py b/tests/test_transforms/sklearn/test_interface.py index 8b2a851d9..45b6b8a30 100644 --- a/tests/test_transforms/sklearn/test_interface.py +++ b/tests/test_transforms/sklearn/test_interface.py @@ -1,3 +1,5 @@ +from typing import List + import numpy as np import pandas as pd import pytest @@ -26,6 +28,16 @@ def multicolumn_ts(random_seed): return TSDataset(df=df_formatted, df_exog=df_exog_formatted, freq="D") +def extract_new_features_columns(transformed_df: pd.DataFrame, initial_df: pd.DataFrame) -> List[str]: + """Extract columns from feature level that are present in transformed_df but not present in initial_df.""" + return ( + transformed_df.columns.get_level_values("feature") + .difference(initial_df.columns.get_level_values("feature")) + .unique() + .tolist() + ) + + @pytest.mark.parametrize( "transform_constructor", ( @@ -93,11 +105,7 @@ def test_inplace_no_new_columns(transform_constructor, in_column, multicolumn_ts initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - new_columns = set( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .tolist() - ) + new_columns = extract_new_features_columns(transformed_df, initial_df) assert len(new_columns) == 0 @@ -128,11 +136,7 @@ def test_creating_columns(transform_constructor, in_column, multicolumn_ts): initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) - new_columns = set( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .tolist() - ) + new_columns = set(extract_new_features_columns(transformed_df, initial_df)) in_column = [in_column] if isinstance(in_column, str) else in_column expected_columns = {f"new_exog_{column}" for column in in_column} assert new_columns == expected_columns @@ -166,14 +170,9 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) segments = sorted(multicolumn_ts.segments) - columns = ( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .unique() - .tolist() - ) + new_columns = extract_new_features_columns(transformed_df, initial_df) - for column in columns: + for column in new_columns: # create transform from column transform_temp = eval(column) df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) @@ -214,12 +213,8 @@ def test_all_columns(transform_constructor, multicolumn_ts): initial_df = multicolumn_ts.df.copy() transformed_df = transform.fit_transform(multicolumn_ts.df) - new_features = set( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .tolist() - ) - assert len(new_features) == initial_df.columns.get_level_values("feature").nunique() + new_columns = extract_new_features_columns(transformed_df, initial_df) + assert len(new_columns) == initial_df.columns.get_level_values("feature").nunique() @pytest.mark.parametrize( @@ -261,23 +256,14 @@ def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): for transform_one_column in transforms_one_column: transformed_dfs_one_column.append(transform_one_column.fit_transform(multicolumn_ts.to_pandas())) - new_columns = ( - transformed_df.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .tolist() - ) + new_columns = extract_new_features_columns(transformed_df, initial_df) for i, column in enumerate(in_column): # find relevant column name in transformed_df column_multi = [x for x in new_columns if column in x][0] # find relevant column name in transformed_dfs_one_column[i] - column_single = ( - transformed_dfs_one_column[i] - .columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .tolist()[0] - ) + column_single = extract_new_features_columns(transformed_dfs_one_column[i], initial_df)[0] df_multi = transformed_df.loc[:, pd.IndexSlice[segments, column_multi]] df_single = transformed_dfs_one_column[i].loc[:, pd.IndexSlice[segments, column_single]] From adfba878aecc8903c5068d3187aabf83c31e0845 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 22 Dec 2021 11:13:27 +0300 Subject: [PATCH 8/9] Add checkings on out_columns attribute --- etna/transforms/sklearn.py | 5 ++- .../test_transforms/sklearn/test_interface.py | 31 ++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/etna/transforms/sklearn.py b/etna/transforms/sklearn.py index db9a54d83..9caf7adad 100644 --- a/etna/transforms/sklearn.py +++ b/etna/transforms/sklearn.py @@ -94,7 +94,10 @@ def fit(self, df: pd.DataFrame) -> "SklearnTransform": if self.in_column is None: self.in_column = sorted(set(df.columns.get_level_values("feature"))) - self.out_columns = [self._get_column_name(column) for column in self.in_column] + if self.inplace: + self.out_columns = self.in_column + else: + self.out_columns = [self._get_column_name(column) for column in self.in_column] if self.mode == TransformMode.per_segment: x = df.loc[:, (segments, self.in_column)].values diff --git a/tests/test_transforms/sklearn/test_interface.py b/tests/test_transforms/sklearn/test_interface.py index 45b6b8a30..5976b4daa 100644 --- a/tests/test_transforms/sklearn/test_interface.py +++ b/tests/test_transforms/sklearn/test_interface.py @@ -105,9 +105,13 @@ def test_inplace_no_new_columns(transform_constructor, in_column, multicolumn_ts initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + # check new columns new_columns = extract_new_features_columns(transformed_df, initial_df) assert len(new_columns) == 0 + # check that output columns are input columns + assert transform.out_columns == transform.in_column + @pytest.mark.parametrize( "transform_constructor", @@ -136,11 +140,18 @@ def test_creating_columns(transform_constructor, in_column, multicolumn_ts): initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) + # check new columns new_columns = set(extract_new_features_columns(transformed_df, initial_df)) in_column = [in_column] if isinstance(in_column, str) else in_column expected_columns = {f"new_exog_{column}" for column in in_column} assert new_columns == expected_columns + # check that output columns are matching input columns + assert len(transform.in_column) == len(transform.out_columns) + assert all( + [f"new_exog_{column}" == new_column for column, new_column in zip(transform.in_column, transform.out_columns)] + ) + @pytest.mark.parametrize( "transform_constructor", @@ -172,16 +183,12 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts new_columns = extract_new_features_columns(transformed_df, initial_df) + # check new columns for column in new_columns: # create transform from column transform_temp = eval(column) df_temp = transform_temp.fit_transform(multicolumn_ts.to_pandas()) - columns_temp = ( - df_temp.columns.get_level_values("feature") - .difference(initial_df.columns.get_level_values("feature")) - .unique() - .tolist() - ) + columns_temp = extract_new_features_columns(df_temp, initial_df) # compare column names and column values assert len(columns_temp) == 1 @@ -192,6 +199,11 @@ def test_generated_column_names(transform_constructor, in_column, multicolumn_ts == transformed_df.loc[:, pd.IndexSlice[segments, column]] ) + # check that output columns are matching input columns + assert len(transform.in_column) == len(transform.out_columns) + # check that name if this input column is present inside name of this output column + assert all([(column in new_column) for column, new_column in zip(transform.in_column, transform.out_columns)]) + @pytest.mark.parametrize( "transform_constructor", @@ -249,21 +261,18 @@ def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): ] segments = sorted(multicolumn_ts.segments) - initial_df = multicolumn_ts.to_pandas() transformed_df = transform.fit_transform(multicolumn_ts.to_pandas()) transformed_dfs_one_column = [] for transform_one_column in transforms_one_column: transformed_dfs_one_column.append(transform_one_column.fit_transform(multicolumn_ts.to_pandas())) - new_columns = extract_new_features_columns(transformed_df, initial_df) - for i, column in enumerate(in_column): # find relevant column name in transformed_df - column_multi = [x for x in new_columns if column in x][0] + column_multi = {key: value for key, value in zip(transform.in_column, transform.out_columns)}[column] # find relevant column name in transformed_dfs_one_column[i] - column_single = extract_new_features_columns(transformed_dfs_one_column[i], initial_df)[0] + column_single = transforms_one_column[i].out_columns[0] df_multi = transformed_df.loc[:, pd.IndexSlice[segments, column_multi]] df_single = transformed_dfs_one_column[i].loc[:, pd.IndexSlice[segments, column_single]] From fff42b4c4a2b595309a445c3af8679a3eafd2a71 Mon Sep 17 00:00:00 2001 From: "d.a.bunin" Date: Wed, 22 Dec 2021 11:37:33 +0300 Subject: [PATCH 9/9] Simplify cycle in test_ordering --- tests/test_transforms/sklearn/test_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_transforms/sklearn/test_interface.py b/tests/test_transforms/sklearn/test_interface.py index 5976b4daa..f38d6f098 100644 --- a/tests/test_transforms/sklearn/test_interface.py +++ b/tests/test_transforms/sklearn/test_interface.py @@ -267,9 +267,10 @@ def test_ordering(transform_constructor, in_column, mode, multicolumn_ts): for transform_one_column in transforms_one_column: transformed_dfs_one_column.append(transform_one_column.fit_transform(multicolumn_ts.to_pandas())) + in_to_out_columns = {key: value for key, value in zip(transform.in_column, transform.out_columns)} for i, column in enumerate(in_column): # find relevant column name in transformed_df - column_multi = {key: value for key, value in zip(transform.in_column, transform.out_columns)}[column] + column_multi = in_to_out_columns[column] # find relevant column name in transformed_dfs_one_column[i] column_single = transforms_one_column[i].out_columns[0]