From a2dd7be6d9bf8b96323e23c221f4784bb1dbf017 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Wed, 10 Nov 2021 15:57:13 +0300 Subject: [PATCH 1/8] add ranks --- etna/analysis/feature_relevance/relevance.py | 20 ++++++++++++++++--- .../test_feature_relevance/test_relevance.py | 14 +++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance.py b/etna/analysis/feature_relevance/relevance.py index eceaa9421..f5dabac62 100644 --- a/etna/analysis/feature_relevance/relevance.py +++ b/etna/analysis/feature_relevance/relevance.py @@ -2,6 +2,7 @@ from abc import abstractmethod import pandas as pd +import scipy.stats from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table @@ -21,8 +22,15 @@ def __init__(self, greater_is_better: bool): """ self.greater_is_better = greater_is_better + def _get_ranks(self, table): + """Compute rank relevance table from relevance table.""" + rank_table = table.apply(lambda x: pd.Series(scipy.stats.rankdata(x.values), index=x.index), axis=1) + if self.greater_is_better: + rank_table = -1 * (rank_table - rank_table.shape[1] - 1) + return rank_table.astype(int) + @abstractmethod - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: """Compute relevance table. For each series in df compute relevance of corresponding series in df_exog. @@ -32,6 +40,8 @@ def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.Data dataframe with series that will be used as target df_exog: dataframe with series to compute relevance for df + return_ranks: + if False return relevance values else return ranks of relevance values Returns ------- @@ -47,9 +57,11 @@ class StatisticsRelevanceTable(RelevanceTable): def __init__(self): super().__init__(greater_is_better=False) - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: """Compute feature relevance table with etna.analysis.get_statistics_relevance_table method.""" table = get_statistics_relevance_table(df=df, df_exog=df_exog) + if return_ranks: + return self._get_ranks(table) return table @@ -59,7 +71,9 @@ class ModelRelevanceTable(RelevanceTable): def __init__(self): super().__init__(greater_is_better=True) - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: """Compute feature relevance table with etna.analysis.get_model_relevance_table method.""" table = get_model_relevance_table(df=df, df_exog=df_exog, **kwargs) + if return_ranks: + return self._get_ranks(table) return table diff --git a/tests/test_analysis/test_feature_relevance/test_relevance.py b/tests/test_analysis/test_feature_relevance/test_relevance.py index 0fa11b9cc..e2e8ccde3 100644 --- a/tests/test_analysis/test_feature_relevance/test_relevance.py +++ b/tests/test_analysis/test_feature_relevance/test_relevance.py @@ -8,11 +8,21 @@ def test_statistics_relevance_table(simple_df_relevance): rt = StatisticsRelevanceTable() assert not rt.greater_is_better df, df_exog = simple_df_relevance - assert rt(df=df, df_exog=df_exog).shape == (2, 2) + assert rt(df=df, df_exog=df_exog, return_ranks=False).shape == (2, 2) def test_model_relevance_table(simple_df_relevance): rt = ModelRelevanceTable() assert rt.greater_is_better df, df_exog = simple_df_relevance - assert rt(df=df, df_exog=df_exog, model=DecisionTreeRegressor()).shape == (2, 2) + assert rt(df=df, df_exog=df_exog, return_ranks=False, model=DecisionTreeRegressor()).shape == (2, 2) + + +def test_relevance_table_ranks(simple_df_relevance): + rt = ModelRelevanceTable() + df, df_exog = simple_df_relevance + table = rt(df=df, df_exog=df_exog, return_ranks=True, model=DecisionTreeRegressor()) + assert table["regressor_1"]["1"] == 1 + assert table["regressor_2"]["1"] == 2 + assert table["regressor_1"]["2"] == 2 + assert table["regressor_2"]["2"] == 1 From 25d883345be0461aaf3be7a03f8ecd215846fa75 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Wed, 10 Nov 2021 16:31:23 +0300 Subject: [PATCH 2/8] fix MRMR transform --- etna/analysis/feature_relevance/relevance.py | 2 +- etna/transforms/feature_importance.py | 8 +++++++- .../test_feature_importance_transform.py | 10 +++++----- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance.py b/etna/analysis/feature_relevance/relevance.py index f5dabac62..ebee8a187 100644 --- a/etna/analysis/feature_relevance/relevance.py +++ b/etna/analysis/feature_relevance/relevance.py @@ -22,7 +22,7 @@ def __init__(self, greater_is_better: bool): """ self.greater_is_better = greater_is_better - def _get_ranks(self, table): + def _get_ranks(self, table) -> pd.DataFrame: """Compute rank relevance table from relevance table.""" rank_table = table.apply(lambda x: pd.Series(scipy.stats.rankdata(x.values), index=x.index), axis=1) if self.greater_is_better: diff --git a/etna/transforms/feature_importance.py b/etna/transforms/feature_importance.py index bf1b55173..d00d8511d 100644 --- a/etna/transforms/feature_importance.py +++ b/etna/transforms/feature_importance.py @@ -142,6 +142,7 @@ class MRMRFeatureSelectionTransform(Transform): def __init__( self, relevance_method: RelevanceTable, + return_ranks: bool, top_k: int, clustering_method: HierarchicalClustering = EuclideanClustering(), n_clusters: int = 10, @@ -155,6 +156,8 @@ def __init__( ---------- relevance_method: method to calculate relevance table + return_ranks: + if False use relevance table else use ranks of relevance table top_k: num of regressors to select; if there are not enough regressors, then all will be selected clustering_method: @@ -171,6 +174,7 @@ def __init__( raise ValueError("Parameter n_clusters should be integer and greater than 1") self.relevance_method = relevance_method + self.return_ranks = return_ranks self.clustering = clustering_method self.n_clusters = n_clusters self.linkage = linkage @@ -208,7 +212,9 @@ def fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": self.clustering.build_distance_matrix(ts=ts) self.clustering.build_clustering_algo(n_clusters=self.n_clusters, linkage=self.linkage) s2c = self.clustering.fit_predict() - relevance_table = self.relevance_method(ts[:, :, "target"], ts[:, :, ts.regressors], **self.relevance_params) + relevance_table = self.relevance_method( + ts[:, :, "target"], ts[:, :, ts.regressors], self.return_ranks, **self.relevance_params + ) y = np.empty(len(relevance_table)) for k, cluster in enumerate(relevance_table.index): y[k] = s2c[cluster] diff --git a/tests/test_transforms/test_feature_importance_transform.py b/tests/test_transforms/test_feature_importance_transform.py index 9f8d08003..6a87523ae 100644 --- a/tests/test_transforms/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_importance_transform.py @@ -73,7 +73,7 @@ def ts_with_regressors(): def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regressors): """Check that transform selects exactly top_k regressors.""" df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform(relevance_method, top_k, clustering_method, n_clusters=2) + mrmr = MRMRFeatureSelectionTransform(relevance_method, False, top_k, clustering_method, n_clusters=2) df_selected = mrmr.fit_transform(df) all_regressors = ts_with_regressors.regressors selected_regressors = set() @@ -93,7 +93,7 @@ def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regr def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regressors): """Check that transform selects right top_k regressors.""" df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform(relevance_method, 3, clustering_method, n_clusters=2) + mrmr = MRMRFeatureSelectionTransform(relevance_method, False, 3, clustering_method, n_clusters=2) df_selected = mrmr.fit_transform(df) selected_regressors = set() for column in df_selected.columns.get_level_values("feature"): @@ -105,14 +105,14 @@ def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regr def test_mrmr_fails_negative_parameters(): """Check that transform doesn't allow you to set top_k to negative values and n_clusters >= 2.""" with pytest.raises(ValueError, match="positive integer"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=-1) + MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=-1) with pytest.raises(ValueError, match="greater than"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=1, n_clusters=1) + MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=1, n_clusters=1) def test_mrmr_fails(ts_with_regressors): """Check that transform doesn't allow you to set n_clusters greater than number of regressors.""" - mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=4, freq="D", n_clusters=25) + mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=4, freq="D", n_clusters=25) with pytest.raises(ValueError, match="strictly less than"): mrmr.fit_transform(ts_with_regressors.to_pandas()) From 522f42eae8cce842ded15c1e58ad12308c04e347 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Thu, 11 Nov 2021 11:22:01 +0300 Subject: [PATCH 3/8] fix --- etna/analysis/feature_relevance/relevance.py | 6 +++--- etna/transforms/feature_importance.py | 2 +- .../test_feature_relevance/test_relevance.py | 17 +++++++++++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance.py b/etna/analysis/feature_relevance/relevance.py index ebee8a187..75503285e 100644 --- a/etna/analysis/feature_relevance/relevance.py +++ b/etna/analysis/feature_relevance/relevance.py @@ -22,11 +22,11 @@ def __init__(self, greater_is_better: bool): """ self.greater_is_better = greater_is_better - def _get_ranks(self, table) -> pd.DataFrame: + def _get_ranks(self, table: pd.DataFrame) -> pd.DataFrame: """Compute rank relevance table from relevance table.""" - rank_table = table.apply(lambda x: pd.Series(scipy.stats.rankdata(x.values), index=x.index), axis=1) if self.greater_is_better: - rank_table = -1 * (rank_table - rank_table.shape[1] - 1) + table *= -1 + rank_table = pd.DataFrame(scipy.stats.rankdata(table, axis=1), columns=table.columns, index=table.index) return rank_table.astype(int) @abstractmethod diff --git a/etna/transforms/feature_importance.py b/etna/transforms/feature_importance.py index d00d8511d..a1c83adf6 100644 --- a/etna/transforms/feature_importance.py +++ b/etna/transforms/feature_importance.py @@ -213,7 +213,7 @@ def fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": self.clustering.build_clustering_algo(n_clusters=self.n_clusters, linkage=self.linkage) s2c = self.clustering.fit_predict() relevance_table = self.relevance_method( - ts[:, :, "target"], ts[:, :, ts.regressors], self.return_ranks, **self.relevance_params + ts[:, :, "target"], ts[:, :, ts.regressors], return_ranks=self.return_ranks, **self.relevance_params ) y = np.empty(len(relevance_table)) for k, cluster in enumerate(relevance_table.index): diff --git a/tests/test_analysis/test_feature_relevance/test_relevance.py b/tests/test_analysis/test_feature_relevance/test_relevance.py index e2e8ccde3..54d8cd5c4 100644 --- a/tests/test_analysis/test_feature_relevance/test_relevance.py +++ b/tests/test_analysis/test_feature_relevance/test_relevance.py @@ -1,4 +1,5 @@ from sklearn.tree import DecisionTreeRegressor +import pytest from etna.analysis.feature_relevance import ModelRelevanceTable from etna.analysis.feature_relevance import StatisticsRelevanceTable @@ -17,12 +18,16 @@ def test_model_relevance_table(simple_df_relevance): df, df_exog = simple_df_relevance assert rt(df=df, df_exog=df_exog, return_ranks=False, model=DecisionTreeRegressor()).shape == (2, 2) - -def test_relevance_table_ranks(simple_df_relevance): +@pytest.mark.parametrize( + "greater_is_better,answer", + ((True, [1, 2, 2, 1]), (False, [2, 1, 1, 2])), +) +def test_relevance_table_ranks(greater_is_better, answer, simple_df_relevance): rt = ModelRelevanceTable() + rt.greater_is_better = greater_is_better df, df_exog = simple_df_relevance table = rt(df=df, df_exog=df_exog, return_ranks=True, model=DecisionTreeRegressor()) - assert table["regressor_1"]["1"] == 1 - assert table["regressor_2"]["1"] == 2 - assert table["regressor_1"]["2"] == 2 - assert table["regressor_2"]["2"] == 1 + assert table["regressor_1"]["1"] == answer[0] + assert table["regressor_2"]["1"] == answer[1] + assert table["regressor_1"]["2"] == answer[2] + assert table["regressor_2"]["2"] == answer[3] From 88b726c3a682c5fbcaaff7bcccea2cb4ba7b49a4 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Thu, 11 Nov 2021 12:43:39 +0300 Subject: [PATCH 4/8] fix lint --- etna/analysis/feature_relevance/relevance.py | 2 +- tests/test_analysis/test_feature_relevance/test_relevance.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance.py b/etna/analysis/feature_relevance/relevance.py index 75503285e..8afc10f5a 100644 --- a/etna/analysis/feature_relevance/relevance.py +++ b/etna/analysis/feature_relevance/relevance.py @@ -26,7 +26,7 @@ def _get_ranks(self, table: pd.DataFrame) -> pd.DataFrame: """Compute rank relevance table from relevance table.""" if self.greater_is_better: table *= -1 - rank_table = pd.DataFrame(scipy.stats.rankdata(table, axis=1), columns=table.columns, index=table.index) + rank_table = pd.DataFrame(scipy.stats.rankdata(table, axis=1), columns=table.columns, index=table.index) return rank_table.astype(int) @abstractmethod diff --git a/tests/test_analysis/test_feature_relevance/test_relevance.py b/tests/test_analysis/test_feature_relevance/test_relevance.py index 54d8cd5c4..100be660d 100644 --- a/tests/test_analysis/test_feature_relevance/test_relevance.py +++ b/tests/test_analysis/test_feature_relevance/test_relevance.py @@ -1,5 +1,5 @@ -from sklearn.tree import DecisionTreeRegressor import pytest +from sklearn.tree import DecisionTreeRegressor from etna.analysis.feature_relevance import ModelRelevanceTable from etna.analysis.feature_relevance import StatisticsRelevanceTable @@ -18,6 +18,7 @@ def test_model_relevance_table(simple_df_relevance): df, df_exog = simple_df_relevance assert rt(df=df, df_exog=df_exog, return_ranks=False, model=DecisionTreeRegressor()).shape == (2, 2) + @pytest.mark.parametrize( "greater_is_better,answer", ((True, [1, 2, 2, 1]), (False, [2, 1, 1, 2])), From 588d1f4004bc3d8e0bc0e916fa71282bfc7d1f36 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Mon, 15 Nov 2021 09:02:51 +0300 Subject: [PATCH 5/8] final --- etna/analysis/feature_relevance/relevance.py | 6 ++--- .../test_feature_importance_transform.py | 22 ++++++++++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/etna/analysis/feature_relevance/relevance.py b/etna/analysis/feature_relevance/relevance.py index 8afc10f5a..61844a39a 100644 --- a/etna/analysis/feature_relevance/relevance.py +++ b/etna/analysis/feature_relevance/relevance.py @@ -30,7 +30,7 @@ def _get_ranks(self, table: pd.DataFrame) -> pd.DataFrame: return rank_table.astype(int) @abstractmethod - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame: """Compute relevance table. For each series in df compute relevance of corresponding series in df_exog. @@ -57,7 +57,7 @@ class StatisticsRelevanceTable(RelevanceTable): def __init__(self): super().__init__(greater_is_better=False) - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame: """Compute feature relevance table with etna.analysis.get_statistics_relevance_table method.""" table = get_statistics_relevance_table(df=df, df_exog=df_exog) if return_ranks: @@ -71,7 +71,7 @@ class ModelRelevanceTable(RelevanceTable): def __init__(self): super().__init__(greater_is_better=True) - def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame: + def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool = False, **kwargs) -> pd.DataFrame: """Compute feature relevance table with etna.analysis.get_model_relevance_table method.""" table = get_model_relevance_table(df=df, df_exog=df_exog, **kwargs) if return_ranks: diff --git a/tests/test_transforms/test_feature_importance_transform.py b/tests/test_transforms/test_feature_importance_transform.py index 6a87523ae..092f420c4 100644 --- a/tests/test_transforms/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_importance_transform.py @@ -73,7 +73,13 @@ def ts_with_regressors(): def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regressors): """Check that transform selects exactly top_k regressors.""" df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform(relevance_method, False, top_k, clustering_method, n_clusters=2) + mrmr = MRMRFeatureSelectionTransform( + relevance_method=relevance_method, + return_ranks=False, + top_k=top_k, + clustering_method=clustering_method, + n_clusters=2, + ) df_selected = mrmr.fit_transform(df) all_regressors = ts_with_regressors.regressors selected_regressors = set() @@ -93,7 +99,13 @@ def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regr def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regressors): """Check that transform selects right top_k regressors.""" df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform(relevance_method, False, 3, clustering_method, n_clusters=2) + mrmr = MRMRFeatureSelectionTransform( + relevance_method=relevance_method, + return_ranks=False, + top_k=3, + clustering_method=clustering_method, + n_clusters=2, + ) df_selected = mrmr.fit_transform(df) selected_regressors = set() for column in df_selected.columns.get_level_values("feature"): @@ -105,14 +117,14 @@ def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regr def test_mrmr_fails_negative_parameters(): """Check that transform doesn't allow you to set top_k to negative values and n_clusters >= 2.""" with pytest.raises(ValueError, match="positive integer"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=-1) + MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=-1) with pytest.raises(ValueError, match="greater than"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=1, n_clusters=1) + MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=1, n_clusters=1) def test_mrmr_fails(ts_with_regressors): """Check that transform doesn't allow you to set n_clusters greater than number of regressors.""" - mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=4, freq="D", n_clusters=25) + mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=4, n_clusters=25) with pytest.raises(ValueError, match="strictly less than"): mrmr.fit_transform(ts_with_regressors.to_pandas()) From d635cb46360f5d7e58ff7ace77b9d6da2e6a0412 Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Mon, 15 Nov 2021 09:16:02 +0300 Subject: [PATCH 6/8] final --- .../test_feature_importance_transform.py | 70 ------------------- 1 file changed, 70 deletions(-) diff --git a/tests/test_transforms/test_feature_importance_transform.py b/tests/test_transforms/test_feature_importance_transform.py index 6bdc87089..f79b1e5f2 100644 --- a/tests/test_transforms/test_feature_importance_transform.py +++ b/tests/test_transforms/test_feature_importance_transform.py @@ -59,76 +59,6 @@ def ts_with_regressors(): @pytest.mark.parametrize( -<<<<<<< HEAD - "relevance_method, clustering_method", - [ - [StatisticsRelevanceTable(), EuclideanClustering()], - [StatisticsRelevanceTable(), DTWClustering()], - ], -) -@pytest.mark.parametrize("top_k", [0, 1, 5, 15, 50]) -def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regressors): - """Check that transform selects exactly top_k regressors.""" - df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform( - relevance_method=relevance_method, - return_ranks=False, - top_k=top_k, - clustering_method=clustering_method, - n_clusters=2, - ) - df_selected = mrmr.fit_transform(df) - all_regressors = ts_with_regressors.regressors - selected_regressors = set() - for column in df_selected.columns.get_level_values("feature"): - if column.startswith("regressor"): - selected_regressors.add(column) - - assert len(selected_regressors) == min(len(all_regressors), top_k) - - -@pytest.mark.parametrize( - "relevance_method, clustering_method", - [ - [StatisticsRelevanceTable(), EuclideanClustering()], - ], -) -def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regressors): - """Check that transform selects right top_k regressors.""" - df = ts_with_regressors.to_pandas() - mrmr = MRMRFeatureSelectionTransform( - relevance_method=relevance_method, - return_ranks=False, - top_k=3, - clustering_method=clustering_method, - n_clusters=2, - ) - df_selected = mrmr.fit_transform(df) - selected_regressors = set() - for column in df_selected.columns.get_level_values("feature"): - if column.startswith("regressor"): - selected_regressors.add(column) - assert set(selected_regressors) == set(["regressor_useful_1", "regressor_useful_2", "regressor_useless_9"]) - - -def test_mrmr_fails_negative_parameters(): - """Check that transform doesn't allow you to set top_k to negative values and n_clusters >= 2.""" - with pytest.raises(ValueError, match="positive integer"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=-1) - with pytest.raises(ValueError, match="greater than"): - MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=1, n_clusters=1) - - -def test_mrmr_fails(ts_with_regressors): - """Check that transform doesn't allow you to set n_clusters greater than number of regressors.""" - mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), return_ranks=False, top_k=4, n_clusters=25) - with pytest.raises(ValueError, match="strictly less than"): - mrmr.fit_transform(ts_with_regressors.to_pandas()) - - -@pytest.mark.parametrize( -======= ->>>>>>> 505ed5922a6a8fd9555698f152a8b2563a39f951 "model", [ DecisionTreeRegressor(random_state=42), From d0975156119c9b41969274c6096f687da0cf4ffa Mon Sep 17 00:00:00 2001 From: Artyom Makhin Date: Mon, 15 Nov 2021 09:20:37 +0300 Subject: [PATCH 7/8] fix conflicts --- etna/transforms/feature_importance.py | 114 -------------------------- 1 file changed, 114 deletions(-) diff --git a/etna/transforms/feature_importance.py b/etna/transforms/feature_importance.py index fbf1af96f..e05e164d4 100644 --- a/etna/transforms/feature_importance.py +++ b/etna/transforms/feature_importance.py @@ -130,117 +130,3 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame: ) result = result.loc[:, pd.IndexSlice[:, selected_columns]] return result -<<<<<<< HEAD - - -class MRMRFeatureSelectionTransform(Transform): - """Transform that selects regressors according to mRMR variable selection method.""" - - def __init__( - self, - relevance_method: RelevanceTable, - return_ranks: bool, - top_k: int, - clustering_method: HierarchicalClustering = EuclideanClustering(), - n_clusters: int = 10, - linkage: str = "average", - **relevance_params, - ): - """ - Init MRMRFeatureSelectionTransform. - - Parameters - ---------- - relevance_method: - method to calculate relevance table - return_ranks: - if False use relevance table else use ranks of relevance table - top_k: - num of regressors to select; if there are not enough regressors, then all will be selected - clustering_method: - method of time series clustering - n_clusters: - number of clusters - linkage: - rule for distance computation for new clusters, allowed "ward", "single", "average", "maximum", "complete" - """ - if not isinstance(top_k, int) or top_k < 0: - raise ValueError("Parameter top_k should be positive integer") - - if not isinstance(n_clusters, int) or n_clusters < 2: - raise ValueError("Parameter n_clusters should be integer and greater than 1") - - self.relevance_method = relevance_method - self.return_ranks = return_ranks - self.clustering = clustering_method - self.n_clusters = n_clusters - self.linkage = linkage - self.top_k = top_k - self.relevance_params = relevance_params - self.selected_regressors: Optional[List[str]] = None - - @staticmethod - def _get_regressors(df: pd.DataFrame) -> List[str]: - """Get list of regressors in the dataframe.""" - result = set() - for column in df.columns.get_level_values("feature"): - if column.startswith("regressor_"): - result.add(column) - return sorted(list(result)) - - def fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform": - """ - Fit the method and remember features to select. - - Parameters - ---------- - df: - dataframe with all segments data - - Returns - ------- - result: MRMRFeatureSelectionTransform - instance after fitting - """ - if len(self._get_regressors(df)) <= self.n_clusters: - raise ValueError("The number of clusters must be strictly less than the number of regressors") - - ts = TSDataset(df=df, freq=pd.infer_freq(df.index)) - self.clustering.build_distance_matrix(ts=ts) - self.clustering.build_clustering_algo(n_clusters=self.n_clusters, linkage=self.linkage) - s2c = self.clustering.fit_predict() - relevance_table = self.relevance_method( - ts[:, :, "target"], ts[:, :, ts.regressors], return_ranks=self.return_ranks, **self.relevance_params - ) - y = np.empty(len(relevance_table)) - for k, cluster in enumerate(relevance_table.index): - y[k] = s2c[cluster] - self.selected_regressors = mrmr_classif(relevance_table, y, K=self.top_k) - return self - - def transform(self, df: pd.DataFrame) -> pd.DataFrame: - """ - Select top_k regressors. - - Parameters - ---------- - df: - dataframe with all segments data - - Returns - ------- - result: pd.DataFrame - Dataframe with with only selected regressors - """ - result = df.copy() - selected_columns = sorted( - [ - column - for column in df.columns.get_level_values("feature").unique() - if not column.startswith("regressor_") or column in self.selected_regressors - ] - ) - result = result.loc[:, pd.IndexSlice[:, selected_columns]] - return result -======= ->>>>>>> 505ed5922a6a8fd9555698f152a8b2563a39f951 From 42ef92734efe37ec75597900f457ad7537fbc3c2 Mon Sep 17 00:00:00 2001 From: Julia Shenshina Date: Mon, 15 Nov 2021 12:31:13 +0300 Subject: [PATCH 8/8] Upd CHANGELOG --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57c22e2ee..7b25ede12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- RelevanceTable returns rank ([#268](https://github.com/tinkoff-ai/etna-ts/pull/268/)) + +### Changed + +### Fixed ## [1.3.1] - 2021-11-12 ### Changed