Skip to content

add ranks #268

Merged
merged 10 commits into from
Nov 15, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions etna/analysis/feature_relevance/relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import abstractmethod

import pandas as pd
import scipy.stats

from etna.analysis.feature_relevance.relevance_table import get_model_relevance_table
from etna.analysis.feature_relevance.relevance_table import get_statistics_relevance_table
Expand All @@ -21,8 +22,15 @@ def __init__(self, greater_is_better: bool):
"""
self.greater_is_better = greater_is_better

def _get_ranks(self, table: pd.DataFrame) -> pd.DataFrame:
"""Compute rank relevance table from relevance table."""
if self.greater_is_better:
table *= -1
rank_table = pd.DataFrame(scipy.stats.rankdata(table, axis=1), columns=table.columns, index=table.index)
return rank_table.astype(int)

@abstractmethod
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame:
"""Compute relevance table.
For each series in df compute relevance of corresponding series in df_exog.

Expand All @@ -32,6 +40,8 @@ def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.Data
dataframe with series that will be used as target
df_exog:
dataframe with series to compute relevance for df
return_ranks:
if False return relevance values else return ranks of relevance values

Returns
-------
Expand All @@ -47,9 +57,11 @@ class StatisticsRelevanceTable(RelevanceTable):
def __init__(self):
super().__init__(greater_is_better=False)

def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame:
"""Compute feature relevance table with etna.analysis.get_statistics_relevance_table method."""
table = get_statistics_relevance_table(df=df, df_exog=df_exog)
if return_ranks:
return self._get_ranks(table)
return table


Expand All @@ -59,7 +71,9 @@ class ModelRelevanceTable(RelevanceTable):
def __init__(self):
super().__init__(greater_is_better=True)

def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, **kwargs) -> pd.DataFrame:
def __call__(self, df: pd.DataFrame, df_exog: pd.DataFrame, return_ranks: bool, **kwargs) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't we add False as default value for return_ranks?

"""Compute feature relevance table with etna.analysis.get_model_relevance_table method."""
table = get_model_relevance_table(df=df, df_exog=df_exog, **kwargs)
if return_ranks:
return self._get_ranks(table)
return table
8 changes: 7 additions & 1 deletion etna/transforms/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class MRMRFeatureSelectionTransform(Transform):
def __init__(
self,
relevance_method: RelevanceTable,
return_ranks: bool,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return_rank sounds like you are going to return smth like {regressor: rank} for chosen top_k, doesn't it? how about use_rank?
and maybe lets make it the last arg (at least after top_k arg)?

top_k: int,
clustering_method: HierarchicalClustering = EuclideanClustering(),
n_clusters: int = 10,
Expand All @@ -155,6 +156,8 @@ def __init__(
----------
relevance_method:
method to calculate relevance table
return_ranks:
if False use relevance table else use ranks of relevance table
top_k:
num of regressors to select; if there are not enough regressors, then all will be selected
clustering_method:
Expand All @@ -171,6 +174,7 @@ def __init__(
raise ValueError("Parameter n_clusters should be integer and greater than 1")

self.relevance_method = relevance_method
self.return_ranks = return_ranks
self.clustering = clustering_method
self.n_clusters = n_clusters
self.linkage = linkage
Expand Down Expand Up @@ -208,7 +212,9 @@ def fit(self, df: pd.DataFrame) -> "MRMRFeatureSelectionTransform":
self.clustering.build_distance_matrix(ts=ts)
self.clustering.build_clustering_algo(n_clusters=self.n_clusters, linkage=self.linkage)
s2c = self.clustering.fit_predict()
relevance_table = self.relevance_method(ts[:, :, "target"], ts[:, :, ts.regressors], **self.relevance_params)
relevance_table = self.relevance_method(
alex-hse-repository marked this conversation as resolved.
Show resolved Hide resolved
ts[:, :, "target"], ts[:, :, ts.regressors], return_ranks=self.return_ranks, **self.relevance_params
)
y = np.empty(len(relevance_table))
for k, cluster in enumerate(relevance_table.index):
y[k] = s2c[cluster]
Expand Down
20 changes: 18 additions & 2 deletions tests/test_analysis/test_feature_relevance/test_relevance.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from sklearn.tree import DecisionTreeRegressor

from etna.analysis.feature_relevance import ModelRelevanceTable
Expand All @@ -8,11 +9,26 @@ def test_statistics_relevance_table(simple_df_relevance):
rt = StatisticsRelevanceTable()
assert not rt.greater_is_better
df, df_exog = simple_df_relevance
assert rt(df=df, df_exog=df_exog).shape == (2, 2)
assert rt(df=df, df_exog=df_exog, return_ranks=False).shape == (2, 2)


def test_model_relevance_table(simple_df_relevance):
rt = ModelRelevanceTable()
assert rt.greater_is_better
df, df_exog = simple_df_relevance
assert rt(df=df, df_exog=df_exog, model=DecisionTreeRegressor()).shape == (2, 2)
assert rt(df=df, df_exog=df_exog, return_ranks=False, model=DecisionTreeRegressor()).shape == (2, 2)


@pytest.mark.parametrize(
"greater_is_better,answer",
((True, [1, 2, 2, 1]), (False, [2, 1, 1, 2])),
)
def test_relevance_table_ranks(greater_is_better, answer, simple_df_relevance):
rt = ModelRelevanceTable()
rt.greater_is_better = greater_is_better
df, df_exog = simple_df_relevance
table = rt(df=df, df_exog=df_exog, return_ranks=True, model=DecisionTreeRegressor())
assert table["regressor_1"]["1"] == answer[0]
assert table["regressor_2"]["1"] == answer[1]
assert table["regressor_1"]["2"] == answer[2]
assert table["regressor_2"]["2"] == answer[3]
10 changes: 5 additions & 5 deletions tests/test_transforms/test_feature_importance_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def ts_with_regressors():
def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regressors):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to use keyword arguments, fix it pls in this file

"""Check that transform selects exactly top_k regressors."""
df = ts_with_regressors.to_pandas()
mrmr = MRMRFeatureSelectionTransform(relevance_method, top_k, clustering_method, n_clusters=2)
mrmr = MRMRFeatureSelectionTransform(relevance_method, False, top_k, clustering_method, n_clusters=2)
df_selected = mrmr.fit_transform(df)
all_regressors = ts_with_regressors.regressors
selected_regressors = set()
Expand All @@ -93,7 +93,7 @@ def test_mrmr_right_len(relevance_method, clustering_method, top_k, ts_with_regr
def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regressors):
"""Check that transform selects right top_k regressors."""
df = ts_with_regressors.to_pandas()
mrmr = MRMRFeatureSelectionTransform(relevance_method, 3, clustering_method, n_clusters=2)
mrmr = MRMRFeatureSelectionTransform(relevance_method, False, 3, clustering_method, n_clusters=2)
df_selected = mrmr.fit_transform(df)
selected_regressors = set()
for column in df_selected.columns.get_level_values("feature"):
Expand All @@ -105,14 +105,14 @@ def test_mrmr_right_regressors(relevance_method, clustering_method, ts_with_regr
def test_mrmr_fails_negative_parameters():
"""Check that transform doesn't allow you to set top_k to negative values and n_clusters >= 2."""
with pytest.raises(ValueError, match="positive integer"):
MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=-1)
MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=-1)
with pytest.raises(ValueError, match="greater than"):
MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=1, n_clusters=1)
MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=1, n_clusters=1)


def test_mrmr_fails(ts_with_regressors):
"""Check that transform doesn't allow you to set n_clusters greater than number of regressors."""
mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), top_k=4, freq="D", n_clusters=25)
mrmr = MRMRFeatureSelectionTransform(StatisticsRelevanceTable(), False, top_k=4, freq="D", n_clusters=25)
with pytest.raises(ValueError, match="strictly less than"):
mrmr.fit_transform(ts_with_regressors.to_pandas())

Expand Down