Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] variance thresholding #373

Merged
merged 3 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
autoPyTorchTabularPreprocessingComponent


class VarianceThreshold(autoPyTorchTabularPreprocessingComponent):
"""
Removes features that have the same value in the training data.
"""
def __init__(self, random_state: Optional[np.random.RandomState] = None):
super().__init__()

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold':

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnVarianceThreshold(
threshold=0.0
)
return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
if self.preprocessor['numerical'] is None:
raise ValueError("cannot call transform on {} without fitting first."
.format(self.__class__.__name__))
X.update({'variance_threshold': self.preprocessor})
return X

@staticmethod
def get_properties(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:

return {
'shortname': 'Variance Threshold',
'name': 'Variance Threshold (constant feature removal)',
'handles_sparse': True,
}
3 changes: 3 additions & 0 deletions autoPyTorch/pipeline/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
VarianceThreshold import VarianceThreshold
from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
Expand Down Expand Up @@ -307,6 +309,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("variance_threshold", VarianceThreshold(random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
3 changes: 3 additions & 0 deletions autoPyTorch/pipeline/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
VarianceThreshold import VarianceThreshold
from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing
from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice
from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent
Expand Down Expand Up @@ -257,6 +259,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("variance_threshold", VarianceThreshold(random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
3 changes: 3 additions & 0 deletions test/test_pipeline/components/preprocessing/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
VarianceThreshold import VarianceThreshold
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline


Expand All @@ -28,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]],

steps.extend([
("imputer", SimpleImputer()),
("variance_threshold", VarianceThreshold()),
("encoder", EncoderChoice(default_dataset_properties)),
("scaler", ScalerChoice(default_dataset_properties)),
("tabular_transformer", TabularColumnTransformer()),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import numpy as np
from numpy.testing import assert_array_equal


from sklearn.base import BaseEstimator
from sklearn.compose import make_column_transformer

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \
VarianceThreshold import VarianceThreshold


def test_variance_threshold():
data = np.array([[1, 2, 1],
[7, 8, 9],
[4, 5, 1],
[11, 12, 1],
[17, 18, 19],
[14, 15, 16]])
numerical_columns = [0, 1, 2]
train_indices = np.array([0, 2, 3])
test_indices = np.array([1, 4, 5])
dataset_properties = {
'categorical_columns': [],
'numerical_columns': numerical_columns,
}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
component = VarianceThreshold()

component = component.fit(X)
X = component.transform(X)
variance_threshold = X['variance_threshold']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['variance_threshold'], dict)
assert isinstance(variance_threshold, BaseEstimator)

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((variance_threshold,
X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_array_equal(transformed, np.array([[7, 8],
[17, 18],
[14, 15]]))