Skip to content

Commit

Permalink
[ADD] scalers from autosklearn (#372)
Browse files Browse the repository at this point in the history
* Add new scalers

* fix flake and mypy

* Apply suggestions from code review

Co-authored-by: nabenabe0928 <[email protected]>

* add robust scaler

* fix documentation

* remove power transformer from feature preprocessing

* fix tests

* check for default in include and exclude

* Apply suggestions from code review

Co-authored-by: nabenabe0928 <[email protected]>

Co-authored-by: nabenabe0928 <[email protected]>
  • Loading branch information
ravinkohli and nabenabe0928 authored Feb 9, 2022
1 parent 466bc18 commit 2601421
Show file tree
Hide file tree
Showing 8 changed files with 363 additions and 52 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def get_hyperparameter_search_space(self,
'RandomKitchenSinks',
'Nystroem',
'PolynomialFeatures',
'PowerTransformer',
'TruncatedSVD',
]
for default_ in defaults:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler


class PowerTransformer(BaseScaler):
"""
Map data to as close to a Gaussian distribution as possible
in order to reduce variance and minimize skewness.
Uses `yeo-johnson` power transform method. Also, data is normalised
to zero mean and unit variance.
"""
def __init__(self,
random_state: Optional[np.random.RandomState] = None):
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'PowerTransformer',
'name': 'PowerTransformer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter,
UniformIntegerHyperparameter
)

import numpy as np

from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


class QuantileTransformer(BaseScaler):
"""
Transform the features to follow a uniform or a normal distribution
using quantiles information.
For more details of each attribute, see:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html
"""
def __init__(
self,
n_quantiles: int = 1000,
output_distribution: str = "normal", # Literal["normal", "uniform"]
random_state: Optional[np.random.RandomState] = None
):
super().__init__()
self.random_state = random_state
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
output_distribution=self.output_distribution,
copy=False)
return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
value_range=(10, 2000),
default_value=1000,
),
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
value_range=("uniform", "normal"),
default_value="normal",
)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

# TODO parametrize like the Random Forest as n_quantiles = n_features^param
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'QuantileTransformer',
'name': 'QuantileTransformer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
)

import numpy as np

from sklearn.preprocessing import RobustScaler as SklearnRobustScaler

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter


class RobustScaler(BaseScaler):
"""
Remove the median and scale features according to the quantile_range to make
the features robust to outliers.
For more details of the preprocessor, see:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html
"""
def __init__(
self,
q_min: float = 0.25,
q_max: float = 0.75,
random_state: Optional[np.random.RandomState] = None
):
super().__init__()
self.add_fit_requirements([
FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
self.random_state = random_state
self.q_min = q_min
self.q_max = q_max

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)
with_centering = bool(not X['dataset_properties']['issparse'])

self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max),
with_centering=with_centering,
copy=False)

return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min",
value_range=(0.001, 0.3),
default_value=0.25),
q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max",
value_range=(0.7, 0.999),
default_value=0.75)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

add_hyperparameter(cs, q_min, UniformFloatHyperparameter)
add_hyperparameter(cs, q_max, UniformFloatHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'RobustScaler',
'name': 'RobustScaler',
'handles_sparse': True
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self,
raise ValueError("no scalers found, please add a scaler")

if default is None:
defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
defaults = [
'StandardScaler',
'Normalizer',
'MinMaxScaler',
'PowerTransformer',
'QuantileTransformer',
'RobustScaler',
'NoScaler'
]
for default_ in defaults:
if default_ in available_scalers:
if include is not None and default_ not in include:
continue
if exclude is not None and default_ in exclude:
continue
default = default_
break

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def random_state():
return 11


@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer',
@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures',
'Nystroem', 'KernelPCA', 'RandomKitchenSinks'])
def preprocessor(request):
return request.param
Expand Down
Loading

0 comments on commit 2601421

Please sign in to comment.