-
Notifications
You must be signed in to change notification settings - Fork 289
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ADD] scalers from autosklearn (#372)
* Add new scalers * fix flake and mypy * Apply suggestions from code review Co-authored-by: nabenabe0928 <[email protected]> * add robust scaler * fix documentation * remove power transformer from feature preprocessing * fix tests * check for default in include and exclude * Apply suggestions from code review Co-authored-by: nabenabe0928 <[email protected]> Co-authored-by: nabenabe0928 <[email protected]>
- Loading branch information
1 parent
466bc18
commit 2601421
Showing
8 changed files
with
363 additions
and
52 deletions.
There are no files selected for viewing
49 changes: 0 additions & 49 deletions
49
.../components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
38 changes: 38 additions & 0 deletions
38
...Torch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
import numpy as np | ||
|
||
from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer | ||
|
||
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler | ||
|
||
|
||
class PowerTransformer(BaseScaler): | ||
""" | ||
Map data to as close to a Gaussian distribution as possible | ||
in order to reduce variance and minimize skewness. | ||
Uses `yeo-johnson` power transform method. Also, data is normalised | ||
to zero mean and unit variance. | ||
""" | ||
def __init__(self, | ||
random_state: Optional[np.random.RandomState] = None): | ||
super().__init__() | ||
self.random_state = random_state | ||
|
||
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: | ||
|
||
self.check_requirements(X, y) | ||
|
||
self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False) | ||
return self | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None | ||
) -> Dict[str, Union[str, bool]]: | ||
return { | ||
'shortname': 'PowerTransformer', | ||
'name': 'PowerTransformer', | ||
'handles_sparse': False | ||
} |
73 changes: 73 additions & 0 deletions
73
...ch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
from ConfigSpace.hyperparameters import ( | ||
CategoricalHyperparameter, | ||
UniformIntegerHyperparameter | ||
) | ||
|
||
import numpy as np | ||
|
||
from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer | ||
|
||
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler | ||
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter | ||
|
||
|
||
class QuantileTransformer(BaseScaler): | ||
""" | ||
Transform the features to follow a uniform or a normal distribution | ||
using quantiles information. | ||
For more details of each attribute, see: | ||
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html | ||
""" | ||
def __init__( | ||
self, | ||
n_quantiles: int = 1000, | ||
output_distribution: str = "normal", # Literal["normal", "uniform"] | ||
random_state: Optional[np.random.RandomState] = None | ||
): | ||
super().__init__() | ||
self.random_state = random_state | ||
self.n_quantiles = n_quantiles | ||
self.output_distribution = output_distribution | ||
|
||
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: | ||
|
||
self.check_requirements(X, y) | ||
|
||
self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles, | ||
output_distribution=self.output_distribution, | ||
copy=False) | ||
return self | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space( | ||
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles", | ||
value_range=(10, 2000), | ||
default_value=1000, | ||
), | ||
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution", | ||
value_range=("uniform", "normal"), | ||
default_value="normal", | ||
) | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
|
||
# TODO parametrize like the Random Forest as n_quantiles = n_features^param | ||
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter) | ||
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter) | ||
|
||
return cs | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None | ||
) -> Dict[str, Union[str, bool]]: | ||
return { | ||
'shortname': 'QuantileTransformer', | ||
'name': 'QuantileTransformer', | ||
'handles_sparse': False | ||
} |
73 changes: 73 additions & 0 deletions
73
autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from typing import Any, Dict, Optional, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
from ConfigSpace.hyperparameters import ( | ||
UniformFloatHyperparameter, | ||
) | ||
|
||
import numpy as np | ||
|
||
from sklearn.preprocessing import RobustScaler as SklearnRobustScaler | ||
|
||
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler | ||
from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter | ||
|
||
|
||
class RobustScaler(BaseScaler): | ||
""" | ||
Remove the median and scale features according to the quantile_range to make | ||
the features robust to outliers. | ||
For more details of the preprocessor, see: | ||
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html | ||
""" | ||
def __init__( | ||
self, | ||
q_min: float = 0.25, | ||
q_max: float = 0.75, | ||
random_state: Optional[np.random.RandomState] = None | ||
): | ||
super().__init__() | ||
self.add_fit_requirements([ | ||
FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)]) | ||
self.random_state = random_state | ||
self.q_min = q_min | ||
self.q_max = q_max | ||
|
||
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: | ||
|
||
self.check_requirements(X, y) | ||
with_centering = bool(not X['dataset_properties']['issparse']) | ||
|
||
self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max), | ||
with_centering=with_centering, | ||
copy=False) | ||
|
||
return self | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space( | ||
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||
q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min", | ||
value_range=(0.001, 0.3), | ||
default_value=0.25), | ||
q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max", | ||
value_range=(0.7, 0.999), | ||
default_value=0.75) | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
|
||
add_hyperparameter(cs, q_min, UniformFloatHyperparameter) | ||
add_hyperparameter(cs, q_max, UniformFloatHyperparameter) | ||
|
||
return cs | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None | ||
) -> Dict[str, Union[str, bool]]: | ||
return { | ||
'shortname': 'RobustScaler', | ||
'name': 'RobustScaler', | ||
'handles_sparse': True | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.