Skip to content

Commit

Permalink
[feat] Add coalescer (automl#376)
Browse files Browse the repository at this point in the history
* [fix] Add check dataset in transform as well for test dataset, which does not require fit
* [test] Migrate tests from the francisco's PR without modifications
* [fix] Modify so that tests pass
* [test] Increase the coverage
  • Loading branch information
nabenabe0928 authored and ravinkohli committed Apr 12, 2022
1 parent d9fa7b2 commit 6c2e54d
Show file tree
Hide file tree
Showing 13 changed files with 730 additions and 1 deletion.
16 changes: 16 additions & 0 deletions autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -30,6 +31,7 @@
"network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866},
{"data_loader:batch_size": 255,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -63,6 +65,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789},
{"data_loader:batch_size": 165,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -93,6 +96,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 299,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -124,6 +128,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 183,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -157,6 +162,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097},
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -185,6 +191,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 159,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -214,6 +221,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 442,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -246,6 +254,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 140,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -278,6 +287,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 48,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -305,6 +315,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 168,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -337,6 +348,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855},
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -365,6 +377,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 163,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -397,6 +410,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569},
{"data_loader:batch_size": 150,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -430,6 +444,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248},
{"data_loader:batch_size": 151,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down Expand Up @@ -459,6 +474,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 42,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import UniformFloatHyperparameter

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
from autoPyTorch.utils.implementations import MinorityCoalesceTransformer


class MinorityCoalescer(BaseCoalescer):
"""Group together categories whose occurence is less than a specified min_frac """
def __init__(self, min_frac: float, random_state: np.random.RandomState):
super().__init__()
self.min_frac = min_frac
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
self.check_requirements(X, y)
self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac)
return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, Any]] = None,
min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac',
value_range=(1e-4, 0.5),
default_value=1e-2,
),
) -> ConfigurationSpace:

cs = ConfigurationSpace()
add_hyperparameter(cs, min_frac, UniformFloatHyperparameter)
return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'MinorityCoalescer',
'name': 'MinorityCoalescer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


class NoCoalescer(BaseCoalescer):
def __init__(self, random_state: np.random.RandomState):
super().__init__()
self.random_state = random_state
self._processing = False

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
"""
As no coalescing happens, only check the requirements.
Args:
X (Dict[str, Any]):
fit dictionary
y (Optional[Any]):
Parameter to comply with scikit-learn API. Not used.
Returns:
instance of self
"""
self.check_requirements(X, y)

return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'NoCoalescer',
'name': 'NoCoalescer',
'handles_sparse': True
}
Loading

0 comments on commit 6c2e54d

Please sign in to comment.