Skip to content

Commit

Permalink
[feat] Add coalescer
Browse files Browse the repository at this point in the history
* [fix] Add check dataset in transform as well for test dataset, which does not require fit
* [test] Migrate tests from the francisco's PR without modifications
* [fix] Modify so that tests pass
  • Loading branch information
nabenabe0928 committed Feb 9, 2022
1 parent 2601421 commit 3efadc3
Show file tree
Hide file tree
Showing 12 changed files with 666 additions and 1 deletion.
16 changes: 16 additions & 0 deletions autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -31,6 +32,7 @@
"network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866},
{"data_loader:batch_size": 255,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -65,6 +67,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789},
{"data_loader:batch_size": 165,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -96,6 +99,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 299,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -128,6 +132,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 183,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -162,6 +167,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097},
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -191,6 +197,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 159,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -221,6 +228,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 442,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -254,6 +262,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 140,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -287,6 +296,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 48,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -315,6 +325,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 168,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -348,6 +359,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855},
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -377,6 +389,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 163,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -410,6 +423,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569},
{"data_loader:batch_size": 150,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -444,6 +458,7 @@
"network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248},
{"data_loader:batch_size": 151,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -474,6 +489,7 @@
"network_head:fully_connected:units_layer_1": 128},
{"data_loader:batch_size": 42,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
from autoPyTorch.utils.implementations import MinorityCoalesceTransformer


class MinorityCoalescer(BaseCoalescer):
"""Group together categories whose occurence is less than a specified min_fraction """
def __init__(self, min_fraction: float, random_state: np.random.RandomState):
super().__init__()
self.min_fraction = min_fraction
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:

self.check_requirements(X, y)

self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_fraction=self.min_fraction)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'MinorityCoalescer',
'name': 'MinorityCoalescer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


class NoCoalescer(BaseCoalescer):
def __init__(self, random_state: np.random.RandomState):
super().__init__()
self.random_state = random_state
self._processing = False

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
"""
As no coalescing happens, only check the requirements.
Args:
X (Dict[str, Any]):
fit dictionary
y (Optional[Any]):
Parameter to comply with scikit-learn API. Not used.
Returns:
instance of self
"""
self.check_requirements(X, y)

return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'NoCoalescer',
'name': 'NoCoalescer',
'handles_sparse': True
}
Loading

0 comments on commit 3efadc3

Please sign in to comment.