Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADD] Minority Coalescer #242

Closed
wants to merge 10 commits into from
42 changes: 21 additions & 21 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,11 +1072,17 @@ def refit(
self
"""

self.dataset_name = dataset.dataset_name

if self._logger is None:
self._logger = self._get_logger(str(self.dataset_name))

if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()

# Refit is not applicable when ensemble_size is set to zero.
if self.ensemble_ is None:
raise ValueError("Refit can only be called if 'ensemble_size != 0'")
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved

self.dataset_name = dataset.dataset_name
dataset_requirements = get_dataset_requirements(
info=dataset.get_required_dataset_info(),
include=self.include_components,
Expand All @@ -1085,26 +1091,20 @@ def refit(
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.save_datamanager(dataset)

X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
'backend': self._backend,
'X_train': dataset.train_tensors[0],
'y_train': dataset.train_tensors[1],
'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
'train_indices': dataset.splits[split_id][0],
'val_indices': dataset.splits[split_id][1],
'split_id': split_id,
'num_run': self._backend.get_next_num_run(),
})
X.update({**self.pipeline_options, **budget_config})
if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()

# Refit is not applicable when ensemble_size is set to zero.
if self.ensemble_ is None:
raise ValueError("Refit can only be called if 'ensemble_size != 0'")

for identifier in self.models_:
X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
'backend': self._backend,
'X_train': dataset.train_tensors[0].copy(),
'y_train': dataset.train_tensors[1].copy(),
'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
'train_indices': dataset.splits[split_id][0],
'val_indices': dataset.splits[split_id][1],
'split_id': split_id,
'num_run': self._backend.get_next_num_run(),
})
X.update({**self.pipeline_options, **budget_config})

model = self.models_[identifier]
# this updates the model inplace, it can then later be used in
# predict method
Expand Down
3 changes: 2 additions & 1 deletion autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"coalescer:__choice__": "NoCoalescer",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
Expand Down Expand Up @@ -506,4 +507,4 @@
"network_backbone:ShapedResNetBackbone:max_shake_drop_probability": 0.034431265307095615,
"network_head:fully_connected:activation": "relu",
"network_head:fully_connected:units_layer_1": 128,
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
"network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
)

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
from autoPyTorch.utils.implementations import MinorityCoalescing


class MinorityCoalescer(BaseCoalescer):
"""
Groups together classes in a categorical feature if the frequency
of occurrence is less than minimum_fraction
"""
def __init__(self, minimum_fraction: float, random_state: Optional[Union[np.random.RandomState, int]] = None):
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self.minimum_fraction = minimum_fraction
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minimum_fraction -> min_fraction (convention)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have a convention for this, and in this case, having the complete word is more clear. If possible I would like to preserve it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pytorch uses min rather than minimum.
When it uses minimum, it is only for the element-wise minimum.
But if you would like to stick to it, it is also fine.

self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:

self.check_requirements(X, y)

self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'MinorityCoalescer',
'name': 'Minority Feature-class coalescer',
'handles_sparse': False
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
}

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict] = None,
minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction",
value_range=(0.0001, 0.5),
default_value=0.01,
log=True),
) -> ConfigurationSpace:
cs = ConfigurationSpace()

add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter)

return cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


class NoCoalescer(BaseCoalescer):
"""
Don't perform NoCoalescer on categorical features
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a NoCoalescer class. This allows the BO model to enable/disable coalescing.

The choice object selects between MinorityCoalescer and NoCoalescer depending on what gives better performance.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean i did not get if you mean Do not perform NO coalescer or Do not perform coalescer.

"""
def __init__(self,
random_state: Optional[Union[np.random.RandomState, int]] = None
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
):
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
"""
The fit function calls the fit function of the underlying model
and returns the transformed array.
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
Args:
X (np.ndarray): input features
y (Optional[np.ndarray]): input labels

Returns:
instance of self
"""
self.check_requirements(X, y)

return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Adds the self into the 'X' dictionary and returns it.
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
Args:
X (Dict[str, Any]): 'X' dictionary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is X dictionary?
fit_dictionary? (Ravin says the fit_dictionary will deprecate soon, but do you have any idea when?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scikit-learn supports passing a dictionary alongside the data. See here

It makes a lot of sense to use it instead of X as a fit_dictionary.

From all of the refactoring changes, this is to me the most important.

When** depends on when there is a contributor that wants to do this change :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I did not get you, so please add your ideas to the doc-string as well?
Especially, I do not get why the meaning behind 'X' in the sentence.
But still it is a bit confusing for me.
Do you know why sklearn uses X for both fit_dictionary and feature_data?


Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
X.update({'coalescer': self.preprocessor})
return X

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'NoCoalescer',
'name': 'No Coalescer',
'handles_sparse': True
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import os
from collections import OrderedDict
from typing import Any, Dict, List, Optional

import ConfigSpace.hyperparameters as CSH
from ConfigSpace.configuration_space import ConfigurationSpace

from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
from autoPyTorch.pipeline.components.base_component import (
ThirdPartyComponents,
autoPyTorchComponent,
find_components,
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer


coalescer_directory = os.path.split(__file__)[0]
_coalescer = find_components(__package__,
coalescer_directory,
BaseCoalescer)
_addons = ThirdPartyComponents(BaseCoalescer)


def add_coalescer(coalescer: BaseCoalescer) -> None:
_addons.add_component(coalescer)


class CoalescerChoice(autoPyTorchChoice):
"""
Allows for dynamically choosing coalescer component at runtime
"""

def get_components(self) -> Dict[str, autoPyTorchComponent]:
"""Returns the available coalescer components

Args:
None

Returns:
Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
as choices for coalescer the categorical columns
"""
components = OrderedDict()
components.update(_coalescer)
components.update(_addons.components)
return components

def get_hyperparameter_search_space(self,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
dataset_properties: Optional[Dict[str, Any]] = None,
default: Optional[str] = None,
include: Optional[List[str]] = None,
exclude: Optional[List[str]] = None) -> ConfigurationSpace:
cs = ConfigurationSpace()

if dataset_properties is None:
dataset_properties = dict()

dataset_properties = {**self.dataset_properties, **dataset_properties}

available_preprocessors = self.get_available_components(dataset_properties=dataset_properties,
include=include,
exclude=exclude)

if len(available_preprocessors) == 0:
raise ValueError("no coalescer found, please add a coalescer")
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

if default is None:
defaults = ['NoCoalescer', 'MinorityCoalescer']
for default_ in defaults:
if default_ in available_preprocessors:
if include is not None and default_ not in include:
continue
if exclude is not None and default_ in exclude:
continue
default = default_
break
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

updates = self._get_search_space_updates()
if '__choice__' in updates.keys():
choice_hyperparameter = updates['__choice__']
if not set(choice_hyperparameter.value_range).issubset(available_preprocessors):
raise ValueError("Expected given update for {} to have "
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
"choices in {} got {}".format(self.__class__.__name__,
available_preprocessors,
choice_hyperparameter.value_range))
if len(dataset_properties['categorical_columns']) == 0:
assert len(choice_hyperparameter.value_range) == 1
assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \
"Provided {} in choices, however, the dataset " \
"is incompatible with it".format(choice_hyperparameter.value_range)

preprocessor = CSH.CategoricalHyperparameter('__choice__',
choice_hyperparameter.value_range,
default_value=choice_hyperparameter.default_value)
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
else:
# add only no coalescer to choice hyperparameters in case the dataset is only numerical
if len(dataset_properties['categorical_columns']) == 0:
default = 'NoCoalescer'
if include is not None and default not in include:
raise ValueError("Provided {} in include, however, the dataset "
"is incompatible with it".format(include))
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
preprocessor = CSH.CategoricalHyperparameter('__choice__',
['NoCoalescer'],
default_value=default)
else:
preprocessor = CSH.CategoricalHyperparameter('__choice__',
list(available_preprocessors.keys()),
default_value=default)
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

cs.add_hyperparameter(preprocessor)

# add only child hyperparameters of early_preprocessor choices
for name in preprocessor.choices:
updates = self._get_search_space_updates(prefix=name)
# Call arg is ignored on mypy as the search space dynamically
# provides different args
preprocessor_configuration_space = available_preprocessors[ # type:ignore[call-arg]
name # type:ignore[call-arg]
].get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore[call-arg]
parent_hyperparameter = {'parent': preprocessor, 'value': name}
cs.add_configuration_space(name, preprocessor_configuration_space,
parent_hyperparameter=parent_hyperparameter)
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

self.configuration_space = cs
self.dataset_properties = dataset_properties
return cs

def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
"""
A mechanism in code to ensure the correctness of the fit dictionary
It recursively makes sure that the children and parent level requirements
are honored before fit.
Args:
dataset_properties:

"""
super()._check_dataset_properties(dataset_properties)
assert 'numerical_columns' in dataset_properties.keys(), \
"Dataset properties must contain information about numerical columns"
assert 'categorical_columns' in dataset_properties.keys(), \
"Dataset properties must contain information about categorical columns"
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import Any, Dict, List

from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
autoPyTorchTabularPreprocessingComponent
)
from autoPyTorch.utils.common import FitRequirement


class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
"""
Base class for coalescing
"""
def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Adds the self into the 'X' dictionary and returns it.
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
Args:
X (Dict[str, Any]): 'X' dictionary
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("cant call transform on {} without fitting first."
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
.format(self.__class__.__name__))
X.update({'coalescer': self.preprocessor})
return X
4 changes: 4 additions & 0 deletions autoPyTorch/pipeline/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
TabularColumnTransformer
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
CoalescerChoice
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down Expand Up @@ -277,6 +280,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
4 changes: 4 additions & 0 deletions autoPyTorch/pipeline/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
TabularColumnTransformer
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
CoalescerChoice
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down Expand Up @@ -219,6 +222,7 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
Loading