From d85c16e5cb25009f39740b09feee306cc4ff3e58 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 25 May 2021 23:04:40 +0200 Subject: [PATCH 1/9] [ADD] first push of coalescer --- .../coalescer/MinorityCoalescer.py | 52 +++++++ .../coalescer/NoCoalescer.py | 51 +++++++ .../coalescer/__init__.py | 137 ++++++++++++++++++ .../coalescer/base_coalescer.py | 32 ++++ autoPyTorch/utils/implementations.py | 79 +++++++++- .../components/preprocessing/test_coalesce.py | 53 +++++++ test/test_utils/test_coalescer.py | 90 ++++++++++++ 7 files changed, 493 insertions(+), 1 deletion(-) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py create mode 100644 test/test_pipeline/components/preprocessing/test_coalesce.py create mode 100644 test/test_utils/test_coalescer.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py new file mode 100644 index 000000000..a770a58a9 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py @@ -0,0 +1,52 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, +) + +import numpy as np + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +from autoPyTorch.utils.implementations import MinorityCoalescing + + +class MinorityCoalescer(BaseCoalescer): + """ + Groups together classes in a categorical feature if the frequency + of occurrence is less than minimum_fraction + """ + def __init__(self, minimum_fraction: float, random_state: Optional[Union[np.random.RandomState, int]] = None): + super().__init__() + self.minimum_fraction = minimum_fraction + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: + + self.check_requirements(X, y) + + self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'MinorityCoalescer', + 'name': 'Minority Feature-class coalescer', + 'handles_sparse': False + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict] = None, + minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction", + value_range=(0.0001, 0.5), + default_value=0.01, + log=True), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py new file mode 100644 index 000000000..da93bbe4b --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py @@ -0,0 +1,51 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer + + +class NoCoalescer(BaseCoalescer): + """ + Don't perform NoCoalescer on categorical features + """ + def __init__(self, + random_state: Optional[Union[np.random.RandomState, int]] = None + ): + super().__init__() + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: + """ + The fit function calls the fit function of the underlying model + and returns the transformed array. + Args: + X (np.ndarray): input features + y (Optional[np.ndarray]): input labels + + Returns: + instance of self + """ + self.check_requirements(X, y) + + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Adds the self into the 'X' dictionary and returns it. + Args: + X (Dict[str, Any]): 'X' dictionary + + Returns: + (Dict[str, Any]): the updated 'X' dictionary + """ + X.update({'coalescer': self.preprocessor}) + return X + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'NoCoalescer', + 'name': 'No Coalescer', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py new file mode 100644 index 000000000..b39e87a3e --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -0,0 +1,137 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer + + +coalescer_directory = os.path.split(__file__)[0] +_coalescer = find_components(__package__, + coalescer_directory, + BaseCoalescer) +_addons = ThirdPartyComponents(BaseCoalescer) + + +def add_coalescer(coalescer: BaseCoalescer) -> None: + _addons.add_component(coalescer) + + +class CoalescerChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing coalescer component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available coalescer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseCoalescer components available + as choices for coalescer the categorical columns + """ + components = OrderedDict() + components.update(_coalescer) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_preprocessors = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_preprocessors) == 0: + raise ValueError("no coalescer found, please add a coalescer") + + if default is None: + defaults = ['MinorityCoalescer', 'NoCoalescer'] + for default_ in defaults: + if default_ in available_preprocessors: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + + updates = self._get_search_space_updates() + if '__choice__' in updates.keys(): + choice_hyperparameter = updates['__choice__'] + if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): + raise ValueError("Expected given update for {} to have " + "choices in {} got {}".format(self.__class__.__name__, + available_preprocessors, + choice_hyperparameter.value_range)) + if len(dataset_properties['categorical_columns']) == 0: + assert len(choice_hyperparameter.value_range) == 1 + assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \ + "Provided {} in choices, however, the dataset " \ + "is incompatible with it".format(choice_hyperparameter.value_range) + + preprocessor = CSH.CategoricalHyperparameter('__choice__', + choice_hyperparameter.value_range, + default_value=choice_hyperparameter.default_value) + else: + # add only no coalescer to choice hyperparameters in case the dataset is only numerical + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoCoalescer' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoCoalescer'], + default_value=default) + else: + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_preprocessors.keys()), + default_value=default) + + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of early_preprocessor choices + for name in preprocessor.choices: + preprocessor_configuration_space = available_preprocessors[name].\ + get_hyperparameter_search_space(dataset_properties) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, preprocessor_configuration_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs + + def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None: + """ + A mechanism in code to ensure the correctness of the fit dictionary + It recursively makes sure that the children and parent level requirements + are honored before fit. + Args: + dataset_properties: + + """ + super()._check_dataset_properties(dataset_properties) + assert 'numerical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about numerical columns" + assert 'categorical_columns' in dataset_properties.keys(), \ + "Dataset properties must contain information about categorical columns" diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py new file mode 100644 index 000000000..280ddd979 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py @@ -0,0 +1,32 @@ +from typing import Any, Dict, List + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( + autoPyTorchTabularPreprocessingComponent +) +from autoPyTorch.utils.common import FitRequirement + + +class BaseCoalescer(autoPyTorchTabularPreprocessingComponent): + """ + Base class for coalescing + """ + def __init__(self) -> None: + super().__init__() + self.add_fit_requirements([ + FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), + FitRequirement('categories', (List,), user_defined=True, dataset_property=True)]) + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Adds the self into the 'X' dictionary and returns it. + Args: + X (Dict[str, Any]): 'X' dictionary + + Returns: + (Dict[str, Any]): the updated 'X' dictionary + """ + if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: + raise ValueError("cant call transform on {} without fitting first." + .format(self.__class__.__name__)) + X.update({'coalescer': self.preprocessor}) + return X diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py index 2130cfd6b..e78baa5e4 100644 --- a/autoPyTorch/utils/implementations.py +++ b/autoPyTorch/utils/implementations.py @@ -1,7 +1,11 @@ -from typing import Any, Callable, Dict, Type, Union +from typing import Any, Callable, Dict, List, Optional, Set, Type, Union import numpy as np +from scipy import sparse + +from sklearn.base import BaseEstimator, TransformerMixin + import torch @@ -61,3 +65,76 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray: @staticmethod def get_properties() -> Dict[str, Any]: return {'supported_losses': ['BCEWithLogitsLoss']} + + +class MinorityCoalescing(BaseEstimator, TransformerMixin): + """ Group together categories which occurence is less than a specified + minimum fraction. Coalesced categories get index of one. + """ + + def __init__(self, minimum_fraction: Optional[float] = None): + self.minimum_fraction = minimum_fraction + + def check_X(self, X: np.array) -> None: + X_data = X.data if sparse.issparse(X) else X + if np.nanmin(X_data) <= -2: + raise ValueError("X needs to contain only integers greater than -2.") + + def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescing': + self.check_X(X) + + if self.minimum_fraction is None: + return self + + # Remember which values should not be coalesced + do_not_coalesce: List[Set[int]] = list() + for column in range(X.shape[1]): + do_not_coalesce.append(set()) + + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + unique, counts = np.unique( + X.data[indptr_start:indptr_end], return_counts=True) + colsize = indptr_end - indptr_start + else: + unique, counts = np.unique(X[:, column], return_counts=True) + colsize = X.shape[0] + + for unique_value, count in zip(unique, counts): + fraction = float(count) / colsize + if fraction >= self.minimum_fraction: + do_not_coalesce[-1].add(unique_value) + + self.do_not_coalesce_ = do_not_coalesce + return self + + def transform(self, X: np.ndarray) -> np.ndarray: + self.check_X(X) + + if self.minimum_fraction is None: + return X + + for column in range(X.shape[1]): + if sparse.issparse(X): + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + unique = np.unique(X.data[indptr_start:indptr_end]) + for unique_value in unique: + if unique_value not in self.do_not_coalesce_[column]: + indptr_start = X.indptr[column] + indptr_end = X.indptr[column + 1] + X.data[indptr_start:indptr_end][ + X.data[indptr_start:indptr_end] == unique_value] = -2 + else: + unique = np.unique(X[:, column]) + unique_values = [unique_value for unique_value in unique + if unique_value not in self.do_not_coalesce_[column]] + mask = np.isin(X[:, column], unique_values) + # The imputer uses -1 for unknown categories + # Then -2 means coalesced categories + X[mask, column] = -2 + return X + + def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: + return self.fit(X, y).transform(X) diff --git a/test/test_pipeline/components/preprocessing/test_coalesce.py b/test/test_pipeline/components/preprocessing/test_coalesce.py new file mode 100644 index 000000000..9aa73880f --- /dev/null +++ b/test/test_pipeline/components/preprocessing/test_coalesce.py @@ -0,0 +1,53 @@ +import copy +import unittest + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) + + +class TestCoalescerChoice(unittest.TestCase): + def test_get_set_config_space(self): + """Make sure that we can setup a valid choice in the Coalescer + choice""" + dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]} + coalescer_choice = CoalescerChoice(dataset_properties) + cs = coalescer_choice.get_hyperparameter_search_space() + + # Make sure that all hyperparameters are part of the search space + self.assertListEqual( + sorted(cs.get_hyperparameter('__choice__').choices), + sorted(list(coalescer_choice.get_components().keys())) + ) + + # Make sure we can properly set some random configs + # Whereas just one iteration will make sure the algorithm works, + # doing five iterations increase the confidence. We will be able to + # catch component specific crashes + for i in range(5): + config = cs.sample_configuration() + config_dict = copy.deepcopy(config.get_dictionary()) + coalescer_choice.set_hyperparameters(config) + + self.assertEqual(coalescer_choice.choice.__class__, + coalescer_choice.get_components()[config_dict['__choice__']]) + + # Then check the choice configuration + selected_choice = config_dict.pop('__choice__', None) + for key, value in config_dict.items(): + # Remove the selected_choice string from the parameter + # so we can query in the object for it + key = key.replace(selected_choice + ':', '') + self.assertIn(key, vars(coalescer_choice.choice)) + self.assertEqual(value, coalescer_choice.choice.__dict__[key]) + + def test_only_numerical(self): + dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': []} + + chooser = CoalescerChoice(dataset_properties) + configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary() + self.assertEqual(configspace['__choice__'], 'NoCoalescer') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils/test_coalescer.py b/test/test_utils/test_coalescer.py new file mode 100644 index 000000000..482ae74de --- /dev/null +++ b/test/test_utils/test_coalescer.py @@ -0,0 +1,90 @@ +import numpy as np + +import pytest + +import scipy.sparse + +from autoPyTorch.utils.implementations import MinorityCoalescing + + +@pytest.fixture +def X1(): + # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%, + # 30%, 30%, 5% and 5% respectively + X = np.vstack(( + np.ones((30, 10)) * 3, + np.ones((30, 10)) * 4, + np.ones((30, 10)) * 5, + np.ones((5, 10)) * 6, + np.ones((5, 10)) * 7, + )) + for col in range(X.shape[1]): + np.random.shuffle(X[:, col]) + return X + + +@pytest.fixture +def X2(): + # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%, + # 5%, 5%, 35% and 50% respectively + X = np.vstack(( + np.ones((5, 10)) * 3, + np.ones((5, 10)) * 4, + np.ones((5, 10)) * 5, + np.ones((35, 10)) * 6, + np.ones((50, 10)) * 7, + )) + for col in range(X.shape[1]): + np.random.shuffle(X[:, col]) + return X + + +def test_default(X1): + X = X1 + X_copy = np.copy(X) + Y = MinorityCoalescing().fit_transform(X) + np.testing.assert_array_almost_equal(Y, X_copy) + # Assert no copies were made + assert id(X) == id(Y) + + +def test_coalesce_10_percent(X1): + X = X1 + Y = MinorityCoalescing(minimum_fraction=.1).fit_transform(X) + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30]) + # Assert no copies were made + assert id(X) == id(Y) + + +def test_coalesce_10_percent_sparse(X1): + X = scipy.sparse.csc_matrix(X1) + Y = MinorityCoalescing(minimum_fraction=.1).fit_transform(X) + # Assert no copies were made + assert id(X) == id(Y) + Y = Y.todense() + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30]) + + +def test_invalid_X(X1): + X = X1 - 5 + with pytest.raises(ValueError): + MinorityCoalescing().fit_transform(X) + + +def test_transform_after_fit(X1, X2): + # On both X_fit and X_transf, the categories 3, 4, 5, 6, 7 are present. + X_fit = X1 # Here categories 3, 4, 5 have ocurrence above 10% + X_transf = X2 # Here it is the opposite, just categs 6 and 7 are above 10% + + mc = MinorityCoalescing(minimum_fraction=.1).fit(X_fit) + + # transform() should coalesce categories as learned during fit. + # Category distribution in X_transf should be irrelevant. + Y = mc.transform(X_transf) + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [85, 0, 0, 0, 0, 5, 5, 5]) From ae0652e623f755489c67ff128fe89a4f377aeb45 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 25 May 2021 23:13:57 +0200 Subject: [PATCH 2/9] Add coalescer --- autoPyTorch/pipeline/tabular_classification.py | 4 ++++ autoPyTorch/pipeline/tabular_regression.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 65abcf3c2..a0216a79a 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -20,6 +20,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -277,6 +280,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 1aa17a593..e613b11f0 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -19,6 +19,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -219,6 +222,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, From 90c71eb2264c2687f7536a3ff4dccfb4ae2a9b10 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 1 Jun 2021 14:12:37 +0200 Subject: [PATCH 3/9] Fix Unit test --- .../tabular_preprocessing/coalescer/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index b39e87a3e..67ad64f5d 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self, raise ValueError("no coalescer found, please add a coalescer") if default is None: - defaults = ['MinorityCoalescer', 'NoCoalescer'] + defaults = ['NoCoalescer', 'MinorityCoalescer'] for default_ in defaults: if default_ in available_preprocessors: if include is not None and default_ not in include: @@ -111,8 +111,10 @@ def get_hyperparameter_search_space(self, # add only child hyperparameters of early_preprocessor choices for name in preprocessor.choices: + updates = self._get_search_space_updates(prefix=name) preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties) + get_hyperparameter_search_space(dataset_properties, + **updates) parent_hyperparameter = {'parent': preprocessor, 'value': name} cs.add_configuration_space(name, preprocessor_configuration_space, parent_hyperparameter=parent_hyperparameter) From 43213bb91333a4000a8722fa8197e345a005d65f Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 1 Jun 2021 14:44:01 +0200 Subject: [PATCH 4/9] Further fixes --- autoPyTorch/configs/greedy_portfolio.json | 3 ++- .../tabular_preprocessing/coalescer/__init__.py | 4 +++- test/test_api/.tmp_api/runhistory.json | 11 ++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json index a8e640a4e..341a034eb 100644 --- a/autoPyTorch/configs/greedy_portfolio.json +++ b/autoPyTorch/configs/greedy_portfolio.json @@ -1,5 +1,6 @@ [{"data_loader:batch_size": 60, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", @@ -506,4 +507,4 @@ "network_backbone:ShapedResNetBackbone:max_shake_drop_probability": 0.034431265307095615, "network_head:fully_connected:activation": "relu", "network_head:fully_connected:units_layer_1": 128, - "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}] \ No newline at end of file + "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}] diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index 67ad64f5d..13fc53dd2 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -112,8 +112,10 @@ def get_hyperparameter_search_space(self, # add only child hyperparameters of early_preprocessor choices for name in preprocessor.choices: updates = self._get_search_space_updates(prefix=name) + # Call arg is ignored on mypy as the search space dynamically + # provides different args preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties, + get_hyperparameter_search_space(dataset_properties, # type:ignore[call-arg] **updates) parent_hyperparameter = {'parent': preprocessor, 'value': name} cs.add_configuration_space(name, preprocessor_configuration_space, diff --git a/test/test_api/.tmp_api/runhistory.json b/test/test_api/.tmp_api/runhistory.json index 6f61e1395..d065c431f 100644 --- a/test/test_api/.tmp_api/runhistory.json +++ b/test/test_api/.tmp_api/runhistory.json @@ -705,6 +705,7 @@ "1": { "data_loader:batch_size": 64, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "ReduceLROnPlateau", @@ -737,6 +738,7 @@ "2": { "data_loader:batch_size": 101, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:numerical_strategy": "most_frequent", "lr_scheduler:__choice__": "CyclicLR", @@ -801,6 +803,7 @@ "3": { "data_loader:batch_size": 242, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "NoScheduler", @@ -831,6 +834,7 @@ "4": { "data_loader:batch_size": 115, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "CosineAnnealingLR", @@ -864,6 +868,7 @@ "5": { "data_loader:batch_size": 185, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "ReduceLROnPlateau", @@ -904,6 +909,7 @@ "6": { "data_loader:batch_size": 95, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "most_frequent", "lr_scheduler:__choice__": "ExponentialLR", @@ -937,6 +943,7 @@ "7": { "data_loader:batch_size": 119, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "StepLR", @@ -979,6 +986,7 @@ "8": { "data_loader:batch_size": 130, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PolynomialFeatures", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "CyclicLR", @@ -1032,6 +1040,7 @@ "9": { "data_loader:batch_size": 137, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", @@ -1077,4 +1086,4 @@ "network_head:fully_connected:units_layer_1": 294 } } -} \ No newline at end of file +} From e107e930cf647e33772219601cc5e1174a9ddaf5 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 1 Jun 2021 14:51:08 +0200 Subject: [PATCH 5/9] Ignore as in preprocessing --- .../preprocessing/tabular_preprocessing/coalescer/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index 13fc53dd2..6455ee4d9 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -115,8 +115,7 @@ def get_hyperparameter_search_space(self, # Call arg is ignored on mypy as the search space dynamically # provides different args preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties, # type:ignore[call-arg] - **updates) + get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore parent_hyperparameter = {'parent': preprocessor, 'value': name} cs.add_configuration_space(name, preprocessor_configuration_space, parent_hyperparameter=parent_hyperparameter) From e40ed308eeadcf66da7dfac92aef06ac48ad2995 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 1 Jun 2021 14:57:54 +0200 Subject: [PATCH 6/9] call args still failing --- .../tabular_preprocessing/coalescer/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index 6455ee4d9..305fab99c 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -114,8 +114,9 @@ def get_hyperparameter_search_space(self, updates = self._get_search_space_updates(prefix=name) # Call arg is ignored on mypy as the search space dynamically # provides different args - preprocessor_configuration_space = available_preprocessors[name].\ - get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore + preprocessor_configuration_space = available_preprocessors[ # type:ignore[call-arg] + name # type:ignore[call-arg] + ].get_hyperparameter_search_space(dataset_properties, **updates) # type:ignore[call-arg] parent_hyperparameter = {'parent': preprocessor, 'value': name} cs.add_configuration_space(name, preprocessor_configuration_space, parent_hyperparameter=parent_hyperparameter) From 3d25f37aed2e64066a47caff3e3f6d453ce8790f Mon Sep 17 00:00:00 2001 From: chico Date: Fri, 18 Jun 2021 16:17:00 +0200 Subject: [PATCH 7/9] FIX_refit --- autoPyTorch/api/base_task.py | 42 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 9e59601a8..6ff99bde3 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1072,11 +1072,17 @@ def refit( self """ - self.dataset_name = dataset.dataset_name - if self._logger is None: self._logger = self._get_logger(str(self.dataset_name)) + if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: + self._load_models() + + # Refit is not applicable when ensemble_size is set to zero. + if self.ensemble_ is None: + raise ValueError("Refit can only be called if 'ensemble_size != 0'") + + self.dataset_name = dataset.dataset_name dataset_requirements = get_dataset_requirements( info=dataset.get_required_dataset_info(), include=self.include_components, @@ -1085,26 +1091,20 @@ def refit( dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._backend.save_datamanager(dataset) - X: Dict[str, Any] = dict({'dataset_properties': dataset_properties, - 'backend': self._backend, - 'X_train': dataset.train_tensors[0], - 'y_train': dataset.train_tensors[1], - 'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None, - 'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None, - 'train_indices': dataset.splits[split_id][0], - 'val_indices': dataset.splits[split_id][1], - 'split_id': split_id, - 'num_run': self._backend.get_next_num_run(), - }) - X.update({**self.pipeline_options, **budget_config}) - if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: - self._load_models() - - # Refit is not applicable when ensemble_size is set to zero. - if self.ensemble_ is None: - raise ValueError("Refit can only be called if 'ensemble_size != 0'") - for identifier in self.models_: + X: Dict[str, Any] = dict({'dataset_properties': dataset_properties, + 'backend': self._backend, + 'X_train': dataset.train_tensors[0].copy(), + 'y_train': dataset.train_tensors[1].copy(), + 'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None, + 'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None, + 'train_indices': dataset.splits[split_id][0], + 'val_indices': dataset.splits[split_id][1], + 'split_id': split_id, + 'num_run': self._backend.get_next_num_run(), + }) + X.update({**self.pipeline_options, **budget_config}) + model = self.models_[identifier] # this updates the model inplace, it can then later be used in # predict method From 2e08e99b58ec6320f19a295dd02037f1e98491e6 Mon Sep 17 00:00:00 2001 From: Francisco Rivera Valverde <44504424+franchuterivera@users.noreply.github.com> Date: Mon, 21 Jun 2021 21:58:03 +0200 Subject: [PATCH 8/9] Update autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../preprocessing/tabular_preprocessing/coalescer/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index 305fab99c..469c753d3 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -97,8 +97,8 @@ def get_hyperparameter_search_space(self, if len(dataset_properties['categorical_columns']) == 0: default = 'NoCoalescer' if include is not None and default not in include: - raise ValueError("Provided {} in include, however, the dataset " - "is incompatible with it".format(include)) + raise ValueError("Provided coalescer {} are incompatible with " + "the dataset without categorical columns.".format(include)) preprocessor = CSH.CategoricalHyperparameter('__choice__', ['NoCoalescer'], default_value=default) From c8aa62632a76adf00d0677c0a250ca0b90c9ef9e Mon Sep 17 00:00:00 2001 From: chico Date: Mon, 21 Jun 2021 22:36:06 +0200 Subject: [PATCH 9/9] Feedback from Shuhei --- .../coalescer/MinorityCoalescer.py | 2 +- .../coalescer/NoCoalescer.py | 16 +++--- .../coalescer/__init__.py | 12 +++-- .../coalescer/base_coalescer.py | 8 ++- autoPyTorch/utils/implementations.py | 53 +++++++++++++++---- .../components/preprocessing/test_coalesce.py | 2 +- 6 files changed, 68 insertions(+), 25 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py index a770a58a9..5d2e13bf3 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py @@ -17,7 +17,7 @@ class MinorityCoalescer(BaseCoalescer): Groups together classes in a categorical feature if the frequency of occurrence is less than minimum_fraction """ - def __init__(self, minimum_fraction: float, random_state: Optional[Union[np.random.RandomState, int]] = None): + def __init__(self, minimum_fraction: float, random_state: np.random.RandomState): super().__init__() self.minimum_fraction = minimum_fraction self.random_state = random_state diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py index da93bbe4b..5d4448249 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py @@ -10,18 +10,20 @@ class NoCoalescer(BaseCoalescer): Don't perform NoCoalescer on categorical features """ def __init__(self, - random_state: Optional[Union[np.random.RandomState, int]] = None + random_state: np.random.RandomState, ): super().__init__() self.random_state = random_state - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer: """ - The fit function calls the fit function of the underlying model - and returns the transformed array. + As no coalescing happens, the input fit dictionary is unchanged. + Args: - X (np.ndarray): input features - y (Optional[np.ndarray]): input labels + X (Dict[str, Any]): + input fit dictionary + y (Optional[Any]): + Parameter to comply with scikit-learn API. Not used. Returns: instance of self @@ -32,7 +34,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ - Adds the self into the 'X' dictionary and returns it. + Add self into the 'X' dictionary and return the modified dict. Args: X (Dict[str, Any]): 'X' dictionary diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py index 305fab99c..66528eb67 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -62,7 +62,9 @@ def get_hyperparameter_search_space(self, exclude=exclude) if len(available_preprocessors) == 0: - raise ValueError("no coalescer found, please add a coalescer") + raise ValueError("No coalescer found, please add a coalescer via the include " + "argument of the pipeline. Additionally, coalescer as a step " + "can be removed as a pipeline step. ") if default is None: defaults = ['NoCoalescer', 'MinorityCoalescer'] @@ -79,10 +81,10 @@ def get_hyperparameter_search_space(self, if '__choice__' in updates.keys(): choice_hyperparameter = updates['__choice__'] if not set(choice_hyperparameter.value_range).issubset(available_preprocessors): - raise ValueError("Expected given update for {} to have " - "choices in {} got {}".format(self.__class__.__name__, - available_preprocessors, - choice_hyperparameter.value_range)) + raise ValueError("The update for {} was expected to be a subset of {} " + "but was {}".format(self.__class__.__name__, + available_preprocessors, + choice_hyperparameter.value_range)) if len(dataset_properties['categorical_columns']) == 0: assert len(choice_hyperparameter.value_range) == 1 assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \ diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py index 280ddd979..7eae84025 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py @@ -18,7 +18,11 @@ def __init__(self) -> None: def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ - Adds the self into the 'X' dictionary and returns it. + The input X is the fit dictionary, that contains both the train data as + well as fit directives. For example, it indicates whether or not to use the gpu + or perform a cpu only run. + + This method add the self into the 'X' dictionary and return it. Args: X (Dict[str, Any]): 'X' dictionary @@ -26,7 +30,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: (Dict[str, Any]): the updated 'X' dictionary """ if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: - raise ValueError("cant call transform on {} without fitting first." + raise ValueError("Cannot call transform() on {} without calling fit() first." .format(self.__class__.__name__)) X.update({'coalescer': self.preprocessor}) return X diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py index e78baa5e4..aabdee91e 100644 --- a/autoPyTorch/utils/implementations.py +++ b/autoPyTorch/utils/implementations.py @@ -68,19 +68,46 @@ def get_properties() -> Dict[str, Any]: class MinorityCoalescing(BaseEstimator, TransformerMixin): - """ Group together categories which occurence is less than a specified + """ Group together categories which occurrence is less than a specified minimum fraction. Coalesced categories get index of one. """ def __init__(self, minimum_fraction: Optional[float] = None): self.minimum_fraction = minimum_fraction - def check_X(self, X: np.array) -> None: + def check_X(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None: + """ + This estimator takes as input a set of features coming in a tabular fashion. + The classes in the columns, i.e.features, are coalesced together, if the ratio of + occurrence of each class is less than self.minimum_fraction. + + Those classes with low occurrence frequency are coalesced into a new class: -2. + The classification pipeline shifts the classes to be positive through encoding. + The imputation tags missing elements as -1. Then the coalescer can use -2 as a + 'coalesced' indicator. + + Failure can only happen if the pipeline failed to fit encoding/imputation. + + Args: + X (np.ndarray): + The input features from the user, likely transformed by an encoder and imputator. + """ X_data = X.data if sparse.issparse(X) else X if np.nanmin(X_data) <= -2: - raise ValueError("X needs to contain only integers greater than -2.") - - def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescing': + raise ValueError("The input features to the MinorityCoalescing " + "need to contain only integers greater than -2.") + + def fit(self, X: Union[np.ndarray, sparse.csr_matrix], + y: Optional[np.ndarray] = None) -> 'MinorityCoalescing': + """ + Trains the estimator to identify low frequency classes on the input train data. + + Args: + X (Union[np.ndarray, sparse.csr_matrix]): + The input features from the user, likely transformed by an encoder and imputator. + y (Optional[np.ndarray]): + Optional labels for the given task, not used by this estimator. + """ self.check_X(X) if self.minimum_fraction is None: @@ -106,10 +133,18 @@ def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescin if fraction >= self.minimum_fraction: do_not_coalesce[-1].add(unique_value) - self.do_not_coalesce_ = do_not_coalesce + self._do_not_coalesce = do_not_coalesce return self - def transform(self, X: np.ndarray) -> np.ndarray: + def transform(self, X: Union[np.ndarray, sparse.csr_matrix]) -> Union[np.ndarray, + sparse.csr_matrix]: + """ + Coalesces categories with low frequency on the input array X. + + Args: + X (Union[np.ndarray, sparse.csr_matrix]): + The input features from the user, likely transformed by an encoder and imputator. + """ self.check_X(X) if self.minimum_fraction is None: @@ -121,7 +156,7 @@ def transform(self, X: np.ndarray) -> np.ndarray: indptr_end = X.indptr[column + 1] unique = np.unique(X.data[indptr_start:indptr_end]) for unique_value in unique: - if unique_value not in self.do_not_coalesce_[column]: + if unique_value not in self._do_not_coalesce[column]: indptr_start = X.indptr[column] indptr_end = X.indptr[column + 1] X.data[indptr_start:indptr_end][ @@ -129,7 +164,7 @@ def transform(self, X: np.ndarray) -> np.ndarray: else: unique = np.unique(X[:, column]) unique_values = [unique_value for unique_value in unique - if unique_value not in self.do_not_coalesce_[column]] + if unique_value not in self._do_not_coalesce[column]] mask = np.isin(X[:, column], unique_values) # The imputer uses -1 for unknown categories # Then -2 means coalesced categories diff --git a/test/test_pipeline/components/preprocessing/test_coalesce.py b/test/test_pipeline/components/preprocessing/test_coalesce.py index 9aa73880f..812256073 100644 --- a/test/test_pipeline/components/preprocessing/test_coalesce.py +++ b/test/test_pipeline/components/preprocessing/test_coalesce.py @@ -24,7 +24,7 @@ def test_get_set_config_space(self): # Whereas just one iteration will make sure the algorithm works, # doing five iterations increase the confidence. We will be able to # catch component specific crashes - for i in range(5): + for _ in range(5): config = cs.sample_configuration() config_dict = copy.deepcopy(config.get_dictionary()) coalescer_choice.set_hyperparameters(config)