From 9ca5889fda4002e3e7e3b1f32df7b714832c6a75 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 3 Feb 2022 14:41:17 +0100 Subject: [PATCH 1/5] remove categorical strategy from simple imputer --- .../imputation/SimpleImputer.py | 61 ++-------- .../imputation/base_imputer.py | 5 +- .../components/preprocessing/test_imputers.py | 114 ++++++++---------- 3 files changed, 64 insertions(+), 116 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index 3d7ca22b1..608ee8ec5 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -13,13 +13,8 @@ class SimpleImputer(BaseImputer): - """An imputer for categorical and numerical columns - - Impute missing values for categorical columns with 'constant_!missing!' - - Note: - In case of numpy data, the constant value is set to -1, under the assumption - that categorical data is fit with an Ordinal Scaler. + """ + An imputer for numerical columns Attributes: random_state (Optional[np.random.RandomState]): @@ -27,56 +22,33 @@ class SimpleImputer(BaseImputer): numerical_strategy (str: default='mean'): The strategy to use for imputing numerical columns. Can be one of ['most_frequent', 'constant_!missing!'] - categorical_strategy (str: default='most_frequent') - The strategy to use for imputing categorical columns. - Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] """ def __init__( self, random_state: Optional[np.random.RandomState] = None, numerical_strategy: str = 'mean', - categorical_strategy: str = 'most_frequent' ): - """ - Note: - 'constant' as numerical_strategy uses 0 as the default fill_value while - 'constant_!missing!' uses a fill_value of -1. - This behaviour should probably be fixed. - """ super().__init__() self.random_state = random_state self.numerical_strategy = numerical_strategy - self.categorical_strategy = categorical_strategy def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: - """ Fits the underlying model and returns the transformed array. + """ + Builds the preprocessor based on the given fit dictionary 'X'. Args: - X (np.ndarray): - The input features to fit on - y (Optional[np.ndarray]): - The labels for the input features `X` + X (Dict[str, Any]): + The fit dictionary + y (Optional[Any]): + Not Used -- to comply with API Returns: - SimpleImputer: - returns self + self: + returns an instance of self. """ self.check_requirements(X, y) - # Choose an imputer for any categorical columns - categorical_columns = X['dataset_properties']['categorical_columns'] - - if isinstance(categorical_columns, List) and len(categorical_columns) != 0: - if self.categorical_strategy == 'constant_!missing!': - # Train data is numpy as of this point, where an Ordinal Encoding is used - # for categoricals. Only Numbers are allowed for `fill_value` - imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False) - self.preprocessor['categorical'] = imputer - else: - imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False) - self.preprocessor['categorical'] = imputer - # Choose an imputer for any numerical columns numerical_columns = X['dataset_properties']['numerical_columns'] @@ -98,11 +70,6 @@ def get_hyperparameter_search_space( value_range=("mean", "median", "most_frequent", "constant_zero"), default_value="mean", ), - categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter='categorical_strategy', - value_range=("most_frequent", "constant_!missing!"), - default_value="most_frequent" - ) ) -> ConfigurationSpace: """Get the hyperparameter search space for the SimpleImputer @@ -112,8 +79,6 @@ def get_hyperparameter_search_space( Note: Not actually Optional, just adhering to its supertype numerical_strategy (HyperparameterSearchSpace: default = ...) The strategy to use for numerical imputation - caterogical_strategy (HyperparameterSearchSpace: default = ...) - The strategy to use for categorical imputation Returns: ConfigurationSpace @@ -132,12 +97,6 @@ def get_hyperparameter_search_space( ): add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) - if ( - isinstance(dataset_properties['categorical_columns'], List) - and len(dataset_properties['categorical_columns']) - ): - add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter) - return cs @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py index b65f3c229..1f33a765a 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py @@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ @@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: + if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) X.update({'imputer': self.preprocessor}) diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 18b43bfa6..58377def7 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -39,14 +39,14 @@ def test_get_config_space(self): self.assertEqual(param1, param2) def test_mean_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -66,31 +66,29 @@ def test_mean_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer((categorical_imputer, - X['dataset_properties']['categorical_columns']), - (numerical_imputer, + column_transformer = make_column_transformer((numerical_imputer, X['dataset_properties']['numerical_columns']), remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], - [7.0, 3.5, 9.0], - [4.0, 3.5, 3.0]], dtype=str)) + assert_array_equal(transformed, np.array([[2.5, 8, 9], + [7, 3.5, 9], + [4, 3.5, 3]])) def test_median_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -110,31 +108,29 @@ def test_median_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], - [7.0, 3.5, 9.0], - [4.0, 3.5, 3.0]], dtype=str)) + assert_array_equal(transformed, np.array([[2.5, 8, 9], + [7, 3.5, 9], + [4, 3.5, 3]])) def test_frequent_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -145,8 +141,7 @@ def test_frequent_imputation(self): 'X_train': data[train_indices], 'dataset_properties': dataset_properties } - imputer_component = SimpleImputer(numerical_strategy='most_frequent', - categorical_strategy='most_frequent') + imputer_component = SimpleImputer(numerical_strategy='most_frequent') imputer_component = imputer_component.fit(X) X = imputer_component.transform(X) @@ -155,31 +150,29 @@ def test_frequent_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9], - [7.0, 2, 9], - [4.0, 2, 3]], dtype=str)) + assert_array_equal(transformed, np.array([[1, 8, 9], + [7, 2, 9], + [4, 2, 3]])) def test_constant_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -190,8 +183,7 @@ def test_constant_imputation(self): 'X_train': data[train_indices], 'dataset_properties': dataset_properties } - imputer_component = SimpleImputer(numerical_strategy='constant_zero', - categorical_strategy='constant_!missing!') + imputer_component = SimpleImputer(numerical_strategy='constant_zero') imputer_component = imputer_component.fit(X) X = imputer_component.transform(X) @@ -200,20 +192,18 @@ def test_constant_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9], - [7.0, '0', 9], - [4.0, '0', '0']], dtype=str)) + assert_array_equal(transformed, np.array([[0, 8, 9], + [7, 0, 9], + [4, 0, 0]])) def test_imputation_without_dataset_properties_raises_error(self): """Tests SimpleImputer checks for dataset properties when querying for From b388947edc6a4d142d420da795185057a5040ee6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 3 Feb 2022 18:36:33 +0100 Subject: [PATCH 2/5] fix tests --- autoPyTorch/configs/greedy_portfolio.json | 16 ---------------- autoPyTorch/optimizer/smbo.py | 8 +++++--- .../TabularColumnTransformer.py | 11 +++++++---- 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json index a8e640a4e..ffc5d98f5 100644 --- a/autoPyTorch/configs/greedy_portfolio.json +++ b/autoPyTorch/configs/greedy_portfolio.json @@ -1,7 +1,6 @@ [{"data_loader:batch_size": 60, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -32,7 +31,6 @@ {"data_loader:batch_size": 255, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -66,7 +64,6 @@ {"data_loader:batch_size": 165, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -97,7 +94,6 @@ {"data_loader:batch_size": 299, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -129,7 +125,6 @@ {"data_loader:batch_size": 183, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -163,7 +158,6 @@ {"data_loader:batch_size": 21, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -192,7 +186,6 @@ {"data_loader:batch_size": 159, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -222,7 +215,6 @@ {"data_loader:batch_size": 442, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -255,7 +247,6 @@ {"data_loader:batch_size": 140, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -288,7 +279,6 @@ {"data_loader:batch_size": 48, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -316,7 +306,6 @@ {"data_loader:batch_size": 168, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -349,7 +338,6 @@ {"data_loader:batch_size": 21, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -378,7 +366,6 @@ {"data_loader:batch_size": 163, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -411,7 +398,6 @@ {"data_loader:batch_size": 150, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -445,7 +431,6 @@ {"data_loader:batch_size": 151, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -475,7 +460,6 @@ {"data_loader:batch_size": 42, "encoder:__choice__": "OneHotEncoder", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index d0bb4056c..f7fa927be 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -244,10 +244,12 @@ def __init__(self, port=self.logger_port) self.logger.info("initialised {}".format(self.__class__.__name__)) - self.initial_configurations: Optional[List[Configuration]] = None if portfolio_selection is not None: - self.initial_configurations = read_return_initial_configurations(config_space=config_space, - portfolio_selection=portfolio_selection) + initial_configurations = read_return_initial_configurations(config_space=config_space, + portfolio_selection=portfolio_selection) + # incase we dont have any valid configuration from the portfolio + self.initial_configurations: Optional[List[Configuration]] = initial_configurations \ + if len(initial_configurations) > 0 else None def reset_data_manager(self) -> None: if self.datamanager is not None: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index ea47e33b9..5902532e9 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -48,13 +48,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": "TabularColumnTransformer": an instance of self """ self.check_requirements(X, y) - numerical_pipeline = 'drop' - categorical_pipeline = 'drop' + # in case the preprocessing steps are disabled + # i.e, NoEncoder for categorical, we want to + # let the data in categorical columns pass through + numerical_pipeline = 'passthrough' + categorical_pipeline = 'passthrough' preprocessors = get_tabular_preprocessers(X) - if len(X['dataset_properties']['numerical_columns']): + if len(X['dataset_properties']['numerical_columns']) and len(preprocessors['numerical']): numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(X['dataset_properties']['categorical_columns']): + if len(X['dataset_properties']['categorical_columns']) and len(preprocessors['categorical']): categorical_pipeline = make_pipeline(*preprocessors['categorical']) self.preprocessor = ColumnTransformer([ From 6cdae18a85b54ea949b37d03964e987863045995 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 4 Feb 2022 12:57:46 +0100 Subject: [PATCH 3/5] address comments from eddie --- autoPyTorch/optimizer/smbo.py | 3 ++- .../TabularColumnTransformer.py | 23 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index f7fa927be..7407f6ba5 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -244,11 +244,12 @@ def __init__(self, port=self.logger_port) self.logger.info("initialised {}".format(self.__class__.__name__)) + self.initial_configurations: Optional[List[Configuration]] = None if portfolio_selection is not None: initial_configurations = read_return_initial_configurations(config_space=config_space, portfolio_selection=portfolio_selection) # incase we dont have any valid configuration from the portfolio - self.initial_configurations: Optional[List[Configuration]] = initial_configurations \ + self.initial_configurations = initial_configurations \ if len(initial_configurations) > 0 else None def reset_data_manager(self) -> None: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 5902532e9..935b740b6 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -1,7 +1,8 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np +from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline @@ -48,21 +49,21 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": "TabularColumnTransformer": an instance of self """ self.check_requirements(X, y) - # in case the preprocessing steps are disabled - # i.e, NoEncoder for categorical, we want to - # let the data in categorical columns pass through - numerical_pipeline = 'passthrough' - categorical_pipeline = 'passthrough' preprocessors = get_tabular_preprocessers(X) - if len(X['dataset_properties']['numerical_columns']) and len(preprocessors['numerical']): + column_transformers: List[Tuple(str, BaseEstimator, List[int])] = [] + if len(preprocessors['numerical']) > 0: numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(X['dataset_properties']['categorical_columns']) and len(preprocessors['categorical']): + column_transformers.append(('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])) + if len(preprocessors['categorical']) > 0: categorical_pipeline = make_pipeline(*preprocessors['categorical']) + column_transformers.append(('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])) - self.preprocessor = ColumnTransformer([ - ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), - ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])], + # in case the preprocessing steps are disabled + # i.e, NoEncoder for categorical, we want to + # let the data in categorical columns pass through + self.preprocessor = ColumnTransformer( + column_transformers, remainder='passthrough' ) From 6e462a680ace078b48209deadde88c39902aedd8 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 4 Feb 2022 13:02:18 +0100 Subject: [PATCH 4/5] fix flake and mypy error --- .../tabular_preprocessing/TabularColumnTransformer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 935b740b6..bac12db4e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -51,13 +51,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": self.check_requirements(X, y) preprocessors = get_tabular_preprocessers(X) - column_transformers: List[Tuple(str, BaseEstimator, List[int])] = [] + column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] if len(preprocessors['numerical']) > 0: numerical_pipeline = make_pipeline(*preprocessors['numerical']) - column_transformers.append(('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])) + column_transformers.append( + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']) + ) if len(preprocessors['categorical']) > 0: categorical_pipeline = make_pipeline(*preprocessors['categorical']) - column_transformers.append(('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])) + column_transformers.append( + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) + ) # in case the preprocessing steps are disabled # i.e, NoEncoder for categorical, we want to From 5f7b538302738092159509679ee6461ac3284a84 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 9 Feb 2022 11:39:52 +0100 Subject: [PATCH 5/5] fix test cases for imputation --- .../components/preprocessing/test_imputers.py | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 58377def7..0db460b77 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -81,16 +81,18 @@ def test_mean_imputation(self): [4, 3.5, 3]])) def test_median_imputation(self): - data = np.array([[1.0, np.nan, 3], - [np.nan, 8, 9], - [4.0, 5, np.nan], - [np.nan, 2, 3], - [7.0, np.nan, 9], - [4.0, np.nan, np.nan]]) + data = np.array([[1.0, np.nan, 7], + [np.nan, 9, 10], + [10.0, 7, 7], + [9.0, np.nan, 11], + [9.0, 9, np.nan], + [np.nan, 5, 6], + [12.0, np.nan, 8], + [9.0, np.nan, np.nan]]) numerical_columns = [0, 1, 2] categorical_columns = [] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4, 5]) + train_indices = np.array([0, 2, 3, 4, 7]) + test_indices = np.array([1, 5, 6]) dataset_properties = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, @@ -118,21 +120,23 @@ def test_median_imputation(self): column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed, np.array([[2.5, 8, 9], - [7, 3.5, 9], - [4, 3.5, 3]])) + assert_array_equal(transformed, np.array([[9, 9, 10], + [9, 5, 6], + [12, 8, 8]])) def test_frequent_imputation(self): - data = np.array([[1.0, np.nan, 3], - [np.nan, 8, 9], - [4.0, 5, np.nan], - [np.nan, 2, 3], - [7.0, np.nan, 9], - [4.0, np.nan, np.nan]]) + data = np.array([[1.0, np.nan, 7], + [np.nan, 9, 10], + [10.0, 7, 7], + [9.0, np.nan, 11], + [9.0, 9, np.nan], + [np.nan, 5, 6], + [12.0, np.nan, 8], + [9.0, np.nan, np.nan]]) numerical_columns = [0, 1, 2] categorical_columns = [] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4, 5]) + train_indices = np.array([0, 2, 4, 5, 7]) + test_indices = np.array([1, 3, 6]) dataset_properties = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, @@ -160,9 +164,9 @@ def test_frequent_imputation(self): column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed, np.array([[1, 8, 9], - [7, 2, 9], - [4, 2, 3]])) + assert_array_equal(transformed, np.array([[9, 9, 10], + [9, 5, 11], + [12, 5, 8]])) def test_constant_imputation(self): data = np.array([[1.0, np.nan, 3],