From 9ca5889fda4002e3e7e3b1f32df7b714832c6a75 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 3 Feb 2022 14:41:17 +0100
Subject: [PATCH 1/5] remove categorical strategy from simple imputer

---
 .../imputation/SimpleImputer.py               |  61 ++--------
 .../imputation/base_imputer.py                |   5 +-
 .../components/preprocessing/test_imputers.py | 114 ++++++++----------
 3 files changed, 64 insertions(+), 116 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
index 3d7ca22b1..608ee8ec5 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -13,13 +13,8 @@
 
 
 class SimpleImputer(BaseImputer):
-    """An imputer for categorical and numerical columns
-
-    Impute missing values for categorical columns with 'constant_!missing!'
-
-    Note:
-        In case of numpy data, the constant value is set to -1, under the assumption
-        that categorical data is fit with an Ordinal Scaler.
+    """
+    An imputer for numerical columns
 
     Attributes:
         random_state (Optional[np.random.RandomState]):
@@ -27,56 +22,33 @@ class SimpleImputer(BaseImputer):
         numerical_strategy (str: default='mean'):
             The strategy to use for imputing numerical columns.
             Can be one of ['most_frequent', 'constant_!missing!']
-        categorical_strategy (str: default='most_frequent')
-            The strategy to use for imputing categorical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
     def __init__(
         self,
         random_state: Optional[np.random.RandomState] = None,
         numerical_strategy: str = 'mean',
-        categorical_strategy: str = 'most_frequent'
     ):
-        """
-        Note:
-            'constant' as numerical_strategy uses 0 as the default fill_value while
-            'constant_!missing!' uses a fill_value of -1.
-            This behaviour should probably be fixed.
-        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
-        self.categorical_strategy = categorical_strategy
 
     def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
-        """ Fits the underlying model and returns the transformed array.
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
 
         Args:
-            X (np.ndarray):
-                The input features to fit on
-            y (Optional[np.ndarray]):
-                The labels for the input features `X`
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
 
         Returns:
-            SimpleImputer:
-                returns self
+            self:
+                returns an instance of self.
         """
         self.check_requirements(X, y)
 
-        # Choose an imputer for any categorical columns
-        categorical_columns = X['dataset_properties']['categorical_columns']
-
-        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
-            if self.categorical_strategy == 'constant_!missing!':
-                # Train data is numpy as of this point, where an Ordinal Encoding is used
-                # for categoricals. Only Numbers are allowed for `fill_value`
-                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
-                self.preprocessor['categorical'] = imputer
-            else:
-                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
-                self.preprocessor['categorical'] = imputer
-
         # Choose an imputer for any numerical columns
         numerical_columns = X['dataset_properties']['numerical_columns']
 
@@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
             value_range=("mean", "median", "most_frequent", "constant_zero"),
             default_value="mean",
         ),
-        categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='categorical_strategy',
-            value_range=("most_frequent", "constant_!missing!"),
-            default_value="most_frequent"
-        )
     ) -> ConfigurationSpace:
         """Get the hyperparameter search space for the SimpleImputer
 
@@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
                 Note: Not actually Optional, just adhering to its supertype
             numerical_strategy (HyperparameterSearchSpace: default = ...)
                 The strategy to use for numerical imputation
-            caterogical_strategy (HyperparameterSearchSpace: default = ...)
-                The strategy to use for categorical imputation
 
         Returns:
             ConfigurationSpace
@@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
         ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if (
-            isinstance(dataset_properties['categorical_columns'], List)
-            and len(dataset_properties['categorical_columns'])
-        ):
-            add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
-
         return cs
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
index b65f3c229..1f33a765a 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
             raise ValueError("cant call transform on {} without fitting first."
                              .format(self.__class__.__name__))
         X.update({'imputer': self.preprocessor})
diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 18b43bfa6..58377def7 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -39,14 +39,14 @@ def test_get_config_space(self):
             self.assertEqual(param1, param2)
 
     def test_mean_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -66,31 +66,29 @@ def test_mean_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer((categorical_imputer,
-                                                      X['dataset_properties']['categorical_columns']),
-                                                     (numerical_imputer,
+        column_transformer = make_column_transformer((numerical_imputer,
                                                       X['dataset_properties']['numerical_columns']),
                                                      remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[2.5, 8, 9],
+                                                  [7, 3.5, 9],
+                                                  [4, 3.5, 3]]))
 
     def test_median_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -110,31 +108,29 @@ def test_median_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[2.5, 8, 9],
+                                                  [7, 3.5, 9],
+                                                  [4, 3.5, 3]]))
 
     def test_frequent_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -145,8 +141,7 @@ def test_frequent_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='most_frequent',
-                                          categorical_strategy='most_frequent')
+        imputer_component = SimpleImputer(numerical_strategy='most_frequent')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -155,31 +150,29 @@ def test_frequent_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9],
-                                                             [7.0, 2, 9],
-                                                             [4.0, 2, 3]], dtype=str))
+        assert_array_equal(transformed, np.array([[1, 8, 9],
+                                                  [7, 2, 9],
+                                                  [4, 2, 3]]))
 
     def test_constant_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -190,8 +183,7 @@ def test_constant_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='constant_zero',
-                                          categorical_strategy='constant_!missing!')
+        imputer_component = SimpleImputer(numerical_strategy='constant_zero')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -200,20 +192,18 @@ def test_constant_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
-        assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9],
-                                                             [7.0, '0', 9],
-                                                             [4.0, '0', '0']], dtype=str))
+        assert_array_equal(transformed, np.array([[0, 8, 9],
+                                                  [7, 0, 9],
+                                                  [4, 0, 0]]))
 
     def test_imputation_without_dataset_properties_raises_error(self):
         """Tests SimpleImputer checks for dataset properties when querying for

From b388947edc6a4d142d420da795185057a5040ee6 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 3 Feb 2022 18:36:33 +0100
Subject: [PATCH 2/5] fix tests

---
 autoPyTorch/configs/greedy_portfolio.json        | 16 ----------------
 autoPyTorch/optimizer/smbo.py                    |  8 +++++---
 .../TabularColumnTransformer.py                  | 11 +++++++----
 3 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json
index a8e640a4e..ffc5d98f5 100644
--- a/autoPyTorch/configs/greedy_portfolio.json
+++ b/autoPyTorch/configs/greedy_portfolio.json
@@ -1,7 +1,6 @@
 [{"data_loader:batch_size": 60,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -32,7 +31,6 @@
  {"data_loader:batch_size": 255,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -66,7 +64,6 @@
  {"data_loader:batch_size": 165,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -97,7 +94,6 @@
  {"data_loader:batch_size": 299,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -129,7 +125,6 @@
  {"data_loader:batch_size": 183,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -163,7 +158,6 @@
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -192,7 +186,6 @@
  {"data_loader:batch_size": 159,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -222,7 +215,6 @@
  {"data_loader:batch_size": 442,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -255,7 +247,6 @@
  {"data_loader:batch_size": 140,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -288,7 +279,6 @@
  {"data_loader:batch_size": 48,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -316,7 +306,6 @@
  {"data_loader:batch_size": 168,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -349,7 +338,6 @@
  {"data_loader:batch_size": 21,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -378,7 +366,6 @@
  {"data_loader:batch_size": 163,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -411,7 +398,6 @@
  {"data_loader:batch_size": 150,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
@@ -445,7 +431,6 @@
  {"data_loader:batch_size": 151,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedMLPBackbone",
@@ -475,7 +460,6 @@
  {"data_loader:batch_size": 42,
  "encoder:__choice__": "OneHotEncoder",
  "feature_preprocessor:__choice__": "TruncatedSVD",
- "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
  "lr_scheduler:__choice__": "CosineAnnealingLR",
  "network_backbone:__choice__": "ShapedResNetBackbone",
diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index d0bb4056c..f7fa927be 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -244,10 +244,12 @@ def __init__(self,
                                               port=self.logger_port)
         self.logger.info("initialised {}".format(self.__class__.__name__))
 
-        self.initial_configurations: Optional[List[Configuration]] = None
         if portfolio_selection is not None:
-            self.initial_configurations = read_return_initial_configurations(config_space=config_space,
-                                                                             portfolio_selection=portfolio_selection)
+            initial_configurations = read_return_initial_configurations(config_space=config_space,
+                                                                        portfolio_selection=portfolio_selection)
+            # incase we dont have any valid configuration from the portfolio
+            self.initial_configurations: Optional[List[Configuration]] = initial_configurations \
+                if len(initial_configurations) > 0 else None
 
     def reset_data_manager(self) -> None:
         if self.datamanager is not None:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index ea47e33b9..5902532e9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -48,13 +48,16 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
-        numerical_pipeline = 'drop'
-        categorical_pipeline = 'drop'
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        numerical_pipeline = 'passthrough'
+        categorical_pipeline = 'passthrough'
 
         preprocessors = get_tabular_preprocessers(X)
-        if len(X['dataset_properties']['numerical_columns']):
+        if len(X['dataset_properties']['numerical_columns']) and len(preprocessors['numerical']):
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']):
+        if len(X['dataset_properties']['categorical_columns']) and len(preprocessors['categorical']):
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
 
         self.preprocessor = ColumnTransformer([

From 6cdae18a85b54ea949b37d03964e987863045995 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 4 Feb 2022 12:57:46 +0100
Subject: [PATCH 3/5] address comments from eddie

---
 autoPyTorch/optimizer/smbo.py                 |  3 ++-
 .../TabularColumnTransformer.py               | 23 ++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
index f7fa927be..7407f6ba5 100644
--- a/autoPyTorch/optimizer/smbo.py
+++ b/autoPyTorch/optimizer/smbo.py
@@ -244,11 +244,12 @@ def __init__(self,
                                               port=self.logger_port)
         self.logger.info("initialised {}".format(self.__class__.__name__))
 
+        self.initial_configurations: Optional[List[Configuration]] = None
         if portfolio_selection is not None:
             initial_configurations = read_return_initial_configurations(config_space=config_space,
                                                                         portfolio_selection=portfolio_selection)
             # incase we dont have any valid configuration from the portfolio
-            self.initial_configurations: Optional[List[Configuration]] = initial_configurations \
+            self.initial_configurations = initial_configurations \
                 if len(initial_configurations) > 0 else None
 
     def reset_data_manager(self) -> None:
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index 5902532e9..935b740b6 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -1,7 +1,8 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
+from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import make_pipeline
 
@@ -48,21 +49,21 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
             "TabularColumnTransformer": an instance of self
         """
         self.check_requirements(X, y)
-        # in case the preprocessing steps are disabled
-        # i.e, NoEncoder for categorical, we want to
-        # let the data in categorical columns pass through
-        numerical_pipeline = 'passthrough'
-        categorical_pipeline = 'passthrough'
 
         preprocessors = get_tabular_preprocessers(X)
-        if len(X['dataset_properties']['numerical_columns']) and len(preprocessors['numerical']):
+        column_transformers: List[Tuple(str, BaseEstimator, List[int])] = []
+        if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-        if len(X['dataset_properties']['categorical_columns']) and len(preprocessors['categorical']):
+            column_transformers.append(('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']))
+        if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
+            column_transformers.append(('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']))
 
-        self.preprocessor = ColumnTransformer([
-            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
-            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
+        # in case the preprocessing steps are disabled
+        # i.e, NoEncoder for categorical, we want to
+        # let the data in categorical columns pass through
+        self.preprocessor = ColumnTransformer(
+            column_transformers,
             remainder='passthrough'
         )
 

From 6e462a680ace078b48209deadde88c39902aedd8 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 4 Feb 2022 13:02:18 +0100
Subject: [PATCH 4/5] fix flake and mypy error

---
 .../tabular_preprocessing/TabularColumnTransformer.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
index 935b740b6..bac12db4e 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -51,13 +51,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         self.check_requirements(X, y)
 
         preprocessors = get_tabular_preprocessers(X)
-        column_transformers: List[Tuple(str, BaseEstimator, List[int])] = []
+        column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
         if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-            column_transformers.append(('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']))
+            column_transformers.append(
+                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+            )
         if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
-            column_transformers.append(('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']))
+            column_transformers.append(
+                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+            )
 
         # in case the preprocessing steps are disabled
         # i.e, NoEncoder for categorical, we want to

From 5f7b538302738092159509679ee6461ac3284a84 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Wed, 9 Feb 2022 11:39:52 +0100
Subject: [PATCH 5/5] fix test cases for imputation

---
 .../components/preprocessing/test_imputers.py | 48 ++++++++++---------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
index 58377def7..0db460b77 100644
--- a/test/test_pipeline/components/preprocessing/test_imputers.py
+++ b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -81,16 +81,18 @@ def test_mean_imputation(self):
                                                   [4, 3.5, 3]]))
 
     def test_median_imputation(self):
-        data = np.array([[1.0, np.nan, 3],
-                         [np.nan, 8, 9],
-                         [4.0, 5, np.nan],
-                         [np.nan, 2, 3],
-                         [7.0, np.nan, 9],
-                         [4.0, np.nan, np.nan]])
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, np.nan, np.nan]])
         numerical_columns = [0, 1, 2]
         categorical_columns = []
-        train_indices = np.array([0, 2, 3])
-        test_indices = np.array([1, 4, 5])
+        train_indices = np.array([0, 2, 3, 4, 7])
+        test_indices = np.array([1, 5, 6])
         dataset_properties = {
             'categorical_columns': categorical_columns,
             'numerical_columns': numerical_columns,
@@ -118,21 +120,23 @@ def test_median_imputation(self):
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed, np.array([[2.5, 8, 9],
-                                                  [7, 3.5, 9],
-                                                  [4, 3.5, 3]]))
+        assert_array_equal(transformed, np.array([[9, 9, 10],
+                                                  [9, 5, 6],
+                                                  [12, 8, 8]]))
 
     def test_frequent_imputation(self):
-        data = np.array([[1.0, np.nan, 3],
-                         [np.nan, 8, 9],
-                         [4.0, 5, np.nan],
-                         [np.nan, 2, 3],
-                         [7.0, np.nan, 9],
-                         [4.0, np.nan, np.nan]])
+        data = np.array([[1.0, np.nan, 7],
+                         [np.nan, 9, 10],
+                         [10.0, 7, 7],
+                         [9.0, np.nan, 11],
+                         [9.0, 9, np.nan],
+                         [np.nan, 5, 6],
+                         [12.0, np.nan, 8],
+                         [9.0, np.nan, np.nan]])
         numerical_columns = [0, 1, 2]
         categorical_columns = []
-        train_indices = np.array([0, 2, 3])
-        test_indices = np.array([1, 4, 5])
+        train_indices = np.array([0, 2, 4, 5, 7])
+        test_indices = np.array([1, 3, 6])
         dataset_properties = {
             'categorical_columns': categorical_columns,
             'numerical_columns': numerical_columns,
@@ -160,9 +164,9 @@ def test_frequent_imputation(self):
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed, np.array([[1, 8, 9],
-                                                  [7, 2, 9],
-                                                  [4, 2, 3]]))
+        assert_array_equal(transformed, np.array([[9, 9, 10],
+                                                  [9, 5, 11],
+                                                  [12, 5, 8]]))
 
     def test_constant_imputation(self):
         data = np.array([[1.0, np.nan, 3],