automl · nabenabe0928 · Feb 9, 2022 · Feb 3, 2022 · Feb 3, 2022 · Feb 4, 2022
diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -13,70 +13,42 @@
 
 
 class SimpleImputer(BaseImputer):
-    """An imputer for categorical and numerical columns
-
-    Impute missing values for categorical columns with 'constant_!missing!'
-
-    Note:
-        In case of numpy data, the constant value is set to -1, under the assumption
-        that categorical data is fit with an Ordinal Scaler.
+    """
+    An imputer for numerical columns
 
     Attributes:
         random_state (Optional[np.random.RandomState]):
             The random state to use for the imputer.
         numerical_strategy (str: default='mean'):
             The strategy to use for imputing numerical columns.
             Can be one of ['most_frequent', 'constant_!missing!']
-        categorical_strategy (str: default='most_frequent')
-            The strategy to use for imputing categorical columns.
-            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
     def __init__(
         self,
         random_state: Optional[np.random.RandomState] = None,
         numerical_strategy: str = 'mean',
-        categorical_strategy: str = 'most_frequent'
     ):
-        """
-        Note:
-            'constant' as numerical_strategy uses 0 as the default fill_value while
-            'constant_!missing!' uses a fill_value of -1.
-            This behaviour should probably be fixed.
-        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
-        self.categorical_strategy = categorical_strategy
 
     def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
-        """ Fits the underlying model and returns the transformed array.
+        """
+        Builds the preprocessor based on the given fit dictionary 'X'.
 
         Args:
-            X (np.ndarray):
-                The input features to fit on
-            y (Optional[np.ndarray]):
-                The labels for the input features `X`
+            X (Dict[str, Any]):
+                The fit dictionary
+            y (Optional[Any]):
+                Not Used -- to comply with API
 
         Returns:
-            SimpleImputer:
-                returns self
+            self:
+                returns an instance of self.
         """
         self.check_requirements(X, y)
 
-        # Choose an imputer for any categorical columns
-        categorical_columns = X['dataset_properties']['categorical_columns']
-
-        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
-            if self.categorical_strategy == 'constant_!missing!':
-                # Train data is numpy as of this point, where an Ordinal Encoding is used
-                # for categoricals. Only Numbers are allowed for `fill_value`
-                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
-                self.preprocessor['categorical'] = imputer
-            else:
-                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
-                self.preprocessor['categorical'] = imputer
-
         # Choose an imputer for any numerical columns
         numerical_columns = X['dataset_properties']['numerical_columns']
 
@@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
             value_range=("mean", "median", "most_frequent", "constant_zero"),
             default_value="mean",
         ),
-        categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
-            hyperparameter='categorical_strategy',
-            value_range=("most_frequent", "constant_!missing!"),
-            default_value="most_frequent"
-        )
     ) -> ConfigurationSpace:
         """Get the hyperparameter search space for the SimpleImputer
 
@@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
                 Note: Not actually Optional, just adhering to its supertype
             numerical_strategy (HyperparameterSearchSpace: default = ...)
                 The strategy to use for numerical imputation
-            caterogical_strategy (HyperparameterSearchSpace: default = ...)
-                The strategy to use for categorical imputation
 
         Returns:
             ConfigurationSpace
@@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
         ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if (
-            isinstance(dataset_properties['categorical_columns'], List)
-            and len(dataset_properties['categorical_columns'])
-        ):
-            add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
-
         return cs
 
     @staticmethod

diff --git a/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/...yTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+        if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
             raise ValueError("cant call transform on {} without fitting first."
-            raise ValueError("cant call transform on {} without fitting first."
+            raise ValueError("cannot call transform on {} without fitting first."
-            raise ValueError("cant call transform on {} without fitting first."
+            raise ValueError("cannot call transform on {} without fitting first."
                              .format(self.__class__.__name__))
         X.update({'imputer': self.preprocessor})

diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -39,14 +39,14 @@ def test_get_config_space(self):
             self.assertEqual(param1, param2)
 
     def test_mean_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -66,31 +66,29 @@ def test_mean_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer((categorical_imputer,
-                                                      X['dataset_properties']['categorical_columns']),
-                                                     (numerical_imputer,
+        column_transformer = make_column_transformer((numerical_imputer,
                                                       X['dataset_properties']['numerical_columns']),
                                                      remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[2.5, 8, 9],
+                                                  [7, 3.5, 9],
+                                                  [4, 3.5, 3]]))
 
     def test_median_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -110,31 +108,29 @@ def test_median_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
-                                                             [7.0, 3.5, 9.0],
-                                                             [4.0, 3.5, 3.0]], dtype=str))
+        assert_array_equal(transformed, np.array([[2.5, 8, 9],
+                                                  [7, 3.5, 9],
+                                                  [4, 3.5, 3]]))
 
     def test_frequent_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -145,8 +141,7 @@ def test_frequent_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='most_frequent',
-                                          categorical_strategy='most_frequent')
+        imputer_component = SimpleImputer(numerical_strategy='most_frequent')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -155,31 +150,29 @@ def test_frequent_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
 
-        assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9],
-                                                             [7.0, 2, 9],
-                                                             [4.0, 2, 3]], dtype=str))
+        assert_array_equal(transformed, np.array([[1, 8, 9],
+                                                  [7, 2, 9],
+                                                  [4, 2, 3]]))
 
     def test_constant_imputation(self):
-        data = np.array([['1.0', np.nan, 3],
+        data = np.array([[1.0, np.nan, 3],
                          [np.nan, 8, 9],
-                         ['4.0', 5, np.nan],
+                         [4.0, 5, np.nan],
                          [np.nan, 2, 3],
-                         ['7.0', np.nan, 9],
-                         ['4.0', np.nan, np.nan]], dtype=object)
-        numerical_columns = [1, 2]
-        categorical_columns = [0]
+                         [7.0, np.nan, 9],
+                         [4.0, np.nan, np.nan]])
+        numerical_columns = [0, 1, 2]
+        categorical_columns = []
         train_indices = np.array([0, 2, 3])
         test_indices = np.array([1, 4, 5])
         dataset_properties = {
@@ -190,8 +183,7 @@ def test_constant_imputation(self):
             'X_train': data[train_indices],
             'dataset_properties': dataset_properties
         }
-        imputer_component = SimpleImputer(numerical_strategy='constant_zero',
-                                          categorical_strategy='constant_!missing!')
+        imputer_component = SimpleImputer(numerical_strategy='constant_zero')
 
         imputer_component = imputer_component.fit(X)
         X = imputer_component.transform(X)
@@ -200,20 +192,18 @@ def test_constant_imputation(self):
 
         # check if the fit dictionary X is modified as expected
         self.assertIsInstance(X['imputer'], dict)
-        self.assertIsInstance(categorical_imputer, BaseEstimator)
+        self.assertIsNone(categorical_imputer)
         self.assertIsInstance(numerical_imputer, BaseEstimator)
 
         # make column transformer with returned encoder to fit on data
-        column_transformer = make_column_transformer(
-            (categorical_imputer, X['dataset_properties']['categorical_columns']),
-            (numerical_imputer, X['dataset_properties']['numerical_columns']),
-            remainder='passthrough'
-        )
+        column_transformer = make_column_transformer((numerical_imputer,
+                                                      X['dataset_properties']['numerical_columns']),
+                                                     remainder='passthrough')
         column_transformer = column_transformer.fit(X['X_train'])
         transformed = column_transformer.transform(data[test_indices])
-        assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9],
-                                                             [7.0, '0', 9],
-                                                             [4.0, '0', '0']], dtype=str))
+        assert_array_equal(transformed, np.array([[0, 8, 9],
+                                                  [7, 0, 9],
+                                                  [4, 0, 0]]))
 
     def test_imputation_without_dataset_properties_raises_error(self):
         """Tests SimpleImputer checks for dataset properties when querying for