automl · ravinkohli · Dec 1, 2021 · Nov 30, 2021 · Nov 30, 2021 · Nov 30, 2021
diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py
@@ -1,9 +1,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
-from ConfigSpace.hyperparameters import (
-    CategoricalHyperparameter
-)
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
 
 import numpy as np
 
@@ -15,92 +13,143 @@
 
 
 class SimpleImputer(BaseImputer):
-    """
-    Impute missing values for categorical columns with '!missing!'
-    (In case of numpy data, the constant value is set to -1, under
-    the assumption that categorical data is fit with an Ordinal Scaler)
+    """An imputer for categorical and numerical columns
+
+    Impute missing values for categorical columns with 'constant_!missing!'
+
+    Note:
+        In case of numpy data, the constant value is set to -1, under the assumption
+        that categorical data is fit with an Ordinal Scaler.
+
+    Attributes:
+        random_state (Optional[np.random.RandomState]):
+            The random state to use for the imputer.
+        numerical_strategy (str: default='mean'):
+            The strategy to use for imputing numerical columns.
+            Can be one of ['most_frequent', 'constant_!missing!']
+        categorical_strategy (str: default='most_frequent')
+            The strategy to use for imputing categorical columns.
+            Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
     """
 
-    def __init__(self,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None,
-                 numerical_strategy: str = 'mean',
-                 categorical_strategy: str = 'most_frequent'):
+    def __init__(
+        self,
+        random_state: Optional[np.random.RandomState] = None,
+        numerical_strategy: str = 'mean',
+        categorical_strategy: str = 'most_frequent'
+    ):
+        """
+        Note:
+            'constant' as numerical_strategy uses 0 as the default fill_value while
+            'constant_!missing!' uses a fill_value of -1.
+            This behaviour should probably be fixed.
+        """
         super().__init__()
         self.random_state = random_state
         self.numerical_strategy = numerical_strategy
         self.categorical_strategy = categorical_strategy
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
-        """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
+        """ Fits the underlying model and returns the transformed array.
+
         Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
+            X (np.ndarray):
+                The input features to fit on
+            y (Optional[np.ndarray]):
+                The labels for the input features `X`
 
         Returns:
-            instance of self
+            SimpleImputer:
+                returns self
         """
         self.check_requirements(X, y)
-        categorical_columns = X['dataset_properties']['categorical_columns'] \
-            if isinstance(X['dataset_properties']['categorical_columns'], List) else []
-        if len(categorical_columns) != 0:
+
+        # Choose an imputer for any categorical columns
+        categorical_columns = X['dataset_properties']['categorical_columns']
+
+        if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
             if self.categorical_strategy == 'constant_!missing!':
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
-                                                                        # Train data is numpy
-                                                                        # as of this point, where
-                                                                        # Ordinal Encoding is using
-                                                                        # for categorical. Only
-                                                                        # Numbers are allowed
-                                                                        # fill_value='!missing!',
-                                                                        fill_value=-1,
-                                                                        copy=False)
+                # Train data is numpy as of this point, where an Ordinal Encoding is used
+                # for categoricals. Only Numbers are allowed for `fill_value`
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
+                self.preprocessor['categorical'] = imputer
             else:
-                self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
-                                                                        copy=False)
-        numerical_columns = X['dataset_properties']['numerical_columns'] \
-            if isinstance(X['dataset_properties']['numerical_columns'], List) else []
-        if len(numerical_columns) != 0:
+                imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
+                self.preprocessor['categorical'] = imputer
+
+        # Choose an imputer for any numerical columns
+        numerical_columns = X['dataset_properties']['numerical_columns']
+
+        if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
             if self.numerical_strategy == 'constant_zero':
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
-                                                                      fill_value=0,
-                                                                      copy=False)
+                imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
+                self.preprocessor['numerical'] = imputer
             else:
-                self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
+                self.preprocessor['numerical'] = imputer
 
         return self
 
     @staticmethod
     def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
-        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
-                                                                                  value_range=("mean", "median",
-                                                                                               "most_frequent",
-                                                                                               "constant_zero"),
-                                                                                  default_value="mean",
-                                                                                  ),
+        numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
+            hyperparameter='numerical_strategy',
+            value_range=("mean", "median", "most_frequent", "constant_zero"),
+            default_value="mean",
+        ),
         categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter='categorical_strategy',
-            value_range=("most_frequent",
-                         "constant_!missing!"),
-            default_value="most_frequent")
+            value_range=("most_frequent", "constant_!missing!"),
+            default_value="most_frequent"
+        )
     ) -> ConfigurationSpace:
+        """Get the hyperparameter search space for the SimpleImputer
+
+        Args:
+            dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]])
+                Properties that describe the dataset
+                Note: Not actually Optional, just adhering to its supertype
+            numerical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for numerical imputation
+            caterogical_strategy (HyperparameterSearchSpace: default = ...)
+                The strategy to use for categorical imputation
+
+        Returns:
+            ConfigurationSpace
+                The space of possible configurations for a SimpleImputer with the given
+                `dataset_properties`
+        """
         cs = ConfigurationSpace()
-        assert dataset_properties is not None, "To create hyperparameter search space" \
-                                               ", dataset_properties should not be None"
-        if len(dataset_properties['numerical_columns']) \
-                if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:
+
+        if dataset_properties is None:
+            raise ValueError("SimpleImputer requires `dataset_properties` for generating"
+                             " a search space.")
+
+        if (
+            isinstance(dataset_properties['numerical_columns'], List)
+            and len(dataset_properties['numerical_columns']) != 0
+        ):
             add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)
 
-        if len(dataset_properties['categorical_columns']) \
-                if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
+        if (
+            isinstance(dataset_properties['categorical_columns'], List)
+            and len(dataset_properties['categorical_columns'])
+        ):
             add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)
 
         return cs
 
     @staticmethod
-    def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
-                       ) -> Dict[str, Union[str, bool]]:
+    def get_properties(
+        dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
+    ) -> Dict[str, Union[str, bool]]:
+        """Get the properties of the SimpleImputer class and what it can handle
+
+        Returns:
+            Dict[str, Union[str, bool]]:
+                A dict from property names to values
+        """
         return {
             'shortname': 'SimpleImputer',
             'name': 'Simple Imputer',

diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py
@@ -3,6 +3,8 @@
 import numpy as np
 from numpy.testing import assert_array_equal
 
+import pytest
+
 from sklearn.base import BaseEstimator, clone
 from sklearn.compose import make_column_transformer
 
@@ -213,6 +215,16 @@ def test_constant_imputation(self):
                                                              [7.0, '0', 9],
                                                              [4.0, '0', '0']], dtype=str))
 
+    def test_imputation_without_dataset_properties_raises_error(self):
+        """Tests SimpleImputer checks for dataset properties when querying for
+        HyperparameterSearchSpace, even though the arg is marked `Optional`.
+
+        Expects:
+            * Should raise a ValueError that no dataset_properties were passed
+        """
+        with pytest.raises(ValueError):
+            SimpleImputer.get_hyperparameter_search_space()
+
 
 if __name__ == '__main__':
     unittest.main()