From 359b4c9239cbbb327c76808a8b7bd7d0749730f8 Mon Sep 17 00:00:00 2001
From: Arlind Kadra <arlindkadra@gmail.com>
Date: Fri, 1 Oct 2021 16:06:36 +0200
Subject: [PATCH 01/24] Initial try at an enhancement for the tabular validator

---
 autoPyTorch/data/base_feature_validator.py    |  28 ++++
 autoPyTorch/data/tabular_feature_validator.py | 129 ++++++++++++------
 2 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 0106a3aa8..ae2b60196 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -122,6 +122,7 @@ def _fit(
             self:
                 The fitted base estimator
         """
+
         raise NotImplementedError()
 
     def _check_data(
@@ -136,6 +137,7 @@ def _check_data(
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         """
+
         raise NotImplementedError()
 
     def transform(
@@ -152,4 +154,30 @@ def transform(
             np.ndarray:
                 The transformed array
         """
+
+        raise NotImplementedError()
+
+    def list_to_dataframe(
+        self,
+        X_train: SUPPORTED_FEAT_TYPES,
+        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+    ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]:
+        """
+        Converts a list to a pandas DataFrame. In this process, column types are inferred.
+
+        If test data is provided, we proactively match it to train data
+
+        Arguments:
+            X_train (SUPPORTED_FEAT_TYPES):
+                A set of features that are going to be validated (type and dimensionality
+                checks) and a encoder fitted in the case the data needs encoding
+            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+                A hold out set of data used for checking
+        Returns:
+            pd.DataFrame:
+                transformed train data from list to pandas DataFrame
+            pd.DataFrame:
+                transformed test data from list to pandas DataFrame
+        """
+
         raise NotImplementedError()
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 698e92438..b46ba12ae 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,6 +1,7 @@
 import functools
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
+
 import numpy as np
 
 import pandas as pd
@@ -38,6 +39,7 @@ def _create_column_transformer(
     Returns:
         ColumnTransformer
     """
+
     numerical_pipeline = 'drop'
     categorical_pipeline = 'drop'
     if len(numerical_columns) > 0:
@@ -63,18 +65,25 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
     preprocessors['numerical'] = list()
     preprocessors['categorical'] = list()
 
-    preprocessors['categorical'].append(OneHotEncoder(
-        categories='auto',
-        sparse=False,
-        handle_unknown='ignore'))
-    preprocessors['numerical'].append(SimpleImputer(strategy='median',
-                                                    copy=False))
-    preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False))
+    preprocessors['categorical'].append(
+        OneHotEncoder(
+            categories='auto',
+            sparse=False,
+            handle_unknown='ignore',
+        )
+    )
+    preprocessors['numerical'].append(
+        SimpleImputer(
+            strategy='median',
+            copy=False,
+        )
+    )
 
     return preprocessors
 
 
 class TabularFeatureValidator(BaseFeatureValidator):
+
     def _fit(
         self,
         X: SUPPORTED_FEAT_TYPES,
@@ -96,24 +105,27 @@ def _fit(
         # The final output of a validator is a numpy array. But pandas
         # gives us information about the column dtype
         if isinstance(X, np.ndarray):
+
             X = self.numpy_array_to_pandas(X)
+            # Replace the data type from the previously saved type.
+            self.data_type = type(X)
+            # save all the information about the column order and data types
+            self._check_data(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = cast(pd.DataFrame, X)
-
-            if not X.select_dtypes(include='object').empty:
-                X = self.infer_objects(X)
 
-            self._check_data(X)
+            X = cast(pd.DataFrame, X)
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
 
             self.enc_columns = categorical_columns
             if len(categorical_columns) >= 0:
                 X = self.impute_nan_in_categories(X)
             preprocessors = get_tabular_preprocessors()
-            self.column_transformer = _create_column_transformer(preprocessors=preprocessors,
-                                                                 numerical_columns=numerical_columns,
-                                                                 categorical_columns=categorical_columns)
+            self.column_transformer = _create_column_transformer(
+                preprocessors=preprocessors,
+                numerical_columns=numerical_columns,
+                categorical_columns=categorical_columns,
+            )
 
             # Mypy redefinition
             assert self.column_transformer is not None
@@ -142,13 +154,15 @@ def comparator(cmp1: str, cmp2: str) -> int:
 
             if len(categorical_columns) > 0:
                 self.categories = [
-                    # We fit an ordinal encoder, where all categorical
+                    # We fit an one-hot encoder, where all categorical
                     # columns are shifted to the left
                     list(range(len(cat)))
                     for cat in self.column_transformer.named_transformers_[
                         'categorical_pipeline'].named_steps['onehotencoder'].categories_
                 ]
 
+            # differently to categorical_columns and numerical_columns,
+            # this saves the index of the column.
             for i, type_ in enumerate(self.feat_type):
                 if 'numerical' in type_:
                     self.numerical_columns.append(i)
@@ -156,7 +170,8 @@ def comparator(cmp1: str, cmp2: str) -> int:
                     self.categorical_columns.append(i)
 
         # Lastly, store the number of features
-        self.num_features = np.shape(X)[1]
+        self.num_features = len(X.columns)
+
         return self
 
     def transform(
@@ -189,10 +204,6 @@ def transform(
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
             X = cast(pd.DataFrame, X)
 
-            # Also remove the object dtype for new data
-            if not X.select_dtypes(include='object').empty:
-                X = self.infer_objects(X)
-
         # Check the data here so we catch problems on new test data
         self._check_data(X)
         # We also need to fillna on the transformation
@@ -268,13 +279,13 @@ def _check_data(
             X = cast(pd.DataFrame, X)
 
             # Handle objects if possible
-            if not X.select_dtypes(include='object').empty:
+            object_columns_indicator = has_object_columns(X.dtypes)
+            if object_columns_indicator:
                 X = self.infer_objects(X)
 
             # Define the column to be encoded here as the feature validator is fitted once
             # per estimator
             # enc_columns, _ = self._get_columns_to_encode(X)
-
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
                 if self.column_order != column_order:
@@ -310,8 +321,10 @@ def _get_columns_info(
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
-            enc_columns (List[str]):
-                Columns to encode, if any
+            categorical_columns: (List[str])
+                Categorical columns.
+            numerical_columns: (List[str])
+                Numerical columns.
             feat_type:
                 Type of each column numerical/categorical
         """
@@ -323,14 +336,15 @@ def _get_columns_info(
 
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
-            if X[column].dtype.name in ['category', 'bool']:
-
+            column_dtype = self.dtypes[i]
+            if column_dtype.name in ['category', 'bool']:
                 categorical_columns.append(column)
                 feat_type.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
-            elif not is_numeric_dtype(X[column]):
-                if X[column].dtype.name == 'object':
+            elif not is_numeric_dtype(column_dtype):
+                # TODO verify how would this happen when we always convert the object dtypes to category
+                if column_dtype.name == 'object':
                     raise ValueError(
                         "Input Column {} has invalid type object. "
                         "Cast it to a valid dtype before using it in AutoPyTorch. "
@@ -345,7 +359,7 @@ def _get_columns_info(
                         )
                     )
                 elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
-                    X[column].dtype
+                    column_dtype
                 ):
                     raise ValueError(
                         "AutoPyTorch does not support time and/or date datatype as given "
@@ -362,7 +376,7 @@ def _get_columns_info(
                         "Make sure your data is formatted in a correct way, "
                         "before feeding it to AutoPyTorch.".format(
                             column,
-                            X[column].dtype.name,
+                            column_dtype.name,
                         )
                     )
             else:
@@ -394,7 +408,7 @@ def list_to_dataframe(
         """
 
         # If a list was provided, it will be converted to pandas
-        X_train = pd.DataFrame(data=X_train).infer_objects()
+        X_train = pd.DataFrame(data=X_train).convert_dtypes()
         self.logger.warning("The provided feature types to AutoPyTorch are of type list."
                             "Features have been interpreted as: {}".format([(col, t) for col, t in
                                                                             zip(X_train.columns, X_train.dtypes)]))
@@ -403,7 +417,8 @@ def list_to_dataframe(
                 self.logger.warning("Train features are a list while the provided test data"
                                     "is {}. X_test will be casted as DataFrame.".format(type(X_test))
                                     )
-            X_test = pd.DataFrame(data=X_test).infer_objects()
+            X_test = pd.DataFrame(data=X_test).convert_dtypes()
+
         return X_train, X_test
 
     @staticmethod
@@ -446,17 +461,21 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
                     self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
                     pass
         else:
+            # Calling for the first time to infer the categories
             X = X.infer_objects()
-            for column in X.columns:
-                if not is_numeric_dtype(X[column]):
+            # initial data types
+            data_types = X.dtypes
+            for index, column in enumerate(X.columns):
+                if not is_numeric_dtype(data_types[index]):
                     X[column] = X[column].astype('category')
-            self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
+            # only numerical attributes and categories
+            data_types = X.dtypes
+            self.object_dtype_mapping = {column: data_types[index] for index, column in enumerate(X.columns)}
         self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
+
         return X
 
-    def impute_nan_in_categories(self,
-                                 X: pd.DataFrame
-                                 ) -> pd.DataFrame:
+    def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
         """
         impute missing values before encoding,
         remove once sklearn natively supports
@@ -481,14 +500,16 @@ def impute_nan_in_categories(self,
             if X[column].isna().any():
                 if column not in self.dict_missing_value_per_col:
                     try:
-                        float(X[column].dropna().values[0])
+                        first_value = X[column].dropna().values[0]
+                        float(first_value)
                         can_cast_as_number = True
                     except Exception:
                         can_cast_as_number = False
                     if can_cast_as_number:
                         # In this case, we expect to have a number as category
                         # it might be string, but its value represent a number
-                        missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1
+
+                        missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
                     else:
                         missing_value = 'Missing!'
 
@@ -507,4 +528,30 @@ def impute_nan_in_categories(self,
                 X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
                                              inplace=True)
                 X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
+
         return X
+
+def has_object_columns(
+    feature_types: pd.Series,
+) -> bool:
+    """
+    Indicate whether on a Series of dtypes for a Pandas DataFrame
+    there exists one or more object columns.
+
+    Arguments:
+    ----------
+    feature_types: pd.Series
+        The feature types for a DataFrame.
+    Returns:
+    --------
+    bool
+        True if the DataFrame dtypes contain an object column, False
+        otherwise.
+    """
+    object_columns_indicator = [True if pd.api.types.is_object_dtype(feature_type) else False
+                                for feature_type in feature_types]
+
+    if True in object_columns_indicator:
+        return True
+    else:
+        return False

From 65e8ffb97520599a3060db594c7735affee688b4 Mon Sep 17 00:00:00 2001
From: Arlind Kadra <arlindkadra@gmail.com>
Date: Fri, 1 Oct 2021 16:10:33 +0200
Subject: [PATCH 02/24] Adding a few type annotations

---
 autoPyTorch/data/tabular_feature_validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index b46ba12ae..06979533d 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -23,7 +23,7 @@
 def _create_column_transformer(
     preprocessors: Dict[str, List[BaseEstimator]],
     numerical_columns: List[str],
-    categorical_columns: List[str]
+    categorical_columns: List[str],
 ) -> ColumnTransformer:
     """
     Given a dictionary of preprocessors, this function

From 217c38dd43b012d3839f757cce7877299aac74cc Mon Sep 17 00:00:00 2001
From: ArlindKadra <arlindkadra@gmail.com>
Date: Fri, 1 Oct 2021 16:56:26 +0200
Subject: [PATCH 03/24] Fixing bugs in implementation

---
 autoPyTorch/data/tabular_feature_validator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 06979533d..31d702301 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -337,14 +337,14 @@ def _get_columns_info(
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
             column_dtype = self.dtypes[i]
-            if column_dtype.name in ['category', 'bool']:
+            if column_dtype in ['category', 'bool']:
                 categorical_columns.append(column)
                 feat_type.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
             elif not is_numeric_dtype(column_dtype):
                 # TODO verify how would this happen when we always convert the object dtypes to category
-                if column_dtype.name == 'object':
+                if column_dtype == 'object':
                     raise ValueError(
                         "Input Column {} has invalid type object. "
                         "Cast it to a valid dtype before using it in AutoPyTorch. "
@@ -376,7 +376,7 @@ def _get_columns_info(
                         "Make sure your data is formatted in a correct way, "
                         "before feeding it to AutoPyTorch.".format(
                             column,
-                            column_dtype.name,
+                            column_dtype,
                         )
                     )
             else:

From f7dd8fe9cb2a3cd8dcbe7a9dc435c3ffc16bddf8 Mon Sep 17 00:00:00 2001
From: Arlind Kadra <arlindkadra@gmail.com>
Date: Fri, 1 Oct 2021 18:14:11 +0200
Subject: [PATCH 04/24] Adding wrongly deleted code part during rebase

---
 autoPyTorch/data/tabular_feature_validator.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 31d702301..700a8d6d1 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -78,6 +78,13 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
             copy=False,
         )
     )
+    preprocessors['numerical'].append(
+        StandardScaler(
+            with_mean=True,
+            with_std=True,
+            copy=False,
+        )
+    )
 
     return preprocessors
 

From 92bd535b73e5afef01ea7cbb74b2b7ea2a179c2c Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Sat, 2 Oct 2021 13:33:07 +0200
Subject: [PATCH 05/24] Fix bug in _get_args

---
 .../setup/network_embedding/base_network_embedding.py       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 18028cddd..14bf00653 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -30,8 +30,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
         raise NotImplementedError
-    #
-    # def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
+    
+    def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
     #     # Feature preprocessors can alter numerical columns
     #     # if len(X['dataset_properties']['numerical_columns']) == 0:
     #     #     num_numerical_columns = 0
@@ -49,4 +49,4 @@ def build_embedding(self, num_input_features: np.ndarray, num_numerical_features
     #     # for i, category in enumerate(categories):
     #     #     num_input_features[num_numerical_columns + i, ] = len(category)
     #     # return num_numerical_columns, num_input_features
-    #     return None, None
+        return None, None

From 5f672b550d2fdc25c2a3865c20a78e44f04e796a Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Sat, 2 Oct 2021 13:46:27 +0200
Subject: [PATCH 06/24] Fix bug in _get_args

---
 .../setup/network_embedding/base_network_embedding.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
index 14bf00653..7d9b1df3f 100644
--- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
+++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
 # import copy
-from typing import Any, Dict, Optional  # , Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 

From 223c09e1d43fc7715739bf09be79c09e52b84960 Mon Sep 17 00:00:00 2001
From: Arlind Kadra <arlindkadra@gmail.com>
Date: Sun, 3 Oct 2021 18:07:39 +0200
Subject: [PATCH 07/24] Addressing Shuhei's comments

---
 autoPyTorch/data/tabular_feature_validator.py | 54 +++++++------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 700a8d6d1..e64cefb9a 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -62,29 +62,13 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
         Dict[str, List[BaseEstimator]]
     """
     preprocessors: Dict[str, List[BaseEstimator]] = dict()
-    preprocessors['numerical'] = list()
-    preprocessors['categorical'] = list()
-
-    preprocessors['categorical'].append(
-        OneHotEncoder(
-            categories='auto',
-            sparse=False,
-            handle_unknown='ignore',
-        )
-    )
-    preprocessors['numerical'].append(
-        SimpleImputer(
-            strategy='median',
-            copy=False,
-        )
-    )
-    preprocessors['numerical'].append(
-        StandardScaler(
-            with_mean=True,
-            with_std=True,
-            copy=False,
-        )
-    )
+
+    onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
+    imputer = SimpleImputer(strategy='median', copy=False)
+    standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
+
+    preprocessors['categorical'] = [onehot_encoder]
+    preprocessors['numerical'] = [imputer, standard_scaler]
 
     return preprocessors
 
@@ -161,7 +145,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
 
             if len(categorical_columns) > 0:
                 self.categories = [
-                    # We fit an one-hot encoder, where all categorical
+                    # We fit a one-hot encoder, where all categorical
                     # columns are shifted to the left
                     list(range(len(cat)))
                     for cat in self.column_transformer.named_transformers_[
@@ -477,7 +461,8 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
                     X[column] = X[column].astype('category')
             # only numerical attributes and categories
             data_types = X.dtypes
-            self.object_dtype_mapping = {column: data_types[index] for index, column in enumerate(X.columns)}
+            self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
+
         self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
 
         return X
@@ -504,13 +489,16 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
         # TypeError: '<' not supported between instances of 'int' and 'str'
         # in the encoding
         for column in self.enc_columns:
-            if X[column].isna().any():
+            # no missing values for categorical column
+            if not X[column].isna().any():
+                continue
+            else:
                 if column not in self.dict_missing_value_per_col:
                     try:
                         first_value = X[column].dropna().values[0]
                         float(first_value)
                         can_cast_as_number = True
-                    except Exception:
+                    except ValueError:
                         can_cast_as_number = False
                     if can_cast_as_number:
                         # In this case, we expect to have a number as category
@@ -555,10 +543,8 @@ def has_object_columns(
         True if the DataFrame dtypes contain an object column, False
         otherwise.
     """
-    object_columns_indicator = [True if pd.api.types.is_object_dtype(feature_type) else False
-                                for feature_type in feature_types]
-
-    if True in object_columns_indicator:
-        return True
-    else:
-        return False
+    for feature_type in feature_types:
+        if pd.api.types.is_object_dtype(feature_type):
+            return True
+        else:
+            return False

From a1ed8830783c13f8a1fa899057257df949740f21 Mon Sep 17 00:00:00 2001
From: Arlind Kadra <arlindkadra@gmail.com>
Date: Mon, 4 Oct 2021 00:12:28 +0200
Subject: [PATCH 08/24] Address Shuhei's comments

---
 autoPyTorch/data/tabular_feature_validator.py | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index e64cefb9a..5ae2cd22c 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -270,7 +270,7 @@ def _check_data(
             X = cast(pd.DataFrame, X)
 
             # Handle objects if possible
-            object_columns_indicator = has_object_columns(X.dtypes)
+            object_columns_indicator = has_object_columns(X.dtypes.values)
             if object_columns_indicator:
                 X = self.infer_objects(X)
 
@@ -480,6 +480,13 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
             pd.DataFrame
         """
 
+        def can_cast_as_number(value: Union[int, float, str]) -> bool:
+            try:
+                float(first_value)
+                return True
+            except ValueError:
+                return False
+
         # To be on the safe side, map always to the same missing
         # value per column
         if not hasattr(self, 'dict_nancol_to_missing'):
@@ -494,16 +501,12 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
                 continue
             else:
                 if column not in self.dict_missing_value_per_col:
-                    try:
-                        first_value = X[column].dropna().values[0]
-                        float(first_value)
-                        can_cast_as_number = True
-                    except ValueError:
-                        can_cast_as_number = False
-                    if can_cast_as_number:
+
+                    first_value = X[column].dropna().values[0]
+
+                    if can_cast_as_number(first_value):
                         # In this case, we expect to have a number as category
                         # it might be string, but its value represent a number
-
                         missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
                     else:
                         missing_value = 'Missing!'
@@ -543,8 +546,4 @@ def has_object_columns(
         True if the DataFrame dtypes contain an object column, False
         otherwise.
     """
-    for feature_type in feature_types:
-        if pd.api.types.is_object_dtype(feature_type):
-            return True
-        else:
-            return False
+    return np.dtype('O') in feature_types

From f5853101fd5c2d6def81a1486b38ff769bc54449 Mon Sep 17 00:00:00 2001
From: ArlindKadra <arlindkadra@gmail.com>
Date: Wed, 6 Oct 2021 19:29:13 +0200
Subject: [PATCH 09/24] Refactoring code

---
 autoPyTorch/data/tabular_feature_validator.py | 59 ++++++++++++++-----
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 5ae2cd22c..866353e6f 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,7 +1,6 @@
 import functools
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
-
 import numpy as np
 
 import pandas as pd
@@ -512,14 +511,9 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool:
                         missing_value = 'Missing!'
 
                     # Make sure this missing value is not seen before
-                    # Do this check for categorical columns
-                    # else modify the value
                     if hasattr(X[column], 'cat'):
-                        while missing_value in X[column].cat.categories:
-                            if isinstance(missing_value, str):
-                                missing_value += '0'
-                            else:
-                                missing_value += missing_value
+                        missing_value = get_unused_category_symbol(X[column], missing_value)
+
                     self.dict_missing_value_per_col[column] = missing_value
 
                 # Convert the frame in place
@@ -529,6 +523,7 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool:
 
         return X
 
+
 def has_object_columns(
     feature_types: pd.Series,
 ) -> bool:
@@ -537,13 +532,47 @@ def has_object_columns(
     there exists one or more object columns.
 
     Arguments:
-    ----------
-    feature_types: pd.Series
-        The feature types for a DataFrame.
+        feature_types (pd.Series):
+            The feature types for a DataFrame.
     Returns:
-    --------
-    bool
-        True if the DataFrame dtypes contain an object column, False
-        otherwise.
+        bool:
+            True if the DataFrame dtypes contain an object column, False
+            otherwise.
     """
     return np.dtype('O') in feature_types
+
+
+def get_unused_category_symbol(
+    frame_column: pd.Series,
+    missing_value_symbol: Union[int, str],
+) -> Union[int, str]:
+    """
+    Select the appropriate missing value symbol for a column.
+
+    Giving a column from a DataFrame and an initial missing value symbol,
+    check if the missing_value is contained in the column, f it is, make
+    the necessary changes for a unique missing value symbol.
+
+    Arguments:
+        frame_column (pd.Series):
+            The DataFrame column.
+        missing_value_symbol (Union[int, str]):
+            The initial symbol for the missing value.
+
+    Returns:
+        missing_value_symbol (Union[int, str]):
+            The unique missing value symbol.
+    """
+
+    if missing_value_symbol not in frame_column.cat.categories:
+        pass
+    elif isinstance(missing_value_symbol, str):
+        max_length = max(len(c) for c in frame_column.cat.categories)
+        missing_value_symbol += '0' * max_length
+    else:
+        # min_value is guaranteed to be negative since there exists -1 in categories
+        # and min_value must be smaller than -1. So the symbol is always negative.
+        min_value = min(c for c in frame_column.cat.categories)
+        missing_value_symbol = missing_value_symbol + min_value
+
+    return missing_value_symbol

From f298c46d0130698a8c635e2e3ed36fc40c2a50dd Mon Sep 17 00:00:00 2001
From: ArlindKadra <arlindkadra@gmail.com>
Date: Wed, 6 Oct 2021 19:31:54 +0200
Subject: [PATCH 10/24] Refactoring code

---
 autoPyTorch/data/tabular_feature_validator.py | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 866353e6f..8e5ed452d 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -498,23 +498,23 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool:
             # no missing values for categorical column
             if not X[column].isna().any():
                 continue
-            else:
-                if column not in self.dict_missing_value_per_col:
 
-                    first_value = X[column].dropna().values[0]
+            if column not in self.dict_missing_value_per_col:
+
+                first_value = X[column].dropna().values[0]
 
-                    if can_cast_as_number(first_value):
-                        # In this case, we expect to have a number as category
-                        # it might be string, but its value represent a number
-                        missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
-                    else:
-                        missing_value = 'Missing!'
+                if can_cast_as_number(first_value):
+                    # In this case, we expect to have a number as category
+                    # it might be string, but its value represent a number
+                    missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
+                else:
+                    missing_value = 'Missing!'
 
-                    # Make sure this missing value is not seen before
-                    if hasattr(X[column], 'cat'):
-                        missing_value = get_unused_category_symbol(X[column], missing_value)
+                # Make sure this missing value is not seen before
+                if hasattr(X[column], 'cat'):
+                    missing_value = get_unused_category_symbol(X[column], missing_value)
 
-                    self.dict_missing_value_per_col[column] = missing_value
+                self.dict_missing_value_per_col[column] = missing_value
 
                 # Convert the frame in place
                 X[column].cat.add_categories([self.dict_missing_value_per_col[column]],

From 03bef163443b42f88fe577d5420c4055ac7330da Mon Sep 17 00:00:00 2001
From: ArlindKadra <arlindkadra@gmail.com>
Date: Wed, 6 Oct 2021 20:24:01 +0200
Subject: [PATCH 11/24] Typos fix and additional comments

---
 autoPyTorch/data/tabular_feature_validator.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 8e5ed452d..27be45eca 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -549,8 +549,8 @@ def get_unused_category_symbol(
     """
     Select the appropriate missing value symbol for a column.
 
-    Giving a column from a DataFrame and an initial missing value symbol,
-    check if the missing_value is contained in the column, f it is, make
+    Given a column from a DataFrame and an initial missing value symbol,
+    check if the missing_value is contained in the column. If it is, make
     the necessary changes for a unique missing value symbol.
 
     Arguments:
@@ -558,6 +558,7 @@ def get_unused_category_symbol(
             The DataFrame column.
         missing_value_symbol (Union[int, str]):
             The initial symbol for the missing value.
+            -1 for int and '-1' for str. 
 
     Returns:
         missing_value_symbol (Union[int, str]):
@@ -568,11 +569,13 @@ def get_unused_category_symbol(
         pass
     elif isinstance(missing_value_symbol, str):
         max_length = max(len(c) for c in frame_column.cat.categories)
+        # There are no categories that are longer than `max_length`
         missing_value_symbol += '0' * max_length
     else:
         # min_value is guaranteed to be negative since there exists -1 in categories
         # and min_value must be smaller than -1. So the symbol is always negative.
         min_value = min(c for c in frame_column.cat.categories)
+        # always missing_value_symbol + min_value < min_value < 0
         missing_value_symbol = missing_value_symbol + min_value
 
     return missing_value_symbol

From a7d01f16ac2fb4d27cea2fa886a108524d32955a Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 7 Oct 2021 16:08:31 +0200
Subject: [PATCH 12/24] Replace nan in categoricals with simple imputer

---
 autoPyTorch/data/base_feature_validator.py    |  2 -
 autoPyTorch/data/tabular_feature_validator.py | 86 +++++--------------
 2 files changed, 20 insertions(+), 68 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index ae2b60196..a6181c771 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -63,8 +63,6 @@ def __init__(self,
         self.categories = []  # type: typing.List[typing.List[int]]
         self.categorical_columns: typing.List[int] = []
         self.numerical_columns: typing.List[int] = []
-        # column identifiers may be integers or strings
-        self.null_columns: typing.Set[str] = set()
 
         self._is_fitted = False
 
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 27be45eca..22448f00e 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -62,12 +62,16 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
     """
     preprocessors: Dict[str, List[BaseEstimator]] = dict()
 
+    # Categorical Preprocessors
     onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
-    imputer = SimpleImputer(strategy='median', copy=False)
+    categorical_imputer = SimpleImputer(strategy='constant', copy=False)
+
+    # Numerical Preprocessors
+    numerical_imputer = SimpleImputer(strategy='median', copy=False)
     standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)
 
-    preprocessors['categorical'] = [onehot_encoder]
-    preprocessors['numerical'] = [imputer, standard_scaler]
+    preprocessors['categorical'] = [categorical_imputer, onehot_encoder]
+    preprocessors['numerical'] = [numerical_imputer, standard_scaler]
 
     return preprocessors
 
@@ -106,10 +110,11 @@ def _fit(
 
             X = cast(pd.DataFrame, X)
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
+            print("enc_columns", categorical_columns)
+            print("all_nan_columns", self.all_nan_columns)
 
             self.enc_columns = categorical_columns
-            if len(categorical_columns) >= 0:
-                X = self.impute_nan_in_categories(X)
+
             preprocessors = get_tabular_preprocessors()
             self.column_transformer = _create_column_transformer(
                 preprocessors=preprocessors,
@@ -196,10 +201,7 @@ def transform(
 
         # Check the data here so we catch problems on new test data
         self._check_data(X)
-        # We also need to fillna on the transformation
-        # in case test data is provided
-        if len(self.categorical_columns) >= 0:
-            X = self.impute_nan_in_categories(X)
+    
         X = self.column_transformer.transform(X)
 
         # Sparse related transformations
@@ -267,6 +269,15 @@ def _check_data(
         if hasattr(X, "iloc"):
             # If entered here, we have a pandas dataframe
             X = cast(pd.DataFrame, X)
+    
+            if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns):
+                X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
+            else:
+                self.all_nan_columns: List[Union[int, str]] = list()
+                for column in X.columns:
+                    if X[column].isna().all():
+                        self.all_nan_columns.append(column)
+                X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
 
             # Handle objects if possible
             object_columns_indicator = has_object_columns(X.dtypes.values)
@@ -466,63 +477,6 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
 
         return X
 
-    def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
-        """
-        impute missing values before encoding,
-        remove once sklearn natively supports
-        it in ordinal encoding. Sklearn issue:
-        "https://github.com/scikit-learn/scikit-learn/issues/17123)"
-        Arguments:
-            X (pd.DataFrame):
-                data to be interpreted.
-        Returns:
-            pd.DataFrame
-        """
-
-        def can_cast_as_number(value: Union[int, float, str]) -> bool:
-            try:
-                float(first_value)
-                return True
-            except ValueError:
-                return False
-
-        # To be on the safe side, map always to the same missing
-        # value per column
-        if not hasattr(self, 'dict_nancol_to_missing'):
-            self.dict_missing_value_per_col: Dict[str, Any] = {}
-
-        # First make sure that we do not alter the type of the column which cause:
-        # TypeError: '<' not supported between instances of 'int' and 'str'
-        # in the encoding
-        for column in self.enc_columns:
-            # no missing values for categorical column
-            if not X[column].isna().any():
-                continue
-
-            if column not in self.dict_missing_value_per_col:
-
-                first_value = X[column].dropna().values[0]
-
-                if can_cast_as_number(first_value):
-                    # In this case, we expect to have a number as category
-                    # it might be string, but its value represent a number
-                    missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
-                else:
-                    missing_value = 'Missing!'
-
-                # Make sure this missing value is not seen before
-                if hasattr(X[column], 'cat'):
-                    missing_value = get_unused_category_symbol(X[column], missing_value)
-
-                self.dict_missing_value_per_col[column] = missing_value
-
-                # Convert the frame in place
-                X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
-                                             inplace=True)
-                X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
-
-        return X
-
 
 def has_object_columns(
     feature_types: pd.Series,

From 38fe9e8dfbd8dd63c2030085147c56e9831f7d93 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 7 Oct 2021 17:10:24 +0200
Subject: [PATCH 13/24] Remove unused function

---
 autoPyTorch/data/tabular_feature_validator.py | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 22448f00e..62de46f56 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -495,41 +495,3 @@ def has_object_columns(
     """
     return np.dtype('O') in feature_types
 
-
-def get_unused_category_symbol(
-    frame_column: pd.Series,
-    missing_value_symbol: Union[int, str],
-) -> Union[int, str]:
-    """
-    Select the appropriate missing value symbol for a column.
-
-    Given a column from a DataFrame and an initial missing value symbol,
-    check if the missing_value is contained in the column. If it is, make
-    the necessary changes for a unique missing value symbol.
-
-    Arguments:
-        frame_column (pd.Series):
-            The DataFrame column.
-        missing_value_symbol (Union[int, str]):
-            The initial symbol for the missing value.
-            -1 for int and '-1' for str. 
-
-    Returns:
-        missing_value_symbol (Union[int, str]):
-            The unique missing value symbol.
-    """
-
-    if missing_value_symbol not in frame_column.cat.categories:
-        pass
-    elif isinstance(missing_value_symbol, str):
-        max_length = max(len(c) for c in frame_column.cat.categories)
-        # There are no categories that are longer than `max_length`
-        missing_value_symbol += '0' * max_length
-    else:
-        # min_value is guaranteed to be negative since there exists -1 in categories
-        # and min_value must be smaller than -1. So the symbol is always negative.
-        min_value = min(c for c in frame_column.cat.categories)
-        # always missing_value_symbol + min_value < min_value < 0
-        missing_value_symbol = missing_value_symbol + min_value
-
-    return missing_value_symbol

From 7693753c5f54df3619a22e9a3e1f5365bc6a5bde Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Thu, 7 Oct 2021 17:18:30 +0200
Subject: [PATCH 14/24] add comment

---
 autoPyTorch/data/tabular_feature_validator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 62de46f56..800932fa0 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -269,7 +269,8 @@ def _check_data(
         if hasattr(X, "iloc"):
             # If entered here, we have a pandas dataframe
             X = cast(pd.DataFrame, X)
-    
+
+            # we should remove columns with all nans in the training set.
             if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns):
                 X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
             else:

From 497c546c34a410e8cf7a16f2192b71f3d768ab0c Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Thu, 7 Oct 2021 17:24:11 +0200
Subject: [PATCH 15/24] Update autoPyTorch/data/tabular_feature_validator.py

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 autoPyTorch/data/tabular_feature_validator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 800932fa0..979f5cac3 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -324,9 +324,9 @@ def _get_columns_info(
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
             categorical_columns: (List[str])
-                Categorical columns.
+                List of the names of categorical columns.
             numerical_columns: (List[str])
-                Numerical columns.
+                List of the names of numerical columns.
             feat_type:
                 Type of each column numerical/categorical
         """

From 9254eb24bab1893e8e1d8be185733c077f81f7ea Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Thu, 7 Oct 2021 17:25:14 +0200
Subject: [PATCH 16/24] Update autoPyTorch/data/tabular_feature_validator.py

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 autoPyTorch/data/tabular_feature_validator.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 979f5cac3..8735286bb 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -465,10 +465,8 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
         else:
             # Calling for the first time to infer the categories
             X = X.infer_objects()
-            # initial data types
-            data_types = X.dtypes
-            for index, column in enumerate(X.columns):
-                if not is_numeric_dtype(data_types[index]):
+            for column, data_type in zip(X.columns, X.dtypes):
+                if not is_numeric_dtype(data_type):
                     X[column] = X[column].astype('category')
             # only numerical attributes and categories
             data_types = X.dtypes

From b63ff3c74fe6023e6698835df3a913d20bf91647 Mon Sep 17 00:00:00 2001
From: ArlindKadra <arlindkadra@gmail.com>
Date: Fri, 8 Oct 2021 11:25:30 +0200
Subject: [PATCH 17/24] Adding unit test for only nall columns in the tabular
 feature categorical evaluator

---
 autoPyTorch/data/tabular_feature_validator.py |  2 -
 test/test_data/test_feature_validator.py      | 87 +++++++++++++++++++
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 8735286bb..a940584dc 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -110,8 +110,6 @@ def _fit(
 
             X = cast(pd.DataFrame, X)
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
-            print("enc_columns", categorical_columns)
-            print("all_nan_columns", self.all_nan_columns)
 
             self.enc_columns = categorical_columns
 
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index f9ba2855e..54101a4b9 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -317,6 +317,93 @@ def test_featurevalidator_get_columns_to_encode():
     assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical']
 
 
+def test_featurevalidator_remove_nan__catcolumns():
+    """
+    Make sure categorical columns that have only nan values are removed.
+    """
+    # First case, there exist null columns in the train set
+    # and the same columns are not all null for the test set.
+    validator = TabularFeatureValidator()
+
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan, 'C': 5},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+
+    validator.fit(df_train)
+    transformed_df_train = validator.transform(df_train)
+    transformed_df_test = validator.transform(df_test)
+
+    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]]))
+    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]]))
+
+    # Second case, there exist null columns in the training set and the same
+    # are null in the test set.
+    validator = TabularFeatureValidator()
+
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan, 'C': np.nan},
+            {'A': np.nan, 'C': np.nan},
+            {'A': 1}
+        ],
+        dtype='category',
+    )
+
+    validator.fit(df_train)
+    transformed_df_train = validator.transform(df_train)
+    transformed_df_test = validator.transform(df_test)
+
+    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]]))
+    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]]))
+
+    # Third case, there exist no null columns in the training set and a
+    # few null columns exist in the test set.
+    validator = TabularFeatureValidator()
+
+    df_train = pd.DataFrame(
+        [
+            {'A': 1, 'B': 1},
+            {'A': 2, 'B': 2}
+        ],
+        dtype='category',
+    )
+    df_test = pd.DataFrame(
+        [
+            {'A': np.nan, 'B': np.nan},
+            {'A': np.nan, 'B': np.nan}
+        ],
+        dtype='category',
+    )
+
+    validator.fit(df_train)
+    transformed_df_train = validator.transform(df_train)
+    transformed_df_test = validator.transform(df_test)
+
+    assert np.array_equal(transformed_df_train, np.array([[0, 1, 0, 1], [1, 0, 1, 0]]))
+    assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]]))
+
+
 def test_features_unsupported_calls_are_raised():
     """
     Makes sure we raise a proper message to the user,

From d5bbdbe1ee45defb8c4a919c694828366634d205 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 11:28:18 +0200
Subject: [PATCH 18/24] fix bug in remove all nan columns

---
 autoPyTorch/data/base_feature_validator.py    | 46 ++++++++++---------
 autoPyTorch/data/tabular_feature_validator.py | 30 +++++++-----
 2 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index a6181c771..eae832128 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_FEAT_TYPES = Union[
+    List,
     pd.DataFrame,
     np.ndarray,
     scipy.sparse.bsr_matrix,
@@ -35,41 +35,43 @@ class BaseFeatureValidator(BaseEstimator):
             List of the column types found by this estimator during fit.
         data_type (str):
             Class name of the data type provided during fit.
-        encoder (typing.Optional[BaseEstimator])
+        encoder (Optional[BaseEstimator])
             Host a encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
+        enc_columns (List[str])
             List of columns that were encoded.
     """
     def __init__(self,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
+                 logger: Optional[Union[PicklableClientLogger, logging.Logger
                                                       ]] = None,
                  ) -> None:
         # Register types to detect unsupported data format changes
-        self.feat_type = None  # type: typing.Optional[typing.List[str]]
-        self.data_type = None  # type: typing.Optional[type]
-        self.dtypes = []  # type: typing.List[str]
-        self.column_order = []  # type: typing.List[str]
+        self.feat_type = None  # type: Optional[List[str]]
+        self.data_type = None  # type: Optional[type]
+        self.dtypes = []  # type: List[str]
+        self.column_order = []  # type: List[str]
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
-        self.enc_columns = []  # type: typing.List[str]
+        self.encoder = None  # type: Optional[BaseEstimator]
+        self.enc_columns = []  # type: List[str]
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Required for dataset properties
-        self.num_features = None  # type: typing.Optional[int]
-        self.categories = []  # type: typing.List[typing.List[int]]
-        self.categorical_columns: typing.List[int] = []
-        self.numerical_columns: typing.List[int] = []
+        self.num_features = None  # type: Optional[int]
+        self.categories = []  # type: List[List[int]]
+        self.categorical_columns: List[int] = []
+        self.numerical_columns: List[int] = []
+
+        self.all_nan_columns: Optional[List[Union[int, str]]] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         X_train: SUPPORTED_FEAT_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
@@ -80,7 +82,7 @@ def fit(
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
                 A hold out set of data used for checking
         """
 
@@ -158,8 +160,8 @@ def transform(
     def list_to_dataframe(
         self,
         X_train: SUPPORTED_FEAT_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
-    ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]:
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """
         Converts a list to a pandas DataFrame. In this process, column types are inferred.
 
@@ -169,7 +171,7 @@ def list_to_dataframe(
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
                 A hold out set of data used for checking
         Returns:
             pd.DataFrame:
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index a940584dc..071e9c6fe 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -110,6 +110,12 @@ def _fit(
 
             X = cast(pd.DataFrame, X)
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
+            
+            self.all_nan_columns = list()
+            for column in X.columns:
+                if X[column].isna().all():
+                    self.all_nan_columns.append(column)
+
 
             self.enc_columns = categorical_columns
 
@@ -199,7 +205,19 @@ def transform(
 
         # Check the data here so we catch problems on new test data
         self._check_data(X)
-    
+
+        if self.all_nan_columns is None:
+            raise NotFittedError("Expected all_nan_columns to be"
+                                 " initialised during fit, got {}".format(self.all_nan_columns))
+        if set(self.all_nan_columns).issubset(X.columns):
+            raise ValueError("Expected all nan columns {} to be a" 
+                             "subset of the columns of the dataset {}".format(
+                                                                              self.all_nan_columns,
+                                                                              X.columns
+                                                                              )
+                            )
+        X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
+
         X = self.column_transformer.transform(X)
 
         # Sparse related transformations
@@ -268,16 +286,6 @@ def _check_data(
             # If entered here, we have a pandas dataframe
             X = cast(pd.DataFrame, X)
 
-            # we should remove columns with all nans in the training set.
-            if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns):
-                X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
-            else:
-                self.all_nan_columns: List[Union[int, str]] = list()
-                for column in X.columns:
-                    if X[column].isna().all():
-                        self.all_nan_columns.append(column)
-                X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
-
             # Handle objects if possible
             object_columns_indicator = has_object_columns(X.dtypes.values)
             if object_columns_indicator:

From bfe489941ea31322960928145cb3f921ec27070c Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 13:49:35 +0200
Subject: [PATCH 19/24] Bug fix for making tests run by arlind

---
 autoPyTorch/data/base_feature_validator.py    |  4 +-
 autoPyTorch/data/tabular_feature_validator.py | 53 +++++++++++--------
 test/test_data/test_feature_validator.py      | 35 ++++++------
 3 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index eae832128..2dc97f3a9 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional, Set, Tuple, Union
 
 import numpy as np
 
@@ -64,7 +64,7 @@ def __init__(self,
         self.categorical_columns: List[int] = []
         self.numerical_columns: List[int] = []
 
-        self.all_nan_columns: Optional[List[Union[int, str]]] = None
+        self.all_nan_columns: Optional[Set[Union[int, str]]] = None
 
         self._is_fitted = False
 
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 071e9c6fe..611a8060f 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -49,7 +49,7 @@ def _create_column_transformer(
     return ColumnTransformer([
         ('categorical_pipeline', categorical_pipeline, categorical_columns),
         ('numerical_pipeline', numerical_pipeline, numerical_columns)],
-        remainder='passthrough'
+        remainder='drop'
     )
 
 
@@ -109,13 +109,12 @@ def _fit(
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
 
             X = cast(pd.DataFrame, X)
-            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
-            
-            self.all_nan_columns = list()
+            self.all_nan_columns = set()
             for column in X.columns:
                 if X[column].isna().all():
-                    self.all_nan_columns.append(column)
+                    self.all_nan_columns.add(column)
 
+            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)            
 
             self.enc_columns = categorical_columns
 
@@ -206,17 +205,15 @@ def transform(
         # Check the data here so we catch problems on new test data
         self._check_data(X)
 
-        if self.all_nan_columns is None:
-            raise NotFittedError("Expected all_nan_columns to be"
-                                 " initialised during fit, got {}".format(self.all_nan_columns))
-        if set(self.all_nan_columns).issubset(X.columns):
-            raise ValueError("Expected all nan columns {} to be a" 
-                             "subset of the columns of the dataset {}".format(
-                                                                              self.all_nan_columns,
-                                                                              X.columns
-                                                                              )
-                            )
-        X.drop(labels=self.all_nan_columns, axis=1, inplace=True)
+        # in case of test data being all none and train data
+        # having a value for a categorical column.
+        # We need to convert the column in test data to 
+        # object otherwise the test column is interpreted as float
+        if len(self.categorical_columns) > 0:
+            categorical_columns = self.column_transformer.transformers_[0][-1]
+            for column in categorical_columns:
+                if X[column].isna().all():
+                    X[column] = X[column].astype('object')
 
         X = self.column_transformer.transform(X)
 
@@ -307,13 +304,20 @@ def _check_data(
 
             dtypes = [dtype.name for dtype in X.dtypes]
             if len(self.dtypes) > 0:
-                if self.dtypes != dtypes:
-                    raise ValueError("Changing the dtype of the features after fit() is "
-                                     "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(self.dtypes,
-                                                                                          dtypes,
-                                                                                          )
-                                     )
+                dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]
+                if any(dtypes_diff):
+                    if self.all_nan_columns is not None and len(self.all_nan_columns) > 0:
+                        if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0:
+                            # we expect the dtypes to only be different if the column belongs 
+                            # to all_nan_columns as these columns would be imputed. if there is 
+                            # a value in the test set for a column in all_nan_columns, pandas 
+                            # does not recognise the dtype of the test column properly
+                            raise ValueError("Changing the dtype of the features after fit() is "
+                                            "not supported. Fit() method was called with "
+                                            "{} whereas the new features have {} as type".format(self.dtypes,
+                                                                                                dtypes,
+                                                                                                )
+                                            )
             else:
                 self.dtypes = dtypes
 
@@ -344,6 +348,8 @@ def _get_columns_info(
 
         # Make sure each column is a valid type
         for i, column in enumerate(X.columns):
+            if self.all_nan_columns is not None and column in self.all_nan_columns:
+                continue
             column_dtype = self.dtypes[i]
             if column_dtype in ['category', 'bool']:
                 categorical_columns.append(column)
@@ -474,6 +480,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
             for column, data_type in zip(X.columns, X.dtypes):
                 if not is_numeric_dtype(data_type):
                     X[column] = X[column].astype('category')
+
             # only numerical attributes and categories
             data_types = X.dtypes
             self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index 54101a4b9..a166bf97c 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -346,8 +346,8 @@ def test_featurevalidator_remove_nan__catcolumns():
     transformed_df_train = validator.transform(df_train)
     transformed_df_test = validator.transform(df_test)
 
-    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]]))
-    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]]))
+    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float))
+    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float))
 
     # Second case, there exist null columns in the training set and the same
     # are null in the test set.
@@ -374,8 +374,8 @@ def test_featurevalidator_remove_nan__catcolumns():
     transformed_df_train = validator.transform(df_train)
     transformed_df_test = validator.transform(df_test)
 
-    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]]))
-    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]]))
+    assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float))
+    assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float))
 
     # Third case, there exist no null columns in the training set and a
     # few null columns exist in the test set.
@@ -400,8 +400,8 @@ def test_featurevalidator_remove_nan__catcolumns():
     transformed_df_train = validator.transform(df_train)
     transformed_df_test = validator.transform(df_test)
 
-    assert np.array_equal(transformed_df_train, np.array([[0, 1, 0, 1], [1, 0, 1, 0]]))
-    assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]]))
+    assert np.array_equal(transformed_df_train, np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=float))
+    assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=float))
 
 
 def test_features_unsupported_calls_are_raised():
@@ -636,16 +636,19 @@ def test_feature_validator_imbalanced_data():
     validator = TabularFeatureValidator()
     validator.fit(X_train)
 
+    transformed_X_train = validator.transform(X_train)
+
     train_feature_types = copy.deepcopy(validator.feat_type)
-    assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
+    assert train_feature_types == ['numerical']
     # validator will throw an error if the column types are not the same
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
-    null_columns = []
-    for column in transformed_X_test.columns:
-        if transformed_X_test[column].isna().all():
-            null_columns.append(column)
-    assert null_columns == [0, 2, 3]
+    assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D'])
+    # as there are no categorical columns, we can make such an 
+    # assertion. We only expect to drop the all nan columns
+    total_all_nan_columns = len(validator.all_nan_columns)
+    total_columns = len(validator.column_order)
+    assert total_columns - total_all_nan_columns == len(transformed_X_test.columns)
 
     # Columns with not all null values in the train split and
     # completely null on the test split.
@@ -664,14 +667,10 @@ def test_feature_validator_imbalanced_data():
     X_test = pd.DataFrame.from_dict(test_features)
     validator = TabularFeatureValidator()
     validator.fit(X_train)
+
     train_feature_types = copy.deepcopy(validator.feat_type)
     assert train_feature_types == ['categorical', 'numerical', 'numerical']
 
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
-    null_columns = []
-    for column in transformed_X_test.columns:
-        if transformed_X_test[column].isna().all():
-            null_columns.append(column)
-
-    assert null_columns == [1]
+    assert not len(validator.all_nan_columns)

From 369edad26d37186d48c1bd02aff738929bf32b48 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 14:39:24 +0200
Subject: [PATCH 20/24] fix flake errors in feature validator

---
 autoPyTorch/data/base_feature_validator.py    |  3 ++-
 autoPyTorch/data/tabular_feature_validator.py | 24 +++++++++----------
 test/test_data/test_feature_validator.py      |  4 +---
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 2dc97f3a9..757a19b46 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -43,7 +43,8 @@ class BaseFeatureValidator(BaseEstimator):
     """
     def __init__(self,
                  logger: Optional[Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
+                                        ]
+                                  ] = None,
                  ) -> None:
         # Register types to detect unsupported data format changes
         self.feat_type = None  # type: Optional[List[str]]
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 611a8060f..7f17d918e 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Dict, List, Optional, Tuple, cast
 
 import numpy as np
 
@@ -114,7 +114,7 @@ def _fit(
                 if X[column].isna().all():
                     self.all_nan_columns.add(column)
 
-            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)            
+            categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
 
             self.enc_columns = categorical_columns
 
@@ -207,7 +207,7 @@ def transform(
 
         # in case of test data being all none and train data
         # having a value for a categorical column.
-        # We need to convert the column in test data to 
+        # We need to convert the column in test data to
         # object otherwise the test column is interpreted as float
         if len(self.categorical_columns) > 0:
             categorical_columns = self.column_transformer.transformers_[0][-1]
@@ -308,16 +308,16 @@ def _check_data(
                 if any(dtypes_diff):
                     if self.all_nan_columns is not None and len(self.all_nan_columns) > 0:
                         if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0:
-                            # we expect the dtypes to only be different if the column belongs 
-                            # to all_nan_columns as these columns would be imputed. if there is 
-                            # a value in the test set for a column in all_nan_columns, pandas 
+                            # we expect the dtypes to only be different if the column belongs
+                            # to all_nan_columns as these columns would be imputed. if there is
+                            # a value in the test set for a column in all_nan_columns, pandas
                             # does not recognise the dtype of the test column properly
                             raise ValueError("Changing the dtype of the features after fit() is "
-                                            "not supported. Fit() method was called with "
-                                            "{} whereas the new features have {} as type".format(self.dtypes,
-                                                                                                dtypes,
-                                                                                                )
-                                            )
+                                             "not supported. Fit() method was called with "
+                                             "{} whereas the new features have {} as type".format(self.dtypes,
+                                                                                                  dtypes,
+                                                                                                  )
+                                             )
             else:
                 self.dtypes = dtypes
 
@@ -482,7 +482,6 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
                     X[column] = X[column].astype('category')
 
             # only numerical attributes and categories
-            data_types = X.dtypes
             self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
 
         self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
@@ -506,4 +505,3 @@ def has_object_columns(
             otherwise.
     """
     return np.dtype('O') in feature_types
-
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index a166bf97c..c2d516162 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -636,15 +636,13 @@ def test_feature_validator_imbalanced_data():
     validator = TabularFeatureValidator()
     validator.fit(X_train)
 
-    transformed_X_train = validator.transform(X_train)
-
     train_feature_types = copy.deepcopy(validator.feat_type)
     assert train_feature_types == ['numerical']
     # validator will throw an error if the column types are not the same
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
     assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D'])
-    # as there are no categorical columns, we can make such an 
+    # as there are no categorical columns, we can make such an
     # assertion. We only expect to drop the all nan columns
     total_all_nan_columns = len(validator.all_nan_columns)
     total_columns = len(validator.column_order)

From a4fb0cb4af571e1ba81384b4cb3e4e0f1f94beb9 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 14:57:07 +0200
Subject: [PATCH 21/24] made typing code uniform

---
 autoPyTorch/data/base_feature_validator.py | 16 ++++-----
 autoPyTorch/data/base_target_validator.py  | 40 +++++++++++-----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index 757a19b46..ed109e380 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -47,21 +47,21 @@ def __init__(self,
                                   ] = None,
                  ) -> None:
         # Register types to detect unsupported data format changes
-        self.feat_type = None  # type: Optional[List[str]]
-        self.data_type = None  # type: Optional[type]
-        self.dtypes = []  # type: List[str]
-        self.column_order = []  # type: List[str]
+        self.feat_type: Optional[List[str]] = None 
+        self.data_type: Optional[type] = None 
+        self.dtypes: List[str] = []
+        self.column_order: List[str] = []
 
-        self.encoder = None  # type: Optional[BaseEstimator]
-        self.enc_columns = []  # type: List[str]
+        self.encoder: Optional[BaseEstimator] = None
+        self.enc_columns: List[str] = []
 
         self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Required for dataset properties
-        self.num_features = None  # type: Optional[int]
-        self.categories = []  # type: List[List[int]]
+        self.num_features: Optional[int] = None
+        self.categories: List[List[int]] = []
         self.categorical_columns: List[int] = []
         self.numerical_columns: List[int] = []
 
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
index dba9c19e3..0fb318476 100644
--- a/autoPyTorch/data/base_target_validator.py
+++ b/autoPyTorch/data/base_target_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import List, Optional, Union, cast
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_TARGET_TYPES = Union[
+    List,
     pd.Series,
     pd.DataFrame,
     np.ndarray,
@@ -35,39 +35,39 @@ class BaseTargetValidator(BaseEstimator):
         is_classification (bool):
             A bool that indicates if the validator should operate in classification mode.
             During classification, the targets are encoded.
-        encoder (typing.Optional[BaseEstimator]):
+        encoder (Optional[BaseEstimator]):
             Host a encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
+        enc_columns (List[str])
             List of columns that where encoded
     """
     def __init__(self,
                  is_classification: bool = False,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
+                 logger: Optional[Union[PicklableClientLogger, logging.Logger
                                                       ]] = None,
                  ) -> None:
         self.is_classification = is_classification
 
-        self.data_type = None  # type: typing.Optional[type]
+        self.data_type: Optional[type] = None
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
+        self.encoder: Optional[BaseEstimator] = None
 
-        self.out_dimensionality = None  # type: typing.Optional[int]
-        self.type_of_target = None  # type: typing.Optional[str]
+        self.out_dimensionality: Optional[int] = None
+        self.type_of_target: Optional[str] = None
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Store the dtype for remapping to correct type
-        self.dtype = None  # type: typing.Optional[type]
+        self.dtype: Optional[type] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
@@ -76,7 +76,7 @@ def fit(
         Arguments:
             y_train (SUPPORTED_TARGET_TYPES)
                 A set of targets set aside for training
-            y_test (typing.Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SUPPORTED_TARGET_TYPES])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -95,8 +95,8 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
-                y_test = typing.cast(pd.DataFrame, y_test)
+                y_train = cast(pd.DataFrame, y_train)
+                y_test = cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(
                         "Train and test targets must both have the same columns, yet "
@@ -127,21 +127,21 @@ def fit(
     def _fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
         Arguments:
             y_train (SUPPORTED_TARGET_TYPES)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (typing.Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SUPPORTED_TARGET_TYPES])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: typing.Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SUPPORTED_TARGET_TYPES],
     ) -> np.ndarray:
         """
         Arguments:
@@ -162,7 +162,7 @@ def inverse_transform(
         Revert any encoding transformation done on a target array
 
         Arguments:
-            y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
+            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
                 Target array to be transformed back to original form before encoding
         Returns:
             np.ndarray:

From 44229a680e47bc0e3c0da0fd28ad9128ee589953 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com>
Date: Fri, 8 Oct 2021 17:02:50 +0200
Subject: [PATCH 22/24] Apply suggestions from code review

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 autoPyTorch/data/tabular_feature_validator.py | 9 +++------
 test/test_data/test_feature_validator.py      | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 7f17d918e..e51b2b387 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -109,10 +109,7 @@ def _fit(
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
 
             X = cast(pd.DataFrame, X)
-            self.all_nan_columns = set()
-            for column in X.columns:
-                if X[column].isna().all():
-                    self.all_nan_columns.add(column)
+             self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
 
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
 
@@ -284,8 +281,8 @@ def _check_data(
             X = cast(pd.DataFrame, X)
 
             # Handle objects if possible
-            object_columns_indicator = has_object_columns(X.dtypes.values)
-            if object_columns_indicator:
+            exist_object_columns = has_object_columns(X.dtypes.values)
+            if exist_object_columns:
                 X = self.infer_objects(X)
 
             # Define the column to be encoded here as the feature validator is fitted once
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
index c2d516162..535023cd2 100644
--- a/test/test_data/test_feature_validator.py
+++ b/test/test_data/test_feature_validator.py
@@ -317,7 +317,7 @@ def test_featurevalidator_get_columns_to_encode():
     assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical']
 
 
-def test_featurevalidator_remove_nan__catcolumns():
+def test_featurevalidator_remove_nan_catcolumns():
     """
     Make sure categorical columns that have only nan values are removed.
     """

From ba3c1e7852cbf2b814a5a353e31facdcf94feda9 Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 17:02:27 +0200
Subject: [PATCH 23/24] address comments from shuhei

---
 autoPyTorch/data/base_feature_validator.py    |  4 +-
 autoPyTorch/data/tabular_feature_validator.py | 67 ++++++++-----------
 2 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
index ed109e380..9ed46d6e6 100644
--- a/autoPyTorch/data/base_feature_validator.py
+++ b/autoPyTorch/data/base_feature_validator.py
@@ -47,8 +47,8 @@ def __init__(self,
                                   ] = None,
                  ) -> None:
         # Register types to detect unsupported data format changes
-        self.feat_type: Optional[List[str]] = None 
-        self.data_type: Optional[type] = None 
+        self.feat_type: Optional[List[str]] = None
+        self.data_type: Optional[type] = None
         self.dtypes: List[str] = []
         self.column_order: List[str] = []
 
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index e51b2b387..9a84e63ec 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -310,7 +310,8 @@ def _check_data(
                             # a value in the test set for a column in all_nan_columns, pandas
                             # does not recognise the dtype of the test column properly
                             raise ValueError("Changing the dtype of the features after fit() is "
-                                             "not supported. Fit() method was called with "
+                                             "not supported. The dtype of some columns are different "
+                                             "between training and test datasets. Fit() method was called with "
                                              "{} whereas the new features have {} as type".format(self.dtypes,
                                                                                                   dtypes,
                                                                                                   )
@@ -348,51 +349,41 @@ def _get_columns_info(
             if self.all_nan_columns is not None and column in self.all_nan_columns:
                 continue
             column_dtype = self.dtypes[i]
+            err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
+                      "but input Column {} has an invalid type `{}`.".format(column, column_dtype)
             if column_dtype in ['category', 'bool']:
                 categorical_columns.append(column)
                 feat_type.append('categorical')
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
-            elif not is_numeric_dtype(column_dtype):
+            elif is_numeric_dtype(column_dtype):
+                feat_type.append('numerical')
+                numerical_columns.append(column)
+            elif column_dtype == 'object':
                 # TODO verify how would this happen when we always convert the object dtypes to category
-                if column_dtype == 'object':
-                    raise ValueError(
-                        "Input Column {} has invalid type object. "
-                        "Cast it to a valid dtype before using it in AutoPyTorch. "
-                        "Valid types are numerical, categorical or boolean. "
-                        "You can cast it to a valid dtype using "
-                        "pandas.Series.astype ."
-                        "If working with string objects, the following "
-                        "tutorial illustrates how to work with text data: "
-                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
-                            # noqa: E501
-                            column,
-                        )
-                    )
-                elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
-                    column_dtype
-                ):
-                    raise ValueError(
-                        "AutoPyTorch does not support time and/or date datatype as given "
-                        "in column {}. Please convert the time information to a numerical value "
-                        "first. One example on how to do this can be found on "
-                        "https://stats.stackexchange.com/questions/311494/".format(
-                            column,
-                        )
-                    )
-                else:
-                    raise ValueError(
-                        "Input Column {} has unsupported dtype {}. "
-                        "Supported column types are categorical/bool/numerical dtypes. "
-                        "Make sure your data is formatted in a correct way, "
-                        "before feeding it to AutoPyTorch.".format(
-                            column,
-                            column_dtype,
-                        )
+                raise ValueError(
+                    "{} Cast it to a valid dtype before feeding it to AutoPyTorch. "
+                    "You can cast it to a valid dtype using pandas.Series.astype."
+                    "If you are working with string objects, the following "
+                    "tutorial illustrates how to work with text data: "
+                    "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
+                        # noqa: E501
+                        err_msg,
                     )
+                )
+            elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype):
+                raise ValueError(
+                    "{} Convert the time information to a numerical value"
+                    " before feeding it to AutoPyTorch. "
+                    "One example of the conversion can be found on "
+                    "https://stats.stackexchange.com/questions/311494/".format(err_msg)
+                )
             else:
-                feat_type.append('numerical')
-                numerical_columns.append(column)
+                raise ValueError(
+                    "{} Make sure your data is formatted in a correct way"
+                    "before feeding it to AutoPyTorch.".format(err_msg)
+                )
+
         return categorical_columns, numerical_columns, feat_type
 
     def list_to_dataframe(

From 10a8441c201eaedf92ba0af406eaa7a90b74ad8f Mon Sep 17 00:00:00 2001
From: Ravin Kohli <kohliravin7@gmail.com>
Date: Fri, 8 Oct 2021 17:05:34 +0200
Subject: [PATCH 24/24] address comments from shuhei (2)

---
 autoPyTorch/data/tabular_feature_validator.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
index 9a84e63ec..3f939bc98 100644
--- a/autoPyTorch/data/tabular_feature_validator.py
+++ b/autoPyTorch/data/tabular_feature_validator.py
@@ -109,7 +109,8 @@ def _fit(
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
 
             X = cast(pd.DataFrame, X)
-             self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
+
+            self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()])
 
             categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
 
@@ -147,15 +148,6 @@ def comparator(cmp1: str, cmp2: str) -> int:
                 key=functools.cmp_to_key(comparator)
             )
 
-            if len(categorical_columns) > 0:
-                self.categories = [
-                    # We fit a one-hot encoder, where all categorical
-                    # columns are shifted to the left
-                    list(range(len(cat)))
-                    for cat in self.column_transformer.named_transformers_[
-                        'categorical_pipeline'].named_steps['onehotencoder'].categories_
-                ]
-
             # differently to categorical_columns and numerical_columns,
             # this saves the index of the column.
             for i, type_ in enumerate(self.feat_type):