From 359b4c9239cbbb327c76808a8b7bd7d0749730f8 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Fri, 1 Oct 2021 16:06:36 +0200 Subject: [PATCH 01/24] Initial try at an enhancement for the tabular validator --- autoPyTorch/data/base_feature_validator.py | 28 ++++ autoPyTorch/data/tabular_feature_validator.py | 129 ++++++++++++------ 2 files changed, 116 insertions(+), 41 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 0106a3aa8..ae2b60196 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -122,6 +122,7 @@ def _fit( self: The fitted base estimator """ + raise NotImplementedError() def _check_data( @@ -136,6 +137,7 @@ def _check_data( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding """ + raise NotImplementedError() def transform( @@ -152,4 +154,30 @@ def transform( np.ndarray: The transformed array """ + + raise NotImplementedError() + + def list_to_dataframe( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]: + """ + Converts a list to a pandas DataFrame. In this process, column types are inferred. + + If test data is provided, we proactively match it to train data + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + Returns: + pd.DataFrame: + transformed train data from list to pandas DataFrame + pd.DataFrame: + transformed test data from list to pandas DataFrame + """ + raise NotImplementedError() diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 698e92438..b46ba12ae 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,6 +1,7 @@ import functools from typing import Any, Dict, List, Optional, Tuple, Union, cast + import numpy as np import pandas as pd @@ -38,6 +39,7 @@ def _create_column_transformer( Returns: ColumnTransformer """ + numerical_pipeline = 'drop' categorical_pipeline = 'drop' if len(numerical_columns) > 0: @@ -63,18 +65,25 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: preprocessors['numerical'] = list() preprocessors['categorical'] = list() - preprocessors['categorical'].append(OneHotEncoder( - categories='auto', - sparse=False, - handle_unknown='ignore')) - preprocessors['numerical'].append(SimpleImputer(strategy='median', - copy=False)) - preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) + preprocessors['categorical'].append( + OneHotEncoder( + categories='auto', + sparse=False, + handle_unknown='ignore', + ) + ) + preprocessors['numerical'].append( + SimpleImputer( + strategy='median', + copy=False, + ) + ) return preprocessors class TabularFeatureValidator(BaseFeatureValidator): + def _fit( self, X: SUPPORTED_FEAT_TYPES, @@ -96,24 +105,27 @@ def _fit( # The final output of a validator is a numpy array. But pandas # gives us information about the column dtype if isinstance(X, np.ndarray): + X = self.numpy_array_to_pandas(X) + # Replace the data type from the previously saved type. + self.data_type = type(X) + # save all the information about the column order and data types + self._check_data(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = cast(pd.DataFrame, X) - - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - self._check_data(X) + X = cast(pd.DataFrame, X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) self.enc_columns = categorical_columns if len(categorical_columns) >= 0: X = self.impute_nan_in_categories(X) preprocessors = get_tabular_preprocessors() - self.column_transformer = _create_column_transformer(preprocessors=preprocessors, - numerical_columns=numerical_columns, - categorical_columns=categorical_columns) + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns, + ) # Mypy redefinition assert self.column_transformer is not None @@ -142,13 +154,15 @@ def comparator(cmp1: str, cmp2: str) -> int: if len(categorical_columns) > 0: self.categories = [ - # We fit an ordinal encoder, where all categorical + # We fit an one-hot encoder, where all categorical # columns are shifted to the left list(range(len(cat))) for cat in self.column_transformer.named_transformers_[ 'categorical_pipeline'].named_steps['onehotencoder'].categories_ ] + # differently to categorical_columns and numerical_columns, + # this saves the index of the column. for i, type_ in enumerate(self.feat_type): if 'numerical' in type_: self.numerical_columns.append(i) @@ -156,7 +170,8 @@ def comparator(cmp1: str, cmp2: str) -> int: self.categorical_columns.append(i) # Lastly, store the number of features - self.num_features = np.shape(X)[1] + self.num_features = len(X.columns) + return self def transform( @@ -189,10 +204,6 @@ def transform( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - # Check the data here so we catch problems on new test data self._check_data(X) # We also need to fillna on the transformation @@ -268,13 +279,13 @@ def _check_data( X = cast(pd.DataFrame, X) # Handle objects if possible - if not X.select_dtypes(include='object').empty: + object_columns_indicator = has_object_columns(X.dtypes) + if object_columns_indicator: X = self.infer_objects(X) # Define the column to be encoded here as the feature validator is fitted once # per estimator # enc_columns, _ = self._get_columns_to_encode(X) - column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: @@ -310,8 +321,10 @@ def _get_columns_info( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: - enc_columns (List[str]): - Columns to encode, if any + categorical_columns: (List[str]) + Categorical columns. + numerical_columns: (List[str]) + Numerical columns. feat_type: Type of each column numerical/categorical """ @@ -323,14 +336,15 @@ def _get_columns_info( # Make sure each column is a valid type for i, column in enumerate(X.columns): - if X[column].dtype.name in ['category', 'bool']: - + column_dtype = self.dtypes[i] + if column_dtype.name in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': + elif not is_numeric_dtype(column_dtype): + # TODO verify how would this happen when we always convert the object dtypes to category + if column_dtype.name == 'object': raise ValueError( "Input Column {} has invalid type object. " "Cast it to a valid dtype before using it in AutoPyTorch. " @@ -345,7 +359,7 @@ def _get_columns_info( ) ) elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - X[column].dtype + column_dtype ): raise ValueError( "AutoPyTorch does not support time and/or date datatype as given " @@ -362,7 +376,7 @@ def _get_columns_info( "Make sure your data is formatted in a correct way, " "before feeding it to AutoPyTorch.".format( column, - X[column].dtype.name, + column_dtype.name, ) ) else: @@ -394,7 +408,7 @@ def list_to_dataframe( """ # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() + X_train = pd.DataFrame(data=X_train).convert_dtypes() self.logger.warning("The provided feature types to AutoPyTorch are of type list." "Features have been interpreted as: {}".format([(col, t) for col, t in zip(X_train.columns, X_train.dtypes)])) @@ -403,7 +417,8 @@ def list_to_dataframe( self.logger.warning("Train features are a list while the provided test data" "is {}. X_test will be casted as DataFrame.".format(type(X_test)) ) - X_test = pd.DataFrame(data=X_test).infer_objects() + X_test = pd.DataFrame(data=X_test).convert_dtypes() + return X_train, X_test @staticmethod @@ -446,17 +461,21 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") pass else: + # Calling for the first time to infer the categories X = X.infer_objects() - for column in X.columns: - if not is_numeric_dtype(X[column]): + # initial data types + data_types = X.dtypes + for index, column in enumerate(X.columns): + if not is_numeric_dtype(data_types[index]): X[column] = X[column].astype('category') - self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} + # only numerical attributes and categories + data_types = X.dtypes + self.object_dtype_mapping = {column: data_types[index] for index, column in enumerate(X.columns)} self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + return X - def impute_nan_in_categories(self, - X: pd.DataFrame - ) -> pd.DataFrame: + def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: """ impute missing values before encoding, remove once sklearn natively supports @@ -481,14 +500,16 @@ def impute_nan_in_categories(self, if X[column].isna().any(): if column not in self.dict_missing_value_per_col: try: - float(X[column].dropna().values[0]) + first_value = X[column].dropna().values[0] + float(first_value) can_cast_as_number = True except Exception: can_cast_as_number = False if can_cast_as_number: # In this case, we expect to have a number as category # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1 + + missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1 else: missing_value = 'Missing!' @@ -507,4 +528,30 @@ def impute_nan_in_categories(self, X[column].cat.add_categories([self.dict_missing_value_per_col[column]], inplace=True) X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) + return X + +def has_object_columns( + feature_types: pd.Series, +) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. + + Arguments: + ---------- + feature_types: pd.Series + The feature types for a DataFrame. + Returns: + -------- + bool + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + object_columns_indicator = [True if pd.api.types.is_object_dtype(feature_type) else False + for feature_type in feature_types] + + if True in object_columns_indicator: + return True + else: + return False From 65e8ffb97520599a3060db594c7735affee688b4 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Fri, 1 Oct 2021 16:10:33 +0200 Subject: [PATCH 02/24] Adding a few type annotations --- autoPyTorch/data/tabular_feature_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index b46ba12ae..06979533d 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -23,7 +23,7 @@ def _create_column_transformer( preprocessors: Dict[str, List[BaseEstimator]], numerical_columns: List[str], - categorical_columns: List[str] + categorical_columns: List[str], ) -> ColumnTransformer: """ Given a dictionary of preprocessors, this function From 217c38dd43b012d3839f757cce7877299aac74cc Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 1 Oct 2021 16:56:26 +0200 Subject: [PATCH 03/24] Fixing bugs in implementation --- autoPyTorch/data/tabular_feature_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 06979533d..31d702301 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -337,14 +337,14 @@ def _get_columns_info( # Make sure each column is a valid type for i, column in enumerate(X.columns): column_dtype = self.dtypes[i] - if column_dtype.name in ['category', 'bool']: + if column_dtype in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types elif not is_numeric_dtype(column_dtype): # TODO verify how would this happen when we always convert the object dtypes to category - if column_dtype.name == 'object': + if column_dtype == 'object': raise ValueError( "Input Column {} has invalid type object. " "Cast it to a valid dtype before using it in AutoPyTorch. " @@ -376,7 +376,7 @@ def _get_columns_info( "Make sure your data is formatted in a correct way, " "before feeding it to AutoPyTorch.".format( column, - column_dtype.name, + column_dtype, ) ) else: From f7dd8fe9cb2a3cd8dcbe7a9dc435c3ffc16bddf8 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Fri, 1 Oct 2021 18:14:11 +0200 Subject: [PATCH 04/24] Adding wrongly deleted code part during rebase --- autoPyTorch/data/tabular_feature_validator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 31d702301..700a8d6d1 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -78,6 +78,13 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: copy=False, ) ) + preprocessors['numerical'].append( + StandardScaler( + with_mean=True, + with_std=True, + copy=False, + ) + ) return preprocessors From 92bd535b73e5afef01ea7cbb74b2b7ea2a179c2c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 2 Oct 2021 13:33:07 +0200 Subject: [PATCH 05/24] Fix bug in _get_args --- .../setup/network_embedding/base_network_embedding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 18028cddd..14bf00653 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -30,8 +30,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - # - # def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: + + def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: # # Feature preprocessors can alter numerical columns # # if len(X['dataset_properties']['numerical_columns']) == 0: # # num_numerical_columns = 0 @@ -49,4 +49,4 @@ def build_embedding(self, num_input_features: np.ndarray, num_numerical_features # # for i, category in enumerate(categories): # # num_input_features[num_numerical_columns + i, ] = len(category) # # return num_numerical_columns, num_input_features - # return None, None + return None, None From 5f672b550d2fdc25c2a3865c20a78e44f04e796a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 2 Oct 2021 13:46:27 +0200 Subject: [PATCH 06/24] Fix bug in _get_args --- .../setup/network_embedding/base_network_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 14bf00653..7d9b1df3f 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ # import copy -from typing import Any, Dict, Optional # , Tuple +from typing import Any, Dict, Optional, Tuple import numpy as np From 223c09e1d43fc7715739bf09be79c09e52b84960 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Sun, 3 Oct 2021 18:07:39 +0200 Subject: [PATCH 07/24] Addressing Shuhei's comments --- autoPyTorch/data/tabular_feature_validator.py | 54 +++++++------------ 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 700a8d6d1..e64cefb9a 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -62,29 +62,13 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: Dict[str, List[BaseEstimator]] """ preprocessors: Dict[str, List[BaseEstimator]] = dict() - preprocessors['numerical'] = list() - preprocessors['categorical'] = list() - - preprocessors['categorical'].append( - OneHotEncoder( - categories='auto', - sparse=False, - handle_unknown='ignore', - ) - ) - preprocessors['numerical'].append( - SimpleImputer( - strategy='median', - copy=False, - ) - ) - preprocessors['numerical'].append( - StandardScaler( - with_mean=True, - with_std=True, - copy=False, - ) - ) + + onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') + imputer = SimpleImputer(strategy='median', copy=False) + standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) + + preprocessors['categorical'] = [onehot_encoder] + preprocessors['numerical'] = [imputer, standard_scaler] return preprocessors @@ -161,7 +145,7 @@ def comparator(cmp1: str, cmp2: str) -> int: if len(categorical_columns) > 0: self.categories = [ - # We fit an one-hot encoder, where all categorical + # We fit a one-hot encoder, where all categorical # columns are shifted to the left list(range(len(cat))) for cat in self.column_transformer.named_transformers_[ @@ -477,7 +461,8 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: X[column] = X[column].astype('category') # only numerical attributes and categories data_types = X.dtypes - self.object_dtype_mapping = {column: data_types[index] for index, column in enumerate(X.columns)} + self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} + self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") return X @@ -504,13 +489,16 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: # TypeError: '<' not supported between instances of 'int' and 'str' # in the encoding for column in self.enc_columns: - if X[column].isna().any(): + # no missing values for categorical column + if not X[column].isna().any(): + continue + else: if column not in self.dict_missing_value_per_col: try: first_value = X[column].dropna().values[0] float(first_value) can_cast_as_number = True - except Exception: + except ValueError: can_cast_as_number = False if can_cast_as_number: # In this case, we expect to have a number as category @@ -555,10 +543,8 @@ def has_object_columns( True if the DataFrame dtypes contain an object column, False otherwise. """ - object_columns_indicator = [True if pd.api.types.is_object_dtype(feature_type) else False - for feature_type in feature_types] - - if True in object_columns_indicator: - return True - else: - return False + for feature_type in feature_types: + if pd.api.types.is_object_dtype(feature_type): + return True + else: + return False From a1ed8830783c13f8a1fa899057257df949740f21 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Mon, 4 Oct 2021 00:12:28 +0200 Subject: [PATCH 08/24] Address Shuhei's comments --- autoPyTorch/data/tabular_feature_validator.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index e64cefb9a..5ae2cd22c 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -270,7 +270,7 @@ def _check_data( X = cast(pd.DataFrame, X) # Handle objects if possible - object_columns_indicator = has_object_columns(X.dtypes) + object_columns_indicator = has_object_columns(X.dtypes.values) if object_columns_indicator: X = self.infer_objects(X) @@ -480,6 +480,13 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: pd.DataFrame """ + def can_cast_as_number(value: Union[int, float, str]) -> bool: + try: + float(first_value) + return True + except ValueError: + return False + # To be on the safe side, map always to the same missing # value per column if not hasattr(self, 'dict_nancol_to_missing'): @@ -494,16 +501,12 @@ def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: continue else: if column not in self.dict_missing_value_per_col: - try: - first_value = X[column].dropna().values[0] - float(first_value) - can_cast_as_number = True - except ValueError: - can_cast_as_number = False - if can_cast_as_number: + + first_value = X[column].dropna().values[0] + + if can_cast_as_number(first_value): # In this case, we expect to have a number as category # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1 else: missing_value = 'Missing!' @@ -543,8 +546,4 @@ def has_object_columns( True if the DataFrame dtypes contain an object column, False otherwise. """ - for feature_type in feature_types: - if pd.api.types.is_object_dtype(feature_type): - return True - else: - return False + return np.dtype('O') in feature_types From f5853101fd5c2d6def81a1486b38ff769bc54449 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 6 Oct 2021 19:29:13 +0200 Subject: [PATCH 09/24] Refactoring code --- autoPyTorch/data/tabular_feature_validator.py | 59 ++++++++++++++----- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 5ae2cd22c..866353e6f 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,7 +1,6 @@ import functools from typing import Any, Dict, List, Optional, Tuple, Union, cast - import numpy as np import pandas as pd @@ -512,14 +511,9 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool: missing_value = 'Missing!' # Make sure this missing value is not seen before - # Do this check for categorical columns - # else modify the value if hasattr(X[column], 'cat'): - while missing_value in X[column].cat.categories: - if isinstance(missing_value, str): - missing_value += '0' - else: - missing_value += missing_value + missing_value = get_unused_category_symbol(X[column], missing_value) + self.dict_missing_value_per_col[column] = missing_value # Convert the frame in place @@ -529,6 +523,7 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool: return X + def has_object_columns( feature_types: pd.Series, ) -> bool: @@ -537,13 +532,47 @@ def has_object_columns( there exists one or more object columns. Arguments: - ---------- - feature_types: pd.Series - The feature types for a DataFrame. + feature_types (pd.Series): + The feature types for a DataFrame. Returns: - -------- - bool - True if the DataFrame dtypes contain an object column, False - otherwise. + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. """ return np.dtype('O') in feature_types + + +def get_unused_category_symbol( + frame_column: pd.Series, + missing_value_symbol: Union[int, str], +) -> Union[int, str]: + """ + Select the appropriate missing value symbol for a column. + + Giving a column from a DataFrame and an initial missing value symbol, + check if the missing_value is contained in the column, f it is, make + the necessary changes for a unique missing value symbol. + + Arguments: + frame_column (pd.Series): + The DataFrame column. + missing_value_symbol (Union[int, str]): + The initial symbol for the missing value. + + Returns: + missing_value_symbol (Union[int, str]): + The unique missing value symbol. + """ + + if missing_value_symbol not in frame_column.cat.categories: + pass + elif isinstance(missing_value_symbol, str): + max_length = max(len(c) for c in frame_column.cat.categories) + missing_value_symbol += '0' * max_length + else: + # min_value is guaranteed to be negative since there exists -1 in categories + # and min_value must be smaller than -1. So the symbol is always negative. + min_value = min(c for c in frame_column.cat.categories) + missing_value_symbol = missing_value_symbol + min_value + + return missing_value_symbol From f298c46d0130698a8c635e2e3ed36fc40c2a50dd Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 6 Oct 2021 19:31:54 +0200 Subject: [PATCH 10/24] Refactoring code --- autoPyTorch/data/tabular_feature_validator.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 866353e6f..8e5ed452d 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -498,23 +498,23 @@ def can_cast_as_number(value: Union[int, float, str]) -> bool: # no missing values for categorical column if not X[column].isna().any(): continue - else: - if column not in self.dict_missing_value_per_col: - first_value = X[column].dropna().values[0] + if column not in self.dict_missing_value_per_col: + + first_value = X[column].dropna().values[0] - if can_cast_as_number(first_value): - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1 - else: - missing_value = 'Missing!' + if can_cast_as_number(first_value): + # In this case, we expect to have a number as category + # it might be string, but its value represent a number + missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1 + else: + missing_value = 'Missing!' - # Make sure this missing value is not seen before - if hasattr(X[column], 'cat'): - missing_value = get_unused_category_symbol(X[column], missing_value) + # Make sure this missing value is not seen before + if hasattr(X[column], 'cat'): + missing_value = get_unused_category_symbol(X[column], missing_value) - self.dict_missing_value_per_col[column] = missing_value + self.dict_missing_value_per_col[column] = missing_value # Convert the frame in place X[column].cat.add_categories([self.dict_missing_value_per_col[column]], From 03bef163443b42f88fe577d5420c4055ac7330da Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 6 Oct 2021 20:24:01 +0200 Subject: [PATCH 11/24] Typos fix and additional comments --- autoPyTorch/data/tabular_feature_validator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 8e5ed452d..27be45eca 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -549,8 +549,8 @@ def get_unused_category_symbol( """ Select the appropriate missing value symbol for a column. - Giving a column from a DataFrame and an initial missing value symbol, - check if the missing_value is contained in the column, f it is, make + Given a column from a DataFrame and an initial missing value symbol, + check if the missing_value is contained in the column. If it is, make the necessary changes for a unique missing value symbol. Arguments: @@ -558,6 +558,7 @@ def get_unused_category_symbol( The DataFrame column. missing_value_symbol (Union[int, str]): The initial symbol for the missing value. + -1 for int and '-1' for str. Returns: missing_value_symbol (Union[int, str]): @@ -568,11 +569,13 @@ def get_unused_category_symbol( pass elif isinstance(missing_value_symbol, str): max_length = max(len(c) for c in frame_column.cat.categories) + # There are no categories that are longer than `max_length` missing_value_symbol += '0' * max_length else: # min_value is guaranteed to be negative since there exists -1 in categories # and min_value must be smaller than -1. So the symbol is always negative. min_value = min(c for c in frame_column.cat.categories) + # always missing_value_symbol + min_value < min_value < 0 missing_value_symbol = missing_value_symbol + min_value return missing_value_symbol From a7d01f16ac2fb4d27cea2fa886a108524d32955a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 7 Oct 2021 16:08:31 +0200 Subject: [PATCH 12/24] Replace nan in categoricals with simple imputer --- autoPyTorch/data/base_feature_validator.py | 2 - autoPyTorch/data/tabular_feature_validator.py | 86 +++++-------------- 2 files changed, 20 insertions(+), 68 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index ae2b60196..a6181c771 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -63,8 +63,6 @@ def __init__(self, self.categories = [] # type: typing.List[typing.List[int]] self.categorical_columns: typing.List[int] = [] self.numerical_columns: typing.List[int] = [] - # column identifiers may be integers or strings - self.null_columns: typing.Set[str] = set() self._is_fitted = False diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 27be45eca..22448f00e 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -62,12 +62,16 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: """ preprocessors: Dict[str, List[BaseEstimator]] = dict() + # Categorical Preprocessors onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') - imputer = SimpleImputer(strategy='median', copy=False) + categorical_imputer = SimpleImputer(strategy='constant', copy=False) + + # Numerical Preprocessors + numerical_imputer = SimpleImputer(strategy='median', copy=False) standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) - preprocessors['categorical'] = [onehot_encoder] - preprocessors['numerical'] = [imputer, standard_scaler] + preprocessors['categorical'] = [categorical_imputer, onehot_encoder] + preprocessors['numerical'] = [numerical_imputer, standard_scaler] return preprocessors @@ -106,10 +110,11 @@ def _fit( X = cast(pd.DataFrame, X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + print("enc_columns", categorical_columns) + print("all_nan_columns", self.all_nan_columns) self.enc_columns = categorical_columns - if len(categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + preprocessors = get_tabular_preprocessors() self.column_transformer = _create_column_transformer( preprocessors=preprocessors, @@ -196,10 +201,7 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(X) - # We also need to fillna on the transformation - # in case test data is provided - if len(self.categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + X = self.column_transformer.transform(X) # Sparse related transformations @@ -267,6 +269,15 @@ def _check_data( if hasattr(X, "iloc"): # If entered here, we have a pandas dataframe X = cast(pd.DataFrame, X) + + if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns): + X.drop(labels=self.all_nan_columns, axis=1, inplace=True) + else: + self.all_nan_columns: List[Union[int, str]] = list() + for column in X.columns: + if X[column].isna().all(): + self.all_nan_columns.append(column) + X.drop(labels=self.all_nan_columns, axis=1, inplace=True) # Handle objects if possible object_columns_indicator = has_object_columns(X.dtypes.values) @@ -466,63 +477,6 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: return X - def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: - """ - impute missing values before encoding, - remove once sklearn natively supports - it in ordinal encoding. Sklearn issue: - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - Arguments: - X (pd.DataFrame): - data to be interpreted. - Returns: - pd.DataFrame - """ - - def can_cast_as_number(value: Union[int, float, str]) -> bool: - try: - float(first_value) - return True - except ValueError: - return False - - # To be on the safe side, map always to the same missing - # value per column - if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: Dict[str, Any] = {} - - # First make sure that we do not alter the type of the column which cause: - # TypeError: '<' not supported between instances of 'int' and 'str' - # in the encoding - for column in self.enc_columns: - # no missing values for categorical column - if not X[column].isna().any(): - continue - - if column not in self.dict_missing_value_per_col: - - first_value = X[column].dropna().values[0] - - if can_cast_as_number(first_value): - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1 - else: - missing_value = 'Missing!' - - # Make sure this missing value is not seen before - if hasattr(X[column], 'cat'): - missing_value = get_unused_category_symbol(X[column], missing_value) - - self.dict_missing_value_per_col[column] = missing_value - - # Convert the frame in place - X[column].cat.add_categories([self.dict_missing_value_per_col[column]], - inplace=True) - X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - - return X - def has_object_columns( feature_types: pd.Series, From 38fe9e8dfbd8dd63c2030085147c56e9831f7d93 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 7 Oct 2021 17:10:24 +0200 Subject: [PATCH 13/24] Remove unused function --- autoPyTorch/data/tabular_feature_validator.py | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 22448f00e..62de46f56 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -495,41 +495,3 @@ def has_object_columns( """ return np.dtype('O') in feature_types - -def get_unused_category_symbol( - frame_column: pd.Series, - missing_value_symbol: Union[int, str], -) -> Union[int, str]: - """ - Select the appropriate missing value symbol for a column. - - Given a column from a DataFrame and an initial missing value symbol, - check if the missing_value is contained in the column. If it is, make - the necessary changes for a unique missing value symbol. - - Arguments: - frame_column (pd.Series): - The DataFrame column. - missing_value_symbol (Union[int, str]): - The initial symbol for the missing value. - -1 for int and '-1' for str. - - Returns: - missing_value_symbol (Union[int, str]): - The unique missing value symbol. - """ - - if missing_value_symbol not in frame_column.cat.categories: - pass - elif isinstance(missing_value_symbol, str): - max_length = max(len(c) for c in frame_column.cat.categories) - # There are no categories that are longer than `max_length` - missing_value_symbol += '0' * max_length - else: - # min_value is guaranteed to be negative since there exists -1 in categories - # and min_value must be smaller than -1. So the symbol is always negative. - min_value = min(c for c in frame_column.cat.categories) - # always missing_value_symbol + min_value < min_value < 0 - missing_value_symbol = missing_value_symbol + min_value - - return missing_value_symbol From 7693753c5f54df3619a22e9a3e1f5365bc6a5bde Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 7 Oct 2021 17:18:30 +0200 Subject: [PATCH 14/24] add comment --- autoPyTorch/data/tabular_feature_validator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 62de46f56..800932fa0 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -269,7 +269,8 @@ def _check_data( if hasattr(X, "iloc"): # If entered here, we have a pandas dataframe X = cast(pd.DataFrame, X) - + + # we should remove columns with all nans in the training set. if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns): X.drop(labels=self.all_nan_columns, axis=1, inplace=True) else: From 497c546c34a410e8cf7a16f2192b71f3d768ab0c Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Thu, 7 Oct 2021 17:24:11 +0200 Subject: [PATCH 15/24] Update autoPyTorch/data/tabular_feature_validator.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/tabular_feature_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 800932fa0..979f5cac3 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -324,9 +324,9 @@ def _get_columns_info( checks) and a encoder fitted in the case the data needs encoding Returns: categorical_columns: (List[str]) - Categorical columns. + List of the names of categorical columns. numerical_columns: (List[str]) - Numerical columns. + List of the names of numerical columns. feat_type: Type of each column numerical/categorical """ From 9254eb24bab1893e8e1d8be185733c077f81f7ea Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Thu, 7 Oct 2021 17:25:14 +0200 Subject: [PATCH 16/24] Update autoPyTorch/data/tabular_feature_validator.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/tabular_feature_validator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 979f5cac3..8735286bb 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -465,10 +465,8 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: else: # Calling for the first time to infer the categories X = X.infer_objects() - # initial data types - data_types = X.dtypes - for index, column in enumerate(X.columns): - if not is_numeric_dtype(data_types[index]): + for column, data_type in zip(X.columns, X.dtypes): + if not is_numeric_dtype(data_type): X[column] = X[column].astype('category') # only numerical attributes and categories data_types = X.dtypes From b63ff3c74fe6023e6698835df3a913d20bf91647 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 8 Oct 2021 11:25:30 +0200 Subject: [PATCH 17/24] Adding unit test for only nall columns in the tabular feature categorical evaluator --- autoPyTorch/data/tabular_feature_validator.py | 2 - test/test_data/test_feature_validator.py | 87 +++++++++++++++++++ 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 8735286bb..a940584dc 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -110,8 +110,6 @@ def _fit( X = cast(pd.DataFrame, X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) - print("enc_columns", categorical_columns) - print("all_nan_columns", self.all_nan_columns) self.enc_columns = categorical_columns diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index f9ba2855e..54101a4b9 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -317,6 +317,93 @@ def test_featurevalidator_get_columns_to_encode(): assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] +def test_featurevalidator_remove_nan__catcolumns(): + """ + Make sure categorical columns that have only nan values are removed. + """ + # First case, there exist null columns in the train set + # and the same columns are not all null for the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': 5}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]])) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]])) + + # Second case, there exist null columns in the training set and the same + # are null in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]])) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]])) + + # Third case, there exist no null columns in the training set and a + # few null columns exist in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': 1}, + {'A': 2, 'B': 2} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan}, + {'A': np.nan, 'B': np.nan} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1, 0, 1], [1, 0, 1, 0]])) + assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]])) + + def test_features_unsupported_calls_are_raised(): """ Makes sure we raise a proper message to the user, From d5bbdbe1ee45defb8c4a919c694828366634d205 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 11:28:18 +0200 Subject: [PATCH 18/24] fix bug in remove all nan columns --- autoPyTorch/data/base_feature_validator.py | 46 ++++++++++--------- autoPyTorch/data/tabular_feature_validator.py | 30 +++++++----- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index a6181c771..eae832128 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import Any, Dict, List, Optional, Union import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = typing.Union[ - typing.List, +SUPPORTED_FEAT_TYPES = Union[ + List, pd.DataFrame, np.ndarray, scipy.sparse.bsr_matrix, @@ -35,41 +35,43 @@ class BaseFeatureValidator(BaseEstimator): List of the column types found by this estimator during fit. data_type (str): Class name of the data type provided during fit. - encoder (typing.Optional[BaseEstimator]) + encoder (Optional[BaseEstimator]) Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that were encoded. """ def __init__(self, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + logger: Optional[Union[PicklableClientLogger, logging.Logger ]] = None, ) -> None: # Register types to detect unsupported data format changes - self.feat_type = None # type: typing.Optional[typing.List[str]] - self.data_type = None # type: typing.Optional[type] - self.dtypes = [] # type: typing.List[str] - self.column_order = [] # type: typing.List[str] + self.feat_type = None # type: Optional[List[str]] + self.data_type = None # type: Optional[type] + self.dtypes = [] # type: List[str] + self.column_order = [] # type: List[str] - self.encoder = None # type: typing.Optional[BaseEstimator] - self.enc_columns = [] # type: typing.List[str] + self.encoder = None # type: Optional[BaseEstimator] + self.enc_columns = [] # type: List[str] - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Required for dataset properties - self.num_features = None # type: typing.Optional[int] - self.categories = [] # type: typing.List[typing.List[int]] - self.categorical_columns: typing.List[int] = [] - self.numerical_columns: typing.List[int] = [] + self.num_features = None # type: Optional[int] + self.categories = [] # type: List[List[int]] + self.categorical_columns: List[int] = [] + self.numerical_columns: List[int] = [] + + self.all_nan_columns: Optional[List[Union[int, str]]] = None self._is_fitted = False def fit( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. @@ -80,7 +82,7 @@ def fit( X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking """ @@ -158,8 +160,8 @@ def transform( def list_to_dataframe( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, - ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]: + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ Converts a list to a pandas DataFrame. In this process, column types are inferred. @@ -169,7 +171,7 @@ def list_to_dataframe( X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking Returns: pd.DataFrame: diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index a940584dc..071e9c6fe 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -110,6 +110,12 @@ def _fit( X = cast(pd.DataFrame, X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + + self.all_nan_columns = list() + for column in X.columns: + if X[column].isna().all(): + self.all_nan_columns.append(column) + self.enc_columns = categorical_columns @@ -199,7 +205,19 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(X) - + + if self.all_nan_columns is None: + raise NotFittedError("Expected all_nan_columns to be" + " initialised during fit, got {}".format(self.all_nan_columns)) + if set(self.all_nan_columns).issubset(X.columns): + raise ValueError("Expected all nan columns {} to be a" + "subset of the columns of the dataset {}".format( + self.all_nan_columns, + X.columns + ) + ) + X.drop(labels=self.all_nan_columns, axis=1, inplace=True) + X = self.column_transformer.transform(X) # Sparse related transformations @@ -268,16 +286,6 @@ def _check_data( # If entered here, we have a pandas dataframe X = cast(pd.DataFrame, X) - # we should remove columns with all nans in the training set. - if hasattr(self, 'all_nan_columns') and set(self.all_nan_columns).issubset(X.columns): - X.drop(labels=self.all_nan_columns, axis=1, inplace=True) - else: - self.all_nan_columns: List[Union[int, str]] = list() - for column in X.columns: - if X[column].isna().all(): - self.all_nan_columns.append(column) - X.drop(labels=self.all_nan_columns, axis=1, inplace=True) - # Handle objects if possible object_columns_indicator = has_object_columns(X.dtypes.values) if object_columns_indicator: From bfe489941ea31322960928145cb3f921ec27070c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 13:49:35 +0200 Subject: [PATCH 19/24] Bug fix for making tests run by arlind --- autoPyTorch/data/base_feature_validator.py | 4 +- autoPyTorch/data/tabular_feature_validator.py | 53 +++++++++++-------- test/test_data/test_feature_validator.py | 35 ++++++------ 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index eae832128..2dc97f3a9 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Set, Tuple, Union import numpy as np @@ -64,7 +64,7 @@ def __init__(self, self.categorical_columns: List[int] = [] self.numerical_columns: List[int] = [] - self.all_nan_columns: Optional[List[Union[int, str]]] = None + self.all_nan_columns: Optional[Set[Union[int, str]]] = None self._is_fitted = False diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 071e9c6fe..611a8060f 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -49,7 +49,7 @@ def _create_column_transformer( return ColumnTransformer([ ('categorical_pipeline', categorical_pipeline, categorical_columns), ('numerical_pipeline', numerical_pipeline, numerical_columns)], - remainder='passthrough' + remainder='drop' ) @@ -109,13 +109,12 @@ def _fit( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) - - self.all_nan_columns = list() + self.all_nan_columns = set() for column in X.columns: if X[column].isna().all(): - self.all_nan_columns.append(column) + self.all_nan_columns.add(column) + categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) self.enc_columns = categorical_columns @@ -206,17 +205,15 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(X) - if self.all_nan_columns is None: - raise NotFittedError("Expected all_nan_columns to be" - " initialised during fit, got {}".format(self.all_nan_columns)) - if set(self.all_nan_columns).issubset(X.columns): - raise ValueError("Expected all nan columns {} to be a" - "subset of the columns of the dataset {}".format( - self.all_nan_columns, - X.columns - ) - ) - X.drop(labels=self.all_nan_columns, axis=1, inplace=True) + # in case of test data being all none and train data + # having a value for a categorical column. + # We need to convert the column in test data to + # object otherwise the test column is interpreted as float + if len(self.categorical_columns) > 0: + categorical_columns = self.column_transformer.transformers_[0][-1] + for column in categorical_columns: + if X[column].isna().all(): + X[column] = X[column].astype('object') X = self.column_transformer.transform(X) @@ -307,13 +304,20 @@ def _check_data( dtypes = [dtype.name for dtype in X.dtypes] if len(self.dtypes) > 0: - if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) + dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] + if any(dtypes_diff): + if self.all_nan_columns is not None and len(self.all_nan_columns) > 0: + if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0: + # we expect the dtypes to only be different if the column belongs + # to all_nan_columns as these columns would be imputed. if there is + # a value in the test set for a column in all_nan_columns, pandas + # does not recognise the dtype of the test column properly + raise ValueError("Changing the dtype of the features after fit() is " + "not supported. Fit() method was called with " + "{} whereas the new features have {} as type".format(self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -344,6 +348,8 @@ def _get_columns_info( # Make sure each column is a valid type for i, column in enumerate(X.columns): + if self.all_nan_columns is not None and column in self.all_nan_columns: + continue column_dtype = self.dtypes[i] if column_dtype in ['category', 'bool']: categorical_columns.append(column) @@ -474,6 +480,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: for column, data_type in zip(X.columns, X.dtypes): if not is_numeric_dtype(data_type): X[column] = X[column].astype('category') + # only numerical attributes and categories data_types = X.dtypes self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 54101a4b9..a166bf97c 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -346,8 +346,8 @@ def test_featurevalidator_remove_nan__catcolumns(): transformed_df_train = validator.transform(df_train) transformed_df_test = validator.transform(df_test) - assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]])) - assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]])) + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) # Second case, there exist null columns in the training set and the same # are null in the test set. @@ -374,8 +374,8 @@ def test_featurevalidator_remove_nan__catcolumns(): transformed_df_train = validator.transform(df_train) transformed_df_test = validator.transform(df_test) - assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]])) - assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]])) + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) # Third case, there exist no null columns in the training set and a # few null columns exist in the test set. @@ -400,8 +400,8 @@ def test_featurevalidator_remove_nan__catcolumns(): transformed_df_train = validator.transform(df_train) transformed_df_test = validator.transform(df_test) - assert np.array_equal(transformed_df_train, np.array([[0, 1, 0, 1], [1, 0, 1, 0]])) - assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]])) + assert np.array_equal(transformed_df_train, np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=float)) def test_features_unsupported_calls_are_raised(): @@ -636,16 +636,19 @@ def test_feature_validator_imbalanced_data(): validator = TabularFeatureValidator() validator.fit(X_train) + transformed_X_train = validator.transform(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) - assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] + assert train_feature_types == ['numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - assert null_columns == [0, 2, 3] + assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) + # as there are no categorical columns, we can make such an + # assertion. We only expect to drop the all nan columns + total_all_nan_columns = len(validator.all_nan_columns) + total_columns = len(validator.column_order) + assert total_columns - total_all_nan_columns == len(transformed_X_test.columns) # Columns with not all null values in the train split and # completely null on the test split. @@ -664,14 +667,10 @@ def test_feature_validator_imbalanced_data(): X_test = pd.DataFrame.from_dict(test_features) validator = TabularFeatureValidator() validator.fit(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) assert train_feature_types == ['categorical', 'numerical', 'numerical'] transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - - assert null_columns == [1] + assert not len(validator.all_nan_columns) From 369edad26d37186d48c1bd02aff738929bf32b48 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 14:39:24 +0200 Subject: [PATCH 20/24] fix flake errors in feature validator --- autoPyTorch/data/base_feature_validator.py | 3 ++- autoPyTorch/data/tabular_feature_validator.py | 24 +++++++++---------- test/test_data/test_feature_validator.py | 4 +--- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 2dc97f3a9..757a19b46 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -43,7 +43,8 @@ class BaseFeatureValidator(BaseEstimator): """ def __init__(self, logger: Optional[Union[PicklableClientLogger, logging.Logger - ]] = None, + ] + ] = None, ) -> None: # Register types to detect unsupported data format changes self.feat_type = None # type: Optional[List[str]] diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 611a8060f..7f17d918e 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,5 @@ import functools -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Dict, List, Optional, Tuple, cast import numpy as np @@ -114,7 +114,7 @@ def _fit( if X[column].isna().all(): self.all_nan_columns.add(column) - categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) self.enc_columns = categorical_columns @@ -207,7 +207,7 @@ def transform( # in case of test data being all none and train data # having a value for a categorical column. - # We need to convert the column in test data to + # We need to convert the column in test data to # object otherwise the test column is interpreted as float if len(self.categorical_columns) > 0: categorical_columns = self.column_transformer.transformers_[0][-1] @@ -308,16 +308,16 @@ def _check_data( if any(dtypes_diff): if self.all_nan_columns is not None and len(self.all_nan_columns) > 0: if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0: - # we expect the dtypes to only be different if the column belongs - # to all_nan_columns as these columns would be imputed. if there is - # a value in the test set for a column in all_nan_columns, pandas + # we expect the dtypes to only be different if the column belongs + # to all_nan_columns as these columns would be imputed. if there is + # a value in the test set for a column in all_nan_columns, pandas # does not recognise the dtype of the test column properly raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) + "not supported. Fit() method was called with " + "{} whereas the new features have {} as type".format(self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -482,7 +482,6 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: X[column] = X[column].astype('category') # only numerical attributes and categories - data_types = X.dtypes self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") @@ -506,4 +505,3 @@ def has_object_columns( otherwise. """ return np.dtype('O') in feature_types - diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index a166bf97c..c2d516162 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -636,15 +636,13 @@ def test_feature_validator_imbalanced_data(): validator = TabularFeatureValidator() validator.fit(X_train) - transformed_X_train = validator.transform(X_train) - train_feature_types = copy.deepcopy(validator.feat_type) assert train_feature_types == ['numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) - # as there are no categorical columns, we can make such an + # as there are no categorical columns, we can make such an # assertion. We only expect to drop the all nan columns total_all_nan_columns = len(validator.all_nan_columns) total_columns = len(validator.column_order) From a4fb0cb4af571e1ba81384b4cb3e4e0f1f94beb9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 14:57:07 +0200 Subject: [PATCH 21/24] made typing code uniform --- autoPyTorch/data/base_feature_validator.py | 16 ++++----- autoPyTorch/data/base_target_validator.py | 40 +++++++++++----------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 757a19b46..ed109e380 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -47,21 +47,21 @@ def __init__(self, ] = None, ) -> None: # Register types to detect unsupported data format changes - self.feat_type = None # type: Optional[List[str]] - self.data_type = None # type: Optional[type] - self.dtypes = [] # type: List[str] - self.column_order = [] # type: List[str] + self.feat_type: Optional[List[str]] = None + self.data_type: Optional[type] = None + self.dtypes: List[str] = [] + self.column_order: List[str] = [] - self.encoder = None # type: Optional[BaseEstimator] - self.enc_columns = [] # type: List[str] + self.encoder: Optional[BaseEstimator] = None + self.enc_columns: List[str] = [] self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Required for dataset properties - self.num_features = None # type: Optional[int] - self.categories = [] # type: List[List[int]] + self.num_features: Optional[int] = None + self.categories: List[List[int]] = [] self.categorical_columns: List[int] = [] self.numerical_columns: List[int] = [] diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index dba9c19e3..0fb318476 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Union, cast import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = typing.Union[ - typing.List, +SUPPORTED_TARGET_TYPES = Union[ + List, pd.Series, pd.DataFrame, np.ndarray, @@ -35,39 +35,39 @@ class BaseTargetValidator(BaseEstimator): is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. - encoder (typing.Optional[BaseEstimator]): + encoder (Optional[BaseEstimator]): Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that where encoded """ def __init__(self, is_classification: bool = False, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + logger: Optional[Union[PicklableClientLogger, logging.Logger ]] = None, ) -> None: self.is_classification = is_classification - self.data_type = None # type: typing.Optional[type] + self.data_type: Optional[type] = None - self.encoder = None # type: typing.Optional[BaseEstimator] + self.encoder: Optional[BaseEstimator] = None - self.out_dimensionality = None # type: typing.Optional[int] - self.type_of_target = None # type: typing.Optional[str] + self.out_dimensionality: Optional[int] = None + self.type_of_target: Optional[str] = None - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Store the dtype for remapping to correct type - self.dtype = None # type: typing.Optional[type] + self.dtype: Optional[type] = None self._is_fitted = False def fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets @@ -76,7 +76,7 @@ def fit( Arguments: y_train (SUPPORTED_TARGET_TYPES) A set of targets set aside for training - y_test (typing.Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SUPPORTED_TARGET_TYPES]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -95,8 +95,8 @@ def fit( np.shape(y_test) )) if isinstance(y_train, pd.DataFrame): - y_train = typing.cast(pd.DataFrame, y_train) - y_test = typing.cast(pd.DataFrame, y_test) + y_train = cast(pd.DataFrame, y_train) + y_test = cast(pd.DataFrame, y_test) if y_train.columns.tolist() != y_test.columns.tolist(): raise ValueError( "Train and test targets must both have the same columns, yet " @@ -127,21 +127,21 @@ def fit( def _fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Arguments: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification - y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SUPPORTED_TARGET_TYPES]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: typing.Union[SUPPORTED_TARGET_TYPES], + y: Union[SUPPORTED_TARGET_TYPES], ) -> np.ndarray: """ Arguments: @@ -162,7 +162,7 @@ def inverse_transform( Revert any encoding transformation done on a target array Arguments: - y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + y (Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: np.ndarray: From 44229a680e47bc0e3c0da0fd28ad9128ee589953 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Fri, 8 Oct 2021 17:02:50 +0200 Subject: [PATCH 22/24] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/tabular_feature_validator.py | 9 +++------ test/test_data/test_feature_validator.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 7f17d918e..e51b2b387 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -109,10 +109,7 @@ def _fit( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - self.all_nan_columns = set() - for column in X.columns: - if X[column].isna().all(): - self.all_nan_columns.add(column) + self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) @@ -284,8 +281,8 @@ def _check_data( X = cast(pd.DataFrame, X) # Handle objects if possible - object_columns_indicator = has_object_columns(X.dtypes.values) - if object_columns_indicator: + exist_object_columns = has_object_columns(X.dtypes.values) + if exist_object_columns: X = self.infer_objects(X) # Define the column to be encoded here as the feature validator is fitted once diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index c2d516162..535023cd2 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -317,7 +317,7 @@ def test_featurevalidator_get_columns_to_encode(): assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] -def test_featurevalidator_remove_nan__catcolumns(): +def test_featurevalidator_remove_nan_catcolumns(): """ Make sure categorical columns that have only nan values are removed. """ From ba3c1e7852cbf2b814a5a353e31facdcf94feda9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 17:02:27 +0200 Subject: [PATCH 23/24] address comments from shuhei --- autoPyTorch/data/base_feature_validator.py | 4 +- autoPyTorch/data/tabular_feature_validator.py | 67 ++++++++----------- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index ed109e380..9ed46d6e6 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -47,8 +47,8 @@ def __init__(self, ] = None, ) -> None: # Register types to detect unsupported data format changes - self.feat_type: Optional[List[str]] = None - self.data_type: Optional[type] = None + self.feat_type: Optional[List[str]] = None + self.data_type: Optional[type] = None self.dtypes: List[str] = [] self.column_order: List[str] = [] diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index e51b2b387..9a84e63ec 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -310,7 +310,8 @@ def _check_data( # a value in the test set for a column in all_nan_columns, pandas # does not recognise the dtype of the test column properly raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " + "not supported. The dtype of some columns are different " + "between training and test datasets. Fit() method was called with " "{} whereas the new features have {} as type".format(self.dtypes, dtypes, ) @@ -348,51 +349,41 @@ def _get_columns_info( if self.all_nan_columns is not None and column in self.all_nan_columns: continue column_dtype = self.dtypes[i] + err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ + "but input Column {} has an invalid type `{}`.".format(column, column_dtype) if column_dtype in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(column_dtype): + elif is_numeric_dtype(column_dtype): + feat_type.append('numerical') + numerical_columns.append(column) + elif column_dtype == 'object': # TODO verify how would this happen when we always convert the object dtypes to category - if column_dtype == 'object': - raise ValueError( - "Input Column {} has invalid type object. " - "Cast it to a valid dtype before using it in AutoPyTorch. " - "Valid types are numerical, categorical or boolean. " - "You can cast it to a valid dtype using " - "pandas.Series.astype ." - "If working with string objects, the following " - "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( - # noqa: E501 - column, - ) - ) - elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - column_dtype - ): - raise ValueError( - "AutoPyTorch does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) - ) - else: - raise ValueError( - "Input Column {} has unsupported dtype {}. " - "Supported column types are categorical/bool/numerical dtypes. " - "Make sure your data is formatted in a correct way, " - "before feeding it to AutoPyTorch.".format( - column, - column_dtype, - ) + raise ValueError( + "{} Cast it to a valid dtype before feeding it to AutoPyTorch. " + "You can cast it to a valid dtype using pandas.Series.astype." + "If you are working with string objects, the following " + "tutorial illustrates how to work with text data: " + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( + # noqa: E501 + err_msg, ) + ) + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype): + raise ValueError( + "{} Convert the time information to a numerical value" + " before feeding it to AutoPyTorch. " + "One example of the conversion can be found on " + "https://stats.stackexchange.com/questions/311494/".format(err_msg) + ) else: - feat_type.append('numerical') - numerical_columns.append(column) + raise ValueError( + "{} Make sure your data is formatted in a correct way" + "before feeding it to AutoPyTorch.".format(err_msg) + ) + return categorical_columns, numerical_columns, feat_type def list_to_dataframe( From 10a8441c201eaedf92ba0af406eaa7a90b74ad8f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 8 Oct 2021 17:05:34 +0200 Subject: [PATCH 24/24] address comments from shuhei (2) --- autoPyTorch/data/tabular_feature_validator.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 9a84e63ec..3f939bc98 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -109,7 +109,8 @@ def _fit( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) + + self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) @@ -147,15 +148,6 @@ def comparator(cmp1: str, cmp2: str) -> int: key=functools.cmp_to_key(comparator) ) - if len(categorical_columns) > 0: - self.categories = [ - # We fit a one-hot encoder, where all categorical - # columns are shifted to the left - list(range(len(cat))) - for cat in self.column_transformer.named_transformers_[ - 'categorical_pipeline'].named_steps['onehotencoder'].categories_ - ] - # differently to categorical_columns and numerical_columns, # this saves the index of the column. for i, type_ in enumerate(self.feat_type):