diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 0106a3aa8..9ed46d6e6 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Set, Tuple, Union import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = typing.Union[ - typing.List, +SUPPORTED_FEAT_TYPES = Union[ + List, pd.DataFrame, np.ndarray, scipy.sparse.bsr_matrix, @@ -35,43 +35,44 @@ class BaseFeatureValidator(BaseEstimator): List of the column types found by this estimator during fit. data_type (str): Class name of the data type provided during fit. - encoder (typing.Optional[BaseEstimator]) + encoder (Optional[BaseEstimator]) Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that were encoded. """ def __init__(self, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger - ]] = None, + logger: Optional[Union[PicklableClientLogger, logging.Logger + ] + ] = None, ) -> None: # Register types to detect unsupported data format changes - self.feat_type = None # type: typing.Optional[typing.List[str]] - self.data_type = None # type: typing.Optional[type] - self.dtypes = [] # type: typing.List[str] - self.column_order = [] # type: typing.List[str] + self.feat_type: Optional[List[str]] = None + self.data_type: Optional[type] = None + self.dtypes: List[str] = [] + self.column_order: List[str] = [] - self.encoder = None # type: typing.Optional[BaseEstimator] - self.enc_columns = [] # type: typing.List[str] + self.encoder: Optional[BaseEstimator] = None + self.enc_columns: List[str] = [] - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Required for dataset properties - self.num_features = None # type: typing.Optional[int] - self.categories = [] # type: typing.List[typing.List[int]] - self.categorical_columns: typing.List[int] = [] - self.numerical_columns: typing.List[int] = [] - # column identifiers may be integers or strings - self.null_columns: typing.Set[str] = set() + self.num_features: Optional[int] = None + self.categories: List[List[int]] = [] + self.categorical_columns: List[int] = [] + self.numerical_columns: List[int] = [] + + self.all_nan_columns: Optional[Set[Union[int, str]]] = None self._is_fitted = False def fit( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. @@ -82,7 +83,7 @@ def fit( X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking """ @@ -122,6 +123,7 @@ def _fit( self: The fitted base estimator """ + raise NotImplementedError() def _check_data( @@ -136,6 +138,7 @@ def _check_data( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding """ + raise NotImplementedError() def transform( @@ -152,4 +155,30 @@ def transform( np.ndarray: The transformed array """ + + raise NotImplementedError() + + def list_to_dataframe( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Converts a list to a pandas DataFrame. In this process, column types are inferred. + + If test data is provided, we proactively match it to train data + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + Returns: + pd.DataFrame: + transformed train data from list to pandas DataFrame + pd.DataFrame: + transformed test data from list to pandas DataFrame + """ + raise NotImplementedError() diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index dba9c19e3..0fb318476 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Union, cast import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = typing.Union[ - typing.List, +SUPPORTED_TARGET_TYPES = Union[ + List, pd.Series, pd.DataFrame, np.ndarray, @@ -35,39 +35,39 @@ class BaseTargetValidator(BaseEstimator): is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. - encoder (typing.Optional[BaseEstimator]): + encoder (Optional[BaseEstimator]): Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that where encoded """ def __init__(self, is_classification: bool = False, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + logger: Optional[Union[PicklableClientLogger, logging.Logger ]] = None, ) -> None: self.is_classification = is_classification - self.data_type = None # type: typing.Optional[type] + self.data_type: Optional[type] = None - self.encoder = None # type: typing.Optional[BaseEstimator] + self.encoder: Optional[BaseEstimator] = None - self.out_dimensionality = None # type: typing.Optional[int] - self.type_of_target = None # type: typing.Optional[str] + self.out_dimensionality: Optional[int] = None + self.type_of_target: Optional[str] = None - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Store the dtype for remapping to correct type - self.dtype = None # type: typing.Optional[type] + self.dtype: Optional[type] = None self._is_fitted = False def fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets @@ -76,7 +76,7 @@ def fit( Arguments: y_train (SUPPORTED_TARGET_TYPES) A set of targets set aside for training - y_test (typing.Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SUPPORTED_TARGET_TYPES]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -95,8 +95,8 @@ def fit( np.shape(y_test) )) if isinstance(y_train, pd.DataFrame): - y_train = typing.cast(pd.DataFrame, y_train) - y_test = typing.cast(pd.DataFrame, y_test) + y_train = cast(pd.DataFrame, y_train) + y_test = cast(pd.DataFrame, y_test) if y_train.columns.tolist() != y_test.columns.tolist(): raise ValueError( "Train and test targets must both have the same columns, yet " @@ -127,21 +127,21 @@ def fit( def _fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Arguments: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification - y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SUPPORTED_TARGET_TYPES]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: typing.Union[SUPPORTED_TARGET_TYPES], + y: Union[SUPPORTED_TARGET_TYPES], ) -> np.ndarray: """ Arguments: @@ -162,7 +162,7 @@ def inverse_transform( Revert any encoding transformation done on a target array Arguments: - y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + y (Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: np.ndarray: diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 698e92438..3f939bc98 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,5 @@ import functools -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Dict, List, Optional, Tuple, cast import numpy as np @@ -22,7 +22,7 @@ def _create_column_transformer( preprocessors: Dict[str, List[BaseEstimator]], numerical_columns: List[str], - categorical_columns: List[str] + categorical_columns: List[str], ) -> ColumnTransformer: """ Given a dictionary of preprocessors, this function @@ -38,6 +38,7 @@ def _create_column_transformer( Returns: ColumnTransformer """ + numerical_pipeline = 'drop' categorical_pipeline = 'drop' if len(numerical_columns) > 0: @@ -48,7 +49,7 @@ def _create_column_transformer( return ColumnTransformer([ ('categorical_pipeline', categorical_pipeline, categorical_columns), ('numerical_pipeline', numerical_pipeline, numerical_columns)], - remainder='passthrough' + remainder='drop' ) @@ -60,21 +61,23 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: Dict[str, List[BaseEstimator]] """ preprocessors: Dict[str, List[BaseEstimator]] = dict() - preprocessors['numerical'] = list() - preprocessors['categorical'] = list() - preprocessors['categorical'].append(OneHotEncoder( - categories='auto', - sparse=False, - handle_unknown='ignore')) - preprocessors['numerical'].append(SimpleImputer(strategy='median', - copy=False)) - preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) + # Categorical Preprocessors + onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') + categorical_imputer = SimpleImputer(strategy='constant', copy=False) + + # Numerical Preprocessors + numerical_imputer = SimpleImputer(strategy='median', copy=False) + standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) + + preprocessors['categorical'] = [categorical_imputer, onehot_encoder] + preprocessors['numerical'] = [numerical_imputer, standard_scaler] return preprocessors class TabularFeatureValidator(BaseFeatureValidator): + def _fit( self, X: SUPPORTED_FEAT_TYPES, @@ -96,24 +99,29 @@ def _fit( # The final output of a validator is a numpy array. But pandas # gives us information about the column dtype if isinstance(X, np.ndarray): + X = self.numpy_array_to_pandas(X) + # Replace the data type from the previously saved type. + self.data_type = type(X) + # save all the information about the column order and data types + self._check_data(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + X = cast(pd.DataFrame, X) - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) - self._check_data(X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) self.enc_columns = categorical_columns - if len(categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + preprocessors = get_tabular_preprocessors() - self.column_transformer = _create_column_transformer(preprocessors=preprocessors, - numerical_columns=numerical_columns, - categorical_columns=categorical_columns) + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns, + ) # Mypy redefinition assert self.column_transformer is not None @@ -140,15 +148,8 @@ def comparator(cmp1: str, cmp2: str) -> int: key=functools.cmp_to_key(comparator) ) - if len(categorical_columns) > 0: - self.categories = [ - # We fit an ordinal encoder, where all categorical - # columns are shifted to the left - list(range(len(cat))) - for cat in self.column_transformer.named_transformers_[ - 'categorical_pipeline'].named_steps['onehotencoder'].categories_ - ] - + # differently to categorical_columns and numerical_columns, + # this saves the index of the column. for i, type_ in enumerate(self.feat_type): if 'numerical' in type_: self.numerical_columns.append(i) @@ -156,7 +157,8 @@ def comparator(cmp1: str, cmp2: str) -> int: self.categorical_columns.append(i) # Lastly, store the number of features - self.num_features = np.shape(X)[1] + self.num_features = len(X.columns) + return self def transform( @@ -189,16 +191,19 @@ def transform( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - # Check the data here so we catch problems on new test data self._check_data(X) - # We also need to fillna on the transformation - # in case test data is provided - if len(self.categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + + # in case of test data being all none and train data + # having a value for a categorical column. + # We need to convert the column in test data to + # object otherwise the test column is interpreted as float + if len(self.categorical_columns) > 0: + categorical_columns = self.column_transformer.transformers_[0][-1] + for column in categorical_columns: + if X[column].isna().all(): + X[column] = X[column].astype('object') + X = self.column_transformer.transform(X) # Sparse related transformations @@ -268,13 +273,13 @@ def _check_data( X = cast(pd.DataFrame, X) # Handle objects if possible - if not X.select_dtypes(include='object').empty: + exist_object_columns = has_object_columns(X.dtypes.values) + if exist_object_columns: X = self.infer_objects(X) # Define the column to be encoded here as the feature validator is fitted once # per estimator # enc_columns, _ = self._get_columns_to_encode(X) - column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: @@ -288,13 +293,21 @@ def _check_data( dtypes = [dtype.name for dtype in X.dtypes] if len(self.dtypes) > 0: - if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) + dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] + if any(dtypes_diff): + if self.all_nan_columns is not None and len(self.all_nan_columns) > 0: + if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0: + # we expect the dtypes to only be different if the column belongs + # to all_nan_columns as these columns would be imputed. if there is + # a value in the test set for a column in all_nan_columns, pandas + # does not recognise the dtype of the test column properly + raise ValueError("Changing the dtype of the features after fit() is " + "not supported. The dtype of some columns are different " + "between training and test datasets. Fit() method was called with " + "{} whereas the new features have {} as type".format(self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -310,8 +323,10 @@ def _get_columns_info( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: - enc_columns (List[str]): - Columns to encode, if any + categorical_columns: (List[str]) + List of the names of categorical columns. + numerical_columns: (List[str]) + List of the names of numerical columns. feat_type: Type of each column numerical/categorical """ @@ -323,51 +338,44 @@ def _get_columns_info( # Make sure each column is a valid type for i, column in enumerate(X.columns): - if X[column].dtype.name in ['category', 'bool']: - + if self.all_nan_columns is not None and column in self.all_nan_columns: + continue + column_dtype = self.dtypes[i] + err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ + "but input Column {} has an invalid type `{}`.".format(column, column_dtype) + if column_dtype in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': - raise ValueError( - "Input Column {} has invalid type object. " - "Cast it to a valid dtype before using it in AutoPyTorch. " - "Valid types are numerical, categorical or boolean. " - "You can cast it to a valid dtype using " - "pandas.Series.astype ." - "If working with string objects, the following " - "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( - # noqa: E501 - column, - ) - ) - elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - X[column].dtype - ): - raise ValueError( - "AutoPyTorch does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) - ) - else: - raise ValueError( - "Input Column {} has unsupported dtype {}. " - "Supported column types are categorical/bool/numerical dtypes. " - "Make sure your data is formatted in a correct way, " - "before feeding it to AutoPyTorch.".format( - column, - X[column].dtype.name, - ) - ) - else: + elif is_numeric_dtype(column_dtype): feat_type.append('numerical') numerical_columns.append(column) + elif column_dtype == 'object': + # TODO verify how would this happen when we always convert the object dtypes to category + raise ValueError( + "{} Cast it to a valid dtype before feeding it to AutoPyTorch. " + "You can cast it to a valid dtype using pandas.Series.astype." + "If you are working with string objects, the following " + "tutorial illustrates how to work with text data: " + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( + # noqa: E501 + err_msg, + ) + ) + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype): + raise ValueError( + "{} Convert the time information to a numerical value" + " before feeding it to AutoPyTorch. " + "One example of the conversion can be found on " + "https://stats.stackexchange.com/questions/311494/".format(err_msg) + ) + else: + raise ValueError( + "{} Make sure your data is formatted in a correct way" + "before feeding it to AutoPyTorch.".format(err_msg) + ) + return categorical_columns, numerical_columns, feat_type def list_to_dataframe( @@ -394,7 +402,7 @@ def list_to_dataframe( """ # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() + X_train = pd.DataFrame(data=X_train).convert_dtypes() self.logger.warning("The provided feature types to AutoPyTorch are of type list." "Features have been interpreted as: {}".format([(col, t) for col, t in zip(X_train.columns, X_train.dtypes)])) @@ -403,7 +411,8 @@ def list_to_dataframe( self.logger.warning("Train features are a list while the provided test data" "is {}. X_test will be casted as DataFrame.".format(type(X_test)) ) - X_test = pd.DataFrame(data=X_test).infer_objects() + X_test = pd.DataFrame(data=X_test).convert_dtypes() + return X_train, X_test @staticmethod @@ -446,65 +455,33 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") pass else: + # Calling for the first time to infer the categories X = X.infer_objects() - for column in X.columns: - if not is_numeric_dtype(X[column]): + for column, data_type in zip(X.columns, X.dtypes): + if not is_numeric_dtype(data_type): X[column] = X[column].astype('category') - self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} + + # only numerical attributes and categories + self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} + self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + return X - def impute_nan_in_categories(self, - X: pd.DataFrame - ) -> pd.DataFrame: - """ - impute missing values before encoding, - remove once sklearn natively supports - it in ordinal encoding. Sklearn issue: - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - Arguments: - X (pd.DataFrame): - data to be interpreted. - Returns: - pd.DataFrame - """ - # To be on the safe side, map always to the same missing - # value per column - if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: Dict[str, Any] = {} - - # First make sure that we do not alter the type of the column which cause: - # TypeError: '<' not supported between instances of 'int' and 'str' - # in the encoding - for column in self.enc_columns: - if X[column].isna().any(): - if column not in self.dict_missing_value_per_col: - try: - float(X[column].dropna().values[0]) - can_cast_as_number = True - except Exception: - can_cast_as_number = False - if can_cast_as_number: - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1 - else: - missing_value = 'Missing!' - - # Make sure this missing value is not seen before - # Do this check for categorical columns - # else modify the value - if hasattr(X[column], 'cat'): - while missing_value in X[column].cat.categories: - if isinstance(missing_value, str): - missing_value += '0' - else: - missing_value += missing_value - self.dict_missing_value_per_col[column] = missing_value - - # Convert the frame in place - X[column].cat.add_categories([self.dict_missing_value_per_col[column]], - inplace=True) - X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - return X +def has_object_columns( + feature_types: pd.Series, +) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. + + Arguments: + feature_types (pd.Series): + The feature types for a DataFrame. + Returns: + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + return np.dtype('O') in feature_types diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index f9ba2855e..535023cd2 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -317,6 +317,93 @@ def test_featurevalidator_get_columns_to_encode(): assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] +def test_featurevalidator_remove_nan_catcolumns(): + """ + Make sure categorical columns that have only nan values are removed. + """ + # First case, there exist null columns in the train set + # and the same columns are not all null for the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': 5}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) + + # Second case, there exist null columns in the training set and the same + # are null in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) + + # Third case, there exist no null columns in the training set and a + # few null columns exist in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': 1}, + {'A': 2, 'B': 2} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan}, + {'A': np.nan, 'B': np.nan} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=float)) + + def test_features_unsupported_calls_are_raised(): """ Makes sure we raise a proper message to the user, @@ -550,15 +637,16 @@ def test_feature_validator_imbalanced_data(): validator.fit(X_train) train_feature_types = copy.deepcopy(validator.feat_type) - assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] + assert train_feature_types == ['numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - assert null_columns == [0, 2, 3] + assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) + # as there are no categorical columns, we can make such an + # assertion. We only expect to drop the all nan columns + total_all_nan_columns = len(validator.all_nan_columns) + total_columns = len(validator.column_order) + assert total_columns - total_all_nan_columns == len(transformed_X_test.columns) # Columns with not all null values in the train split and # completely null on the test split. @@ -577,14 +665,10 @@ def test_feature_validator_imbalanced_data(): X_test = pd.DataFrame.from_dict(test_features) validator = TabularFeatureValidator() validator.fit(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) assert train_feature_types == ['categorical', 'numerical', 'numerical'] transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - - assert null_columns == [1] + assert not len(validator.all_nan_columns)