diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 11c6cf577..bdc403a93 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -111,6 +111,20 @@ def _fit( """ raise NotImplementedError() + def _check_data( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> None: + """ + Feature dimensionality and data type checks + + Arguments: + X (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + """ + raise NotImplementedError() + def transform( self, X: SupportedFeatTypes, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 8dad37205..54d162082 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -156,9 +156,13 @@ def _fit( # with nan values. # Columns that are completely made of NaN values are provided to the pipeline # so that later stages decide how to handle them + + # Clear whatever null column markers we had previously + self.null_columns.clear() if np.any(pd.isnull(X)): for column in X.columns: if X[column].isna().all(): + self.null_columns.add(column) X[column] = pd.to_numeric(X[column]) # Also note this change in self.dtypes if len(self.dtypes) != 0: @@ -167,9 +171,8 @@ def _fit( if not X.select_dtypes(include='object').empty: X = self.infer_objects(X) - self.transformed_columns, self.feat_type = self._get_columns_to_encode(X) - - assert self.feat_type is not None + self._check_data(X) + self.enc_columns, self.feat_type = self._get_columns_to_encode(X) if len(self.transformed_columns) > 0: @@ -238,11 +241,25 @@ def transform( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if ispandas(X) and not issparse(X): - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) + if hasattr(X, "iloc") and not issparse(X): + X = cast(pd.DataFrame, X) + # If we had null columns in our fit call and we made them numeric, then: + # - If the columns are null even in transform, apply the same procedure. + # - Otherwise, substitute the values with np.NaN and then make the columns numeric. + # If the column is null here, but it was not in fit, it does not matter. + for column in self.null_columns: + # The column is not null, make it null since it was null in fit. + if not X[column].isna().all(): + X[column] = np.NaN + X[column] = pd.to_numeric(X[column]) + + # for the test set, if we have columns with only null values + # they will probably have a numeric type. If these columns were not + # with only null values in the train set, they should be converted + # to the type that they had during fitting. + for column in X.columns: + if X[column].isna().all(): + X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)]) # Also remove the object dtype for new data if not X.select_dtypes(include='object').empty: @@ -250,18 +267,12 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(X) + # We also need to fillna on the transformation + # in case test data is provided + X = self.impute_nan_in_categories(X) - # Pandas related transformations - if ispandas(X) and self.column_transformer is not None: - if np.any(pd.isnull(X)): - # After above check it means that if there is a NaN - # the whole column must be NaN - # Make sure it is numerical and let the pipeline handle it - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - - X = self.column_transformer.transform(X) + if self.encoder is not None: + X = self.encoder.transform(X) # Sparse related transformations # Not all sparse format support index sorting @@ -488,7 +499,7 @@ def numpy_array_to_pandas( Returns: pd.DataFrame """ - return pd.DataFrame(X).infer_objects().convert_dtypes() + return pd.DataFrame(X).convert_dtypes() def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -506,18 +517,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: if hasattr(self, 'object_dtype_mapping'): # Mypy does not process the has attr. This dict is defined below for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type] - if 'int' in dtype.name: - # In the case train data was interpreted as int - # and test data was interpreted as float, because of 0.0 - # for example, honor training data - X[key] = X[key].applymap(np.int64) - else: - try: - X[key] = X[key].astype(dtype.name) - except Exception as e: - # Try inference if possible - self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") - pass + # honor the training data types + try: + X[key] = X[key].astype(dtype.name) + except Exception as e: + # Try inference if possible + self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") + pass else: X = X.infer_objects() for column in X.columns: diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 2daa271b7..87d728090 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -1,4 +1,4 @@ -import copy + import copy import functools import numpy as np @@ -139,9 +139,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): if isinstance(input_data_featuretest, pd.DataFrame): pytest.skip("Column order change in pandas is not supported") elif isinstance(input_data_featuretest, np.ndarray): - complementary_type = pd.DataFrame(input_data_featuretest) + complementary_type = validator.numpy_array_to_pandas(input_data_featuretest) elif isinstance(input_data_featuretest, list): - complementary_type = pd.DataFrame(input_data_featuretest) + complementary_type, _ = validator.list_to_dataframe(input_data_featuretest) elif sparse.issparse(input_data_featuretest): complementary_type = sparse.csr_matrix(input_data_featuretest.todense()) else: @@ -331,8 +331,11 @@ def test_unknown_encode_value(): ) @pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list')) @pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list')) -def test_featurevalidator_new_data_after_fit(openml_id, - train_data_type, test_data_type): +def test_feature_validator_new_data_after_fit( + openml_id, + train_data_type, + test_data_type, +): # List is currently not supported as infer_objects # cast list objects to type objects @@ -406,3 +409,109 @@ def test_comparator(): key=functools.cmp_to_key(validator._comparator) ) assert ans == feat_type + + +# Actual checks for the features +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_numericalonly_nonan', + 'numpy_numericalonly_nan', + 'numpy_mixed_nan', + 'pandas_numericalonly_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'openml_40981', # Australian + ), + indirect=True +) +def test_featurevalidator_reduce_precision(input_data_featuretest): + X_train, X_test = sklearn.model_selection.train_test_split( + input_data_featuretest, test_size=0.1, random_state=1) + validator = TabularFeatureValidator(dataset_compression={'memory_allocation': 0, 'methods': ['precision']}) + validator.fit(X_train=X_train) + transformed_X_train = validator.transform(X_train.copy()) + + assert validator._reduced_dtype is not None + assert megabytes(transformed_X_train) < megabytes(X_train) + + transformed_X_test = validator.transform(X_test.copy()) + assert megabytes(transformed_X_test) < megabytes(X_test) + if hasattr(transformed_X_train, 'iloc'): + assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) + assert all(transformed_X_train.dtypes == validator._precision) + else: + assert transformed_X_train.dtype == transformed_X_test.dtype + assert transformed_X_test.dtype == validator._reduced_dtype + + +def test_feature_validator_imbalanced_data(): + + # Null columns in the train split but not necessarily in the test split + train_features = { + 'A': [np.NaN, np.NaN, np.NaN], + 'B': [1, 2, 3], + 'C': [np.NaN, np.NaN, np.NaN], + 'D': [np.NaN, np.NaN, np.NaN], + } + test_features = { + 'A': [3, 4, 5], + 'B': [6, 5, 7], + 'C': [np.NaN, np.NaN, np.NaN], + 'D': ['Blue', np.NaN, np.NaN], + } + + X_train = pd.DataFrame.from_dict(train_features) + X_test = pd.DataFrame.from_dict(test_features) + validator = TabularFeatureValidator() + validator.fit(X_train) + + train_feature_types = copy.deepcopy(validator.feat_type) + assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] + # validator will throw an error if the column types are not the same + transformed_X_test = validator.transform(X_test) + transformed_X_test = pd.DataFrame(transformed_X_test) + null_columns = [] + for column in transformed_X_test.columns: + if transformed_X_test[column].isna().all(): + null_columns.append(column) + assert null_columns == [0, 2, 3] + + # Columns with not all null values in the train split and + # completely null on the test split. + train_features = { + 'A': [np.NaN, np.NaN, 4], + 'B': [1, 2, 3], + 'C': ['Blue', np.NaN, np.NaN], + } + test_features = { + 'A': [np.NaN, np.NaN, np.NaN], + 'B': [6, 5, 7], + 'C': [np.NaN, np.NaN, np.NaN], + } + + X_train = pd.DataFrame.from_dict(train_features) + X_test = pd.DataFrame.from_dict(test_features) + validator = TabularFeatureValidator() + validator.fit(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) + assert train_feature_types == ['categorical', 'numerical', 'numerical'] + + transformed_X_test = validator.transform(X_test) + transformed_X_test = pd.DataFrame(transformed_X_test) + null_columns = [] + for column in transformed_X_test.columns: + if transformed_X_test[column].isna().all(): + null_columns.append(column) + + assert null_columns == [1] diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index f7755e35e..f546357f5 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -32,7 +32,6 @@ def test_data_validation_for_classification(openmlid, as_frame): x, y, test_size=0.33, random_state=0) validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - X_train_t, y_train_t = validator.transform(X_train, y_train) assert np.shape(X_train) == np.shape(X_train_t)