From 1d14de32b4c6a2e0fd36ef2ae7f5c22750b90856 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 22 Feb 2022 20:29:22 +0900 Subject: [PATCH] [fix] Fix the task inference issue mentioned in #352 Since sklearn task inference regards targets with integers as a classification task, I modified target_validator so that we always cast targets for regression to float. This workaround is mentioned in the reference below: https://github.com/scikit-learn/scikit-learn/issues/8952 --- autoPyTorch/data/base_feature_validator.py | 18 ++--- autoPyTorch/data/base_target_validator.py | 24 +++--- autoPyTorch/data/base_validator.py | 28 +++---- autoPyTorch/data/tabular_feature_validator.py | 22 ++--- autoPyTorch/data/tabular_target_validator.py | 80 ++++++++++--------- 5 files changed, 88 insertions(+), 84 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 6ef7cae6b..2d0ecf988 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -12,7 +12,7 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = Union[ +SupportedFeatTypes = Union[ List, pd.DataFrame, np.ndarray, @@ -68,8 +68,8 @@ def __init__( def fit( self, - X_train: SUPPORTED_FEAT_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + X_train: SupportedFeatTypes, + X_test: Optional[SupportedFeatTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. @@ -77,10 +77,10 @@ def fit( CSR sparse data types are also supported Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of data used for checking """ @@ -109,11 +109,11 @@ def fit( def _fit( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> BaseEstimator: """ Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: @@ -124,11 +124,11 @@ def _fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> np.ndarray: """ Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features, whose categorical features are going to be transformed diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 393f3d85b..1b8ce124a 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -12,7 +12,7 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = Union[ +SupportedTargetTypes = Union[ List, pd.Series, pd.DataFrame, @@ -69,17 +69,17 @@ def __init__(self, def fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets The supported data types are List, numpy arrays and pandas DataFrames. Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) A set of targets set aside for training - y_test (Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SupportedTargetTypes]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -128,26 +128,26 @@ def fit( def _fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) The labels of the current task. They are going to be encoded in case of classification - y_test (Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SupportedTargetTypes]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: Union[SUPPORTED_TARGET_TYPES], + y: Union[SupportedTargetTypes], ) -> np.ndarray: """ Args: - y (SUPPORTED_TARGET_TYPES) + y (SupportedTargetTypes) A set of targets that are going to be encoded if the current task is classification Returns: @@ -158,7 +158,7 @@ def transform( def inverse_transform( self, - y: SUPPORTED_TARGET_TYPES, + y: SupportedTargetTypes, ) -> np.ndarray: """ Revert any encoding transformation done on a target array diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 13bb421c7..bebddff49 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -7,8 +7,8 @@ from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES -from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES +from autoPyTorch.data.base_feature_validator import SupportedFeatTypes +from autoPyTorch.data.base_target_validator import SupportedTargetTypes class BaseInputValidator(BaseEstimator): @@ -40,10 +40,10 @@ def __init__( def fit( self, - X_train: SUPPORTED_FEAT_TYPES, - y_train: SUPPORTED_TARGET_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + X_train: SupportedFeatTypes, + y_train: SupportedTargetTypes, + X_test: Optional[SupportedFeatTypes] = None, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features, and @@ -59,15 +59,15 @@ def fit( + If performing a classification task, the data is going to be encoded Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks). If this data contains categorical columns, an encoder is going to be instantiated and trained with this data. - y_train (SUPPORTED_TARGET_TYPES): + y_train (SupportedTargetTypes): A set of targets that are going to be encoded if the task is for classification - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of features used for checking - y_test (SUPPORTED_TARGET_TYPES): + y_test (SupportedTargetTypes): A hold out set of targets used for checking. Additionally, if the current task is a classification task, this y_test categories are also going to be used to fit a pre-processing encoding (to prevent errors on unseen classes). @@ -96,16 +96,16 @@ def fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, - y: Optional[SUPPORTED_TARGET_TYPES] = None, + X: SupportedFeatTypes, + y: Optional[SupportedTargetTypes] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Transform the given target or features to a numpy array Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features to transform - y (Optional[SUPPORTED_TARGET_TYPES]): + y (Optional[SupportedTargetTypes]): A set of targets to transform Returns: diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 27ed18cfc..4bab001c6 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -16,7 +16,7 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline -from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES +from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes def _create_column_transformer( @@ -117,7 +117,7 @@ def _comparator(cmp1: str, cmp2: str) -> int: def _fit( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> BaseEstimator: """ In case input data is a pandas DataFrame, this utility encodes the user provided @@ -125,7 +125,7 @@ def _fit( will be able to use Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and an encoder fitted in the case the data needs encoding @@ -204,14 +204,14 @@ def _fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> np.ndarray: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features, whose categorical features are going to be transformed @@ -276,13 +276,13 @@ def transform( def _check_data( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> None: """ Feature dimensionality and data type checks Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and an encoder fitted in the case the data needs encoding """ @@ -429,8 +429,8 @@ def _get_columns_to_encode( def list_to_dataframe( self, - X_train: SUPPORTED_FEAT_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + X_train: SupportedFeatTypes, + X_test: Optional[SupportedFeatTypes] = None, ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ Converts a list to a pandas DataFrame. In this process, column types are inferred. @@ -438,10 +438,10 @@ def list_to_dataframe( If test data is provided, we proactively match it to train data Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of data used for checking Returns: diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index c37dc81c3..a60c45831 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, cast +from typing import List, Optional, cast import numpy as np @@ -13,14 +13,19 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.multiclass import type_of_target -from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES +from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes + + +def _check_and_to_numpy(y: SupportedTargetTypes) -> np.ndarray: + """ sklearn check array will make sure we have the correct numerical features for the array """ + return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False) class TabularTargetValidator(BaseTargetValidator): def _fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ If dealing with classification, this utility encodes the targets. @@ -29,10 +34,10 @@ def _fit( errors Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) The labels of the current task. They are going to be encoded in case of classification - y_test (Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SupportedTargetTypes]) A holdout set of labels """ if not self.is_classification or self.type_of_target == 'multilabel-indicator': @@ -94,16 +99,34 @@ def _fit( return self + def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray: + if self.encoder is None: + return _check_and_to_numpy(y) + + # remove ravel warning from pandas Series + shape = np.shape(y) + if len(shape) > 1: + y = self.encoder.transform(y) + elif hasattr(y, 'iloc'): + # The Ordinal encoder expects a 2 dimensional input. + # The targets are 1 dimensional, so reshape to match the expected shape + y = cast(pd.DataFrame, y) + y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1) + else: + y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) + + return _check_and_to_numpy(y) + def transform( self, - y: Union[SUPPORTED_TARGET_TYPES], + y: SupportedTargetTypes, ) -> np.ndarray: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. Args: - y (SUPPORTED_TARGET_TYPES) + y (SupportedTargetTypes) A set of targets that are going to be encoded if the current task is classification @@ -116,47 +139,28 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(y) + y = self._transform_by_encoder(y) - if self.encoder is not None: - # remove ravel warning from pandas Series - shape = np.shape(y) - if len(shape) > 1: - y = self.encoder.transform(y) - else: - # The Ordinal encoder expects a 2 dimensional input. - # The targets are 1 dimensional, so reshape to match the expected shape - if hasattr(y, 'iloc'): - y = cast(pd.DataFrame, y) - y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1) - else: - y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) - - # sklearn check array will make sure we have the - # correct numerical features for the array - # Also, a numpy array will be created - y = sklearn.utils.check_array( - y, - force_all_finite=True, - accept_sparse='csr', - ensure_2d=False, - ) - - # When translating a dataframe to numpy, make sure we - # honor the ravel requirement + # When translating a dataframe to numpy, make sure we honor the ravel requirement if y.ndim == 2 and y.shape[1] == 1: y = np.ravel(y) + if not self.is_classification: + # Regression targets must be cast to float + # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952 + y = y.astype(dtype=np.float64) + return y def inverse_transform( self, - y: SUPPORTED_TARGET_TYPES, + y: SupportedTargetTypes, ) -> np.ndarray: """ Revert any encoding transformation done on a target array Args: - y (Union[np.ndarray, pd.DataFrame, pd.Series]): + y (SupportedTargetTypes): Target array to be transformed back to original form before encoding Returns: np.ndarray: @@ -187,13 +191,13 @@ def inverse_transform( def _check_data( self, - y: SUPPORTED_TARGET_TYPES, + y: SupportedTargetTypes, ) -> None: """ Perform dimensionality and data type checks on the targets Args: - y (Union[np.ndarray, pd.DataFrame, pd.Series]): + y (SupportedTargetTypes): A set of features whose dimensionality and data type is going to be checked """