[fix] Fix the task inference issue mentioned in automl#352

Since sklearn task inference regards targets with integers as a classification task, I modified target_validator so that we always cast targets for regression to float. This workaround is mentioned in the reference below: scikit-learn/scikit-learn#8952
nabenabe0928 · Feb 22, 2022 · 1d14de3 · 1d14de3
1 parent b5c1757
commit 1d14de3
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 84 deletions.
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -12,7 +12,7 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = Union[
+SupportedFeatTypes = Union[
     List,
     pd.DataFrame,
     np.ndarray,
@@ -68,19 +68,19 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
         CSR sparse data types are also supported
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
         """
 
@@ -109,11 +109,11 @@ def fit(
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
@@ -124,11 +124,11 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> np.ndarray:
         """
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 

diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -12,7 +12,7 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = Union[
+SupportedTargetTypes = Union[
     List,
     pd.Series,
     pd.DataFrame,
@@ -69,17 +69,17 @@ def __init__(self,
 
     def fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 A set of targets set aside for training
-            y_test (Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SupportedTargetTypes])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -128,26 +128,26 @@ def fit(
 
     def _fit(
         self,
-        y_train: SUPPORTED_TARGET_TYPES,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_train: SupportedTargetTypes,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Args:
-            y_train (SUPPORTED_TARGET_TYPES)
+            y_train (SupportedTargetTypes)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SupportedTargetTypes])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SupportedTargetTypes],
     ) -> np.ndarray:
         """
         Args:
-            y (SUPPORTED_TARGET_TYPES)
+            y (SupportedTargetTypes)
                 A set of targets that are going to be encoded if the current task
                 is classification
         Returns:
@@ -158,7 +158,7 @@ def transform(
 
     def inverse_transform(
         self,
-        y: SUPPORTED_TARGET_TYPES,
+        y: SupportedTargetTypes,
     ) -> np.ndarray:
         """
         Revert any encoding transformation done on a target array

diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py
@@ -7,8 +7,8 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES
-from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES
+from autoPyTorch.data.base_feature_validator import SupportedFeatTypes
+from autoPyTorch.data.base_target_validator import SupportedTargetTypes
 
 
 class BaseInputValidator(BaseEstimator):
@@ -40,10 +40,10 @@ def __init__(
 
     def fit(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        y_train: SUPPORTED_TARGET_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
-        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        y_train: SupportedTargetTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
+        y_test: Optional[SupportedTargetTypes] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features, and
@@ -59,15 +59,15 @@ def fit(
             + If performing a classification task, the data is going to be encoded
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks). If this data contains categorical columns, an encoder is going to
                 be instantiated and trained with this data.
-            y_train (SUPPORTED_TARGET_TYPES):
+            y_train (SupportedTargetTypes):
                 A set of targets that are going to be encoded if the task is for classification
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of features used for checking
-            y_test (SUPPORTED_TARGET_TYPES):
+            y_test (SupportedTargetTypes):
                 A hold out set of targets used for checking. Additionally, if the current task
                 is a classification task, this y_test categories are also going to be used to
                 fit a pre-processing encoding (to prevent errors on unseen classes).
@@ -96,16 +96,16 @@ def fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: Optional[SUPPORTED_TARGET_TYPES] = None,
+        X: SupportedFeatTypes,
+        y: Optional[SupportedTargetTypes] = None,
     ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
         """
         Transform the given target or features to a numpy array
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features to transform
-            y (Optional[SUPPORTED_TARGET_TYPES]):
+            y (Optional[SupportedTargetTypes]):
                 A set of targets to transform
 
         Returns:

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -16,7 +16,7 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import make_pipeline
 
-from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
+from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes
 
 
 def _create_column_transformer(
@@ -117,15 +117,15 @@ def _comparator(cmp1: str, cmp2: str) -> int:
 
     def _fit(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> BaseEstimator:
         """
         In case input data is a pandas DataFrame, this utility encodes the user provided
         features (from categorical for example) to a numerical value that further stages
         will be able to use
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
 
@@ -204,14 +204,14 @@ def _fit(
 
     def transform(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> np.ndarray:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features, whose categorical features are going to be
                 transformed
 
@@ -276,13 +276,13 @@ def transform(
 
     def _check_data(
         self,
-        X: SUPPORTED_FEAT_TYPES,
+        X: SupportedFeatTypes,
     ) -> None:
         """
         Feature dimensionality and data type checks
 
         Args:
-            X (SUPPORTED_FEAT_TYPES):
+            X (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and an encoder fitted in the case the data needs encoding
         """
@@ -429,19 +429,19 @@ def _get_columns_to_encode(
 
     def list_to_dataframe(
         self,
-        X_train: SUPPORTED_FEAT_TYPES,
-        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_train: SupportedFeatTypes,
+        X_test: Optional[SupportedFeatTypes] = None,
     ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
         """
         Converts a list to a pandas DataFrame. In this process, column types are inferred.
 
         If test data is provided, we proactively match it to train data
 
         Args:
-            X_train (SUPPORTED_FEAT_TYPES):
+            X_train (SupportedFeatTypes):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SupportedFeatTypes]):
                 A hold out set of data used for checking
 
         Returns: