automl · ravinkohli · Oct 21, 2021 · Jun 4, 2021 · Jun 7, 2021 · Jun 7, 2021
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -27,11 +27,14 @@
 class TabularClassificationTask(BaseTask):
     """
     Tabular Classification API to the pipelines.
+
     Args:
         seed (int):
             seed to be used for reproducibility.
         n_jobs (int), (default=1):
             number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]):
             specifies configuration for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50):
@@ -63,6 +66,7 @@ def __init__(
         self,
         seed: int = 1,
         n_jobs: int = 1,
+        n_threads: int = 1,
         logging_config: Optional[Dict] = None,
         ensemble_size: int = 50,
         ensemble_nbest: int = 50,
@@ -83,6 +87,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
@@ -277,6 +282,8 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        if self.dataset is None:
+            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,

diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -27,9 +27,13 @@
 class TabularRegressionTask(BaseTask):
     """
     Tabular Regression API to the pipelines.
+
     Args:
         seed (int): seed to be used for reproducibility.
-        n_jobs (int), (default=1): number of consecutive processes to spawn.
+        n_jobs (int), (default=1):
+            number of consecutive processes to spawn.
+        n_threads (int), (default=1):
+            number of threads to use for each process.
         logging_config (Optional[Dict]): specifies configuration
             for logging, if None, it is loaded from the logging.yaml
         ensemble_size (int), (default=50): Number of models added to the ensemble built by
@@ -50,11 +54,11 @@ class TabularRegressionTask(BaseTask):
             Otherwise specifies set of components not to use. Incompatible with include
             components
     """
-
     def __init__(
             self,
             seed: int = 1,
             n_jobs: int = 1,
+            n_threads: int = 1,
             logging_config: Optional[Dict] = None,
             ensemble_size: int = 50,
             ensemble_nbest: int = 50,
@@ -75,6 +79,7 @@ def __init__(
         super().__init__(
             seed=seed,
             n_jobs=n_jobs,
+            n_threads=n_threads,
             logging_config=logging_config,
             ensemble_size=ensemble_size,
             ensemble_nbest=ensemble_nbest,
@@ -263,6 +268,8 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        if self.dataset is None:
+            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import List, Optional, Set, Tuple, Union
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_FEAT_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_FEAT_TYPES = Union[
+    List,
     pd.DataFrame,
     np.ndarray,
     scipy.sparse.bsr_matrix,
@@ -29,66 +29,64 @@
 class BaseFeatureValidator(BaseEstimator):
     """
     A class to pre-process features. In this regards, the format of the data is checked,
-    and if applicable, features are encoded
+    and if applicable, features are encoded.
+
     Attributes:
         feat_type (List[str]):
             List of the column types found by this estimator during fit.
         data_type (str):
             Class name of the data type provided during fit.
-        encoder (typing.Optional[BaseEstimator])
+        encoder (Optional[BaseEstimator])
             Host a encoder object if the data requires transformation (for example,
-            if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
-            List of columns that were encoded.
+            if provided a categorical column in a pandas DataFrame).
     """
-    def __init__(self,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
-                 ) -> None:
+    def __init__(
+        self,
+        logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
+    ) -> None:
         # Register types to detect unsupported data format changes
-        self.feat_type = None  # type: typing.Optional[typing.List[str]]
-        self.data_type = None  # type: typing.Optional[type]
-        self.dtypes = []  # type: typing.List[str]
-        self.column_order = []  # type: typing.List[str]
+        self.feat_type: Optional[List[str]] = None
+        self.data_type: Optional[type] = None
+        self.dtypes: List[str] = []
+        self.column_order: List[str] = []
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
-        self.enc_columns = []  # type: typing.List[str]
+        self.column_transformer: Optional[BaseEstimator] = None
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Required for dataset properties
-        self.num_features = None  # type: typing.Optional[int]
-        self.categories = []  # type: typing.List[typing.List[int]]
-        self.categorical_columns: typing.List[int] = []
-        self.numerical_columns: typing.List[int] = []
-        # column identifiers may be integers or strings
-        self.null_columns: typing.Set[str] = set()
+        self.num_features: Optional[int] = None
+        self.categories: List[List[int]] = []
+        self.categorical_columns: List[int] = []
+        self.numerical_columns: List[int] = []
+
+        self.all_nan_columns: Optional[Set[Union[int, str]]] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         X_train: SUPPORTED_FEAT_TYPES,
-        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the features.
         The supported data types are List, numpy arrays and pandas DataFrames.
         CSR sparse data types are also supported
 
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
-            X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
                 A hold out set of data used for checking
         """
 
         # If a list was provided, it will be converted to pandas
         if isinstance(X_train, list):
-            X_train, X_test = self.list_to_dataframe(X_train, X_test)
+            X_train, X_test = self.list_to_pandas(X_train, X_test)
 
         self._check_data(X_train)
 
@@ -114,14 +112,15 @@ def _fit(
         X: SUPPORTED_FEAT_TYPES,
     ) -> BaseEstimator:
         """
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         Returns:
             self:
                 The fitted base estimator
         """
+
         raise NotImplementedError()
 
     def _check_data(
@@ -131,19 +130,20 @@ def _check_data(
         """
         Feature dimensionality and data type checks
 
-        Arguments:
+        Args:
             X (SUPPORTED_FEAT_TYPES):
                 A set of features that are going to be validated (type and dimensionality
                 checks) and a encoder fitted in the case the data needs encoding
         """
+
         raise NotImplementedError()
 
     def transform(
         self,
         X: SUPPORTED_FEAT_TYPES,
     ) -> np.ndarray:
         """
-        Arguments:
+        Args:
             X_train (SUPPORTED_FEAT_TYPES):
                 A set of features, whose categorical features are going to be
                 transformed
@@ -152,4 +152,30 @@ def transform(
             np.ndarray:
                 The transformed array
         """
+
+        raise NotImplementedError()
+
+    def list_to_pandas(
+        self,
+        X_train: SUPPORTED_FEAT_TYPES,
+        X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
+    ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
+        """
+        Converts a list to a pandas DataFrame. In this process, column types are inferred.
+
+        If test data is provided, we proactively match it to train data
+
+        Args:
+            X_train (SUPPORTED_FEAT_TYPES):
+                A set of features that are going to be validated (type and dimensionality
+                checks) and a encoder fitted in the case the data needs encoding
+            X_test (Optional[SUPPORTED_FEAT_TYPES]):
+                A hold out set of data used for checking
+        Returns:
+            pd.DataFrame:
+                transformed train data from list to pandas DataFrame
+            pd.DataFrame:
+                transformed test data from list to pandas DataFrame
+        """
+
         raise NotImplementedError()
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -1,5 +1,5 @@
 import logging
-import typing
+from typing import List, Optional, Union, cast
 
 import numpy as np
 
@@ -12,8 +12,8 @@
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SUPPORTED_TARGET_TYPES = typing.Union[
-    typing.List,
+SUPPORTED_TARGET_TYPES = Union[
+    List,
     pd.Series,
     pd.DataFrame,
     np.ndarray,
@@ -35,48 +35,50 @@ class BaseTargetValidator(BaseEstimator):
         is_classification (bool):
             A bool that indicates if the validator should operate in classification mode.
             During classification, the targets are encoded.
-        encoder (typing.Optional[BaseEstimator]):
+        encoder (Optional[BaseEstimator]):
             Host a encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
-        enc_columns (typing.List[str])
+        enc_columns (List[str])
             List of columns that where encoded
     """
     def __init__(self,
                  is_classification: bool = False,
-                 logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
+                 logger: Optional[Union[PicklableClientLogger,
+                                        logging.Logger
+                                        ]
+                                  ] = None,
                  ) -> None:
         self.is_classification = is_classification
 
-        self.data_type = None  # type: typing.Optional[type]
+        self.data_type: Optional[type] = None
 
-        self.encoder = None  # type: typing.Optional[BaseEstimator]
+        self.encoder: Optional[BaseEstimator] = None
 
-        self.out_dimensionality = None  # type: typing.Optional[int]
-        self.type_of_target = None  # type: typing.Optional[str]
+        self.out_dimensionality: Optional[int] = None
+        self.type_of_target: Optional[str] = None
 
-        self.logger: typing.Union[
+        self.logger: Union[
             PicklableClientLogger, logging.Logger
         ] = logger if logger is not None else logging.getLogger(__name__)
 
         # Store the dtype for remapping to correct type
-        self.dtype = None  # type: typing.Optional[type]
+        self.dtype: Optional[type] = None
 
         self._is_fitted = False
 
     def fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
         Validates and fit a categorical encoder (if needed) to the targets
         The supported data types are List, numpy arrays and pandas DataFrames.
 
-        Arguments:
+        Args:
             y_train (SUPPORTED_TARGET_TYPES)
                 A set of targets set aside for training
-            y_test (typing.Union[SUPPORTED_TARGET_TYPES])
+            y_test (Union[SUPPORTED_TARGET_TYPES])
                 A hold out set of data used of the targets. It is also used to fit the
                 categories of the encoder.
         """
@@ -95,8 +97,8 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = typing.cast(pd.DataFrame, y_train)
-                y_test = typing.cast(pd.DataFrame, y_test)
+                y_train = cast(pd.DataFrame, y_train)
+                y_test = cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(
                         "Train and test targets must both have the same columns, yet "
@@ -127,24 +129,24 @@ def fit(
     def _fit(
         self,
         y_train: SUPPORTED_TARGET_TYPES,
-        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
     ) -> BaseEstimator:
         """
-        Arguments:
+        Args:
             y_train (SUPPORTED_TARGET_TYPES)
                 The labels of the current task. They are going to be encoded in case
                 of classification
-            y_test (typing.Optional[SUPPORTED_TARGET_TYPES])
+            y_test (Optional[SUPPORTED_TARGET_TYPES])
                 A holdout set of labels
         """
         raise NotImplementedError()
 
     def transform(
         self,
-        y: typing.Union[SUPPORTED_TARGET_TYPES],
+        y: Union[SUPPORTED_TARGET_TYPES],
     ) -> np.ndarray:
         """
-        Arguments:
+        Args:
             y (SUPPORTED_TARGET_TYPES)
                 A set of targets that are going to be encoded if the current task
                 is classification
@@ -161,8 +163,8 @@ def inverse_transform(
         """
         Revert any encoding transformation done on a target array
 
-        Arguments:
-            y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
+        Args:
+            y (Union[np.ndarray, pd.DataFrame, pd.Series]):
                 Target array to be transformed back to original form before encoding
         Returns:
             np.ndarray: