Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes #249

Merged
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a7a94e8
Update implementation
ArlindKadra Jun 4, 2021
3b7f559
Coding style fixes
ArlindKadra Jun 7, 2021
11e7021
Implementation update
ArlindKadra Jun 7, 2021
375c055
Style fix
ArlindKadra Jun 7, 2021
3413bc3
Turn weighted loss into a constant again, implementation update
ArlindKadra Jun 8, 2021
d37d4a5
Cocktail branch inconsistencies (#275)
ravinkohli Jul 21, 2021
23466f0
Cocktail fixes time debug (#286)
ravinkohli Oct 20, 2021
00f80cb
Addressing Shuhei's comments
ArlindKadra Oct 20, 2021
88e0228
flake8 problems fix
ArlindKadra Oct 20, 2021
3b6ec03
Update autoPyTorch/api/base_task.py
ArlindKadra Oct 21, 2021
a26edbe
Update autoPyTorch/api/base_task.py
ArlindKadra Oct 21, 2021
73a11c9
Update autoPyTorch/data/tabular_feature_validator.py
ArlindKadra Oct 21, 2021
37e3537
Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py
ArlindKadra Oct 21, 2021
dc5e8a2
Update autoPyTorch/data/tabular_feature_validator.py
ArlindKadra Oct 21, 2021
48b16a3
Update autoPyTorch/utils/implementations.py
ArlindKadra Oct 21, 2021
dab2f76
Allow the number of threads to be given by the user
ArlindKadra Oct 21, 2021
6f0aecb
Removing unnecessary argument and refactoring the attribute.
ArlindKadra Oct 21, 2021
84d7406
Addressing Ravin's comments
ArlindKadra Oct 21, 2021
9f8ebb5
Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py
ArlindKadra Oct 21, 2021
1488978
Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py
ArlindKadra Oct 21, 2021
a044a19
Merge branch 'refactor_development_regularization_cocktails' into coc…
ravinkohli Oct 21, 2021
6c8a55b
add todo for backend and accept changes from shuhei
ravinkohli Oct 21, 2021
e9dfea9
Addressing Shuhei's and Ravin's comments
ArlindKadra Oct 21, 2021
88893a9
Addressing Shuhei's and Ravin's comments, bug fix
ArlindKadra Oct 21, 2021
da6e47c
Update autoPyTorch/pipeline/components/setup/network_backbone/ResNetB…
ArlindKadra Oct 21, 2021
2740052
Update autoPyTorch/pipeline/components/setup/network_backbone/ResNetB…
ArlindKadra Oct 21, 2021
e597951
bug fix
ArlindKadra Oct 21, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
346 changes: 288 additions & 58 deletions autoPyTorch/api/base_task.py

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@
class TabularClassificationTask(BaseTask):
"""
Tabular Classification API to the pipelines.

Args:
seed (int):
seed to be used for reproducibility.
n_jobs (int), (default=1):
number of consecutive processes to spawn.
nr_threads (int), (default=1):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_threads for the compatibility?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, done.

number of threads to use for each process.
logging_config (Optional[Dict]):
specifies configuration for logging, if None, it is loaded from the logging.yaml
ensemble_size (int), (default=50):
Expand Down Expand Up @@ -63,6 +66,7 @@ def __init__(
self,
seed: int = 1,
n_jobs: int = 1,
nr_threads: int = 1,
logging_config: Optional[Dict] = None,
ensemble_size: int = 50,
ensemble_nbest: int = 50,
Expand All @@ -83,6 +87,7 @@ def __init__(
super().__init__(
seed=seed,
n_jobs=n_jobs,
nr_threads=nr_threads,
logging_config=logging_config,
ensemble_size=ensemble_size,
ensemble_nbest=ensemble_nbest,
Expand Down Expand Up @@ -275,6 +280,8 @@ def search(
y_test=y_test,
dataset_name=dataset_name)

if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down
11 changes: 9 additions & 2 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,13 @@
class TabularRegressionTask(BaseTask):
"""
Tabular Regression API to the pipelines.

Args:
seed (int): seed to be used for reproducibility.
n_jobs (int), (default=1): number of consecutive processes to spawn.
n_jobs (int), (default=1):
number of consecutive processes to spawn.
nr_threads (int), (default=1):
number of threads to use for each process.
logging_config (Optional[Dict]): specifies configuration
for logging, if None, it is loaded from the logging.yaml
ensemble_size (int), (default=50): Number of models added to the ensemble built by
Expand All @@ -50,11 +54,11 @@ class TabularRegressionTask(BaseTask):
Otherwise specifies set of components not to use. Incompatible with include
components
"""

def __init__(
self,
seed: int = 1,
n_jobs: int = 1,
nr_threads: int = 1,
logging_config: Optional[Dict] = None,
ensemble_size: int = 50,
ensemble_nbest: int = 50,
Expand All @@ -75,6 +79,7 @@ def __init__(
super().__init__(
seed=seed,
n_jobs=n_jobs,
nr_threads=nr_threads,
logging_config=logging_config,
ensemble_size=ensemble_size,
ensemble_nbest=ensemble_nbest,
Expand Down Expand Up @@ -261,6 +266,8 @@ def search(
y_test=y_test,
dataset_name=dataset_name)

if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down
90 changes: 58 additions & 32 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import typing
from typing import List, Optional, Set, Tuple, Union

import numpy as np

Expand All @@ -12,8 +12,8 @@
from autoPyTorch.utils.logging_ import PicklableClientLogger


SUPPORTED_FEAT_TYPES = typing.Union[
typing.List,
SUPPORTED_FEAT_TYPES = Union[
List,
pd.DataFrame,
np.ndarray,
scipy.sparse.bsr_matrix,
Expand All @@ -29,66 +29,64 @@
class BaseFeatureValidator(BaseEstimator):
"""
A class to pre-process features. In this regards, the format of the data is checked,
and if applicable, features are encoded
and if applicable, features are encoded.

Attributes:
feat_type (List[str]):
List of the column types found by this estimator during fit.
data_type (str):
Class name of the data type provided during fit.
encoder (typing.Optional[BaseEstimator])
encoder (Optional[BaseEstimator])
Host a encoder object if the data requires transformation (for example,
if provided a categorical column in a pandas DataFrame)
enc_columns (typing.List[str])
List of columns that were encoded.
if provided a categorical column in a pandas DataFrame).
"""
def __init__(self,
logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
]] = None,
) -> None:
def __init__(
self,
logger: Optional[Union[PicklableClientLogger, logging.Logger]] = None,
) -> None:
# Register types to detect unsupported data format changes
self.feat_type = None # type: typing.Optional[typing.List[str]]
self.data_type = None # type: typing.Optional[type]
self.dtypes = [] # type: typing.List[str]
self.column_order = [] # type: typing.List[str]
self.feat_type: Optional[List[str]] = None
self.data_type: Optional[type] = None
self.dtypes: List[str] = []
self.column_order: List[str] = []

self.encoder = None # type: typing.Optional[BaseEstimator]
self.enc_columns = [] # type: typing.List[str]
self.column_transformer: Optional[BaseEstimator] = None

self.logger: typing.Union[
self.logger: Union[
PicklableClientLogger, logging.Logger
] = logger if logger is not None else logging.getLogger(__name__)

# Required for dataset properties
self.num_features = None # type: typing.Optional[int]
self.categories = [] # type: typing.List[typing.List[int]]
self.categorical_columns: typing.List[int] = []
self.numerical_columns: typing.List[int] = []
# column identifiers may be integers or strings
self.null_columns: typing.Set[str] = set()
self.num_features: Optional[int] = None
self.categories: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []

self.all_nan_columns: Optional[Set[Union[int, str]]] = None

self._is_fitted = False

def fit(
self,
X_train: SUPPORTED_FEAT_TYPES,
X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the features.
The supported data types are List, numpy arrays and pandas DataFrames.
CSR sparse data types are also supported

Arguments:
Args:
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
X_test (Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of data used for checking
"""

# If a list was provided, it will be converted to pandas
if isinstance(X_train, list):
X_train, X_test = self.list_to_dataframe(X_train, X_test)
X_train, X_test = self.list_to_pandas(X_train, X_test)

self._check_data(X_train)

Expand All @@ -114,14 +112,15 @@ def _fit(
X: SUPPORTED_FEAT_TYPES,
) -> BaseEstimator:
"""
Arguments:
Args:
X (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
Returns:
self:
The fitted base estimator
"""

raise NotImplementedError()

def _check_data(
Expand All @@ -131,19 +130,20 @@ def _check_data(
"""
Feature dimensionality and data type checks

Arguments:
Args:
X (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
"""

raise NotImplementedError()

def transform(
self,
X: SUPPORTED_FEAT_TYPES,
) -> np.ndarray:
"""
Arguments:
Args:
X_train (SUPPORTED_FEAT_TYPES):
A set of features, whose categorical features are going to be
transformed
Expand All @@ -152,4 +152,30 @@ def transform(
np.ndarray:
The transformed array
"""

raise NotImplementedError()

def list_to_pandas(
self,
X_train: SUPPORTED_FEAT_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
"""
Converts a list to a pandas DataFrame. In this process, column types are inferred.

If test data is provided, we proactively match it to train data

Args:
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
X_test (Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of data used for checking
Returns:
pd.DataFrame:
transformed train data from list to pandas DataFrame
pd.DataFrame:
transformed test data from list to pandas DataFrame
"""

raise NotImplementedError()
52 changes: 27 additions & 25 deletions autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import typing
from typing import List, Optional, Union, cast

import numpy as np

Expand All @@ -12,8 +12,8 @@
from autoPyTorch.utils.logging_ import PicklableClientLogger


SUPPORTED_TARGET_TYPES = typing.Union[
typing.List,
SUPPORTED_TARGET_TYPES = Union[
List,
pd.Series,
pd.DataFrame,
np.ndarray,
Expand All @@ -35,48 +35,50 @@ class BaseTargetValidator(BaseEstimator):
is_classification (bool):
A bool that indicates if the validator should operate in classification mode.
During classification, the targets are encoded.
encoder (typing.Optional[BaseEstimator]):
encoder (Optional[BaseEstimator]):
Host a encoder object if the data requires transformation (for example,
if provided a categorical column in a pandas DataFrame)
enc_columns (typing.List[str])
enc_columns (List[str])
List of columns that where encoded
"""
def __init__(self,
is_classification: bool = False,
logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
]] = None,
logger: Optional[Union[PicklableClientLogger,
logging.Logger
]
] = None,
) -> None:
self.is_classification = is_classification

self.data_type = None # type: typing.Optional[type]
self.data_type: Optional[type] = None

self.encoder = None # type: typing.Optional[BaseEstimator]
self.encoder: Optional[BaseEstimator] = None

self.out_dimensionality = None # type: typing.Optional[int]
self.type_of_target = None # type: typing.Optional[str]
self.out_dimensionality: Optional[int] = None
self.type_of_target: Optional[str] = None

self.logger: typing.Union[
self.logger: Union[
PicklableClientLogger, logging.Logger
] = logger if logger is not None else logging.getLogger(__name__)

# Store the dtype for remapping to correct type
self.dtype = None # type: typing.Optional[type]
self.dtype: Optional[type] = None

self._is_fitted = False

def fit(
self,
y_train: SUPPORTED_TARGET_TYPES,
y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the targets
The supported data types are List, numpy arrays and pandas DataFrames.

Arguments:
Args:
y_train (SUPPORTED_TARGET_TYPES)
A set of targets set aside for training
y_test (typing.Union[SUPPORTED_TARGET_TYPES])
y_test (Union[SUPPORTED_TARGET_TYPES])
A hold out set of data used of the targets. It is also used to fit the
categories of the encoder.
"""
Expand All @@ -95,8 +97,8 @@ def fit(
np.shape(y_test)
))
if isinstance(y_train, pd.DataFrame):
y_train = typing.cast(pd.DataFrame, y_train)
y_test = typing.cast(pd.DataFrame, y_test)
y_train = cast(pd.DataFrame, y_train)
y_test = cast(pd.DataFrame, y_test)
if y_train.columns.tolist() != y_test.columns.tolist():
raise ValueError(
"Train and test targets must both have the same columns, yet "
Expand Down Expand Up @@ -127,24 +129,24 @@ def fit(
def _fit(
self,
y_train: SUPPORTED_TARGET_TYPES,
y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Arguments:
Args:
y_train (SUPPORTED_TARGET_TYPES)
The labels of the current task. They are going to be encoded in case
of classification
y_test (typing.Optional[SUPPORTED_TARGET_TYPES])
y_test (Optional[SUPPORTED_TARGET_TYPES])
A holdout set of labels
"""
raise NotImplementedError()

def transform(
self,
y: typing.Union[SUPPORTED_TARGET_TYPES],
y: Union[SUPPORTED_TARGET_TYPES],
) -> np.ndarray:
"""
Arguments:
Args:
y (SUPPORTED_TARGET_TYPES)
A set of targets that are going to be encoded if the current task
is classification
Expand All @@ -161,8 +163,8 @@ def inverse_transform(
"""
Revert any encoding transformation done on a target array

Arguments:
y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
Args:
y (Union[np.ndarray, pd.DataFrame, pd.Series]):
Target array to be transformed back to original form before encoding
Returns:
np.ndarray:
Expand Down
Loading