Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement for the tabular validator. #291

Merged
merged 25 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
359b4c9
Initial try at an enhancement for the tabular validator
ArlindKadra Oct 1, 2021
65e8ffb
Adding a few type annotations
ArlindKadra Oct 1, 2021
217c38d
Fixing bugs in implementation
ArlindKadra Oct 1, 2021
f7dd8fe
Adding wrongly deleted code part during rebase
ArlindKadra Oct 1, 2021
92bd535
Fix bug in _get_args
ravinkohli Oct 2, 2021
5f672b5
Fix bug in _get_args
ravinkohli Oct 2, 2021
223c09e
Addressing Shuhei's comments
ArlindKadra Oct 3, 2021
a1ed883
Address Shuhei's comments
ArlindKadra Oct 3, 2021
f585310
Refactoring code
ArlindKadra Oct 6, 2021
f298c46
Refactoring code
ArlindKadra Oct 6, 2021
03bef16
Typos fix and additional comments
ArlindKadra Oct 6, 2021
a7d01f1
Replace nan in categoricals with simple imputer
ravinkohli Oct 7, 2021
38fe9e8
Remove unused function
ravinkohli Oct 7, 2021
7693753
add comment
ravinkohli Oct 7, 2021
f4cd3a4
Merge branch 'cocktail_fixes_time_debug' into tabular_validator_enhan…
ravinkohli Oct 7, 2021
497c546
Update autoPyTorch/data/tabular_feature_validator.py
ravinkohli Oct 7, 2021
9254eb2
Update autoPyTorch/data/tabular_feature_validator.py
ravinkohli Oct 7, 2021
b63ff3c
Adding unit test for only nall columns in the tabular feature categor…
ArlindKadra Oct 8, 2021
d5bbdbe
fix bug in remove all nan columns
ravinkohli Oct 8, 2021
bfe4899
Bug fix for making tests run by arlind
ravinkohli Oct 8, 2021
369edad
fix flake errors in feature validator
ravinkohli Oct 8, 2021
a4fb0cb
made typing code uniform
ravinkohli Oct 8, 2021
44229a6
Apply suggestions from code review
ravinkohli Oct 8, 2021
ba3c1e7
address comments from shuhei
ravinkohli Oct 8, 2021
10a8441
address comments from shuhei (2)
ravinkohli Oct 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 51 additions & 22 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import typing
from typing import List, Optional, Set, Tuple, Union

import numpy as np

Expand All @@ -12,8 +12,8 @@
from autoPyTorch.utils.logging_ import PicklableClientLogger


SUPPORTED_FEAT_TYPES = typing.Union[
typing.List,
SUPPORTED_FEAT_TYPES = Union[
List,
pd.DataFrame,
np.ndarray,
scipy.sparse.bsr_matrix,
Expand All @@ -35,43 +35,44 @@ class BaseFeatureValidator(BaseEstimator):
List of the column types found by this estimator during fit.
data_type (str):
Class name of the data type provided during fit.
encoder (typing.Optional[BaseEstimator])
encoder (Optional[BaseEstimator])
Host a encoder object if the data requires transformation (for example,
if provided a categorical column in a pandas DataFrame)
enc_columns (typing.List[str])
enc_columns (List[str])
List of columns that were encoded.
"""
def __init__(self,
logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
]] = None,
logger: Optional[Union[PicklableClientLogger, logging.Logger
]
] = None,
) -> None:
# Register types to detect unsupported data format changes
self.feat_type = None # type: typing.Optional[typing.List[str]]
self.data_type = None # type: typing.Optional[type]
self.dtypes = [] # type: typing.List[str]
self.column_order = [] # type: typing.List[str]
self.feat_type: Optional[List[str]] = None
self.data_type: Optional[type] = None
self.dtypes: List[str] = []
self.column_order: List[str] = []

self.encoder = None # type: typing.Optional[BaseEstimator]
self.enc_columns = [] # type: typing.List[str]
self.encoder: Optional[BaseEstimator] = None
self.enc_columns: List[str] = []

self.logger: typing.Union[
self.logger: Union[
PicklableClientLogger, logging.Logger
] = logger if logger is not None else logging.getLogger(__name__)

# Required for dataset properties
self.num_features = None # type: typing.Optional[int]
self.categories = [] # type: typing.List[typing.List[int]]
self.categorical_columns: typing.List[int] = []
self.numerical_columns: typing.List[int] = []
# column identifiers may be integers or strings
self.null_columns: typing.Set[str] = set()
self.num_features: Optional[int] = None
self.categories: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []

self.all_nan_columns: Optional[Set[Union[int, str]]] = None

self._is_fitted = False

def fit(
self,
X_train: SUPPORTED_FEAT_TYPES,
X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the features.
Expand All @@ -82,7 +83,7 @@ def fit(
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
X_test (Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of data used for checking
"""

Expand Down Expand Up @@ -122,6 +123,7 @@ def _fit(
self:
The fitted base estimator
"""

raise NotImplementedError()

def _check_data(
Expand All @@ -136,6 +138,7 @@ def _check_data(
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
"""

raise NotImplementedError()

def transform(
Expand All @@ -152,4 +155,30 @@ def transform(
np.ndarray:
The transformed array
"""

raise NotImplementedError()

def list_to_dataframe(
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
self,
X_train: SUPPORTED_FEAT_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
"""
Converts a list to a pandas DataFrame. In this process, column types are inferred.

If test data is provided, we proactively match it to train data

Arguments:
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
X_test (Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of data used for checking
Returns:
pd.DataFrame:
transformed train data from list to pandas DataFrame
pd.DataFrame:
transformed test data from list to pandas DataFrame
"""

raise NotImplementedError()
40 changes: 20 additions & 20 deletions autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import typing
from typing import List, Optional, Union, cast

import numpy as np

Expand All @@ -12,8 +12,8 @@
from autoPyTorch.utils.logging_ import PicklableClientLogger


SUPPORTED_TARGET_TYPES = typing.Union[
typing.List,
SUPPORTED_TARGET_TYPES = Union[
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AutoPep8 rule

Suggested change
SUPPORTED_TARGET_TYPES = Union[
SupportedTargetTypes = Union[

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets keep this a part of a separate PR later.

List,
pd.Series,
pd.DataFrame,
np.ndarray,
Expand All @@ -35,39 +35,39 @@ class BaseTargetValidator(BaseEstimator):
is_classification (bool):
A bool that indicates if the validator should operate in classification mode.
During classification, the targets are encoded.
encoder (typing.Optional[BaseEstimator]):
encoder (Optional[BaseEstimator]):
Host a encoder object if the data requires transformation (for example,
if provided a categorical column in a pandas DataFrame)
enc_columns (typing.List[str])
enc_columns (List[str])
List of columns that where encoded
"""
def __init__(self,
is_classification: bool = False,
logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
logger: Optional[Union[PicklableClientLogger, logging.Logger
]] = None,
) -> None:
self.is_classification = is_classification

self.data_type = None # type: typing.Optional[type]
self.data_type: Optional[type] = None

self.encoder = None # type: typing.Optional[BaseEstimator]
self.encoder: Optional[BaseEstimator] = None

self.out_dimensionality = None # type: typing.Optional[int]
self.type_of_target = None # type: typing.Optional[str]
self.out_dimensionality: Optional[int] = None
self.type_of_target: Optional[str] = None

self.logger: typing.Union[
self.logger: Union[
PicklableClientLogger, logging.Logger
] = logger if logger is not None else logging.getLogger(__name__)

# Store the dtype for remapping to correct type
self.dtype = None # type: typing.Optional[type]
self.dtype: Optional[type] = None

self._is_fitted = False

def fit(
self,
y_train: SUPPORTED_TARGET_TYPES,
y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the targets
Expand All @@ -76,7 +76,7 @@ def fit(
Arguments:
y_train (SUPPORTED_TARGET_TYPES)
A set of targets set aside for training
y_test (typing.Union[SUPPORTED_TARGET_TYPES])
y_test (Union[SUPPORTED_TARGET_TYPES])
A hold out set of data used of the targets. It is also used to fit the
categories of the encoder.
"""
Expand All @@ -95,8 +95,8 @@ def fit(
np.shape(y_test)
))
if isinstance(y_train, pd.DataFrame):
y_train = typing.cast(pd.DataFrame, y_train)
y_test = typing.cast(pd.DataFrame, y_test)
y_train = cast(pd.DataFrame, y_train)
y_test = cast(pd.DataFrame, y_test)
if y_train.columns.tolist() != y_test.columns.tolist():
raise ValueError(
"Train and test targets must both have the same columns, yet "
Expand Down Expand Up @@ -127,21 +127,21 @@ def fit(
def _fit(
self,
y_train: SUPPORTED_TARGET_TYPES,
y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Arguments:
y_train (SUPPORTED_TARGET_TYPES)
The labels of the current task. They are going to be encoded in case
of classification
y_test (typing.Optional[SUPPORTED_TARGET_TYPES])
y_test (Optional[SUPPORTED_TARGET_TYPES])
A holdout set of labels
"""
raise NotImplementedError()

def transform(
self,
y: typing.Union[SUPPORTED_TARGET_TYPES],
y: Union[SUPPORTED_TARGET_TYPES],
) -> np.ndarray:
"""
Arguments:
Expand All @@ -162,7 +162,7 @@ def inverse_transform(
Revert any encoding transformation done on a target array

Arguments:
y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]):
y (Union[np.ndarray, pd.DataFrame, pd.Series]):
Target array to be transformed back to original form before encoding
Returns:
np.ndarray:
Expand Down
Loading