Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancement for the tabular validator. #291

Merged
merged 25 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
359b4c9
Initial try at an enhancement for the tabular validator
ArlindKadra Oct 1, 2021
65e8ffb
Adding a few type annotations
ArlindKadra Oct 1, 2021
217c38d
Fixing bugs in implementation
ArlindKadra Oct 1, 2021
f7dd8fe
Adding wrongly deleted code part during rebase
ArlindKadra Oct 1, 2021
92bd535
Fix bug in _get_args
ravinkohli Oct 2, 2021
5f672b5
Fix bug in _get_args
ravinkohli Oct 2, 2021
223c09e
Addressing Shuhei's comments
ArlindKadra Oct 3, 2021
a1ed883
Address Shuhei's comments
ArlindKadra Oct 3, 2021
f585310
Refactoring code
ArlindKadra Oct 6, 2021
f298c46
Refactoring code
ArlindKadra Oct 6, 2021
03bef16
Typos fix and additional comments
ArlindKadra Oct 6, 2021
a7d01f1
Replace nan in categoricals with simple imputer
ravinkohli Oct 7, 2021
38fe9e8
Remove unused function
ravinkohli Oct 7, 2021
7693753
add comment
ravinkohli Oct 7, 2021
f4cd3a4
Merge branch 'cocktail_fixes_time_debug' into tabular_validator_enhan…
ravinkohli Oct 7, 2021
497c546
Update autoPyTorch/data/tabular_feature_validator.py
ravinkohli Oct 7, 2021
9254eb2
Update autoPyTorch/data/tabular_feature_validator.py
ravinkohli Oct 7, 2021
b63ff3c
Adding unit test for only nall columns in the tabular feature categor…
ArlindKadra Oct 8, 2021
d5bbdbe
fix bug in remove all nan columns
ravinkohli Oct 8, 2021
bfe4899
Bug fix for making tests run by arlind
ravinkohli Oct 8, 2021
369edad
fix flake errors in feature validator
ravinkohli Oct 8, 2021
a4fb0cb
made typing code uniform
ravinkohli Oct 8, 2021
44229a6
Apply suggestions from code review
ravinkohli Oct 8, 2021
ba3c1e7
address comments from shuhei
ravinkohli Oct 8, 2021
10a8441
address comments from shuhei (2)
ravinkohli Oct 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def _fit(
self:
The fitted base estimator
"""

raise NotImplementedError()

def _check_data(
Expand All @@ -136,6 +137,7 @@ def _check_data(
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
"""

raise NotImplementedError()

def transform(
Expand All @@ -152,4 +154,30 @@ def transform(
np.ndarray:
The transformed array
"""

raise NotImplementedError()

def list_to_dataframe(
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
self,
X_train: SUPPORTED_FEAT_TYPES,
X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]:
"""
Converts a list to a pandas DataFrame. In this process, column types are inferred.

If test data is provided, we proactively match it to train data

Arguments:
X_train (SUPPORTED_FEAT_TYPES):
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
A hold out set of data used for checking
Returns:
pd.DataFrame:
transformed train data from list to pandas DataFrame
pd.DataFrame:
transformed test data from list to pandas DataFrame
"""

raise NotImplementedError()
132 changes: 86 additions & 46 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import functools
from typing import Any, Dict, List, Optional, Tuple, Union, cast


import numpy as np

import pandas as pd
Expand All @@ -22,7 +23,7 @@
def _create_column_transformer(
preprocessors: Dict[str, List[BaseEstimator]],
numerical_columns: List[str],
categorical_columns: List[str]
categorical_columns: List[str],
) -> ColumnTransformer:
"""
Given a dictionary of preprocessors, this function
Expand All @@ -38,6 +39,7 @@ def _create_column_transformer(
Returns:
ColumnTransformer
"""

numerical_pipeline = 'drop'
categorical_pipeline = 'drop'
if len(numerical_columns) > 0:
Expand All @@ -60,21 +62,19 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
Dict[str, List[BaseEstimator]]
"""
preprocessors: Dict[str, List[BaseEstimator]] = dict()
preprocessors['numerical'] = list()
preprocessors['categorical'] = list()

preprocessors['categorical'].append(OneHotEncoder(
categories='auto',
sparse=False,
handle_unknown='ignore'))
preprocessors['numerical'].append(SimpleImputer(strategy='median',
copy=False))
preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False))
onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')
imputer = SimpleImputer(strategy='median', copy=False)
standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False)

preprocessors['categorical'] = [onehot_encoder]
preprocessors['numerical'] = [imputer, standard_scaler]

return preprocessors


class TabularFeatureValidator(BaseFeatureValidator):

def _fit(
self,
X: SUPPORTED_FEAT_TYPES,
Expand All @@ -96,24 +96,27 @@ def _fit(
# The final output of a validator is a numpy array. But pandas
# gives us information about the column dtype
if isinstance(X, np.ndarray):

X = self.numpy_array_to_pandas(X)
# Replace the data type from the previously saved type.
self.data_type = type(X)
# save all the information about the column order and data types
self._check_data(X)
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = cast(pd.DataFrame, X)

if not X.select_dtypes(include='object').empty:
X = self.infer_objects(X)

self._check_data(X)
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
X = cast(pd.DataFrame, X)
categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)

self.enc_columns = categorical_columns
if len(categorical_columns) >= 0:
X = self.impute_nan_in_categories(X)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are we imputing now?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are using a sklearn imputer also for the categorical columns

preprocessors = get_tabular_preprocessors()
self.column_transformer = _create_column_transformer(preprocessors=preprocessors,
numerical_columns=numerical_columns,
categorical_columns=categorical_columns)
self.column_transformer = _create_column_transformer(
preprocessors=preprocessors,
numerical_columns=numerical_columns,
categorical_columns=categorical_columns,
)

# Mypy redefinition
assert self.column_transformer is not None
Expand Down Expand Up @@ -142,21 +145,24 @@ def comparator(cmp1: str, cmp2: str) -> int:

if len(categorical_columns) > 0:
self.categories = [
# We fit an ordinal encoder, where all categorical
# We fit a one-hot encoder, where all categorical
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
# columns are shifted to the left
list(range(len(cat)))
for cat in self.column_transformer.named_transformers_[
'categorical_pipeline'].named_steps['onehotencoder'].categories_
]

# differently to categorical_columns and numerical_columns,
# this saves the index of the column.
Comment on lines +151 to +152
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# differently to categorical_columns and numerical_columns,
# this saves the index of the column.
# Memorize the indices of categorical and numerical columns separately

for i, type_ in enumerate(self.feat_type):
if 'numerical' in type_:
self.numerical_columns.append(i)
else:
self.categorical_columns.append(i)

# Lastly, store the number of features
self.num_features = np.shape(X)[1]
self.num_features = len(X.columns)

return self

def transform(
Expand Down Expand Up @@ -189,10 +195,6 @@ def transform(
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = cast(pd.DataFrame, X)

# Also remove the object dtype for new data
if not X.select_dtypes(include='object').empty:
X = self.infer_objects(X)
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved

# Check the data here so we catch problems on new test data
self._check_data(X)
# We also need to fillna on the transformation
Expand Down Expand Up @@ -268,13 +270,13 @@ def _check_data(
X = cast(pd.DataFrame, X)

# Handle objects if possible
if not X.select_dtypes(include='object').empty:
object_columns_indicator = has_object_columns(X.dtypes)
ArlindKadra marked this conversation as resolved.
Show resolved Hide resolved
if object_columns_indicator:
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
X = self.infer_objects(X)

# Define the column to be encoded here as the feature validator is fitted once
# per estimator
# enc_columns, _ = self._get_columns_to_encode(X)

column_order = [column for column in X.columns]
if len(self.column_order) > 0:
if self.column_order != column_order:
Expand Down Expand Up @@ -310,8 +312,10 @@ def _get_columns_info(
A set of features that are going to be validated (type and dimensionality
checks) and a encoder fitted in the case the data needs encoding
Returns:
enc_columns (List[str]):
Columns to encode, if any
categorical_columns: (List[str])
Categorical columns.
numerical_columns: (List[str])
Numerical columns.
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
feat_type:
Type of each column numerical/categorical
"""
Expand All @@ -323,14 +327,15 @@ def _get_columns_info(

# Make sure each column is a valid type
for i, column in enumerate(X.columns):
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
if X[column].dtype.name in ['category', 'bool']:

column_dtype = self.dtypes[i]
if column_dtype in ['category', 'bool']:
categorical_columns.append(column)
feat_type.append('categorical')
# Move away from np.issubdtype as it causes
# TypeError: data type not understood in certain pandas types
elif not is_numeric_dtype(X[column]):
if X[column].dtype.name == 'object':
elif not is_numeric_dtype(column_dtype):
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
# TODO verify how would this happen when we always convert the object dtypes to category
if column_dtype == 'object':
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(
"Input Column {} has invalid type object. "
"Cast it to a valid dtype before using it in AutoPyTorch. "
Expand All @@ -345,7 +350,7 @@ def _get_columns_info(
)
)
elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
X[column].dtype
column_dtype
):
raise ValueError(
"AutoPyTorch does not support time and/or date datatype as given "
Expand All @@ -362,7 +367,7 @@ def _get_columns_info(
"Make sure your data is formatted in a correct way, "
"before feeding it to AutoPyTorch.".format(
column,
X[column].dtype.name,
column_dtype,
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
)
)
else:
Expand Down Expand Up @@ -394,7 +399,7 @@ def list_to_dataframe(
"""

# If a list was provided, it will be converted to pandas
X_train = pd.DataFrame(data=X_train).infer_objects()
X_train = pd.DataFrame(data=X_train).convert_dtypes()
self.logger.warning("The provided feature types to AutoPyTorch are of type list."
"Features have been interpreted as: {}".format([(col, t) for col, t in
zip(X_train.columns, X_train.dtypes)]))
Expand All @@ -403,7 +408,8 @@ def list_to_dataframe(
self.logger.warning("Train features are a list while the provided test data"
"is {}. X_test will be casted as DataFrame.".format(type(X_test))
)
X_test = pd.DataFrame(data=X_test).infer_objects()
X_test = pd.DataFrame(data=X_test).convert_dtypes()

return X_train, X_test

@staticmethod
Expand Down Expand Up @@ -446,17 +452,22 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
pass
else:
# Calling for the first time to infer the categories
X = X.infer_objects()
for column in X.columns:
if not is_numeric_dtype(X[column]):
# initial data types
data_types = X.dtypes
for index, column in enumerate(X.columns):
if not is_numeric_dtype(data_types[index]):
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
X[column] = X[column].astype('category')
self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
# only numerical attributes and categories
data_types = X.dtypes
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}

self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")

return X

def impute_nan_in_categories(self,
X: pd.DataFrame
) -> pd.DataFrame:
def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
"""
impute missing values before encoding,
remove once sklearn natively supports
Expand All @@ -478,17 +489,22 @@ def impute_nan_in_categories(self,
# TypeError: '<' not supported between instances of 'int' and 'str'
# in the encoding
for column in self.enc_columns:
if X[column].isna().any():
# no missing values for categorical column
if not X[column].isna().any():
continue
else:
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
if column not in self.dict_missing_value_per_col:
try:
float(X[column].dropna().values[0])
first_value = X[column].dropna().values[0]
float(first_value)
can_cast_as_number = True
except Exception:
except ValueError:
can_cast_as_number = False
if can_cast_as_number:
# In this case, we expect to have a number as category
# it might be string, but its value represent a number
missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1

missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
else:
missing_value = 'Missing!'

nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -507,4 +523,28 @@ def impute_nan_in_categories(self,
X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
inplace=True)
X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)

return X

def has_object_columns(
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
feature_types: pd.Series,
) -> bool:
"""
Indicate whether on a Series of dtypes for a Pandas DataFrame
there exists one or more object columns.

Arguments:
----------
feature_types: pd.Series
The feature types for a DataFrame.
Returns:
--------
bool
True if the DataFrame dtypes contain an object column, False
otherwise.
"""
for feature_type in feature_types:
if pd.api.types.is_object_dtype(feature_type):
return True
else:
return False
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# import copy
from typing import Any, Dict, Optional # , Tuple
from typing import Any, Dict, Optional, Tuple

import numpy as np

Expand Down Expand Up @@ -30,8 +30,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
raise NotImplementedError
#
# def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:

def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
# # Feature preprocessors can alter numerical columns
# # if len(X['dataset_properties']['numerical_columns']) == 0:
# # num_numerical_columns = 0
Expand All @@ -49,4 +49,4 @@ def build_embedding(self, num_input_features: np.ndarray, num_numerical_features
# # for i, category in enumerate(categories):
# # num_input_features[num_numerical_columns + i, ] = len(category)
# # return num_numerical_columns, num_input_features
# return None, None
return None, None
ravinkohli marked this conversation as resolved.
Show resolved Hide resolved