-
Notifications
You must be signed in to change notification settings - Fork 289
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FIX] Remove redundant categorical imputation #375
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,70 +13,42 @@ | |
|
||
|
||
class SimpleImputer(BaseImputer): | ||
"""An imputer for categorical and numerical columns | ||
|
||
Impute missing values for categorical columns with 'constant_!missing!' | ||
|
||
Note: | ||
In case of numpy data, the constant value is set to -1, under the assumption | ||
that categorical data is fit with an Ordinal Scaler. | ||
""" | ||
An imputer for numerical columns | ||
|
||
Attributes: | ||
random_state (Optional[np.random.RandomState]): | ||
The random state to use for the imputer. | ||
numerical_strategy (str: default='mean'): | ||
The strategy to use for imputing numerical columns. | ||
Can be one of ['most_frequent', 'constant_!missing!'] | ||
categorical_strategy (str: default='most_frequent') | ||
The strategy to use for imputing categorical columns. | ||
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero'] | ||
""" | ||
|
||
def __init__( | ||
self, | ||
random_state: Optional[np.random.RandomState] = None, | ||
numerical_strategy: str = 'mean', | ||
categorical_strategy: str = 'most_frequent' | ||
): | ||
""" | ||
Note: | ||
'constant' as numerical_strategy uses 0 as the default fill_value while | ||
'constant_!missing!' uses a fill_value of -1. | ||
This behaviour should probably be fixed. | ||
""" | ||
super().__init__() | ||
self.random_state = random_state | ||
self.numerical_strategy = numerical_strategy | ||
self.categorical_strategy = categorical_strategy | ||
|
||
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: | ||
""" Fits the underlying model and returns the transformed array. | ||
""" | ||
Builds the preprocessor based on the given fit dictionary 'X'. | ||
|
||
Args: | ||
X (np.ndarray): | ||
The input features to fit on | ||
y (Optional[np.ndarray]): | ||
The labels for the input features `X` | ||
X (Dict[str, Any]): | ||
The fit dictionary | ||
y (Optional[Any]): | ||
Not Used -- to comply with API | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this change to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, there is outdated documentation throughout the pipeline components. I'll raise an issue and fix them with a different PR. |
||
|
||
Returns: | ||
SimpleImputer: | ||
returns self | ||
self: | ||
returns an instance of self. | ||
""" | ||
self.check_requirements(X, y) | ||
|
||
# Choose an imputer for any categorical columns | ||
categorical_columns = X['dataset_properties']['categorical_columns'] | ||
|
||
if isinstance(categorical_columns, List) and len(categorical_columns) != 0: | ||
if self.categorical_strategy == 'constant_!missing!': | ||
# Train data is numpy as of this point, where an Ordinal Encoding is used | ||
# for categoricals. Only Numbers are allowed for `fill_value` | ||
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False) | ||
self.preprocessor['categorical'] = imputer | ||
else: | ||
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False) | ||
self.preprocessor['categorical'] = imputer | ||
|
||
# Choose an imputer for any numerical columns | ||
numerical_columns = X['dataset_properties']['numerical_columns'] | ||
|
||
|
@@ -98,11 +70,6 @@ def get_hyperparameter_search_space( | |
value_range=("mean", "median", "most_frequent", "constant_zero"), | ||
default_value="mean", | ||
), | ||
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||
hyperparameter='categorical_strategy', | ||
value_range=("most_frequent", "constant_!missing!"), | ||
default_value="most_frequent" | ||
) | ||
) -> ConfigurationSpace: | ||
"""Get the hyperparameter search space for the SimpleImputer | ||
|
||
|
@@ -112,8 +79,6 @@ def get_hyperparameter_search_space( | |
Note: Not actually Optional, just adhering to its supertype | ||
numerical_strategy (HyperparameterSearchSpace: default = ...) | ||
The strategy to use for numerical imputation | ||
caterogical_strategy (HyperparameterSearchSpace: default = ...) | ||
The strategy to use for categorical imputation | ||
|
||
Returns: | ||
ConfigurationSpace | ||
|
@@ -132,12 +97,6 @@ def get_hyperparameter_search_space( | |
): | ||
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) | ||
|
||
if ( | ||
isinstance(dataset_properties['categorical_columns'], List) | ||
and len(dataset_properties['categorical_columns']) | ||
): | ||
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter) | ||
|
||
return cs | ||
|
||
@staticmethod | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent): | |||||
def __init__(self) -> None: | ||||||
super().__init__() | ||||||
self.add_fit_requirements([ | ||||||
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), | ||||||
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) | ||||||
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) | ||||||
|
||||||
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: | ||||||
""" | ||||||
|
@@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: | |||||
Returns: | ||||||
(Dict[str, Any]): the updated 'X' dictionary | ||||||
""" | ||||||
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: | ||||||
if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0: | ||||||
raise ValueError("cant call transform on {} without fitting first." | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "can not" is two words :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not know that "can not" exists, but "cannot" is preferred for formal settings?
|
||||||
.format(self.__class__.__name__)) | ||||||
X.update({'imputer': self.preprocessor}) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,14 +39,14 @@ def test_get_config_space(self): | |
self.assertEqual(param1, param2) | ||
|
||
def test_mean_imputation(self): | ||
data = np.array([['1.0', np.nan, 3], | ||
data = np.array([[1.0, np.nan, 3], | ||
[np.nan, 8, 9], | ||
['4.0', 5, np.nan], | ||
[4.0, 5, np.nan], | ||
[np.nan, 2, 3], | ||
['7.0', np.nan, 9], | ||
['4.0', np.nan, np.nan]], dtype=object) | ||
numerical_columns = [1, 2] | ||
categorical_columns = [0] | ||
[7.0, np.nan, 9], | ||
[4.0, np.nan, np.nan]]) | ||
numerical_columns = [0, 1, 2] | ||
categorical_columns = [] | ||
nabenabe0928 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
train_indices = np.array([0, 2, 3]) | ||
test_indices = np.array([1, 4, 5]) | ||
dataset_properties = { | ||
|
@@ -66,31 +66,29 @@ def test_mean_imputation(self): | |
|
||
# check if the fit dictionary X is modified as expected | ||
self.assertIsInstance(X['imputer'], dict) | ||
self.assertIsInstance(categorical_imputer, BaseEstimator) | ||
self.assertIsNone(categorical_imputer) | ||
self.assertIsInstance(numerical_imputer, BaseEstimator) | ||
|
||
# make column transformer with returned encoder to fit on data | ||
column_transformer = make_column_transformer((categorical_imputer, | ||
X['dataset_properties']['categorical_columns']), | ||
(numerical_imputer, | ||
column_transformer = make_column_transformer((numerical_imputer, | ||
X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough') | ||
column_transformer = column_transformer.fit(X['X_train']) | ||
transformed = column_transformer.transform(data[test_indices]) | ||
|
||
assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], | ||
[7.0, 3.5, 9.0], | ||
[4.0, 3.5, 3.0]], dtype=str)) | ||
assert_array_equal(transformed, np.array([[2.5, 8, 9], | ||
[7, 3.5, 9], | ||
[4, 3.5, 3]])) | ||
|
||
def test_median_imputation(self): | ||
data = np.array([['1.0', np.nan, 3], | ||
data = np.array([[1.0, np.nan, 3], | ||
[np.nan, 8, 9], | ||
['4.0', 5, np.nan], | ||
[4.0, 5, np.nan], | ||
[np.nan, 2, 3], | ||
['7.0', np.nan, 9], | ||
['4.0', np.nan, np.nan]], dtype=object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because we would like to test the median is correct, shouldn't we test by the data whose median is different from mean? |
||
numerical_columns = [1, 2] | ||
categorical_columns = [0] | ||
[7.0, np.nan, 9], | ||
[4.0, np.nan, np.nan]]) | ||
numerical_columns = [0, 1, 2] | ||
categorical_columns = [] | ||
train_indices = np.array([0, 2, 3]) | ||
test_indices = np.array([1, 4, 5]) | ||
dataset_properties = { | ||
|
@@ -110,31 +108,29 @@ def test_median_imputation(self): | |
|
||
# check if the fit dictionary X is modified as expected | ||
self.assertIsInstance(X['imputer'], dict) | ||
self.assertIsInstance(categorical_imputer, BaseEstimator) | ||
self.assertIsNone(categorical_imputer) | ||
self.assertIsInstance(numerical_imputer, BaseEstimator) | ||
|
||
# make column transformer with returned encoder to fit on data | ||
column_transformer = make_column_transformer( | ||
(categorical_imputer, X['dataset_properties']['categorical_columns']), | ||
(numerical_imputer, X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough' | ||
) | ||
column_transformer = make_column_transformer((numerical_imputer, | ||
X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough') | ||
column_transformer = column_transformer.fit(X['X_train']) | ||
transformed = column_transformer.transform(data[test_indices]) | ||
|
||
assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], | ||
[7.0, 3.5, 9.0], | ||
[4.0, 3.5, 3.0]], dtype=str)) | ||
assert_array_equal(transformed, np.array([[2.5, 8, 9], | ||
[7, 3.5, 9], | ||
[4, 3.5, 3]])) | ||
|
||
def test_frequent_imputation(self): | ||
data = np.array([['1.0', np.nan, 3], | ||
data = np.array([[1.0, np.nan, 3], | ||
[np.nan, 8, 9], | ||
['4.0', 5, np.nan], | ||
[4.0, 5, np.nan], | ||
[np.nan, 2, 3], | ||
['7.0', np.nan, 9], | ||
['4.0', np.nan, np.nan]], dtype=object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here as well. We should modify the test dataset so that the imputation value is more obvious. |
||
numerical_columns = [1, 2] | ||
categorical_columns = [0] | ||
[7.0, np.nan, 9], | ||
[4.0, np.nan, np.nan]]) | ||
numerical_columns = [0, 1, 2] | ||
categorical_columns = [] | ||
train_indices = np.array([0, 2, 3]) | ||
test_indices = np.array([1, 4, 5]) | ||
dataset_properties = { | ||
|
@@ -145,8 +141,7 @@ def test_frequent_imputation(self): | |
'X_train': data[train_indices], | ||
'dataset_properties': dataset_properties | ||
} | ||
imputer_component = SimpleImputer(numerical_strategy='most_frequent', | ||
categorical_strategy='most_frequent') | ||
imputer_component = SimpleImputer(numerical_strategy='most_frequent') | ||
|
||
imputer_component = imputer_component.fit(X) | ||
X = imputer_component.transform(X) | ||
|
@@ -155,31 +150,29 @@ def test_frequent_imputation(self): | |
|
||
# check if the fit dictionary X is modified as expected | ||
self.assertIsInstance(X['imputer'], dict) | ||
self.assertIsInstance(categorical_imputer, BaseEstimator) | ||
self.assertIsNone(categorical_imputer) | ||
self.assertIsInstance(numerical_imputer, BaseEstimator) | ||
|
||
# make column transformer with returned encoder to fit on data | ||
column_transformer = make_column_transformer( | ||
(categorical_imputer, X['dataset_properties']['categorical_columns']), | ||
(numerical_imputer, X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough' | ||
) | ||
column_transformer = make_column_transformer((numerical_imputer, | ||
X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough') | ||
column_transformer = column_transformer.fit(X['X_train']) | ||
transformed = column_transformer.transform(data[test_indices]) | ||
|
||
assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9], | ||
[7.0, 2, 9], | ||
[4.0, 2, 3]], dtype=str)) | ||
assert_array_equal(transformed, np.array([[1, 8, 9], | ||
[7, 2, 9], | ||
[4, 2, 3]])) | ||
|
||
def test_constant_imputation(self): | ||
data = np.array([['1.0', np.nan, 3], | ||
data = np.array([[1.0, np.nan, 3], | ||
[np.nan, 8, 9], | ||
['4.0', 5, np.nan], | ||
[4.0, 5, np.nan], | ||
[np.nan, 2, 3], | ||
['7.0', np.nan, 9], | ||
['4.0', np.nan, np.nan]], dtype=object) | ||
numerical_columns = [1, 2] | ||
categorical_columns = [0] | ||
[7.0, np.nan, 9], | ||
[4.0, np.nan, np.nan]]) | ||
numerical_columns = [0, 1, 2] | ||
categorical_columns = [] | ||
train_indices = np.array([0, 2, 3]) | ||
test_indices = np.array([1, 4, 5]) | ||
dataset_properties = { | ||
|
@@ -190,8 +183,7 @@ def test_constant_imputation(self): | |
'X_train': data[train_indices], | ||
'dataset_properties': dataset_properties | ||
} | ||
imputer_component = SimpleImputer(numerical_strategy='constant_zero', | ||
categorical_strategy='constant_!missing!') | ||
nabenabe0928 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
imputer_component = SimpleImputer(numerical_strategy='constant_zero') | ||
|
||
imputer_component = imputer_component.fit(X) | ||
X = imputer_component.transform(X) | ||
|
@@ -200,20 +192,18 @@ def test_constant_imputation(self): | |
|
||
# check if the fit dictionary X is modified as expected | ||
self.assertIsInstance(X['imputer'], dict) | ||
self.assertIsInstance(categorical_imputer, BaseEstimator) | ||
self.assertIsNone(categorical_imputer) | ||
self.assertIsInstance(numerical_imputer, BaseEstimator) | ||
|
||
# make column transformer with returned encoder to fit on data | ||
column_transformer = make_column_transformer( | ||
(categorical_imputer, X['dataset_properties']['categorical_columns']), | ||
(numerical_imputer, X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough' | ||
) | ||
column_transformer = make_column_transformer((numerical_imputer, | ||
X['dataset_properties']['numerical_columns']), | ||
remainder='passthrough') | ||
column_transformer = column_transformer.fit(X['X_train']) | ||
transformed = column_transformer.transform(data[test_indices]) | ||
assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9], | ||
[7.0, '0', 9], | ||
[4.0, '0', '0']], dtype=str)) | ||
assert_array_equal(transformed, np.array([[0, 8, 9], | ||
[7, 0, 9], | ||
[4, 0, 0]])) | ||
|
||
def test_imputation_without_dataset_properties_raises_error(self): | ||
"""Tests SimpleImputer checks for dataset properties when querying for | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be good to give a little example dict in the documentation.