Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Remove redundant categorical imputation #375

Merged
merged 5 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,42 @@


class SimpleImputer(BaseImputer):
"""An imputer for categorical and numerical columns

Impute missing values for categorical columns with 'constant_!missing!'

Note:
In case of numpy data, the constant value is set to -1, under the assumption
that categorical data is fit with an Ordinal Scaler.
"""
An imputer for numerical columns

Attributes:
random_state (Optional[np.random.RandomState]):
The random state to use for the imputer.
numerical_strategy (str: default='mean'):
The strategy to use for imputing numerical columns.
Can be one of ['most_frequent', 'constant_!missing!']
categorical_strategy (str: default='most_frequent')
The strategy to use for imputing categorical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
"""

def __init__(
self,
random_state: Optional[np.random.RandomState] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'
):
"""
Note:
'constant' as numerical_strategy uses 0 as the default fill_value while
'constant_!missing!' uses a fill_value of -1.
This behaviour should probably be fixed.
"""
super().__init__()
self.random_state = random_state
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
""" Fits the underlying model and returns the transformed array.
"""
Builds the preprocessor based on the given fit dictionary 'X'.

Args:
X (np.ndarray):
The input features to fit on
y (Optional[np.ndarray]):
The labels for the input features `X`
X (Dict[str, Any]):
The fit dictionary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to give a little example dict in the documentation.

y (Optional[Any]):
Not Used -- to comply with API
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this change to Any, seems more semantically clear (:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, there is outdated documentation throughout the pipeline components. I'll raise an issue and fix them with a different PR.


Returns:
SimpleImputer:
returns self
self:
returns an instance of self.
"""
self.check_requirements(X, y)

# Choose an imputer for any categorical columns
categorical_columns = X['dataset_properties']['categorical_columns']

if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
if self.categorical_strategy == 'constant_!missing!':
# Train data is numpy as of this point, where an Ordinal Encoding is used
# for categoricals. Only Numbers are allowed for `fill_value`
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
self.preprocessor['categorical'] = imputer
else:
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
self.preprocessor['categorical'] = imputer

# Choose an imputer for any numerical columns
numerical_columns = X['dataset_properties']['numerical_columns']

Expand All @@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
value_range=("mean", "median", "most_frequent", "constant_zero"),
default_value="mean",
),
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='categorical_strategy',
value_range=("most_frequent", "constant_!missing!"),
default_value="most_frequent"
)
) -> ConfigurationSpace:
"""Get the hyperparameter search space for the SimpleImputer

Expand All @@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
Note: Not actually Optional, just adhering to its supertype
numerical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for numerical imputation
caterogical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for categorical imputation

Returns:
ConfigurationSpace
Expand All @@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
):
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)

if (
isinstance(dataset_properties['categorical_columns'], List)
and len(dataset_properties['categorical_columns'])
):
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)

return cs

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Expand All @@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
raise ValueError("cant call transform on {} without fitting first."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise ValueError("cant call transform on {} without fitting first."
raise ValueError("cannot call transform on {} without fitting first."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"can not" is two words :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not know that "can not" exists, but "cannot" is preferred for formal settings?

  • article1 --> "You’ll find cannot in formal writing and speech. Research studies, academic reports, and professional presentations are places where cannot is the most appropriate choice."
  • article 2 --> "While can not is an acceptable alternate spelling, cannot is generally preferred by most writers."
  • article 3 --> "Both cannot and can not are perfectly fine, but cannot is far more common and is therefore recommended"

.format(self.__class__.__name__))
X.update({'imputer': self.preprocessor})
Expand Down
114 changes: 52 additions & 62 deletions test/test_pipeline/components/preprocessing/test_imputers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ def test_get_config_space(self):
self.assertEqual(param1, param2)

def test_mean_imputation(self):
data = np.array([['1.0', np.nan, 3],
data = np.array([[1.0, np.nan, 3],
[np.nan, 8, 9],
['4.0', 5, np.nan],
[4.0, 5, np.nan],
[np.nan, 2, 3],
['7.0', np.nan, 9],
['4.0', np.nan, np.nan]], dtype=object)
numerical_columns = [1, 2]
categorical_columns = [0]
[7.0, np.nan, 9],
[4.0, np.nan, np.nan]])
numerical_columns = [0, 1, 2]
categorical_columns = []
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
train_indices = np.array([0, 2, 3])
test_indices = np.array([1, 4, 5])
dataset_properties = {
Expand All @@ -66,31 +66,29 @@ def test_mean_imputation(self):

# check if the fit dictionary X is modified as expected
self.assertIsInstance(X['imputer'], dict)
self.assertIsInstance(categorical_imputer, BaseEstimator)
self.assertIsNone(categorical_imputer)
self.assertIsInstance(numerical_imputer, BaseEstimator)

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((categorical_imputer,
X['dataset_properties']['categorical_columns']),
(numerical_imputer,
column_transformer = make_column_transformer((numerical_imputer,
X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
[7.0, 3.5, 9.0],
[4.0, 3.5, 3.0]], dtype=str))
assert_array_equal(transformed, np.array([[2.5, 8, 9],
[7, 3.5, 9],
[4, 3.5, 3]]))

def test_median_imputation(self):
data = np.array([['1.0', np.nan, 3],
data = np.array([[1.0, np.nan, 3],
[np.nan, 8, 9],
['4.0', 5, np.nan],
[4.0, 5, np.nan],
[np.nan, 2, 3],
['7.0', np.nan, 9],
['4.0', np.nan, np.nan]], dtype=object)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we would like to test the median is correct, shouldn't we test by the data whose median is different from mean?

numerical_columns = [1, 2]
categorical_columns = [0]
[7.0, np.nan, 9],
[4.0, np.nan, np.nan]])
numerical_columns = [0, 1, 2]
categorical_columns = []
train_indices = np.array([0, 2, 3])
test_indices = np.array([1, 4, 5])
dataset_properties = {
Expand All @@ -110,31 +108,29 @@ def test_median_imputation(self):

# check if the fit dictionary X is modified as expected
self.assertIsInstance(X['imputer'], dict)
self.assertIsInstance(categorical_imputer, BaseEstimator)
self.assertIsNone(categorical_imputer)
self.assertIsInstance(numerical_imputer, BaseEstimator)

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer(
(categorical_imputer, X['dataset_properties']['categorical_columns']),
(numerical_imputer, X['dataset_properties']['numerical_columns']),
remainder='passthrough'
)
column_transformer = make_column_transformer((numerical_imputer,
X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0],
[7.0, 3.5, 9.0],
[4.0, 3.5, 3.0]], dtype=str))
assert_array_equal(transformed, np.array([[2.5, 8, 9],
[7, 3.5, 9],
[4, 3.5, 3]]))

def test_frequent_imputation(self):
data = np.array([['1.0', np.nan, 3],
data = np.array([[1.0, np.nan, 3],
[np.nan, 8, 9],
['4.0', 5, np.nan],
[4.0, 5, np.nan],
[np.nan, 2, 3],
['7.0', np.nan, 9],
['4.0', np.nan, np.nan]], dtype=object)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here as well. We should modify the test dataset so that the imputation value is more obvious.
This example requires tie-breaking for the most frequent value.

numerical_columns = [1, 2]
categorical_columns = [0]
[7.0, np.nan, 9],
[4.0, np.nan, np.nan]])
numerical_columns = [0, 1, 2]
categorical_columns = []
train_indices = np.array([0, 2, 3])
test_indices = np.array([1, 4, 5])
dataset_properties = {
Expand All @@ -145,8 +141,7 @@ def test_frequent_imputation(self):
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
imputer_component = SimpleImputer(numerical_strategy='most_frequent',
categorical_strategy='most_frequent')
imputer_component = SimpleImputer(numerical_strategy='most_frequent')

imputer_component = imputer_component.fit(X)
X = imputer_component.transform(X)
Expand All @@ -155,31 +150,29 @@ def test_frequent_imputation(self):

# check if the fit dictionary X is modified as expected
self.assertIsInstance(X['imputer'], dict)
self.assertIsInstance(categorical_imputer, BaseEstimator)
self.assertIsNone(categorical_imputer)
self.assertIsInstance(numerical_imputer, BaseEstimator)

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer(
(categorical_imputer, X['dataset_properties']['categorical_columns']),
(numerical_imputer, X['dataset_properties']['numerical_columns']),
remainder='passthrough'
)
column_transformer = make_column_transformer((numerical_imputer,
X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9],
[7.0, 2, 9],
[4.0, 2, 3]], dtype=str))
assert_array_equal(transformed, np.array([[1, 8, 9],
[7, 2, 9],
[4, 2, 3]]))

def test_constant_imputation(self):
data = np.array([['1.0', np.nan, 3],
data = np.array([[1.0, np.nan, 3],
[np.nan, 8, 9],
['4.0', 5, np.nan],
[4.0, 5, np.nan],
[np.nan, 2, 3],
['7.0', np.nan, 9],
['4.0', np.nan, np.nan]], dtype=object)
numerical_columns = [1, 2]
categorical_columns = [0]
[7.0, np.nan, 9],
[4.0, np.nan, np.nan]])
numerical_columns = [0, 1, 2]
categorical_columns = []
train_indices = np.array([0, 2, 3])
test_indices = np.array([1, 4, 5])
dataset_properties = {
Expand All @@ -190,8 +183,7 @@ def test_constant_imputation(self):
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
imputer_component = SimpleImputer(numerical_strategy='constant_zero',
categorical_strategy='constant_!missing!')
nabenabe0928 marked this conversation as resolved.
Show resolved Hide resolved
imputer_component = SimpleImputer(numerical_strategy='constant_zero')

imputer_component = imputer_component.fit(X)
X = imputer_component.transform(X)
Expand All @@ -200,20 +192,18 @@ def test_constant_imputation(self):

# check if the fit dictionary X is modified as expected
self.assertIsInstance(X['imputer'], dict)
self.assertIsInstance(categorical_imputer, BaseEstimator)
self.assertIsNone(categorical_imputer)
self.assertIsInstance(numerical_imputer, BaseEstimator)

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer(
(categorical_imputer, X['dataset_properties']['categorical_columns']),
(numerical_imputer, X['dataset_properties']['numerical_columns']),
remainder='passthrough'
)
column_transformer = make_column_transformer((numerical_imputer,
X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])
assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9],
[7.0, '0', 9],
[4.0, '0', '0']], dtype=str))
assert_array_equal(transformed, np.array([[0, 8, 9],
[7, 0, 9],
[4, 0, 0]]))

def test_imputation_without_dataset_properties_raises_error(self):
"""Tests SimpleImputer checks for dataset properties when querying for
Expand Down