Skip to content

Commit

Permalink
remove todos for the preprocessing PR, and apply suggestion from code…
Browse files Browse the repository at this point in the history
… review
  • Loading branch information
ravinkohli committed Jul 5, 2022
1 parent ae4bd55 commit c27c76d
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def __init__(self) -> None:
self._processing = True
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from math import ceil
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
Expand All @@ -18,11 +17,25 @@
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


def get_num_output_dimensions(config, num_categs_per_feature):
""" Returns list of embedding sizes for each categorical variable.
def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]:
"""
Returns list of embedding sizes for each categorical variable.
Selects this adaptively based on training_datset.
Note: Assumes there is at least one embed feature.
Args:
config (Dict[str, Any]):
contains the hyperparameters required to calculate the `num_output_dimensions`
num_categs_per_feature (List[int]):
list containing number of categories for each feature that is to be embedded,
0 if the column is not an embed column
Returns:
List[int]:
list containing the output embedding size for each column,
1 if the column is not an embed column
"""

max_embedding_dim = config['max_embedding_dim']
embed_exponent = config['embed_exponent']
size_factor = config['embedding_size_factor']
Expand Down Expand Up @@ -65,12 +78,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
# before passing it through the model
concat_seq = []

x_pointer = 0
layer_pointer = 0
for x_pointer, embed in enumerate(self.embed_features):
current_feature_slice = x[:, x_pointer]
if not embed:
x_pointer += 1
concat_seq.append(current_feature_slice.view(-1, 1))
continue
current_feature_slice = current_feature_slice.to(torch.int)
Expand Down Expand Up @@ -115,7 +126,7 @@ def get_hyperparameter_search_space(
value_range=(100,),
default_value=100),
embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
value_range=(1.0, 0.5, 1.5, 0.7, 0.6, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4),
value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5),
default_value=1,
),
) -> ConfigurationSpace:
Expand Down
8 changes: 6 additions & 2 deletions autoPyTorch/pipeline/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
CoalescerChoice
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
ColumnSplitter
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down Expand Up @@ -235,8 +238,9 @@ def _get_pipeline_steps(

steps.extend([
("imputer", SimpleImputer(random_state=self.random_state)),
("variance_threshold", VarianceThreshold(random_state=self.random_state)),
("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
# ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
# ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
("column_splitter", ColumnSplitter(random_state=self.random_state)),
("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
)


# TODO: fix in preprocessing PR
# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")
@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
'classification_categorical_only',
'classification_numerical_and_categorical'], indirect=True)
Expand Down
3 changes: 1 addition & 2 deletions test/test_pipeline/components/setup/test_setup_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ def head(request):
return request.param


# TODO: add 'LearnedEntityEmbedding' after preprocessing dix
@pytest.fixture(params=['NoEmbedding'])
@pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding'])
def embedding(request):
return request.param

Expand Down
12 changes: 6 additions & 6 deletions test/test_pipeline/test_tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,12 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
"""This test makes sure that the pipeline is able to fit
given random combinations of hyperparameters across the pipeline"""
# TODO: fix issue where adversarial also works for regression
# TODO: Fix issue with learned entity embedding after preprocessing PR

pipeline = TabularRegressionPipeline(
dataset_properties=fit_dictionary_tabular['dataset_properties'],
exclude={'trainer': ['AdversarialTrainer'],
'network_embedding': ['LearnedEntityEmbedding']})
exclude={'trainer': ['AdversarialTrainer']})
# ,
# 'network_embedding': ['LearnedEntityEmbedding']})
cs = pipeline.get_hyperparameter_search_space()

config = cs.sample_configuration()
Expand Down Expand Up @@ -139,11 +140,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items()

# Then the pipeline should have added the following keys
# Removing 'imputer', 'encoder', 'scaler', these will be
# TODO: added back after a PR fixing preprocessing
expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network',
'optimizer', 'lr_scheduler', 'train_data_loader',
'val_data_loader', 'run_summary', 'feature_preprocessor'}
'val_data_loader', 'run_summary', 'feature_preprocessor',
'imputer', 'encoder', 'scaler'}
assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys()))

# Then we need to have transformations being created.
Expand Down

0 comments on commit c27c76d

Please sign in to comment.