Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reg cocktails apt1.0+reg cocktails pytorch embedding reduced #454

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from math import ceil
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter,
UniformFloatHyperparameter,
UniformIntegerHyperparameter,
)

import numpy as np
Expand All @@ -16,6 +17,36 @@
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]:
"""
Returns list of embedding sizes for each categorical variable.
Selects this adaptively based on training_datset.
Note: Assumes there is at least one embed feature.

Args:
config (Dict[str, Any]):
contains the hyperparameters required to calculate the `num_output_dimensions`
num_categs_per_feature (List[int]):
list containing number of categories for each feature that is to be embedded,
0 if the column is not an embed column

Returns:
List[int]:
list containing the output embedding size for each column,
1 if the column is not an embed column
"""

max_embedding_dim = config['max_embedding_dim']
embed_exponent = config['embed_exponent']
size_factor = config['embedding_size_factor']
num_output_dimensions = [int(size_factor*max(
2,
min(max_embedding_dim,
1.6 * num_categories**embed_exponent)))
if num_categories > 0 else 1 for num_categories in num_categs_per_feature]
return num_output_dimensions


class _LearnedEntityEmbedding(nn.Module):
""" Learned entity embedding module for categorical features"""

Expand All @@ -35,9 +66,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n

self.num_embed_features = self.num_categories_per_col[self.embed_features]

self.num_output_dimensions = [1] * num_features_excl_embed
self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in
enumerate(self.num_embed_features)])
self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col)

self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)

Expand All @@ -48,12 +77,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
# before passing it through the model
concat_seq = []

x_pointer = 0
layer_pointer = 0
for x_pointer, embed in enumerate(self.embed_features):
current_feature_slice = x[:, x_pointer]
if not embed:
x_pointer += 1
concat_seq.append(current_feature_slice.view(-1, 1))
continue
current_feature_slice = current_feature_slice.to(torch.int)
Expand Down Expand Up @@ -91,28 +118,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
value_range=(0, 1),
default_value=0.5),
embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent",
value_range=(0.56,),
default_value=0.56),
max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim",
value_range=(100,),
default_value=100),
embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5),
default_value=1,
),
) -> ConfigurationSpace:
cs = ConfigurationSpace()
if dataset_properties is not None:
for i in range(len(dataset_properties['categorical_columns'])
if isinstance(dataset_properties['categorical_columns'], List) else 0):
# currently as we dont have information about the embedding columns
# we search for more dimensions than necessary. This can be solved by
# not having `min_unique_values_for_embedding` as a hyperparameter and
# instead passing it as a parameter to the feature validator, which
# allows us to pass embed_columns to the dataset properties.
# TODO: test the trade off
# Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
# in one custom transformer. this will also allow users to use this transformer
# outside the pipeline
ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
value_range=dimension_reduction.value_range,
default_value=dimension_reduction.default_value,
log=dimension_reduction.log)
add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter)
if len(dataset_properties['categorical_columns']) > 0:
add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter)
add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter)
add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter)

return cs

@staticmethod
Expand Down
3 changes: 3 additions & 0 deletions autoPyTorch/pipeline/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
ColumnSplitter
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
ColumnSplitter
)
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
EncoderChoice
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
)


# TODO: fix in preprocessing PR
# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")
@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
'classification_categorical_only',
'classification_numerical_and_categorical'], indirect=True)
Expand Down
1 change: 0 additions & 1 deletion test/test_pipeline/components/setup/test_setup_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def head(request):
return request.param


# TODO: add 'LearnedEntityEmbedding' after preprocessing dix
@pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding'])
def embedding(request):
return request.param
Expand Down
16 changes: 6 additions & 10 deletions test/test_pipeline/test_tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,10 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
"""This test makes sure that the pipeline is able to fit
given random combinations of hyperparameters across the pipeline"""
# TODO: fix issue where adversarial also works for regression
# TODO: Fix issue with learned entity embedding after preprocessing PR

pipeline = TabularRegressionPipeline(
dataset_properties=fit_dictionary_tabular['dataset_properties'],
exclude={'trainer': ['AdversarialTrainer'],
'network_embedding': ['LearnedEntityEmbedding']})
exclude={'trainer': ['AdversarialTrainer']})
cs = pipeline.get_hyperparameter_search_space()

config = cs.sample_configuration()
Expand All @@ -91,8 +90,7 @@ def test_pipeline_predict(self, fit_dictionary_tabular):
X = fit_dictionary_tabular['X_train'].copy()
pipeline = TabularRegressionPipeline(
dataset_properties=fit_dictionary_tabular['dataset_properties'],
exclude={'trainer': ['AdversarialTrainer'],
'network_embedding': ['LearnedEntityEmbedding']})
exclude={'trainer': ['AdversarialTrainer']})

cs = pipeline.get_hyperparameter_search_space()
config = cs.sample_configuration()
Expand Down Expand Up @@ -121,8 +119,7 @@ def test_pipeline_transform(self, fit_dictionary_tabular):

pipeline = TabularRegressionPipeline(
dataset_properties=fit_dictionary_tabular['dataset_properties'],
exclude={'trainer': ['AdversarialTrainer'],
'network_embedding': ['LearnedEntityEmbedding']})
exclude={'trainer': ['AdversarialTrainer']})
cs = pipeline.get_hyperparameter_search_space()
config = cs.sample_configuration()
pipeline.set_hyperparameters(config)
Expand All @@ -139,11 +136,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items()

# Then the pipeline should have added the following keys
# Removing 'imputer', 'encoder', 'scaler', these will be
# TODO: added back after a PR fixing preprocessing
expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network',
'optimizer', 'lr_scheduler', 'train_data_loader',
'val_data_loader', 'run_summary', 'feature_preprocessor'}
'val_data_loader', 'run_summary', 'feature_preprocessor',
'imputer', 'encoder', 'scaler'}
assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys()))

# Then we need to have transformations being created.
Expand Down