remove todos for the preprocessing PR, and apply suggestion from code…

… review
automl · Jul 5, 2022 · c27c76d · c27c76d
1 parent ae4bd55
commit c27c76d
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 19 deletions.
diff --git a/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/...Torch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -12,7 +12,6 @@ def __init__(self) -> None:
         self._processing = True
         self.add_fit_requirements([
             FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
         ])
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py
@@ -1,4 +1,3 @@
-from math import ceil
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -18,11 +17,25 @@
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
 
 
-def get_num_output_dimensions(config, num_categs_per_feature):
-    """ Returns list of embedding sizes for each categorical variable.
+def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]:
+    """
+        Returns list of embedding sizes for each categorical variable.
         Selects this adaptively based on training_datset.
         Note: Assumes there is at least one embed feature.
+
+    Args:
+        config (Dict[str, Any]): 
+            contains the hyperparameters required to calculate the `num_output_dimensions`
+        num_categs_per_feature (List[int]):
+            list containing number of categories for each feature that is to be embedded,
+            0 if the column is not an embed column
+
+    Returns:
+        List[int]:
+            list containing the output embedding size for each column,
+            1 if the column is not an embed column
     """
+
     max_embedding_dim = config['max_embedding_dim']
     embed_exponent = config['embed_exponent']
     size_factor = config['embedding_size_factor']
@@ -65,12 +78,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # before passing it through the model
         concat_seq = []
 
-        x_pointer = 0
         layer_pointer = 0
         for x_pointer, embed in enumerate(self.embed_features):
             current_feature_slice = x[:, x_pointer]
             if not embed:
-                x_pointer += 1
                 concat_seq.append(current_feature_slice.view(-1, 1))
                 continue
             current_feature_slice = current_feature_slice.to(torch.int)
@@ -115,7 +126,7 @@ def get_hyperparameter_search_space(
                                                                                    value_range=(100,),
                                                                                    default_value=100),
         embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
-                                                                                     value_range=(1.0, 0.5, 1.5, 0.7, 0.6, 0.8, 0.9, 1.1, 1.2, 1.3, 1.4),
+                                                                                     value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5),
                                                                                      default_value=1,
                                                                                      ),
     ) -> ConfigurationSpace:

diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
@@ -21,6 +21,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
     CoalescerChoice
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
+    ColumnSplitter
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -235,8 +238,9 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
-            ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
-            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
+            # ("variance_threshold", VarianceThreshold(random_state=self.random_state)),
+            # ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
+            ("column_splitter", ColumnSplitter(random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,

diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py
@@ -13,8 +13,6 @@
 )
 
 
-# TODO: fix in preprocessing PR
-# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")
 @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
                                                     'classification_categorical_only',
                                                     'classification_numerical_and_categorical'], indirect=True)

diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py
@@ -19,8 +19,7 @@ def head(request):
     return request.param
 
 
-# TODO: add 'LearnedEntityEmbedding' after preprocessing dix
-@pytest.fixture(params=['NoEmbedding'])
+@pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding'])
 def embedding(request):
     return request.param
 

diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py
@@ -61,11 +61,12 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
         """This test makes sure that the pipeline is able to fit
         given random combinations of hyperparameters across the pipeline"""
         # TODO: fix issue where adversarial also works for regression
-        # TODO: Fix issue with learned entity embedding after preprocessing PR
+
         pipeline = TabularRegressionPipeline(
             dataset_properties=fit_dictionary_tabular['dataset_properties'],
-            exclude={'trainer': ['AdversarialTrainer'],
-                     'network_embedding': ['LearnedEntityEmbedding']})
+            exclude={'trainer': ['AdversarialTrainer']})
+            # ,
+            #          'network_embedding': ['LearnedEntityEmbedding']})
         cs = pipeline.get_hyperparameter_search_space()
 
         config = cs.sample_configuration()
@@ -139,11 +140,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
         assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items()
 
         # Then the pipeline should have added the following keys
-        # Removing 'imputer', 'encoder', 'scaler', these will be
-        # TODO: added back after a PR fixing preprocessing
         expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network',
                          'optimizer', 'lr_scheduler', 'train_data_loader',
-                         'val_data_loader', 'run_summary', 'feature_preprocessor'}
+                         'val_data_loader', 'run_summary', 'feature_preprocessor',
+                         'imputer', 'encoder', 'scaler'}
         assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys()))
 
         # Then we need to have transformations being created.