From d85c16e5cb25009f39740b09feee306cc4ff3e58 Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 25 May 2021 23:04:40 +0200
Subject: [PATCH 1/9] [ADD] first push of coalescer

---
 .../coalescer/MinorityCoalescer.py            |  52 +++++++
 .../coalescer/NoCoalescer.py                  |  51 +++++++
 .../coalescer/__init__.py                     | 137 ++++++++++++++++++
 .../coalescer/base_coalescer.py               |  32 ++++
 autoPyTorch/utils/implementations.py          |  79 +++++++++-
 .../components/preprocessing/test_coalesce.py |  53 +++++++
 test/test_utils/test_coalescer.py             |  90 ++++++++++++
 7 files changed, 493 insertions(+), 1 deletion(-)
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
 create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
 create mode 100644 test/test_pipeline/components/preprocessing/test_coalesce.py
 create mode 100644 test/test_utils/test_coalescer.py

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
new file mode 100644
index 000000000..a770a58a9
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
@@ -0,0 +1,52 @@
+from typing import Any, Dict, Optional, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import (
+    UniformFloatHyperparameter,
+)
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.implementations import MinorityCoalescing
+
+
+class MinorityCoalescer(BaseCoalescer):
+    """
+    Groups together classes in a categorical feature if the frequency
+    of occurrence is less than minimum_fraction
+    """
+    def __init__(self, minimum_fraction: float, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
+        self.minimum_fraction = minimum_fraction
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
+
+        self.check_requirements(X, y)
+
+        self.preprocessor['categorical'] = MinorityCoalescing(minimum_fraction=self.minimum_fraction)
+        return self
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'MinorityCoalescer',
+            'name': 'Minority Feature-class coalescer',
+            'handles_sparse': False
+        }
+
+    @staticmethod
+    def get_hyperparameter_search_space(
+        dataset_properties: Optional[Dict] = None,
+        minimum_fraction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="minimum_fraction",
+                                                                                value_range=(0.0001, 0.5),
+                                                                                default_value=0.01,
+                                                                                log=True),
+    ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        add_hyperparameter(cs, minimum_fraction, UniformFloatHyperparameter)
+
+        return cs
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
new file mode 100644
index 000000000..da93bbe4b
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
@@ -0,0 +1,51 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+
+
+class NoCoalescer(BaseCoalescer):
+    """
+    Don't perform NoCoalescer on categorical features
+    """
+    def __init__(self,
+                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 ):
+        super().__init__()
+        self.random_state = random_state
+
+    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
+        """
+        The fit function calls the fit function of the underlying model
+        and returns the transformed array.
+        Args:
+            X (np.ndarray): input features
+            y (Optional[np.ndarray]): input labels
+
+        Returns:
+            instance of self
+        """
+        self.check_requirements(X, y)
+
+        return self
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        X.update({'coalescer': self.preprocessor})
+        return X
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
+        return {
+            'shortname': 'NoCoalescer',
+            'name': 'No Coalescer',
+            'handles_sparse': True
+        }
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
new file mode 100644
index 000000000..b39e87a3e
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -0,0 +1,137 @@
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional
+
+import ConfigSpace.hyperparameters as CSH
+from ConfigSpace.configuration_space import ConfigurationSpace
+
+from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice
+from autoPyTorch.pipeline.components.base_component import (
+    ThirdPartyComponents,
+    autoPyTorchComponent,
+    find_components,
+)
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer
+
+
+coalescer_directory = os.path.split(__file__)[0]
+_coalescer = find_components(__package__,
+                             coalescer_directory,
+                             BaseCoalescer)
+_addons = ThirdPartyComponents(BaseCoalescer)
+
+
+def add_coalescer(coalescer: BaseCoalescer) -> None:
+    _addons.add_component(coalescer)
+
+
+class CoalescerChoice(autoPyTorchChoice):
+    """
+    Allows for dynamically choosing coalescer component at runtime
+    """
+
+    def get_components(self) -> Dict[str, autoPyTorchComponent]:
+        """Returns the available coalescer components
+
+        Args:
+            None
+
+        Returns:
+            Dict[str, autoPyTorchComponent]: all BaseCoalescer components available
+                as choices for coalescer the categorical columns
+        """
+        components = OrderedDict()
+        components.update(_coalescer)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self,
+                                        dataset_properties: Optional[Dict[str, Any]] = None,
+                                        default: Optional[str] = None,
+                                        include: Optional[List[str]] = None,
+                                        exclude: Optional[List[str]] = None) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = dict()
+
+        dataset_properties = {**self.dataset_properties, **dataset_properties}
+
+        available_preprocessors = self.get_available_components(dataset_properties=dataset_properties,
+                                                                include=include,
+                                                                exclude=exclude)
+
+        if len(available_preprocessors) == 0:
+            raise ValueError("no coalescer found, please add a coalescer")
+
+        if default is None:
+            defaults = ['MinorityCoalescer', 'NoCoalescer']
+            for default_ in defaults:
+                if default_ in available_preprocessors:
+                    if include is not None and default_ not in include:
+                        continue
+                    if exclude is not None and default_ in exclude:
+                        continue
+                    default = default_
+                    break
+
+        updates = self._get_search_space_updates()
+        if '__choice__' in updates.keys():
+            choice_hyperparameter = updates['__choice__']
+            if not set(choice_hyperparameter.value_range).issubset(available_preprocessors):
+                raise ValueError("Expected given update for {} to have "
+                                 "choices in {} got {}".format(self.__class__.__name__,
+                                                               available_preprocessors,
+                                                               choice_hyperparameter.value_range))
+            if len(dataset_properties['categorical_columns']) == 0:
+                assert len(choice_hyperparameter.value_range) == 1
+                assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \
+                    "Provided {} in choices, however, the dataset " \
+                    "is incompatible with it".format(choice_hyperparameter.value_range)
+
+            preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                         choice_hyperparameter.value_range,
+                                                         default_value=choice_hyperparameter.default_value)
+        else:
+            # add only no coalescer to choice hyperparameters in case the dataset is only numerical
+            if len(dataset_properties['categorical_columns']) == 0:
+                default = 'NoCoalescer'
+                if include is not None and default not in include:
+                    raise ValueError("Provided {} in include, however, the dataset "
+                                     "is incompatible with it".format(include))
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             ['NoCoalescer'],
+                                                             default_value=default)
+            else:
+                preprocessor = CSH.CategoricalHyperparameter('__choice__',
+                                                             list(available_preprocessors.keys()),
+                                                             default_value=default)
+
+        cs.add_hyperparameter(preprocessor)
+
+        # add only child hyperparameters of early_preprocessor choices
+        for name in preprocessor.choices:
+            preprocessor_configuration_space = available_preprocessors[name].\
+                get_hyperparameter_search_space(dataset_properties)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, preprocessor_configuration_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space = cs
+        self.dataset_properties = dataset_properties
+        return cs
+
+    def _check_dataset_properties(self, dataset_properties: Dict[str, Any]) -> None:
+        """
+        A mechanism in code to ensure the correctness of the fit dictionary
+        It recursively makes sure that the children and parent level requirements
+        are honored before fit.
+        Args:
+            dataset_properties:
+
+        """
+        super()._check_dataset_properties(dataset_properties)
+        assert 'numerical_columns' in dataset_properties.keys(), \
+            "Dataset properties must contain information about numerical columns"
+        assert 'categorical_columns' in dataset_properties.keys(), \
+            "Dataset properties must contain information about categorical columns"
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
new file mode 100644
index 000000000..280ddd979
--- /dev/null
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, List
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
+    autoPyTorchTabularPreprocessingComponent
+)
+from autoPyTorch.utils.common import FitRequirement
+
+
+class BaseCoalescer(autoPyTorchTabularPreprocessingComponent):
+    """
+    Base class for coalescing
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.add_fit_requirements([
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('categories', (List,), user_defined=True, dataset_property=True)])
+
+    def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Adds the self into the 'X' dictionary and returns it.
+        Args:
+            X (Dict[str, Any]): 'X' dictionary
+
+        Returns:
+            (Dict[str, Any]): the updated 'X' dictionary
+        """
+        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
+            raise ValueError("cant call transform on {} without fitting first."
+                             .format(self.__class__.__name__))
+        X.update({'coalescer': self.preprocessor})
+        return X
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
index 2130cfd6b..e78baa5e4 100644
--- a/autoPyTorch/utils/implementations.py
+++ b/autoPyTorch/utils/implementations.py
@@ -1,7 +1,11 @@
-from typing import Any, Callable, Dict, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
 
 import numpy as np
 
+from scipy import sparse
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
 import torch
 
 
@@ -61,3 +65,76 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     @staticmethod
     def get_properties() -> Dict[str, Any]:
         return {'supported_losses': ['BCEWithLogitsLoss']}
+
+
+class MinorityCoalescing(BaseEstimator, TransformerMixin):
+    """ Group together categories which occurence is less than a specified
+    minimum fraction. Coalesced categories get index of one.
+    """
+
+    def __init__(self, minimum_fraction: Optional[float] = None):
+        self.minimum_fraction = minimum_fraction
+
+    def check_X(self, X: np.array) -> None:
+        X_data = X.data if sparse.issparse(X) else X
+        if np.nanmin(X_data) <= -2:
+            raise ValueError("X needs to contain only integers greater than -2.")
+
+    def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescing':
+        self.check_X(X)
+
+        if self.minimum_fraction is None:
+            return self
+
+        # Remember which values should not be coalesced
+        do_not_coalesce: List[Set[int]] = list()
+        for column in range(X.shape[1]):
+            do_not_coalesce.append(set())
+
+            if sparse.issparse(X):
+                indptr_start = X.indptr[column]
+                indptr_end = X.indptr[column + 1]
+                unique, counts = np.unique(
+                    X.data[indptr_start:indptr_end], return_counts=True)
+                colsize = indptr_end - indptr_start
+            else:
+                unique, counts = np.unique(X[:, column], return_counts=True)
+                colsize = X.shape[0]
+
+            for unique_value, count in zip(unique, counts):
+                fraction = float(count) / colsize
+                if fraction >= self.minimum_fraction:
+                    do_not_coalesce[-1].add(unique_value)
+
+        self.do_not_coalesce_ = do_not_coalesce
+        return self
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        self.check_X(X)
+
+        if self.minimum_fraction is None:
+            return X
+
+        for column in range(X.shape[1]):
+            if sparse.issparse(X):
+                indptr_start = X.indptr[column]
+                indptr_end = X.indptr[column + 1]
+                unique = np.unique(X.data[indptr_start:indptr_end])
+                for unique_value in unique:
+                    if unique_value not in self.do_not_coalesce_[column]:
+                        indptr_start = X.indptr[column]
+                        indptr_end = X.indptr[column + 1]
+                        X.data[indptr_start:indptr_end][
+                            X.data[indptr_start:indptr_end] == unique_value] = -2
+            else:
+                unique = np.unique(X[:, column])
+                unique_values = [unique_value for unique_value in unique
+                                 if unique_value not in self.do_not_coalesce_[column]]
+                mask = np.isin(X[:, column], unique_values)
+                # The imputer uses -1 for unknown categories
+                # Then -2 means coalesced categories
+                X[mask, column] = -2
+        return X
+
+    def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray:
+        return self.fit(X, y).transform(X)
diff --git a/test/test_pipeline/components/preprocessing/test_coalesce.py b/test/test_pipeline/components/preprocessing/test_coalesce.py
new file mode 100644
index 000000000..9aa73880f
--- /dev/null
+++ b/test/test_pipeline/components/preprocessing/test_coalesce.py
@@ -0,0 +1,53 @@
+import copy
+import unittest
+
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
+
+
+class TestCoalescerChoice(unittest.TestCase):
+    def test_get_set_config_space(self):
+        """Make sure that we can setup a valid choice in the Coalescer
+        choice"""
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]}
+        coalescer_choice = CoalescerChoice(dataset_properties)
+        cs = coalescer_choice.get_hyperparameter_search_space()
+
+        # Make sure that all hyperparameters are part of the search space
+        self.assertListEqual(
+            sorted(cs.get_hyperparameter('__choice__').choices),
+            sorted(list(coalescer_choice.get_components().keys()))
+        )
+
+        # Make sure we can properly set some random configs
+        # Whereas just one iteration will make sure the algorithm works,
+        # doing five iterations increase the confidence. We will be able to
+        # catch component specific crashes
+        for i in range(5):
+            config = cs.sample_configuration()
+            config_dict = copy.deepcopy(config.get_dictionary())
+            coalescer_choice.set_hyperparameters(config)
+
+            self.assertEqual(coalescer_choice.choice.__class__,
+                             coalescer_choice.get_components()[config_dict['__choice__']])
+
+            # Then check the choice configuration
+            selected_choice = config_dict.pop('__choice__', None)
+            for key, value in config_dict.items():
+                # Remove the selected_choice string from the parameter
+                # so we can query in the object for it
+                key = key.replace(selected_choice + ':', '')
+                self.assertIn(key, vars(coalescer_choice.choice))
+                self.assertEqual(value, coalescer_choice.choice.__dict__[key])
+
+    def test_only_numerical(self):
+        dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': []}
+
+        chooser = CoalescerChoice(dataset_properties)
+        configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary()
+        self.assertEqual(configspace['__choice__'], 'NoCoalescer')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_utils/test_coalescer.py b/test/test_utils/test_coalescer.py
new file mode 100644
index 000000000..482ae74de
--- /dev/null
+++ b/test/test_utils/test_coalescer.py
@@ -0,0 +1,90 @@
+import numpy as np
+
+import pytest
+
+import scipy.sparse
+
+from autoPyTorch.utils.implementations import MinorityCoalescing
+
+
+@pytest.fixture
+def X1():
+    # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%,
+    # 30%, 30%, 5% and 5% respectively
+    X = np.vstack((
+        np.ones((30, 10)) * 3,
+        np.ones((30, 10)) * 4,
+        np.ones((30, 10)) * 5,
+        np.ones((5, 10)) * 6,
+        np.ones((5, 10)) * 7,
+    ))
+    for col in range(X.shape[1]):
+        np.random.shuffle(X[:, col])
+    return X
+
+
+@pytest.fixture
+def X2():
+    # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%,
+    # 5%, 5%, 35% and 50% respectively
+    X = np.vstack((
+        np.ones((5, 10)) * 3,
+        np.ones((5, 10)) * 4,
+        np.ones((5, 10)) * 5,
+        np.ones((35, 10)) * 6,
+        np.ones((50, 10)) * 7,
+    ))
+    for col in range(X.shape[1]):
+        np.random.shuffle(X[:, col])
+    return X
+
+
+def test_default(X1):
+    X = X1
+    X_copy = np.copy(X)
+    Y = MinorityCoalescing().fit_transform(X)
+    np.testing.assert_array_almost_equal(Y, X_copy)
+    # Assert no copies were made
+    assert id(X) == id(Y)
+
+
+def test_coalesce_10_percent(X1):
+    X = X1
+    Y = MinorityCoalescing(minimum_fraction=.1).fit_transform(X)
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
+    # Assert no copies were made
+    assert id(X) == id(Y)
+
+
+def test_coalesce_10_percent_sparse(X1):
+    X = scipy.sparse.csc_matrix(X1)
+    Y = MinorityCoalescing(minimum_fraction=.1).fit_transform(X)
+    # Assert no copies were made
+    assert id(X) == id(Y)
+    Y = Y.todense()
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30])
+
+
+def test_invalid_X(X1):
+    X = X1 - 5
+    with pytest.raises(ValueError):
+        MinorityCoalescing().fit_transform(X)
+
+
+def test_transform_after_fit(X1, X2):
+    # On both X_fit and X_transf, the categories 3, 4, 5, 6, 7 are present.
+    X_fit = X1  # Here categories 3, 4, 5 have ocurrence above 10%
+    X_transf = X2  # Here it is the opposite, just categs 6 and 7 are above 10%
+
+    mc = MinorityCoalescing(minimum_fraction=.1).fit(X_fit)
+
+    # transform() should coalesce categories as learned during fit.
+    # Category distribution in X_transf should be irrelevant.
+    Y = mc.transform(X_transf)
+    for col in range(Y.shape[1]):
+        hist = np.histogram(Y[:, col], bins=np.arange(-2, 7))
+        np.testing.assert_array_almost_equal(hist[0], [85, 0, 0, 0, 0, 5, 5, 5])

From ae0652e623f755489c67ff128fe89a4f377aeb45 Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 25 May 2021 23:13:57 +0200
Subject: [PATCH 2/9] Add coalescer

---
 autoPyTorch/pipeline/tabular_classification.py | 4 ++++
 autoPyTorch/pipeline/tabular_regression.py     | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
index 65abcf3c2..a0216a79a 100644
--- a/autoPyTorch/pipeline/tabular_classification.py
+++ b/autoPyTorch/pipeline/tabular_classification.py
@@ -20,6 +20,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -277,6 +280,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,
diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py
index 1aa17a593..e613b11f0 100644
--- a/autoPyTorch/pipeline/tabular_regression.py
+++ b/autoPyTorch/pipeline/tabular_regression.py
@@ -19,6 +19,9 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import (
     TabularColumnTransformer
 )
+from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import (
+    CoalescerChoice
+)
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
     EncoderChoice
 )
@@ -219,6 +222,7 @@ def _get_pipeline_steps(
 
         steps.extend([
             ("imputer", SimpleImputer(random_state=self.random_state)),
+            ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)),
             ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)),
             ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)),
             ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties,

From 90c71eb2264c2687f7536a3ff4dccfb4ae2a9b10 Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 1 Jun 2021 14:12:37 +0200
Subject: [PATCH 3/9] Fix Unit test

---
 .../tabular_preprocessing/coalescer/__init__.py             | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index b39e87a3e..67ad64f5d 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self,
             raise ValueError("no coalescer found, please add a coalescer")
 
         if default is None:
-            defaults = ['MinorityCoalescer', 'NoCoalescer']
+            defaults = ['NoCoalescer', 'MinorityCoalescer']
             for default_ in defaults:
                 if default_ in available_preprocessors:
                     if include is not None and default_ not in include:
@@ -111,8 +111,10 @@ def get_hyperparameter_search_space(self,
 
         # add only child hyperparameters of early_preprocessor choices
         for name in preprocessor.choices:
+            updates = self._get_search_space_updates(prefix=name)
             preprocessor_configuration_space = available_preprocessors[name].\
-                get_hyperparameter_search_space(dataset_properties)
+                get_hyperparameter_search_space(dataset_properties,
+                                                **updates)
             parent_hyperparameter = {'parent': preprocessor, 'value': name}
             cs.add_configuration_space(name, preprocessor_configuration_space,
                                        parent_hyperparameter=parent_hyperparameter)

From 43213bb91333a4000a8722fa8197e345a005d65f Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 1 Jun 2021 14:44:01 +0200
Subject: [PATCH 4/9] Further fixes

---
 autoPyTorch/configs/greedy_portfolio.json             |  3 ++-
 .../tabular_preprocessing/coalescer/__init__.py       |  4 +++-
 test/test_api/.tmp_api/runhistory.json                | 11 ++++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json
index a8e640a4e..341a034eb 100644
--- a/autoPyTorch/configs/greedy_portfolio.json
+++ b/autoPyTorch/configs/greedy_portfolio.json
@@ -1,5 +1,6 @@
 [{"data_loader:batch_size": 60,
  "encoder:__choice__": "OneHotEncoder",
+ "coalescer:__choice__": "NoCoalescer",
  "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
  "imputer:categorical_strategy": "most_frequent",
  "imputer:numerical_strategy": "mean",
@@ -506,4 +507,4 @@
  "network_backbone:ShapedResNetBackbone:max_shake_drop_probability": 0.034431265307095615,
  "network_head:fully_connected:activation": "relu",
  "network_head:fully_connected:units_layer_1": 128,
- "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
\ No newline at end of file
+ "network_backbone:ShapedResNetBackbone:max_dropout": 0.6296079567189131}]
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index 67ad64f5d..13fc53dd2 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -112,8 +112,10 @@ def get_hyperparameter_search_space(self,
         # add only child hyperparameters of early_preprocessor choices
         for name in preprocessor.choices:
             updates = self._get_search_space_updates(prefix=name)
+            # Call arg is ignored on mypy as the search space dynamically
+            # provides different args
             preprocessor_configuration_space = available_preprocessors[name].\
-                get_hyperparameter_search_space(dataset_properties,
+                get_hyperparameter_search_space(dataset_properties,  # type:ignore[call-arg]
                                                 **updates)
             parent_hyperparameter = {'parent': preprocessor, 'value': name}
             cs.add_configuration_space(name, preprocessor_configuration_space,
diff --git a/test/test_api/.tmp_api/runhistory.json b/test/test_api/.tmp_api/runhistory.json
index 6f61e1395..d065c431f 100644
--- a/test/test_api/.tmp_api/runhistory.json
+++ b/test/test_api/.tmp_api/runhistory.json
@@ -705,6 +705,7 @@
     "1": {
       "data_loader:batch_size": 64,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "NoFeaturePreprocessor",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "ReduceLROnPlateau",
@@ -737,6 +738,7 @@
     "2": {
       "data_loader:batch_size": 101,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PowerTransformer",
       "imputer:numerical_strategy": "most_frequent",
       "lr_scheduler:__choice__": "CyclicLR",
@@ -801,6 +803,7 @@
     "3": {
       "data_loader:batch_size": 242,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "NoScheduler",
@@ -831,6 +834,7 @@
     "4": {
       "data_loader:batch_size": 115,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -864,6 +868,7 @@
     "5": {
       "data_loader:batch_size": 185,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "ReduceLROnPlateau",
@@ -904,6 +909,7 @@
     "6": {
       "data_loader:batch_size": 95,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "RandomKitchenSinks",
       "imputer:numerical_strategy": "most_frequent",
       "lr_scheduler:__choice__": "ExponentialLR",
@@ -937,6 +943,7 @@
     "7": {
       "data_loader:batch_size": 119,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "StepLR",
@@ -979,6 +986,7 @@
     "8": {
       "data_loader:batch_size": 130,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "PolynomialFeatures",
       "imputer:numerical_strategy": "median",
       "lr_scheduler:__choice__": "CyclicLR",
@@ -1032,6 +1040,7 @@
     "9": {
       "data_loader:batch_size": 137,
       "encoder:__choice__": "NoEncoder",
+      "coalescer:__choice__": "NoCoalescer",
       "feature_preprocessor:__choice__": "Nystroem",
       "imputer:numerical_strategy": "mean",
       "lr_scheduler:__choice__": "CosineAnnealingLR",
@@ -1077,4 +1086,4 @@
       "network_head:fully_connected:units_layer_1": 294
     }
   }
-}
\ No newline at end of file
+}

From e107e930cf647e33772219601cc5e1174a9ddaf5 Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 1 Jun 2021 14:51:08 +0200
Subject: [PATCH 5/9] Ignore as in preprocessing

---
 .../preprocessing/tabular_preprocessing/coalescer/__init__.py  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index 13fc53dd2..6455ee4d9 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -115,8 +115,7 @@ def get_hyperparameter_search_space(self,
             # Call arg is ignored on mypy as the search space dynamically
             # provides different args
             preprocessor_configuration_space = available_preprocessors[name].\
-                get_hyperparameter_search_space(dataset_properties,  # type:ignore[call-arg]
-                                                **updates)
+                get_hyperparameter_search_space(dataset_properties, **updates)  # type:ignore
             parent_hyperparameter = {'parent': preprocessor, 'value': name}
             cs.add_configuration_space(name, preprocessor_configuration_space,
                                        parent_hyperparameter=parent_hyperparameter)

From e40ed308eeadcf66da7dfac92aef06ac48ad2995 Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Tue, 1 Jun 2021 14:57:54 +0200
Subject: [PATCH 6/9] call args still failing

---
 .../tabular_preprocessing/coalescer/__init__.py              | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index 6455ee4d9..305fab99c 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -114,8 +114,9 @@ def get_hyperparameter_search_space(self,
             updates = self._get_search_space_updates(prefix=name)
             # Call arg is ignored on mypy as the search space dynamically
             # provides different args
-            preprocessor_configuration_space = available_preprocessors[name].\
-                get_hyperparameter_search_space(dataset_properties, **updates)  # type:ignore
+            preprocessor_configuration_space = available_preprocessors[       # type:ignore[call-arg]
+                name                                                          # type:ignore[call-arg]
+            ].get_hyperparameter_search_space(dataset_properties, **updates)  # type:ignore[call-arg]
             parent_hyperparameter = {'parent': preprocessor, 'value': name}
             cs.add_configuration_space(name, preprocessor_configuration_space,
                                        parent_hyperparameter=parent_hyperparameter)

From 3d25f37aed2e64066a47caff3e3f6d453ce8790f Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Fri, 18 Jun 2021 16:17:00 +0200
Subject: [PATCH 7/9] FIX_refit

---
 autoPyTorch/api/base_task.py | 42 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
index 9e59601a8..6ff99bde3 100644
--- a/autoPyTorch/api/base_task.py
+++ b/autoPyTorch/api/base_task.py
@@ -1072,11 +1072,17 @@ def refit(
             self
         """
 
-        self.dataset_name = dataset.dataset_name
-
         if self._logger is None:
             self._logger = self._get_logger(str(self.dataset_name))
 
+        if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
+            self._load_models()
+
+        # Refit is not applicable when ensemble_size is set to zero.
+        if self.ensemble_ is None:
+            raise ValueError("Refit can only be called if 'ensemble_size != 0'")
+
+        self.dataset_name = dataset.dataset_name
         dataset_requirements = get_dataset_requirements(
             info=dataset.get_required_dataset_info(),
             include=self.include_components,
@@ -1085,26 +1091,20 @@ def refit(
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
-        X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
-                                  'backend': self._backend,
-                                  'X_train': dataset.train_tensors[0],
-                                  'y_train': dataset.train_tensors[1],
-                                  'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
-                                  'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
-                                  'train_indices': dataset.splits[split_id][0],
-                                  'val_indices': dataset.splits[split_id][1],
-                                  'split_id': split_id,
-                                  'num_run': self._backend.get_next_num_run(),
-                                  })
-        X.update({**self.pipeline_options, **budget_config})
-        if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
-            self._load_models()
-
-        # Refit is not applicable when ensemble_size is set to zero.
-        if self.ensemble_ is None:
-            raise ValueError("Refit can only be called if 'ensemble_size != 0'")
-
         for identifier in self.models_:
+            X: Dict[str, Any] = dict({'dataset_properties': dataset_properties,
+                                      'backend': self._backend,
+                                      'X_train': dataset.train_tensors[0].copy(),
+                                      'y_train': dataset.train_tensors[1].copy(),
+                                      'X_test': dataset.test_tensors[0] if dataset.test_tensors is not None else None,
+                                      'y_test': dataset.test_tensors[1] if dataset.test_tensors is not None else None,
+                                      'train_indices': dataset.splits[split_id][0],
+                                      'val_indices': dataset.splits[split_id][1],
+                                      'split_id': split_id,
+                                      'num_run': self._backend.get_next_num_run(),
+                                      })
+            X.update({**self.pipeline_options, **budget_config})
+
             model = self.models_[identifier]
             # this updates the model inplace, it can then later be used in
             # predict method

From 2e08e99b58ec6320f19a295dd02037f1e98491e6 Mon Sep 17 00:00:00 2001
From: Francisco Rivera Valverde
 <44504424+franchuterivera@users.noreply.github.com>
Date: Mon, 21 Jun 2021 21:58:03 +0200
Subject: [PATCH 8/9] Update
 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py

Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
---
 .../preprocessing/tabular_preprocessing/coalescer/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index 305fab99c..469c753d3 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -97,8 +97,8 @@ def get_hyperparameter_search_space(self,
             if len(dataset_properties['categorical_columns']) == 0:
                 default = 'NoCoalescer'
                 if include is not None and default not in include:
-                    raise ValueError("Provided {} in include, however, the dataset "
-                                     "is incompatible with it".format(include))
+                    raise ValueError("Provided coalescer {} are incompatible with "
+                                     "the dataset without categorical columns.".format(include))
                 preprocessor = CSH.CategoricalHyperparameter('__choice__',
                                                              ['NoCoalescer'],
                                                              default_value=default)

From c8aa62632a76adf00d0677c0a250ca0b90c9ef9e Mon Sep 17 00:00:00 2001
From: chico <francisco.rivera.valverde@gmail.com>
Date: Mon, 21 Jun 2021 22:36:06 +0200
Subject: [PATCH 9/9] Feedback from Shuhei

---
 .../coalescer/MinorityCoalescer.py            |  2 +-
 .../coalescer/NoCoalescer.py                  | 16 +++---
 .../coalescer/__init__.py                     | 12 +++--
 .../coalescer/base_coalescer.py               |  8 ++-
 autoPyTorch/utils/implementations.py          | 53 +++++++++++++++----
 .../components/preprocessing/test_coalesce.py |  2 +-
 6 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
index a770a58a9..5d2e13bf3 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py
@@ -17,7 +17,7 @@ class MinorityCoalescer(BaseCoalescer):
     Groups together classes in a categorical feature if the frequency
     of occurrence is less than minimum_fraction
     """
-    def __init__(self, minimum_fraction: float, random_state: Optional[Union[np.random.RandomState, int]] = None):
+    def __init__(self, minimum_fraction: float, random_state: np.random.RandomState):
         super().__init__()
         self.minimum_fraction = minimum_fraction
         self.random_state = random_state
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
index da93bbe4b..5d4448249 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py
@@ -10,18 +10,20 @@ class NoCoalescer(BaseCoalescer):
     Don't perform NoCoalescer on categorical features
     """
     def __init__(self,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None
+                 random_state: np.random.RandomState,
                  ):
         super().__init__()
         self.random_state = random_state
 
-    def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
+    def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer:
         """
-        The fit function calls the fit function of the underlying model
-        and returns the transformed array.
+        As no coalescing happens, the input fit dictionary is unchanged.
+
         Args:
-            X (np.ndarray): input features
-            y (Optional[np.ndarray]): input labels
+        X (Dict[str, Any]):
+            input fit dictionary
+        y (Optional[Any]):
+            Parameter to comply with scikit-learn API. Not used.
 
         Returns:
             instance of self
@@ -32,7 +34,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer:
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Adds the self into the 'X' dictionary and returns it.
+        Add self into the 'X' dictionary and return the modified dict.
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
index 305fab99c..66528eb67 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py
@@ -62,7 +62,9 @@ def get_hyperparameter_search_space(self,
                                                                 exclude=exclude)
 
         if len(available_preprocessors) == 0:
-            raise ValueError("no coalescer found, please add a coalescer")
+            raise ValueError("No coalescer found, please add a coalescer via the include "
+                             "argument of the pipeline. Additionally, coalescer as a step "
+                             "can be removed as a pipeline step. ")
 
         if default is None:
             defaults = ['NoCoalescer', 'MinorityCoalescer']
@@ -79,10 +81,10 @@ def get_hyperparameter_search_space(self,
         if '__choice__' in updates.keys():
             choice_hyperparameter = updates['__choice__']
             if not set(choice_hyperparameter.value_range).issubset(available_preprocessors):
-                raise ValueError("Expected given update for {} to have "
-                                 "choices in {} got {}".format(self.__class__.__name__,
-                                                               available_preprocessors,
-                                                               choice_hyperparameter.value_range))
+                raise ValueError("The update for {} was expected to be a subset of {} "
+                                 "but was {}".format(self.__class__.__name__,
+                                                     available_preprocessors,
+                                                     choice_hyperparameter.value_range))
             if len(dataset_properties['categorical_columns']) == 0:
                 assert len(choice_hyperparameter.value_range) == 1
                 assert 'MinorityCoalescer' in choice_hyperparameter.value_range, \
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
index 280ddd979..7eae84025 100644
--- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
+++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py
@@ -18,7 +18,11 @@ def __init__(self) -> None:
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Adds the self into the 'X' dictionary and returns it.
+        The input X is the fit dictionary, that contains both the train data as
+        well as fit directives. For example, it indicates whether or not to use the gpu
+        or perform a cpu only run.
+
+        This method add the self into the 'X' dictionary and return it.
         Args:
             X (Dict[str, Any]): 'X' dictionary
 
@@ -26,7 +30,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
             (Dict[str, Any]): the updated 'X' dictionary
         """
         if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
-            raise ValueError("cant call transform on {} without fitting first."
+            raise ValueError("Cannot call transform() on {} without calling fit() first."
                              .format(self.__class__.__name__))
         X.update({'coalescer': self.preprocessor})
         return X
diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py
index e78baa5e4..aabdee91e 100644
--- a/autoPyTorch/utils/implementations.py
+++ b/autoPyTorch/utils/implementations.py
@@ -68,19 +68,46 @@ def get_properties() -> Dict[str, Any]:
 
 
 class MinorityCoalescing(BaseEstimator, TransformerMixin):
-    """ Group together categories which occurence is less than a specified
+    """ Group together categories which occurrence is less than a specified
     minimum fraction. Coalesced categories get index of one.
     """
 
     def __init__(self, minimum_fraction: Optional[float] = None):
         self.minimum_fraction = minimum_fraction
 
-    def check_X(self, X: np.array) -> None:
+    def check_X(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None:
+        """
+        This estimator takes as input a set of features coming in a tabular fashion.
+        The classes in the columns, i.e.features, are coalesced together, if the ratio of
+        occurrence of each class is less than self.minimum_fraction.
+
+        Those classes with low occurrence frequency are coalesced into a new class: -2.
+        The classification pipeline shifts the classes to be positive through encoding.
+        The imputation tags missing elements as -1. Then the coalescer can use -2 as a
+        'coalesced' indicator.
+
+        Failure can only happen if the pipeline failed to fit encoding/imputation.
+
+        Args:
+        X (np.ndarray):
+            The input features from the user, likely transformed by an encoder and imputator.
+        """
         X_data = X.data if sparse.issparse(X) else X
         if np.nanmin(X_data) <= -2:
-            raise ValueError("X needs to contain only integers greater than -2.")
-
-    def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescing':
+            raise ValueError("The input features to the MinorityCoalescing "
+                             "need to contain only integers greater than -2.")
+
+    def fit(self, X: Union[np.ndarray, sparse.csr_matrix],
+            y: Optional[np.ndarray] = None) -> 'MinorityCoalescing':
+        """
+        Trains the estimator to identify low frequency classes on the input train data.
+
+        Args:
+        X (Union[np.ndarray, sparse.csr_matrix]):
+            The input features from the user, likely transformed by an encoder and imputator.
+        y (Optional[np.ndarray]):
+            Optional labels for the given task, not used by this estimator.
+        """
         self.check_X(X)
 
         if self.minimum_fraction is None:
@@ -106,10 +133,18 @@ def fit(self, X: np.array, y: Optional[np.ndarray] = None) -> 'MinorityCoalescin
                 if fraction >= self.minimum_fraction:
                     do_not_coalesce[-1].add(unique_value)
 
-        self.do_not_coalesce_ = do_not_coalesce
+        self._do_not_coalesce = do_not_coalesce
         return self
 
-    def transform(self, X: np.ndarray) -> np.ndarray:
+    def transform(self, X: Union[np.ndarray, sparse.csr_matrix]) -> Union[np.ndarray,
+                                                                          sparse.csr_matrix]:
+        """
+        Coalesces categories with low frequency on the input array X.
+
+        Args:
+        X (Union[np.ndarray, sparse.csr_matrix]):
+            The input features from the user, likely transformed by an encoder and imputator.
+        """
         self.check_X(X)
 
         if self.minimum_fraction is None:
@@ -121,7 +156,7 @@ def transform(self, X: np.ndarray) -> np.ndarray:
                 indptr_end = X.indptr[column + 1]
                 unique = np.unique(X.data[indptr_start:indptr_end])
                 for unique_value in unique:
-                    if unique_value not in self.do_not_coalesce_[column]:
+                    if unique_value not in self._do_not_coalesce[column]:
                         indptr_start = X.indptr[column]
                         indptr_end = X.indptr[column + 1]
                         X.data[indptr_start:indptr_end][
@@ -129,7 +164,7 @@ def transform(self, X: np.ndarray) -> np.ndarray:
             else:
                 unique = np.unique(X[:, column])
                 unique_values = [unique_value for unique_value in unique
-                                 if unique_value not in self.do_not_coalesce_[column]]
+                                 if unique_value not in self._do_not_coalesce[column]]
                 mask = np.isin(X[:, column], unique_values)
                 # The imputer uses -1 for unknown categories
                 # Then -2 means coalesced categories
diff --git a/test/test_pipeline/components/preprocessing/test_coalesce.py b/test/test_pipeline/components/preprocessing/test_coalesce.py
index 9aa73880f..812256073 100644
--- a/test/test_pipeline/components/preprocessing/test_coalesce.py
+++ b/test/test_pipeline/components/preprocessing/test_coalesce.py
@@ -24,7 +24,7 @@ def test_get_set_config_space(self):
         # Whereas just one iteration will make sure the algorithm works,
         # doing five iterations increase the confidence. We will be able to
         # catch component specific crashes
-        for i in range(5):
+        for _ in range(5):
             config = cs.sample_configuration()
             config_dict = copy.deepcopy(config.get_dictionary())
             coalescer_choice.set_hyperparameters(config)