Replace all occurrences of get Pandas' get_dummies() with skLearn One…

…HotEncoder (#1135) * For consistency and avoidance of future issues, replace all occurrences of Pandas' get_dummies with skLearn's OneHotEncoder. Encoder lifespan: Reuses encoders for new estimate_effect() calls, and replaces existing encoders on CausalEstimator.fit(). Additional uses of get_dummies without side-effects or consistent encoding issues in do-Sampler Propensity Scores utilities also replaced for consistency. Signed-off-by: DAVID RAWLINSON <[email protected]> * Add categorical encoding consistency tests for CausalEstimators. Fix bug in arg order for RegressionEstimator._do(). Signed-off-by: DAVID RAWLINSON <[email protected]> --------- Signed-off-by: DAVID RAWLINSON <[email protected]> Co-authored-by: DAVID RAWLINSON <[email protected]>
py-why · Mar 26, 2024 · 65f3031 · 65f3031
1 parent dfbbbca
commit 65f3031
Show file tree

Hide file tree

Showing 14 changed files with 409 additions and 113 deletions.
diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py
@@ -11,6 +11,7 @@
 import dowhy.interpreters as interpreters
 from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
 from dowhy.utils.api import parse_state
+from dowhy.utils.encoding import Encoders
 
 logger = logging.getLogger(__name__)
 
@@ -112,6 +113,35 @@ def __init__(
         self._bootstrap_estimates = None
         self._bootstrap_null_estimates = None
 
+        self._encoders = Encoders()
+
+    def reset_encoders(self):
+        """
+        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
+
+        It's important that data is consistently encoded otherwise models will produce inconsistent output.
+        In particular, categorical variables are one-hot encoded; the mapping of original data values
+        must be identical between model training/fitting and inference time.
+
+        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
+
+        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
+        """
+        self._encoders.reset()
+
+    def _encode(self, data: pd.DataFrame, encoder_name: str):
+        """
+        Encodes categorical columns in the given data, returning a new dataframe containing
+        all original data and the encoded columns. Numerical data is unchanged, categorical
+        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
+        if available, or created if not. The encoder can be reused in subsequent calls.
+
+        :param data: Data to encode.
+        :param encoder_name: The name for the encoder to be used.
+        :returns: The encoded data.
+        """
+        return self._encoders.encode(data, encoder_name)
+
     def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
         """Sets the effect modifiers for the estimator
         Modifies need_conditional_estimates accordingly to effect modifiers value
@@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
             self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
             if len(self._effect_modifier_names) > 0:
                 self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
             else:
                 self._effect_modifier_names = []
@@ -234,7 +264,10 @@ def _estimate_conditional_effects(
                 effect_modifier_names[i] = prefix + str(em)
         # Grouping by effect modifiers and computing effect separately
         by_effect_mods = data.groupby(effect_modifier_names)
-        cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
+        def cond_est_fn(x):
+            return self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
         conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
         # Deleting the temporary categorical columns
         for em in effect_modifier_names:

diff --git a/dowhy/causal_estimators/causalml.py b/dowhy/causal_estimators/causalml.py
@@ -116,6 +116,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check the backdoor variables being used
@@ -127,7 +128,7 @@ def fit(
             # Get the data of the unobserved confounders
             self._observed_common_causes = data[self._observed_common_causes_names]
             # One hot encode the data if they are categorical
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = []
 
@@ -138,7 +139,7 @@ def fit(
         self._instrumental_variable_names = self._target_estimand.instrumental_variables
         if self._instrumental_variable_names:
             self._instrumental_variables = data[self._instrumental_variable_names]
-            self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
+            self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
         else:
             self._instrumental_variables = []
 

diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -122,6 +122,7 @@ def fit(
         """
         self.exact_match_cols = exact_match_cols
 
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check if the treatment is one-dimensional
@@ -146,7 +147,7 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Distance matching methods are not applicable"

diff --git a/dowhy/causal_estimators/econml.py b/dowhy/causal_estimators/econml.py
@@ -120,6 +120,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
         # Save parameters for later refutter fitting
         self._econml_fit_params = kwargs
@@ -148,12 +149,12 @@ def fit(
                 # Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
                 # the latter can be used by other estimator methods later
                 self._effect_modifiers = data[effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self._effect_modifier_names = effect_modifier_names
             self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
         if self._observed_common_causes_names:
             self._observed_common_causes = data[self._observed_common_causes_names]
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
         self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
@@ -165,7 +166,7 @@ def fit(
             self.estimating_instrument_names = parse_state(self.iv_instrument_name)
         if self.estimating_instrument_names:
             self._estimating_instruments = data[self.estimating_instrument_names]
-            self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
+            self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
         else:
             self._estimating_instruments = None
 
@@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None):
         """Returns None if the confidence interval has not been calculated."""
         return self.effect_intervals
 
-    def _do(self, x):
+    def _do(self, x, data_df=None):
         raise NotImplementedError
 
     def construct_symbolic_estimator(self, estimand):

diff --git a/dowhy/causal_estimators/instrumental_variable_estimator.py b/dowhy/causal_estimators/instrumental_variable_estimator.py
@@ -92,6 +92,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.estimating_instrument_names = self._target_estimand.instrumental_variables

diff --git a/dowhy/causal_estimators/propensity_score_estimator.py b/dowhy/causal_estimators/propensity_score_estimator.py
@@ -93,6 +93,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
@@ -103,7 +104,8 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
+
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"

diff --git a/dowhy/causal_estimators/regression_discontinuity_estimator.py b/dowhy/causal_estimators/regression_discontinuity_estimator.py
@@ -98,6 +98,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.rd_variable = data[self.rd_variable_name]

diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py
@@ -5,7 +5,6 @@
 import statsmodels.api as sm
 
 from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
-from dowhy.utils.encoding import one_hot_encode
 
 
 class RegressionEstimator(CausalEstimator):
@@ -71,53 +70,6 @@ def __init__(
 
         self.model = None
 
-        # Data encoders
-        # encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
-        # It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
-        # Set to False to include a bit for each value of every categorical variable.
-        self.encoder_drop_first = True
-        self.reset_encoders()
-
-    def reset_encoders(self):
-        """
-        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
-
-        It's important that data is consistently encoded otherwise models will produce inconsistent output.
-        In particular, categorical variables are one-hot encoded; the mapping of original data values
-        must be identical between model training/fitting and inference time.
-
-        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
-
-        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
-        """
-        self._encoders = {
-            "treatment": None,
-            "observed_common_causes": None,
-            "effect_modifiers": None,
-        }
-
-    def _encode(self, data: pd.DataFrame, encoder_name: str):
-        """
-        Encodes categorical columns in the given data, returning a new dataframe containing
-        all original data and the encoded columns. Numerical data is unchanged, categorical
-        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
-        if available, or created if not. The encoder can be reused in subsequent calls.
-
-        :param data: Data to encode.
-        :param encoder_name: The name for the encoder to be used.
-        :returns: The encoded data.
-        """
-        existing_encoder = self._encoders.get(encoder_name)
-        encoded_variables, encoder = one_hot_encode(
-            data,
-            drop_first=self.encoder_drop_first,
-            encoder=existing_encoder,
-        )
-
-        # Remember encoder
-        self._encoders[encoder_name] = encoder
-        return encoded_variables
-
     def fit(
         self,
         data: pd.DataFrame,
@@ -170,7 +122,7 @@ def estimate_effect(
             need_conditional_estimates = self.need_conditional_estimates
         # TODO make treatment_value and control value also as local parameters
         # All treatments are set to the same constant value
-        effect_estimate = self._do(data, treatment_value) - self._do(data, control_value)
+        effect_estimate = self._do(treatment_value, data) - self._do(control_value, data)
         conditional_effect_estimates = None
         if need_conditional_estimates:
             conditional_effect_estimates = self._estimate_conditional_effects(
@@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
         est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
         return est.value
 
-    def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
-        """Sets the effect modifiers for the estimator
-        Modifies need_conditional_estimates accordingly to effect modifiers value
-        :param effect_modifiers: Variables on which to compute separate
-            effects, or return a heterogeneous effect function. Not all
-            methods support this currently.
-        """
-        self._effect_modifiers = effect_modifier_names
-        if effect_modifier_names is not None:
-            self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
-            if len(self._effect_modifier_names) > 0:
-                self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
-                self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
-            else:
-                self._effect_modifier_names = []
-        else:
-            self._effect_modifier_names = []
-
-        self.need_conditional_estimates = (
-            self.need_conditional_estimates
-            if self.need_conditional_estimates != "auto"
-            else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
-        )
-
     def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
         treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")
 
@@ -295,6 +222,10 @@ def predict(self, data_df):
         interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
         return interventional_outcomes
 
-    def _do(self, data_df: pd.DataFrame, treatment_val):
+    def _do(
+        self,
+        treatment_val,
+        data_df: pd.DataFrame,
+    ):
         interventional_outcomes = self.interventional_outcomes(data_df, treatment_val)
         return interventional_outcomes.mean()
diff --git a/dowhy/causal_estimators/two_stage_regression_estimator.py b/dowhy/causal_estimators/two_stage_regression_estimator.py
@@ -167,6 +167,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         if len(self._target_estimand.treatment_variable) > 1:
@@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
         treatment_vals = data_df[self._target_estimand.treatment_variable]
         if len(self._observed_common_causes_names) > 0:
             observed_common_causes_vals = data_df[self._observed_common_causes_names]
-            observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
+            observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")
+
         if self._effect_modifier_names:
             effect_modifiers_vals = data_df[self._effect_modifier_names]
-            effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
+            effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")
+
         if type(treatment_vals) is not np.ndarray:
             treatment_vals = treatment_vals.to_numpy()
         if treatment_vals.shape[0] != data_df.shape[0]: