From 65f3031bb103d7219fee4c62cbfc8d2747d08d9c Mon Sep 17 00:00:00 2001
From: drawlinson <divad.nosnilwar@gmail.com>
Date: Wed, 27 Mar 2024 03:33:17 +1100
Subject: [PATCH] Replace all occurrences of get Pandas' get_dummies() with
 skLearn OneHotEncoder (#1135)

* For consistency and avoidance of future issues, replace all occurrences of Pandas' get_dummies with skLearn's OneHotEncoder. Encoder lifespan: Reuses encoders for new estimate_effect() calls, and replaces existing encoders on CausalEstimator.fit(). Additional uses of get_dummies without side-effects or consistent encoding issues in do-Sampler Propensity Scores utilities also replaced for consistency.

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>

* Add categorical encoding consistency tests for CausalEstimators. Fix bug in arg order for RegressionEstimator._do().

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>

---------

Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app>
Co-authored-by: DAVID RAWLINSON <dave@causalwizard.app>
---
 dowhy/causal_estimator.py                     |  37 +++-
 dowhy/causal_estimators/causalml.py           |   5 +-
 .../distance_matching_estimator.py            |   3 +-
 dowhy/causal_estimators/econml.py             |   9 +-
 .../instrumental_variable_estimator.py        |   1 +
 .../propensity_score_estimator.py             |   4 +-
 .../regression_discontinuity_estimator.py     |   1 +
 .../causal_estimators/regression_estimator.py |  81 +------
 .../two_stage_regression_estimator.py         |   7 +-
 .../add_unobserved_common_cause.py            |  66 +++---
 dowhy/utils/encoding.py                       |  50 +++++
 dowhy/utils/propensity_score.py               |  11 +-
 tests/causal_estimators/base.py               | 198 ++++++++++++++++++
 .../test_estimator_consistency.py             |  49 +++++
 14 files changed, 409 insertions(+), 113 deletions(-)
 create mode 100755 tests/causal_estimators/test_estimator_consistency.py

diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py
index 061361720d..fccca7c418 100755
--- a/dowhy/causal_estimator.py
+++ b/dowhy/causal_estimator.py
@@ -11,6 +11,7 @@
 import dowhy.interpreters as interpreters
 from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
 from dowhy.utils.api import parse_state
+from dowhy.utils.encoding import Encoders
 
 logger = logging.getLogger(__name__)
 
@@ -112,6 +113,35 @@ def __init__(
         self._bootstrap_estimates = None
         self._bootstrap_null_estimates = None
 
+        self._encoders = Encoders()
+
+    def reset_encoders(self):
+        """
+        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
+
+        It's important that data is consistently encoded otherwise models will produce inconsistent output.
+        In particular, categorical variables are one-hot encoded; the mapping of original data values
+        must be identical between model training/fitting and inference time.
+
+        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
+
+        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
+        """
+        self._encoders.reset()
+
+    def _encode(self, data: pd.DataFrame, encoder_name: str):
+        """
+        Encodes categorical columns in the given data, returning a new dataframe containing
+        all original data and the encoded columns. Numerical data is unchanged, categorical
+        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
+        if available, or created if not. The encoder can be reused in subsequent calls.
+
+        :param data: Data to encode.
+        :param encoder_name: The name for the encoder to be used.
+        :returns: The encoded data.
+        """
+        return self._encoders.encode(data, encoder_name)
+
     def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
         """Sets the effect modifiers for the estimator
         Modifies need_conditional_estimates accordingly to effect modifiers value
@@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
             self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
             if len(self._effect_modifier_names) > 0:
                 self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
             else:
                 self._effect_modifier_names = []
@@ -234,7 +264,10 @@ def _estimate_conditional_effects(
                 effect_modifier_names[i] = prefix + str(em)
         # Grouping by effect modifiers and computing effect separately
         by_effect_mods = data.groupby(effect_modifier_names)
-        cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
+        def cond_est_fn(x):
+            return self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
         conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
         # Deleting the temporary categorical columns
         for em in effect_modifier_names:
diff --git a/dowhy/causal_estimators/causalml.py b/dowhy/causal_estimators/causalml.py
index 49e9ea0f20..d44f0d9ac8 100644
--- a/dowhy/causal_estimators/causalml.py
+++ b/dowhy/causal_estimators/causalml.py
@@ -116,6 +116,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check the backdoor variables being used
@@ -127,7 +128,7 @@ def fit(
             # Get the data of the unobserved confounders
             self._observed_common_causes = data[self._observed_common_causes_names]
             # One hot encode the data if they are categorical
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = []
 
@@ -138,7 +139,7 @@ def fit(
         self._instrumental_variable_names = self._target_estimand.instrumental_variables
         if self._instrumental_variable_names:
             self._instrumental_variables = data[self._instrumental_variable_names]
-            self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
+            self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
         else:
             self._instrumental_variables = []
 
diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
index e63222404a..3f44593314 100644
--- a/dowhy/causal_estimators/distance_matching_estimator.py
+++ b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -122,6 +122,7 @@ def fit(
         """
         self.exact_match_cols = exact_match_cols
 
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check if the treatment is one-dimensional
@@ -146,7 +147,7 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Distance matching methods are not applicable"
diff --git a/dowhy/causal_estimators/econml.py b/dowhy/causal_estimators/econml.py
index 819d3211cb..33cd4b0b82 100755
--- a/dowhy/causal_estimators/econml.py
+++ b/dowhy/causal_estimators/econml.py
@@ -120,6 +120,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
         # Save parameters for later refutter fitting
         self._econml_fit_params = kwargs
@@ -148,12 +149,12 @@ def fit(
                 # Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
                 # the latter can be used by other estimator methods later
                 self._effect_modifiers = data[effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self._effect_modifier_names = effect_modifier_names
             self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
         if self._observed_common_causes_names:
             self._observed_common_causes = data[self._observed_common_causes_names]
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
         self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
@@ -165,7 +166,7 @@ def fit(
             self.estimating_instrument_names = parse_state(self.iv_instrument_name)
         if self.estimating_instrument_names:
             self._estimating_instruments = data[self.estimating_instrument_names]
-            self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
+            self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
         else:
             self._estimating_instruments = None
 
@@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None):
         """Returns None if the confidence interval has not been calculated."""
         return self.effect_intervals
 
-    def _do(self, x):
+    def _do(self, x, data_df=None):
         raise NotImplementedError
 
     def construct_symbolic_estimator(self, estimand):
diff --git a/dowhy/causal_estimators/instrumental_variable_estimator.py b/dowhy/causal_estimators/instrumental_variable_estimator.py
index 5cc34f06fc..6570f0a540 100755
--- a/dowhy/causal_estimators/instrumental_variable_estimator.py
+++ b/dowhy/causal_estimators/instrumental_variable_estimator.py
@@ -92,6 +92,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.estimating_instrument_names = self._target_estimand.instrumental_variables
diff --git a/dowhy/causal_estimators/propensity_score_estimator.py b/dowhy/causal_estimators/propensity_score_estimator.py
index a6a2f2df39..0ffd7ae41c 100644
--- a/dowhy/causal_estimators/propensity_score_estimator.py
+++ b/dowhy/causal_estimators/propensity_score_estimator.py
@@ -93,6 +93,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
@@ -103,7 +104,8 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
+
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"
diff --git a/dowhy/causal_estimators/regression_discontinuity_estimator.py b/dowhy/causal_estimators/regression_discontinuity_estimator.py
index 47c1211711..daed8a8492 100755
--- a/dowhy/causal_estimators/regression_discontinuity_estimator.py
+++ b/dowhy/causal_estimators/regression_discontinuity_estimator.py
@@ -98,6 +98,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.rd_variable = data[self.rd_variable_name]
diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py
index f7c86fb357..a0457b1f47 100644
--- a/dowhy/causal_estimators/regression_estimator.py
+++ b/dowhy/causal_estimators/regression_estimator.py
@@ -5,7 +5,6 @@
 import statsmodels.api as sm
 
 from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
-from dowhy.utils.encoding import one_hot_encode
 
 
 class RegressionEstimator(CausalEstimator):
@@ -71,53 +70,6 @@ def __init__(
 
         self.model = None
 
-        # Data encoders
-        # encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
-        # It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
-        # Set to False to include a bit for each value of every categorical variable.
-        self.encoder_drop_first = True
-        self.reset_encoders()
-
-    def reset_encoders(self):
-        """
-        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
-
-        It's important that data is consistently encoded otherwise models will produce inconsistent output.
-        In particular, categorical variables are one-hot encoded; the mapping of original data values
-        must be identical between model training/fitting and inference time.
-
-        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
-
-        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
-        """
-        self._encoders = {
-            "treatment": None,
-            "observed_common_causes": None,
-            "effect_modifiers": None,
-        }
-
-    def _encode(self, data: pd.DataFrame, encoder_name: str):
-        """
-        Encodes categorical columns in the given data, returning a new dataframe containing
-        all original data and the encoded columns. Numerical data is unchanged, categorical
-        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
-        if available, or created if not. The encoder can be reused in subsequent calls.
-
-        :param data: Data to encode.
-        :param encoder_name: The name for the encoder to be used.
-        :returns: The encoded data.
-        """
-        existing_encoder = self._encoders.get(encoder_name)
-        encoded_variables, encoder = one_hot_encode(
-            data,
-            drop_first=self.encoder_drop_first,
-            encoder=existing_encoder,
-        )
-
-        # Remember encoder
-        self._encoders[encoder_name] = encoder
-        return encoded_variables
-
     def fit(
         self,
         data: pd.DataFrame,
@@ -170,7 +122,7 @@ def estimate_effect(
             need_conditional_estimates = self.need_conditional_estimates
         # TODO make treatment_value and control value also as local parameters
         # All treatments are set to the same constant value
-        effect_estimate = self._do(data, treatment_value) - self._do(data, control_value)
+        effect_estimate = self._do(treatment_value, data) - self._do(control_value, data)
         conditional_effect_estimates = None
         if need_conditional_estimates:
             conditional_effect_estimates = self._estimate_conditional_effects(
@@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
         est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
         return est.value
 
-    def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
-        """Sets the effect modifiers for the estimator
-        Modifies need_conditional_estimates accordingly to effect modifiers value
-        :param effect_modifiers: Variables on which to compute separate
-            effects, or return a heterogeneous effect function. Not all
-            methods support this currently.
-        """
-        self._effect_modifiers = effect_modifier_names
-        if effect_modifier_names is not None:
-            self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
-            if len(self._effect_modifier_names) > 0:
-                self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
-                self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
-            else:
-                self._effect_modifier_names = []
-        else:
-            self._effect_modifier_names = []
-
-        self.need_conditional_estimates = (
-            self.need_conditional_estimates
-            if self.need_conditional_estimates != "auto"
-            else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
-        )
-
     def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
         treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")
 
@@ -295,6 +222,10 @@ def predict(self, data_df):
         interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
         return interventional_outcomes
 
-    def _do(self, data_df: pd.DataFrame, treatment_val):
+    def _do(
+        self,
+        treatment_val,
+        data_df: pd.DataFrame,
+    ):
         interventional_outcomes = self.interventional_outcomes(data_df, treatment_val)
         return interventional_outcomes.mean()
diff --git a/dowhy/causal_estimators/two_stage_regression_estimator.py b/dowhy/causal_estimators/two_stage_regression_estimator.py
index ebb01558ed..659ba7f955 100644
--- a/dowhy/causal_estimators/two_stage_regression_estimator.py
+++ b/dowhy/causal_estimators/two_stage_regression_estimator.py
@@ -167,6 +167,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         if len(self._target_estimand.treatment_variable) > 1:
@@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
         treatment_vals = data_df[self._target_estimand.treatment_variable]
         if len(self._observed_common_causes_names) > 0:
             observed_common_causes_vals = data_df[self._observed_common_causes_names]
-            observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
+            observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")
+
         if self._effect_modifier_names:
             effect_modifiers_vals = data_df[self._effect_modifier_names]
-            effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
+            effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")
+
         if type(treatment_vals) is not np.ndarray:
             treatment_vals = treatment_vals.to_numpy()
         if treatment_vals.shape[0] != data_df.shape[0]:
diff --git a/dowhy/causal_refuters/add_unobserved_common_cause.py b/dowhy/causal_refuters/add_unobserved_common_cause.py
index 333dc72060..80f0ef18f2 100755
--- a/dowhy/causal_refuters/add_unobserved_common_cause.py
+++ b/dowhy/causal_refuters/add_unobserved_common_cause.py
@@ -19,6 +19,7 @@
 from dowhy.causal_refuter import CausalRefutation, CausalRefuter, choose_variables
 from dowhy.causal_refuters.evalue_sensitivity_analyzer import EValueSensitivityAnalyzer
 from dowhy.causal_refuters.linear_sensitivity_analyzer import LinearSensitivityAnalyzer
+from dowhy.utils.encoding import Encoders
 
 logger = logging.getLogger(__name__)
 
@@ -201,6 +202,41 @@ def include_simulated_confounder(
         )
 
 
+def preprocess_observed_common_causes(
+    data: pd.DataFrame,
+    target_estimand: IdentifiedEstimand,
+    no_common_causes_error_message: str,
+):
+    """
+    Preprocesses backdoor variables (observed common causes) and returns the pre-processed matrix.
+
+    At least one backdoor (common cause) variable is required. Raises an exception if none present.
+
+    Preprocessing has two steps:
+    1. Categorical encoding.
+    2. Standardization.
+
+    :param data: All data, some of which needs preprocessing.
+    :param target_estimand: Estimand for desired effect including definition of backdoor variables.
+    :param no_common_causes_error_message: Message to be displayed with ValueError if no backdoor variable present.
+    :return: DataFrame containing pre-processed data.
+    """
+
+    # 1. Categorical encoding of relevant variables
+    observed_common_causes_names = target_estimand.get_backdoor_variables()
+    if len(observed_common_causes_names) > 0:
+        # The encoded data is only used to calculate a parameter, so the encoder can be discarded.
+        observed_common_causes = data[observed_common_causes_names]
+        encoders = Encoders()
+        observed_common_causes = encoders.encode(observed_common_causes, "observed_common_causes")
+    else:
+        raise ValueError(no_common_causes_error_message)
+
+    # 2. Standardizing the data
+    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    return observed_common_causes
+
+
 def _infer_default_kappa_t(
     data: pd.DataFrame,
     target_estimand: IdentifiedEstimand,
@@ -210,19 +246,10 @@ def _infer_default_kappa_t(
     len_kappa_t: int = 10,
 ):
     """Infer default effect strength of simulated confounder on treatment."""
-    observed_common_causes_names = target_estimand.get_backdoor_variables()
-    if len(observed_common_causes_names) > 0:
-        observed_common_causes = data[observed_common_causes_names]
-        observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
-    else:
-        raise ValueError(
-            "There needs to be at least one common cause to"
-            + "automatically compute the default value of kappa_t."
-            + " Provide a value for kappa_t"
-        )
     t = data[treatment_name]
-    # Standardizing the data
-    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_t. Provide a value for kappa_t"
+    observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)
+
     if effect_on_t == "binary_flip":
         # Fit a model containing all confounders and compare predictions
         # using all features compared to all features except a given
@@ -272,19 +299,10 @@ def _infer_default_kappa_y(
     len_kappa_y: int = 10,
 ):
     """Infer default effect strength of simulated confounder on treatment."""
-    observed_common_causes_names = target_estimand.get_backdoor_variables()
-    if len(observed_common_causes_names) > 0:
-        observed_common_causes = data[observed_common_causes_names]
-        observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
-    else:
-        raise ValueError(
-            "There needs to be at least one common cause to"
-            + "automatically compute the default value of kappa_y."
-            + " Provide a value for kappa_y"
-        )
     y = data[outcome_name]
-    # Standardizing the data
-    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_y. Provide a value for kappa_y"
+    observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)
+
     if effect_on_y == "binary_flip":
         # Fit a model containing all confounders and compare predictions
         # using all features compared to all features except a given
diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py
index 722e1473ae..2c4929acc1 100644
--- a/dowhy/utils/encoding.py
+++ b/dowhy/utils/encoding.py
@@ -60,3 +60,53 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e
     df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)
 
     return df_result, encoder
+
+
+class Encoders:
+    """Categorical data One-Hot encoding helper object.
+
+    Initializes a factory object which manages a set of sklearn.preprocessing.OneHotEncoder instances,
+    although the `encode()` method can be overriden to replace these with your preferred encoder.
+
+    Each Encoder instance is given a name to retrieve it in future, and is used to encode
+    a different set of variables.
+    """
+
+    def __init__(self, drop_first=True):
+        """Initializes an instance and calls `reset_encoders()`.
+
+        :param drop_first: If true, will not encode the first category value with a bit in 1-hot encoding.
+            It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
+            Set to False to include a bit for each value of every categorical variable.
+        """
+        self.drop_first = drop_first
+        self.reset()
+
+    def reset(self):
+        """
+        Removes any reference to data encoders, causing them to be re-created on next `encode()`.
+        A separate encoder is used for each named set of variables.
+        """
+        self._encoders = {}
+
+    def encode(self, data: pd.DataFrame, encoder_name: str):
+        """
+        Encodes categorical columns in the given data, returning a new dataframe containing
+        all original data and the encoded columns. Numerical data is unchanged, categorical
+        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
+        if available, or created if not. The encoder can be reused in subsequent calls.
+
+        :param data: Data to encode.
+        :param encoder_name: The name for the encoder to be used.
+        :returns: The encoded data.
+        """
+        existing_encoder = self._encoders.get(encoder_name)
+        encoded_variables, encoder = one_hot_encode(
+            data,
+            drop_first=self.drop_first,
+            encoder=existing_encoder,
+        )
+
+        # Remember encoder
+        self._encoders[encoder_name] = encoder
+        return encoded_variables
diff --git a/dowhy/utils/propensity_score.py b/dowhy/utils/propensity_score.py
index df49d58bbc..e8f68ceb87 100644
--- a/dowhy/utils/propensity_score.py
+++ b/dowhy/utils/propensity_score.py
@@ -1,10 +1,11 @@
 import numpy as np
 import pandas as pd
-from pandas import get_dummies
 from sklearn.linear_model import LogisticRegression
 from sklearn.preprocessing import LabelEncoder
 from statsmodels.nonparametric.kernel_density import EstimatorSettings, KDEMultivariateConditional
 
+from dowhy.utils.encoding import one_hot_encode
+
 
 def propensity_of_treatment_score(data, covariates, treatment, model="logistic", variable_types=None):
     if model == "logistic":
@@ -114,8 +115,14 @@ def binarize_discrete(data, covariates, variable_types):
     if variable_types:
         for variable in covariates:
             variable_type = variable_types[variable]
+            # variable_type:
+            #  A dictionary containing the variable's names and types. 'c' for continuous, 'o'
+            #  for ordered, 'd' for discrete, and 'u' for unordered discrete.
             if variable_type in ["d", "o", "u"]:
-                dummies = get_dummies(data[variable])
+                # [] notation to retain DataFrame rather than Series.
+                # For one_hot_encode type must be categorical, or it won't encode.
+                variable_data = data.loc[:, [variable]].astype(str)
+                dummies, _ = one_hot_encode(variable_data)  # Original impl. pd.get_dummies, drop_first default is False
                 dummies.columns = [variable + str(col) for col in dummies.columns]
                 dummies = dummies[dummies.columns[:-1]]
                 covariates += list(dummies.columns)
diff --git a/tests/causal_estimators/base.py b/tests/causal_estimators/base.py
index f3ff3772a8..c7bfeebb60 100755
--- a/tests/causal_estimators/base.py
+++ b/tests/causal_estimators/base.py
@@ -1,5 +1,8 @@
 import itertools
 
+import numpy as np
+import pandas as pd
+
 import dowhy.datasets
 from dowhy import EstimandType, identify_effect_auto
 from dowhy.graph import build_graph_from_str
@@ -179,3 +182,198 @@ def custom_data_average_treatment_effect_test(self, data):
         )
         res = True if (error < true_ate * self._error_tolerance) else False
         assert res
+
+
+class SimpleEstimatorWithModelParams(object):
+    def __init__(self, Estimator, method_params, identifier_method="backdoor"):
+        self._Estimator = Estimator
+        self._method_params = method_params
+        self._identifier_method = identifier_method
+
+    def consistent_estimator_encoding_test(self):
+        """
+        This test tries to verify and enforce consistent encoding of categorical variables
+        by Estimators. The desired behaviour is that encodings of new values are produced
+        during `fit()`, which is also when the model is learned/trained/fitted.
+
+        In `estimator.estimate_effect()` and `do(x)` the same encodings should be reused.
+        """
+
+        # Generate a dataset with some categorical variables (common causes)
+        # This configuration is necessary for the test and should not be varied.
+        data = dowhy.datasets.linear_dataset(
+            beta=1,
+            num_common_causes=3,
+            num_discrete_common_causes=2,
+            num_instruments=2,
+            num_effect_modifiers=0,
+            num_discrete_effect_modifiers=0,
+            num_treatments=1,
+            num_frontdoor_variables=0,
+            num_samples=500,
+            treatment_is_binary=True,
+            treatment_is_category=False,
+            outcome_is_binary=False,
+        )
+
+        # For the purposes of the test, these are the categorical columns.
+        encoded_categorical_columns = ["W1", "W2"]
+
+        # Since their values are integer, convert them to string type to ensure
+        # Categorical handling.
+        df_1 = data["df"]
+        df_1[encoded_categorical_columns] = df_1[encoded_categorical_columns].astype(str)
+
+        def fit_estimator(data, method_params):
+            """
+            Creates an Estimator, identifies the effect, and fits the Estimator to the data.
+            The Estimator is returned.
+            """
+
+            target_estimand = identify_effect_auto(
+                build_graph_from_str(data["gml_graph"]),
+                observed_nodes=list(data["df"].columns),
+                action_nodes=data["treatment_name"],
+                outcome_nodes=data["outcome_name"],
+                estimand_type=EstimandType.NONPARAMETRIC_ATE,
+            )
+            target_estimand.set_identifier_method(self._identifier_method)
+
+            estimator = self._Estimator(
+                identified_estimand=target_estimand,
+                **method_params,
+            )
+
+            estimator.fit(
+                df_1,
+                effect_modifier_names=data["effect_modifier_names"],
+            )
+            return estimator
+
+        def estimate(estimator, df):
+            """
+            Returns an Estimate of the ATE, given the data provided.
+            """
+            print(f"Est,,, {type(estimator)}")
+            estimate = estimator.estimate_effect(
+                df,
+                control_value=0,
+                treatment_value=1,
+                test_significance=False,
+                evaluate_effect_strength=False,
+                confidence_intervals=False,
+                target_units="ate",
+            )
+            return estimate
+
+        def swap_first_row(df: pd.DataFrame, columns: list):
+            """
+            A property of some categorical encoders (e.g. Pandas' `get_dummies()`) is that the
+            values of encoded variables are assigned on the order in which each unique value is
+            encountered in the data. Therefore, by swapping the *first* row of some data with
+            some other row, we can try to try the encoder into a different encoding of the data.
+
+            This function finds a row which is dissimilar to the first row in terms of all the
+            values in `columns`. This row is then swapped with row 0. A copy of the data with the
+            rows swapped is returned.
+
+            :param df: A DataFrame.
+            :param columns: A list of column names, the values of which must be dissimilar to the first row
+            :returns: A copy of df in which the first row is swapped with another row.
+            """
+            # Get the values of row 0 for the specified columns
+            row_0_values = df.loc[0, columns].tolist()
+
+            # Find rows where values differ from row 0 in terms of all values in the specified columns
+            n = df[(df[columns] != row_0_values).all(axis=1)].index[0]
+
+            # Create a copy of the data and swap the rows.
+            df_swap = df.copy()
+            df_swap.iloc[0] = df.iloc[n]
+            df_swap.iloc[n] = df.iloc[0]
+            return df_swap
+
+        # Test 1: Permuting data order does not affect Effect estimate.
+        # This test will not likely fail with a RegressionEstimator, because
+        # the effect of common cause variables is additive and does not contribute to
+        # the estimated effect. However, it could fail with other Estimators.
+        estimator = fit_estimator(data, self._method_params)
+        estimate_1 = estimate(estimator, df_1)
+        df_2 = swap_first_row(df_1, encoded_categorical_columns)
+        estimate_2 = estimate(estimator, df_2)
+
+        error = abs(estimate_1.value - estimate_2.value)
+        error_tolerance = 1.0e-6  # tiny errors OK due to e.g. precision errors
+        print(
+            "Difference {0} between ATE estimates 1: {1} and 2: {2} must be < {3}".format(
+                error,
+                estimate_1.value,
+                estimate_2.value,
+                error_tolerance,
+            )
+        )
+        assert error < error_tolerance
+
+        # Test 2: Verify that estimated Outcomes from "do-operator" are unchanged
+        # While for some Estimators the Effect is unaffected by changes to common-cause
+        # data, and data ordering, predicted Outcomes should be as all variables can
+        # contribute to these Outcomes. However, they are only available in a standard
+        # interface for Estimators which support `do(x)`.
+        #
+        # In this test, we verify that the result of `do(x)` does not change when we
+        # present our two datasets (one has swapped first row). If the result differs,
+        # this is likely due to the encoding of these new data, since the model is
+        # unchanged.
+        #
+        # Unlike the Effect test #1 above, this test is verifiable; we can randomize
+        # the values and combinations of the encoded values and verify that under these
+        # conditions the result of `do(x)` *does* change. This is the type of error we
+        # expect to observe if there's an encoding error - all the encoded variables
+        # would change.
+        def randomize_column_values(df, columns):
+            """
+            Returns a copy of `df` with randomized values for specified `columns`.
+            Randomized values are chosen uniformly from the set of unique values
+            in each specified column. This action should disrupt the result of `do(x)`
+            on the data.
+            """
+            df = df.copy()
+            num_rows = len(df)
+            for column in columns:
+                possible_values = df[column].unique()
+                df[column] = np.random.choice(possible_values, num_rows)
+            return df
+
+        try:
+            df_3 = randomize_column_values(df_1, encoded_categorical_columns)
+
+            treatment_value = 1
+            do_x_with_df_1 = estimator.do(x=treatment_value, data_df=df_1)
+            do_x_with_df_2 = estimator.do(x=treatment_value, data_df=df_2)
+            do_x_with_df_3 = estimator.do(x=treatment_value, data_df=df_3)
+
+            # Test that do(x) result is unchanged despite row permutation
+            error_2 = abs(do_x_with_df_1 - do_x_with_df_2)
+            print(
+                "Difference {0} between do(x) 1: {1} and 2: {2} must be < {3}".format(
+                    error_2,
+                    do_x_with_df_1,
+                    do_x_with_df_2,
+                    error_tolerance,
+                )
+            )
+            assert error_2 < error_tolerance
+
+            # Verify that this test *does* detect errors, when common-cause data changed.
+            error_3 = abs(do_x_with_df_1 - do_x_with_df_3)
+            print(
+                "Difference {0} between do(x) 1: {1} and 3: {2} must be > {3}".format(
+                    error_3,
+                    do_x_with_df_1,
+                    do_x_with_df_3,
+                    error_tolerance,
+                )
+            )
+            assert error_3 > error_tolerance
+        except NotImplementedError:
+            pass  # Expected, for many Estimators
diff --git a/tests/causal_estimators/test_estimator_consistency.py b/tests/causal_estimators/test_estimator_consistency.py
new file mode 100755
index 0000000000..139bfb32ec
--- /dev/null
+++ b/tests/causal_estimators/test_estimator_consistency.py
@@ -0,0 +1,49 @@
+import statsmodels as sm
+from pytest import mark
+
+from dowhy.causal_estimators.generalized_linear_model_estimator import GeneralizedLinearModelEstimator
+from dowhy.causal_estimators.linear_regression_estimator import LinearRegressionEstimator
+from dowhy.causal_estimators.propensity_score_matching_estimator import PropensityScoreMatchingEstimator
+from dowhy.causal_estimators.propensity_score_stratification_estimator import PropensityScoreStratificationEstimator
+from dowhy.causal_estimators.propensity_score_weighting_estimator import PropensityScoreWeightingEstimator
+
+from .base import SimpleEstimatorWithModelParams
+
+
+@mark.usefixtures("fixed_seed")
+class TestEstimatorConsistency(object):
+    @mark.parametrize(
+        [
+            "Estimator",
+            "method_params",
+        ],
+        [
+            (
+                PropensityScoreMatchingEstimator,
+                {},
+            ),
+            (
+                PropensityScoreStratificationEstimator,
+                {},
+            ),
+            (
+                PropensityScoreWeightingEstimator,
+                {},
+            ),
+            (
+                LinearRegressionEstimator,
+                {},
+            ),
+            (
+                GeneralizedLinearModelEstimator,
+                {"glm_family": sm.api.families.Poisson()},
+            ),
+        ],
+    )
+    def test_encoding_consistency(
+        self,
+        Estimator,
+        method_params,
+    ):
+        estimator_tester = SimpleEstimatorWithModelParams(Estimator, method_params)
+        estimator_tester.consistent_estimator_encoding_test()