From 65f3031bb103d7219fee4c62cbfc8d2747d08d9c Mon Sep 17 00:00:00 2001 From: drawlinson Date: Wed, 27 Mar 2024 03:33:17 +1100 Subject: [PATCH] Replace all occurrences of get Pandas' get_dummies() with skLearn OneHotEncoder (#1135) * For consistency and avoidance of future issues, replace all occurrences of Pandas' get_dummies with skLearn's OneHotEncoder. Encoder lifespan: Reuses encoders for new estimate_effect() calls, and replaces existing encoders on CausalEstimator.fit(). Additional uses of get_dummies without side-effects or consistent encoding issues in do-Sampler Propensity Scores utilities also replaced for consistency. Signed-off-by: DAVID RAWLINSON * Add categorical encoding consistency tests for CausalEstimators. Fix bug in arg order for RegressionEstimator._do(). Signed-off-by: DAVID RAWLINSON --------- Signed-off-by: DAVID RAWLINSON Co-authored-by: DAVID RAWLINSON --- dowhy/causal_estimator.py | 37 +++- dowhy/causal_estimators/causalml.py | 5 +- .../distance_matching_estimator.py | 3 +- dowhy/causal_estimators/econml.py | 9 +- .../instrumental_variable_estimator.py | 1 + .../propensity_score_estimator.py | 4 +- .../regression_discontinuity_estimator.py | 1 + .../causal_estimators/regression_estimator.py | 81 +------ .../two_stage_regression_estimator.py | 7 +- .../add_unobserved_common_cause.py | 66 +++--- dowhy/utils/encoding.py | 50 +++++ dowhy/utils/propensity_score.py | 11 +- tests/causal_estimators/base.py | 198 ++++++++++++++++++ .../test_estimator_consistency.py | 49 +++++ 14 files changed, 409 insertions(+), 113 deletions(-) create mode 100755 tests/causal_estimators/test_estimator_consistency.py diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py index 061361720d..fccca7c418 100755 --- a/dowhy/causal_estimator.py +++ b/dowhy/causal_estimator.py @@ -11,6 +11,7 @@ import dowhy.interpreters as interpreters from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand from dowhy.utils.api import parse_state +from dowhy.utils.encoding import Encoders logger = logging.getLogger(__name__) @@ -112,6 +113,35 @@ def __init__( self._bootstrap_estimates = None self._bootstrap_null_estimates = None + self._encoders = Encoders() + + def reset_encoders(self): + """ + Removes any reference to data encoders, causing them to be re-created on next `fit()`. + + It's important that data is consistently encoded otherwise models will produce inconsistent output. + In particular, categorical variables are one-hot encoded; the mapping of original data values + must be identical between model training/fitting and inference time. + + Encoders are reset when `fit()` is called again, as the data is assumed to have changed. + + A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers). + """ + self._encoders.reset() + + def _encode(self, data: pd.DataFrame, encoder_name: str): + """ + Encodes categorical columns in the given data, returning a new dataframe containing + all original data and the encoded columns. Numerical data is unchanged, categorical + types are one-hot encoded. `encoder_name` identifies a specific encoder to be used + if available, or created if not. The encoder can be reused in subsequent calls. + + :param data: Data to encode. + :param encoder_name: The name for the encoder to be used. + :returns: The encoded data. + """ + return self._encoders.encode(data, encoder_name) + def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None): """Sets the effect modifiers for the estimator Modifies need_conditional_estimates accordingly to effect modifiers value @@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns] if len(self._effect_modifier_names) > 0: self._effect_modifiers = data[self._effect_modifier_names] - self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True) + self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers") self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names)) else: self._effect_modifier_names = [] @@ -234,7 +264,10 @@ def _estimate_conditional_effects( effect_modifier_names[i] = prefix + str(em) # Grouping by effect modifiers and computing effect separately by_effect_mods = data.groupby(effect_modifier_names) - cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x) + + def cond_est_fn(x): + return self._do(self._treatment_value, x) - self._do(self._control_value, x) + conditional_estimates = by_effect_mods.apply(estimate_effect_fn) # Deleting the temporary categorical columns for em in effect_modifier_names: diff --git a/dowhy/causal_estimators/causalml.py b/dowhy/causal_estimators/causalml.py index 49e9ea0f20..d44f0d9ac8 100644 --- a/dowhy/causal_estimators/causalml.py +++ b/dowhy/causal_estimators/causalml.py @@ -116,6 +116,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) # Check the backdoor variables being used @@ -127,7 +128,7 @@ def fit( # Get the data of the unobserved confounders self._observed_common_causes = data[self._observed_common_causes_names] # One hot encode the data if they are categorical - self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True) + self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes") else: self._observed_common_causes = [] @@ -138,7 +139,7 @@ def fit( self._instrumental_variable_names = self._target_estimand.instrumental_variables if self._instrumental_variable_names: self._instrumental_variables = data[self._instrumental_variable_names] - self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True) + self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables") else: self._instrumental_variables = [] diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py index e63222404a..3f44593314 100644 --- a/dowhy/causal_estimators/distance_matching_estimator.py +++ b/dowhy/causal_estimators/distance_matching_estimator.py @@ -122,6 +122,7 @@ def fit( """ self.exact_match_cols = exact_match_cols + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) # Check if the treatment is one-dimensional @@ -146,7 +147,7 @@ def fit( # Convert the categorical variables into dummy/indicator variables # Basically, this gives a one hot encoding for each category # The first category is taken to be the base line. - self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True) + self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes") else: self._observed_common_causes = None error_msg = "No common causes/confounders present. Distance matching methods are not applicable" diff --git a/dowhy/causal_estimators/econml.py b/dowhy/causal_estimators/econml.py index 819d3211cb..33cd4b0b82 100755 --- a/dowhy/causal_estimators/econml.py +++ b/dowhy/causal_estimators/econml.py @@ -120,6 +120,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) # Save parameters for later refutter fitting self._econml_fit_params = kwargs @@ -148,12 +149,12 @@ def fit( # Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names # the latter can be used by other estimator methods later self._effect_modifiers = data[effect_modifier_names] - self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True) + self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers") self._effect_modifier_names = effect_modifier_names self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names)) if self._observed_common_causes_names: self._observed_common_causes = data[self._observed_common_causes_names] - self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True) + self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes") else: self._observed_common_causes = None self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names)) @@ -165,7 +166,7 @@ def fit( self.estimating_instrument_names = parse_state(self.iv_instrument_name) if self.estimating_instrument_names: self._estimating_instruments = data[self.estimating_instrument_names] - self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True) + self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments") else: self._estimating_instruments = None @@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None): """Returns None if the confidence interval has not been calculated.""" return self.effect_intervals - def _do(self, x): + def _do(self, x, data_df=None): raise NotImplementedError def construct_symbolic_estimator(self, estimand): diff --git a/dowhy/causal_estimators/instrumental_variable_estimator.py b/dowhy/causal_estimators/instrumental_variable_estimator.py index 5cc34f06fc..6570f0a540 100755 --- a/dowhy/causal_estimators/instrumental_variable_estimator.py +++ b/dowhy/causal_estimators/instrumental_variable_estimator.py @@ -92,6 +92,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) self.estimating_instrument_names = self._target_estimand.instrumental_variables diff --git a/dowhy/causal_estimators/propensity_score_estimator.py b/dowhy/causal_estimators/propensity_score_estimator.py index a6a2f2df39..0ffd7ae41c 100644 --- a/dowhy/causal_estimators/propensity_score_estimator.py +++ b/dowhy/causal_estimators/propensity_score_estimator.py @@ -93,6 +93,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables())) @@ -103,7 +104,8 @@ def fit( # Convert the categorical variables into dummy/indicator variables # Basically, this gives a one hot encoding for each category # The first category is taken to be the base line. - self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True) + self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes") + else: self._observed_common_causes = None error_msg = "No common causes/confounders present. Propensity score based methods are not applicable" diff --git a/dowhy/causal_estimators/regression_discontinuity_estimator.py b/dowhy/causal_estimators/regression_discontinuity_estimator.py index 47c1211711..daed8a8492 100755 --- a/dowhy/causal_estimators/regression_discontinuity_estimator.py +++ b/dowhy/causal_estimators/regression_discontinuity_estimator.py @@ -98,6 +98,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) self.rd_variable = data[self.rd_variable_name] diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py index f7c86fb357..a0457b1f47 100644 --- a/dowhy/causal_estimators/regression_estimator.py +++ b/dowhy/causal_estimators/regression_estimator.py @@ -5,7 +5,6 @@ import statsmodels.api as sm from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand -from dowhy.utils.encoding import one_hot_encode class RegressionEstimator(CausalEstimator): @@ -71,53 +70,6 @@ def __init__( self.model = None - # Data encoders - # encoder_drop_first will not encode the first category value with a bit in 1-hot encoding. - # It will be implicit instead, by the absence of any bit representing this value in the relevant columns. - # Set to False to include a bit for each value of every categorical variable. - self.encoder_drop_first = True - self.reset_encoders() - - def reset_encoders(self): - """ - Removes any reference to data encoders, causing them to be re-created on next `fit()`. - - It's important that data is consistently encoded otherwise models will produce inconsistent output. - In particular, categorical variables are one-hot encoded; the mapping of original data values - must be identical between model training/fitting and inference time. - - Encoders are reset when `fit()` is called again, as the data is assumed to have changed. - - A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers). - """ - self._encoders = { - "treatment": None, - "observed_common_causes": None, - "effect_modifiers": None, - } - - def _encode(self, data: pd.DataFrame, encoder_name: str): - """ - Encodes categorical columns in the given data, returning a new dataframe containing - all original data and the encoded columns. Numerical data is unchanged, categorical - types are one-hot encoded. `encoder_name` identifies a specific encoder to be used - if available, or created if not. The encoder can be reused in subsequent calls. - - :param data: Data to encode. - :param encoder_name: The name for the encoder to be used. - :returns: The encoded data. - """ - existing_encoder = self._encoders.get(encoder_name) - encoded_variables, encoder = one_hot_encode( - data, - drop_first=self.encoder_drop_first, - encoder=existing_encoder, - ) - - # Remember encoder - self._encoders[encoder_name] = encoder - return encoded_variables - def fit( self, data: pd.DataFrame, @@ -170,7 +122,7 @@ def estimate_effect( need_conditional_estimates = self.need_conditional_estimates # TODO make treatment_value and control value also as local parameters # All treatments are set to the same constant value - effect_estimate = self._do(data, treatment_value) - self._do(data, control_value) + effect_estimate = self._do(treatment_value, data) - self._do(control_value, data) conditional_effect_estimates = None if need_conditional_estimates: conditional_effect_estimates = self._estimate_conditional_effects( @@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df): est = self.estimate_effect(data=data_df, need_conditional_estimates=False) return est.value - def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None): - """Sets the effect modifiers for the estimator - Modifies need_conditional_estimates accordingly to effect modifiers value - :param effect_modifiers: Variables on which to compute separate - effects, or return a heterogeneous effect function. Not all - methods support this currently. - """ - self._effect_modifiers = effect_modifier_names - if effect_modifier_names is not None: - self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns] - if len(self._effect_modifier_names) > 0: - self._effect_modifiers = data[self._effect_modifier_names] - self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers") - self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names)) - else: - self._effect_modifier_names = [] - else: - self._effect_modifier_names = [] - - self.need_conditional_estimates = ( - self.need_conditional_estimates - if self.need_conditional_estimates != "auto" - else (self._effect_modifier_names and len(self._effect_modifier_names) > 0) - ) - def _build_features(self, data_df: pd.DataFrame, treatment_values=None): treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment") @@ -295,6 +222,10 @@ def predict(self, data_df): interventional_outcomes = self.predict_fn(data_df, self.model, new_features) return interventional_outcomes - def _do(self, data_df: pd.DataFrame, treatment_val): + def _do( + self, + treatment_val, + data_df: pd.DataFrame, + ): interventional_outcomes = self.interventional_outcomes(data_df, treatment_val) return interventional_outcomes.mean() diff --git a/dowhy/causal_estimators/two_stage_regression_estimator.py b/dowhy/causal_estimators/two_stage_regression_estimator.py index ebb01558ed..659ba7f955 100644 --- a/dowhy/causal_estimators/two_stage_regression_estimator.py +++ b/dowhy/causal_estimators/two_stage_regression_estimator.py @@ -167,6 +167,7 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) if len(self._target_estimand.treatment_variable) > 1: @@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame): treatment_vals = data_df[self._target_estimand.treatment_variable] if len(self._observed_common_causes_names) > 0: observed_common_causes_vals = data_df[self._observed_common_causes_names] - observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True) + observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes") + if self._effect_modifier_names: effect_modifiers_vals = data_df[self._effect_modifier_names] - effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True) + effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers") + if type(treatment_vals) is not np.ndarray: treatment_vals = treatment_vals.to_numpy() if treatment_vals.shape[0] != data_df.shape[0]: diff --git a/dowhy/causal_refuters/add_unobserved_common_cause.py b/dowhy/causal_refuters/add_unobserved_common_cause.py index 333dc72060..80f0ef18f2 100755 --- a/dowhy/causal_refuters/add_unobserved_common_cause.py +++ b/dowhy/causal_refuters/add_unobserved_common_cause.py @@ -19,6 +19,7 @@ from dowhy.causal_refuter import CausalRefutation, CausalRefuter, choose_variables from dowhy.causal_refuters.evalue_sensitivity_analyzer import EValueSensitivityAnalyzer from dowhy.causal_refuters.linear_sensitivity_analyzer import LinearSensitivityAnalyzer +from dowhy.utils.encoding import Encoders logger = logging.getLogger(__name__) @@ -201,6 +202,41 @@ def include_simulated_confounder( ) +def preprocess_observed_common_causes( + data: pd.DataFrame, + target_estimand: IdentifiedEstimand, + no_common_causes_error_message: str, +): + """ + Preprocesses backdoor variables (observed common causes) and returns the pre-processed matrix. + + At least one backdoor (common cause) variable is required. Raises an exception if none present. + + Preprocessing has two steps: + 1. Categorical encoding. + 2. Standardization. + + :param data: All data, some of which needs preprocessing. + :param target_estimand: Estimand for desired effect including definition of backdoor variables. + :param no_common_causes_error_message: Message to be displayed with ValueError if no backdoor variable present. + :return: DataFrame containing pre-processed data. + """ + + # 1. Categorical encoding of relevant variables + observed_common_causes_names = target_estimand.get_backdoor_variables() + if len(observed_common_causes_names) > 0: + # The encoded data is only used to calculate a parameter, so the encoder can be discarded. + observed_common_causes = data[observed_common_causes_names] + encoders = Encoders() + observed_common_causes = encoders.encode(observed_common_causes, "observed_common_causes") + else: + raise ValueError(no_common_causes_error_message) + + # 2. Standardizing the data + observed_common_causes = StandardScaler().fit_transform(observed_common_causes) + return observed_common_causes + + def _infer_default_kappa_t( data: pd.DataFrame, target_estimand: IdentifiedEstimand, @@ -210,19 +246,10 @@ def _infer_default_kappa_t( len_kappa_t: int = 10, ): """Infer default effect strength of simulated confounder on treatment.""" - observed_common_causes_names = target_estimand.get_backdoor_variables() - if len(observed_common_causes_names) > 0: - observed_common_causes = data[observed_common_causes_names] - observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True) - else: - raise ValueError( - "There needs to be at least one common cause to" - + "automatically compute the default value of kappa_t." - + " Provide a value for kappa_t" - ) t = data[treatment_name] - # Standardizing the data - observed_common_causes = StandardScaler().fit_transform(observed_common_causes) + no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_t. Provide a value for kappa_t" + observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message) + if effect_on_t == "binary_flip": # Fit a model containing all confounders and compare predictions # using all features compared to all features except a given @@ -272,19 +299,10 @@ def _infer_default_kappa_y( len_kappa_y: int = 10, ): """Infer default effect strength of simulated confounder on treatment.""" - observed_common_causes_names = target_estimand.get_backdoor_variables() - if len(observed_common_causes_names) > 0: - observed_common_causes = data[observed_common_causes_names] - observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True) - else: - raise ValueError( - "There needs to be at least one common cause to" - + "automatically compute the default value of kappa_y." - + " Provide a value for kappa_y" - ) y = data[outcome_name] - # Standardizing the data - observed_common_causes = StandardScaler().fit_transform(observed_common_causes) + no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_y. Provide a value for kappa_y" + observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message) + if effect_on_y == "binary_flip": # Fit a model containing all confounders and compare predictions # using all features compared to all features except a given diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py index 722e1473ae..2c4929acc1 100644 --- a/dowhy/utils/encoding.py +++ b/dowhy/utils/encoding.py @@ -60,3 +60,53 @@ def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, e df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1) return df_result, encoder + + +class Encoders: + """Categorical data One-Hot encoding helper object. + + Initializes a factory object which manages a set of sklearn.preprocessing.OneHotEncoder instances, + although the `encode()` method can be overriden to replace these with your preferred encoder. + + Each Encoder instance is given a name to retrieve it in future, and is used to encode + a different set of variables. + """ + + def __init__(self, drop_first=True): + """Initializes an instance and calls `reset_encoders()`. + + :param drop_first: If true, will not encode the first category value with a bit in 1-hot encoding. + It will be implicit instead, by the absence of any bit representing this value in the relevant columns. + Set to False to include a bit for each value of every categorical variable. + """ + self.drop_first = drop_first + self.reset() + + def reset(self): + """ + Removes any reference to data encoders, causing them to be re-created on next `encode()`. + A separate encoder is used for each named set of variables. + """ + self._encoders = {} + + def encode(self, data: pd.DataFrame, encoder_name: str): + """ + Encodes categorical columns in the given data, returning a new dataframe containing + all original data and the encoded columns. Numerical data is unchanged, categorical + types are one-hot encoded. `encoder_name` identifies a specific encoder to be used + if available, or created if not. The encoder can be reused in subsequent calls. + + :param data: Data to encode. + :param encoder_name: The name for the encoder to be used. + :returns: The encoded data. + """ + existing_encoder = self._encoders.get(encoder_name) + encoded_variables, encoder = one_hot_encode( + data, + drop_first=self.drop_first, + encoder=existing_encoder, + ) + + # Remember encoder + self._encoders[encoder_name] = encoder + return encoded_variables diff --git a/dowhy/utils/propensity_score.py b/dowhy/utils/propensity_score.py index df49d58bbc..e8f68ceb87 100644 --- a/dowhy/utils/propensity_score.py +++ b/dowhy/utils/propensity_score.py @@ -1,10 +1,11 @@ import numpy as np import pandas as pd -from pandas import get_dummies from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from statsmodels.nonparametric.kernel_density import EstimatorSettings, KDEMultivariateConditional +from dowhy.utils.encoding import one_hot_encode + def propensity_of_treatment_score(data, covariates, treatment, model="logistic", variable_types=None): if model == "logistic": @@ -114,8 +115,14 @@ def binarize_discrete(data, covariates, variable_types): if variable_types: for variable in covariates: variable_type = variable_types[variable] + # variable_type: + # A dictionary containing the variable's names and types. 'c' for continuous, 'o' + # for ordered, 'd' for discrete, and 'u' for unordered discrete. if variable_type in ["d", "o", "u"]: - dummies = get_dummies(data[variable]) + # [] notation to retain DataFrame rather than Series. + # For one_hot_encode type must be categorical, or it won't encode. + variable_data = data.loc[:, [variable]].astype(str) + dummies, _ = one_hot_encode(variable_data) # Original impl. pd.get_dummies, drop_first default is False dummies.columns = [variable + str(col) for col in dummies.columns] dummies = dummies[dummies.columns[:-1]] covariates += list(dummies.columns) diff --git a/tests/causal_estimators/base.py b/tests/causal_estimators/base.py index f3ff3772a8..c7bfeebb60 100755 --- a/tests/causal_estimators/base.py +++ b/tests/causal_estimators/base.py @@ -1,5 +1,8 @@ import itertools +import numpy as np +import pandas as pd + import dowhy.datasets from dowhy import EstimandType, identify_effect_auto from dowhy.graph import build_graph_from_str @@ -179,3 +182,198 @@ def custom_data_average_treatment_effect_test(self, data): ) res = True if (error < true_ate * self._error_tolerance) else False assert res + + +class SimpleEstimatorWithModelParams(object): + def __init__(self, Estimator, method_params, identifier_method="backdoor"): + self._Estimator = Estimator + self._method_params = method_params + self._identifier_method = identifier_method + + def consistent_estimator_encoding_test(self): + """ + This test tries to verify and enforce consistent encoding of categorical variables + by Estimators. The desired behaviour is that encodings of new values are produced + during `fit()`, which is also when the model is learned/trained/fitted. + + In `estimator.estimate_effect()` and `do(x)` the same encodings should be reused. + """ + + # Generate a dataset with some categorical variables (common causes) + # This configuration is necessary for the test and should not be varied. + data = dowhy.datasets.linear_dataset( + beta=1, + num_common_causes=3, + num_discrete_common_causes=2, + num_instruments=2, + num_effect_modifiers=0, + num_discrete_effect_modifiers=0, + num_treatments=1, + num_frontdoor_variables=0, + num_samples=500, + treatment_is_binary=True, + treatment_is_category=False, + outcome_is_binary=False, + ) + + # For the purposes of the test, these are the categorical columns. + encoded_categorical_columns = ["W1", "W2"] + + # Since their values are integer, convert them to string type to ensure + # Categorical handling. + df_1 = data["df"] + df_1[encoded_categorical_columns] = df_1[encoded_categorical_columns].astype(str) + + def fit_estimator(data, method_params): + """ + Creates an Estimator, identifies the effect, and fits the Estimator to the data. + The Estimator is returned. + """ + + target_estimand = identify_effect_auto( + build_graph_from_str(data["gml_graph"]), + observed_nodes=list(data["df"].columns), + action_nodes=data["treatment_name"], + outcome_nodes=data["outcome_name"], + estimand_type=EstimandType.NONPARAMETRIC_ATE, + ) + target_estimand.set_identifier_method(self._identifier_method) + + estimator = self._Estimator( + identified_estimand=target_estimand, + **method_params, + ) + + estimator.fit( + df_1, + effect_modifier_names=data["effect_modifier_names"], + ) + return estimator + + def estimate(estimator, df): + """ + Returns an Estimate of the ATE, given the data provided. + """ + print(f"Est,,, {type(estimator)}") + estimate = estimator.estimate_effect( + df, + control_value=0, + treatment_value=1, + test_significance=False, + evaluate_effect_strength=False, + confidence_intervals=False, + target_units="ate", + ) + return estimate + + def swap_first_row(df: pd.DataFrame, columns: list): + """ + A property of some categorical encoders (e.g. Pandas' `get_dummies()`) is that the + values of encoded variables are assigned on the order in which each unique value is + encountered in the data. Therefore, by swapping the *first* row of some data with + some other row, we can try to try the encoder into a different encoding of the data. + + This function finds a row which is dissimilar to the first row in terms of all the + values in `columns`. This row is then swapped with row 0. A copy of the data with the + rows swapped is returned. + + :param df: A DataFrame. + :param columns: A list of column names, the values of which must be dissimilar to the first row + :returns: A copy of df in which the first row is swapped with another row. + """ + # Get the values of row 0 for the specified columns + row_0_values = df.loc[0, columns].tolist() + + # Find rows where values differ from row 0 in terms of all values in the specified columns + n = df[(df[columns] != row_0_values).all(axis=1)].index[0] + + # Create a copy of the data and swap the rows. + df_swap = df.copy() + df_swap.iloc[0] = df.iloc[n] + df_swap.iloc[n] = df.iloc[0] + return df_swap + + # Test 1: Permuting data order does not affect Effect estimate. + # This test will not likely fail with a RegressionEstimator, because + # the effect of common cause variables is additive and does not contribute to + # the estimated effect. However, it could fail with other Estimators. + estimator = fit_estimator(data, self._method_params) + estimate_1 = estimate(estimator, df_1) + df_2 = swap_first_row(df_1, encoded_categorical_columns) + estimate_2 = estimate(estimator, df_2) + + error = abs(estimate_1.value - estimate_2.value) + error_tolerance = 1.0e-6 # tiny errors OK due to e.g. precision errors + print( + "Difference {0} between ATE estimates 1: {1} and 2: {2} must be < {3}".format( + error, + estimate_1.value, + estimate_2.value, + error_tolerance, + ) + ) + assert error < error_tolerance + + # Test 2: Verify that estimated Outcomes from "do-operator" are unchanged + # While for some Estimators the Effect is unaffected by changes to common-cause + # data, and data ordering, predicted Outcomes should be as all variables can + # contribute to these Outcomes. However, they are only available in a standard + # interface for Estimators which support `do(x)`. + # + # In this test, we verify that the result of `do(x)` does not change when we + # present our two datasets (one has swapped first row). If the result differs, + # this is likely due to the encoding of these new data, since the model is + # unchanged. + # + # Unlike the Effect test #1 above, this test is verifiable; we can randomize + # the values and combinations of the encoded values and verify that under these + # conditions the result of `do(x)` *does* change. This is the type of error we + # expect to observe if there's an encoding error - all the encoded variables + # would change. + def randomize_column_values(df, columns): + """ + Returns a copy of `df` with randomized values for specified `columns`. + Randomized values are chosen uniformly from the set of unique values + in each specified column. This action should disrupt the result of `do(x)` + on the data. + """ + df = df.copy() + num_rows = len(df) + for column in columns: + possible_values = df[column].unique() + df[column] = np.random.choice(possible_values, num_rows) + return df + + try: + df_3 = randomize_column_values(df_1, encoded_categorical_columns) + + treatment_value = 1 + do_x_with_df_1 = estimator.do(x=treatment_value, data_df=df_1) + do_x_with_df_2 = estimator.do(x=treatment_value, data_df=df_2) + do_x_with_df_3 = estimator.do(x=treatment_value, data_df=df_3) + + # Test that do(x) result is unchanged despite row permutation + error_2 = abs(do_x_with_df_1 - do_x_with_df_2) + print( + "Difference {0} between do(x) 1: {1} and 2: {2} must be < {3}".format( + error_2, + do_x_with_df_1, + do_x_with_df_2, + error_tolerance, + ) + ) + assert error_2 < error_tolerance + + # Verify that this test *does* detect errors, when common-cause data changed. + error_3 = abs(do_x_with_df_1 - do_x_with_df_3) + print( + "Difference {0} between do(x) 1: {1} and 3: {2} must be > {3}".format( + error_3, + do_x_with_df_1, + do_x_with_df_3, + error_tolerance, + ) + ) + assert error_3 > error_tolerance + except NotImplementedError: + pass # Expected, for many Estimators diff --git a/tests/causal_estimators/test_estimator_consistency.py b/tests/causal_estimators/test_estimator_consistency.py new file mode 100755 index 0000000000..139bfb32ec --- /dev/null +++ b/tests/causal_estimators/test_estimator_consistency.py @@ -0,0 +1,49 @@ +import statsmodels as sm +from pytest import mark + +from dowhy.causal_estimators.generalized_linear_model_estimator import GeneralizedLinearModelEstimator +from dowhy.causal_estimators.linear_regression_estimator import LinearRegressionEstimator +from dowhy.causal_estimators.propensity_score_matching_estimator import PropensityScoreMatchingEstimator +from dowhy.causal_estimators.propensity_score_stratification_estimator import PropensityScoreStratificationEstimator +from dowhy.causal_estimators.propensity_score_weighting_estimator import PropensityScoreWeightingEstimator + +from .base import SimpleEstimatorWithModelParams + + +@mark.usefixtures("fixed_seed") +class TestEstimatorConsistency(object): + @mark.parametrize( + [ + "Estimator", + "method_params", + ], + [ + ( + PropensityScoreMatchingEstimator, + {}, + ), + ( + PropensityScoreStratificationEstimator, + {}, + ), + ( + PropensityScoreWeightingEstimator, + {}, + ), + ( + LinearRegressionEstimator, + {}, + ), + ( + GeneralizedLinearModelEstimator, + {"glm_family": sm.api.families.Poisson()}, + ), + ], + ) + def test_encoding_consistency( + self, + Estimator, + method_params, + ): + estimator_tester = SimpleEstimatorWithModelParams(Estimator, method_params) + estimator_tester.consistent_estimator_encoding_test()