fixed some bugs related to merging with shap and new attribute names

linting fixed failing tests fixed notebooks fixed ortholearner docstring reverted to allowing n_crossfit_splits and raising deprecation warning. fixed ortholearner doctest fixed bugs for failing tests fixed failing test bugs added extra kwargs to _strata old sklearn crashes with new scipy. we can revert once we enforce new sklearn disallowing bootstrap inference when refitting added many refit tests. fixed some leftover bugs based on new tests typo in tests changed monte_carlo_iterations to mc_iters and added mc_agg parameter to change the aggregation method {mean, median} fixed nuisance aggregation when nuisances have different dimensions removed _models_nuisance initilaization at init. added rlearner residuals_ property fixed flaky cate interpreter test
py-why · Jan 4, 2021 · ba0dbf0 · ba0dbf0
1 parent 72a297a
commit ba0dbf0
Show file tree

Hide file tree

Showing 19 changed files with 1,022 additions and 333 deletions.
diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
@@ -38,6 +38,7 @@ class in this module implements the general logic in a very versatile way
 
 from .cate_estimator import (BaseCateEstimator, LinearCateEstimator,
                              TreatmentExpansionMixin)
+from .inference import BootstrapInference
 from .utilities import (_deprecate_positional, _EncoderWrapper, check_input_arrays,
                         cross_product, filter_none_kwargs,
                         inverse_onehot, ndim, reshape, shape, transpose)
@@ -281,9 +282,130 @@ class _OrthoLearner(ABC, TreatmentExpansionMixin, LinearCateEstimator):
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
         by :mod:`np.random<numpy.random>`.
 
-    monte_carlo_iterations: int, optional (default=None)
+    mc_iters: int, optional (default=None)
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    Examples
+    --------
+
+    The example code below implements a very simple version of the double machine learning
+    method on top of the :class:`._OrthoLearner` class, for expository purposes.
+    For a more elaborate implementation of a Double Machine Learning child class of the class
+    :class:`._OrthoLearner` check out :class:`.DML`
+    and its child classes:
+
+    .. testcode::
+
+        import numpy as np
+        from sklearn.linear_model import LinearRegression
+        from econml._ortho_learner import _OrthoLearner
+        class ModelNuisance:
+            def __init__(self, model_t, model_y):
+                self._model_t = model_t
+                self._model_y = model_y
+            def fit(self, Y, T, W=None):
+                self._model_t.fit(W, T)
+                self._model_y.fit(W, Y)
+                return self
+            def predict(self, Y, T, W=None):
+                return Y - self._model_y.predict(W), T - self._model_t.predict(W)
+        class ModelFinal:
+            def __init__(self):
+                return
+            def fit(self, Y, T, W=None, nuisances=None):
+                Y_res, T_res = nuisances
+                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
+                return self
+            def predict(self, X=None):
+                return self.model.coef_[0]
+            def score(self, Y, T, W=None, nuisances=None):
+                Y_res, T_res = nuisances
+                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
+        class OrthoLearner(_OrthoLearner):
+            def _gen_ortho_learner_model_nuisance(self):
+                return ModelNuisance(LinearRegression(), LinearRegression())
+            def _gen_ortho_learner_model_final(self):
+                return ModelFinal()
+        np.random.seed(123)
+        X = np.random.normal(size=(100, 3))
+        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.1, size=(100,))
+        est = OrthoLearner(n_splits=2, discrete_treatment=False, discrete_instrument=False,
+                           categories='auto', random_state=None)
+        est.fit(y, X[:, 0], W=X[:, 1:])
+
+    >>> est.score_
+    0.00756830...
+    >>> est.const_marginal_effect()
+    1.02364992...
+    >>> est.effect()
+    array([1.023649...])
+    >>> est.effect(T0=0, T1=10)
+    array([10.236499...])
+    >>> est.score(y, X[:, 0], W=X[:, 1:])
+    0.00727995...
+    >>> est.ortho_learner_model_final.model
+    LinearRegression(fit_intercept=False)
+    >>> est.ortho_learner_model_final.model.coef_
+    array([1.023649...])
+
+    The following example shows how to do double machine learning with discrete treatments, using
+    the _OrthoLearner:
+
+    .. testcode::
+
+        class ModelNuisance:
+            def __init__(self, model_t, model_y):
+                self._model_t = model_t
+                self._model_y = model_y
+            def fit(self, Y, T, W=None):
+                self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1)))
+                self._model_y.fit(W, Y)
+                return self
+            def predict(self, Y, T, W=None):
+                return Y - self._model_y.predict(W), T - self._model_t.predict_proba(W)[:, 1:]
+        class ModelFinal:
+            def __init__(self):
+                return
+            def fit(self, Y, T, W=None, nuisances=None):
+                Y_res, T_res = nuisances
+                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
+                return self
+            def predict(self):
+                # theta needs to be of dimension (1, d_t) if T is (n, d_t)
+                return np.array([[self.model.coef_[0]]])
+            def score(self, Y, T, W=None, nuisances=None):
+                Y_res, T_res = nuisances
+                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
+        from sklearn.linear_model import LogisticRegression
+        class OrthoLearner(_OrthoLearner):
+            def _gen_ortho_learner_model_nuisance(self):
+                return ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression())
+            def _gen_ortho_learner_model_final(self):
+                return ModelFinal()
+        np.random.seed(123)
+        W = np.random.normal(size=(100, 3))
+        import scipy.special
+        T = np.random.binomial(1, scipy.special.expit(W[:, 0]))
+        y = T + W[:, 0] + np.random.normal(0, 0.01, size=(100,))
+        est = OrthoLearner(n_splits=2, discrete_treatment=True, discrete_instrument=False,
+                           categories='auto', random_state=None)
+        est.fit(y, T, W=W)
+
+    >>> est.score_
+    0.00673015...
+    >>> est.const_marginal_effect()
+    array([[1.008401...]])
+    >>> est.effect()
+    array([1.008401...])
+    >>> est.score(y, T, W=W)
+    0.00310431...
+    >>> est.ortho_learner_model_final.model.coef_[0]
+    1.00840170...
+
     Attributes
     ----------
     models_nuisance: list of objects of type(model_nuisance)
@@ -301,14 +423,14 @@ class _OrthoLearner(ABC, TreatmentExpansionMixin, LinearCateEstimator):
 
     def __init__(self, *,
                  discrete_treatment, discrete_instrument, categories, n_splits, random_state,
-                 monte_carlo_iterations=None):
-        self._models_nuisance = None
+                 mc_iters=None, mc_agg='mean'):
         self.n_splits = n_splits
         self.discrete_treatment = discrete_treatment
         self.discrete_instrument = discrete_instrument
         self.random_state = random_state
         self.categories = categories
-        self.monte_carlo_iterations = monte_carlo_iterations
+        self.mc_iters = mc_iters
+        self.mc_agg = mc_agg
         super().__init__()
 
     @abstractmethod
@@ -350,13 +472,13 @@ def _gen_ortho_learner_model_final(self):
                                 sample_weight=sample_weight, sample_var=sample_var)
                 model_final.predict(X=X)
 
-            Predict, should just take the features X and return the constant marginal effect. In fact we allow for the
-            model method signatures to skip any of the keyword arguments as long as the class is always called with the
-            omitted keyword argument set to ``None``. Moreover, the predict function of the final model can take no
-            argument if the class is always called with ``X=None``. This can be enforced in child classes by
-            re-implementing the fit and the various effect methods. If ``discrete_treatment=True``, then the input ``T``
-            to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of
-            the one-hot.
+            Predict, should just take the features X and return the constant marginal effect. In fact we allow
+            for the model method signatures to skip any of the keyword arguments as long as the class is always
+            called with the omitted keyword argument set to ``None``. Moreover, the predict function of the final
+            model can take no argument if the class is always called with ``X=None``. This can be enforced in child
+            classes by re-implementing the fit and the various effect methods. If ``discrete_treatment=True``,
+            then the input ``T`` to both above calls will be the one-hot encoding of the original input ``T``,
+            excluding the first column of the one-hot.
         """
         pass
 
@@ -390,7 +512,7 @@ def _subinds_check_none(self, var, inds):
 
     def _strata(self, Y, T, X=None, W=None, Z=None,
                 sample_weight=None, sample_var=None, groups=None,
-                cache_values=False, monte_carlo_iterations=None):
+                cache_values=False, only_final=False, check_input=True):
         if self.discrete_instrument:
             Z = LabelEncoder().fit_transform(np.ravel(Z))
 
@@ -492,17 +614,24 @@ def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=No
             all_nuisances = []
             fitted_inds = None
 
-            for _ in range(self.monte_carlo_iterations or 1):
+            for _ in range(self.mc_iters or 1):
                 nuisances, new_inds = self._fit_nuisances(Y, T, X, W, Z, sample_weight=sample_weight, groups=groups)
                 all_nuisances.append(nuisances)
                 if fitted_inds is None:
                     fitted_inds = new_inds
                 elif not np.array_equal(fitted_inds, new_inds):
                     raise AttributeError("Different indices were fit by different folds, so they cannot be aggregated")
 
-            if self.monte_carlo_iterations is not None:
-                # TODO: support different ways to aggregate, like median?
-                nuisances = np.mean(np.array(all_nuisances), axis=0)
+            if self.mc_iters is not None:
+                if self.mc_agg == 'mean':
+                    nuisances = tuple(np.mean(nuisance_mc_variants, axis=0)
+                                      for nuisance_mc_variants in list(zip(*all_nuisances)))
+                elif self.mc_agg == 'median':
+                    nuisances = tuple(np.median(nuisance_mc_variants, axis=0)
+                                      for nuisance_mc_variants in list(zip(*all_nuisances)))
+                else:
+                    raise ValueError(
+                        "Parameter `mc_agg` must be one of {'mean', 'median'}. Got {}".format(self.mc_agg))
 
             Y, T, X, W, Z, sample_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
                                                         for arr in (Y, T, X, W, Z, sample_weight, sample_var))
@@ -526,6 +655,10 @@ def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=No
 
         return self
 
+    @property
+    def _illegal_refit_inference_methods(self):
+        return (BootstrapInference,)
+
     def refit(self, inference=None):
         """
         Estimate the counterfactual model using a new final model specification but with cached first stage results.
@@ -539,6 +672,8 @@ def refit(self, inference=None):
             This instance
         """
         assert self._cached_values, "Refit can only be called if values were cached during the original fit"
+        if isinstance(self._get_inference(inference), self._illegal_refit_inference_methods):
+            raise ValueError("The chosen inference method does not allow only for model final re-fitting.")
         cached = self._cached_values
         kwargs = filter_none_kwargs(
             Y=cached.Y, T=cached.T, X=cached.X, W=cached.W, Z=cached.Z,

diff --git a/econml/_rlearner.py b/econml/_rlearner.py
@@ -170,9 +170,13 @@ class _RLearner(_OrthoLearner):
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
         by :mod:`np.random<numpy.random>`.
 
-    monte_carlo_iterations: int, optional
+    mc_iters: int, optional
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
     Attributes
     ----------
     models_y: list of objects of type(model_y)
@@ -198,13 +202,14 @@ class _RLearner(_OrthoLearner):
         is multidimensional, then the average of the MSEs for each dimension of Y is returned.
     """
 
-    def __init__(self, *, discrete_treatment, categories, n_splits, random_state, monte_carlo_iterations=None):
+    def __init__(self, *, discrete_treatment, categories, n_splits, random_state, mc_iters=None, mc_agg='mean'):
         super().__init__(discrete_treatment=discrete_treatment,
                          discrete_instrument=False,  # no instrument, so doesn't matter
                          categories=categories,
                          n_splits=n_splits,
                          random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg)
 
     @abstractmethod
     def _gen_model_y(self):
@@ -348,3 +353,18 @@ def nuisance_scores_y(self):
     @property
     def nuisance_scores_t(self):
         return self.nuisance_scores_[1]
+
+    @property
+    def residuals_(self):
+        """
+        A tuple (y_res, T_res, X, W), of the residuals from the first stage estimation
+        along with the associated X and W. Samples are not guaranteed to be in the same
+        order as the input order.
+        """
+        if not hasattr(self, '_cached_values'):
+            raise AttributeError("Estimator is not fitted yet!")
+        if self._cached_values is None:
+            raise AttributeError("`fit` was called with `cache_values=False`. "
+                                 "Set to `True` to enable residual storage.")
+        Y_res, T_res = self._cached_values.nuisances
+        return Y_res, T_res, self._cached_values.X, self._cached_values.W
diff --git a/econml/cate_estimator.py b/econml/cate_estimator.py
@@ -525,8 +525,11 @@ class LinearModelFinalCateEstimatorMixin(BaseCateEstimator):
     """
     Base class for models where the final stage is a linear model.
 
-    Subclasses must expose a ``model_final`` attribute containing the model's
-    final stage model.
+    Such an estimator must implement a :attr:`model_final_` attribute that points
+    to the fitted final :class:`.StatsModelsLinearRegression` object that
+    represents the fitted CATE model. Also must implement :attr:`featurizer_` that points
+    to the fitted featurizer and :attr:`bias_part_of_coef` that designates
+    if the intercept is the first element of the :attr:`model_final_` coefficient.
 
     Attributes
     ----------
@@ -561,7 +564,7 @@ def coef_(self):
         """
         return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
-                                        self.fit_cate_intercept)[0]
+                                        self.fit_cate_intercept_)[0]
 
     @property
     def intercept_(self):
@@ -576,11 +579,11 @@ def intercept_(self):
             a vector and not a 2D array. For binary treatment the n_t dimension is
             also omitted.
         """
-        if not self.fit_cate_intercept:
+        if not self.fit_cate_intercept_:
             raise AttributeError("No intercept was fitted!")
         return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
-                                        self.fit_cate_intercept)[1]
+                                        self.fit_cate_intercept_)[1]
 
     @BaseCateEstimator._defer_to_inference
     def coef__interval(self, *, alpha=0.1):
@@ -718,16 +721,14 @@ def summary(self, alpha=0.1, value=0, decimals=3, feature_names=None, treatment_
 
     def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None):
         (dt, dy, treatment_names, output_names) = _define_names(self._d_t, self._d_y, treatment_names, output_names)
-        if hasattr(self, "featurizer") and self.featurizer is not None:
-            X = self.featurizer.transform(X)
+        if hasattr(self, "featurizer_") and self.featurizer_ is not None:
+            X = self.featurizer_.transform(X)
         X, T = broadcast_unit_treatments(X, dt)
-        d_x = X.shape[1]
         X_new = cross_product(X, T)
         feature_names = self.cate_feature_names(feature_names)
-        return _shap_explain_joint_linear_model_cate(self.model_final, X_new, T, dt, dy, self.fit_cate_intercept,
+        return _shap_explain_joint_linear_model_cate(self.model_final_, X_new, T, dt, dy, self.bias_part_of_coef,
                                                      feature_names=feature_names, treatment_names=treatment_names,
                                                      output_names=output_names)
-
     shap_values.__doc__ = LinearCateEstimator.shap_values.__doc__
 
 
@@ -736,9 +737,11 @@ class StatsModelsCateEstimatorMixin(LinearModelFinalCateEstimatorMixin):
     Mixin class that offers `inference='statsmodels'` options to the CATE estimator
     that inherits it.
 
-    Such an estimator must implement a :attr:`model_final` attribute that points
+    Such an estimator must implement a :attr:`model_final_` attribute that points
     to the fitted final :class:`.StatsModelsLinearRegression` object that
-    represents the fitted CATE model.
+    represents the fitted CATE model. Also must implement :attr:`featurizer_` that points
+    to the fitted featurizer and :attr:`bias_part_of_coef` that designates
+    if the intercept is the first element of the :attr:`model_final_` coefficient.
     """
 
     def _get_inference_options(self):
@@ -822,7 +825,7 @@ def intercept_(self, T):
         -------
         intercept: float or (n_y,) array like
         """
-        if not self.fit_cate_intercept:
+        if not self.fit_cate_intercept_:
             raise AttributeError("No intercept was fitted!")
         _, T = self._expand_treatments(None, T)
         ind = inverse_onehot(T).item() - 1