From cf7122a7b95d2ef991a030b0f855ef1e34e117f4 Mon Sep 17 00:00:00 2001
From: Vasilis <vasy@microsoft.com>
Date: Fri, 1 Jan 2021 23:55:20 -0500
Subject: [PATCH] abstract class approach to refitting

fixed small issue with _d_t storage

enabled refitting in drlearner

enabled refit in ortho_iv

enabled monte_carlo_iterations in ortho_iv

added cache_values param to ortho_iv

fixed docstrings to remove the model parameters and added dosctrings to the abstract methods. Removed doctest examples from ortholearner and rlearner as these classes can no longer be used as standalone and are abstract classes
---
 econml/_ortho_learner.py | 484 +++++++++------------------
 econml/_rlearner.py      | 186 ++++-------
 econml/cate_estimator.py |  10 +-
 econml/dml.py            | 683 ++++++++++++---------------------------
 econml/drlearner.py      | 271 ++++++++++++----
 econml/inference.py      |   8 +-
 econml/ortho_iv.py       | 407 ++++++++++++++++-------
 7 files changed, 918 insertions(+), 1131 deletions(-)

diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
index 13e5489cc..67bbd9115 100644
--- a/econml/_ortho_learner.py
+++ b/econml/_ortho_learner.py
@@ -27,6 +27,7 @@ class in this module implements the general logic in a very versatile way
 import copy
 from collections import namedtuple
 from warnings import warn
+from abc import ABC, abstractmethod
 
 import numpy as np
 from sklearn.base import clone
@@ -189,10 +190,10 @@ def predict(self, X, y, W=None):
 
 
 CachedValues = namedtuple('_CachedValues', ['nuisances',
-                                            'Y', 'T', 'X', 'W', 'Z', 'sample_weight', 'sample_var'])
+                                            'Y', 'T', 'X', 'W', 'Z', 'sample_weight', 'sample_var', 'groups'])
 
 
-class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
+class _OrthoLearner(ABC, TreatmentExpansionMixin, LinearCateEstimator):
     """
     Base class for all orthogonal learners. This class is a parent class to any method that has
     the following architecture:
@@ -247,39 +248,6 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
 
     Parameters
     ----------
-    model_nuisance: estimator
-        The estimator for fitting the nuisance function. Must implement
-        `fit` and `predict` methods that both have signatures::
-
-            model_nuisance.fit(Y, T, X=X, W=W, Z=Z,
-                               sample_weight=sample_weight, sample_var=sample_var)
-            model_nuisance.predict(Y, T, X=X, W=W, Z=Z,
-                                   sample_weight=sample_weight, sample_var=sample_var)
-
-        In fact we allow for the model method signatures to skip any of the keyword arguments
-        as long as the class is always called with the omitted keyword argument set to ``None``.
-        This can be enforced in child classes by re-implementing the fit and the various effect
-        methods. If ``discrete_treatment=True``, then the input ``T`` to both above calls will be the
-        one-hot encoding of the original input ``T``, excluding the first column of the one-hot.
-
-        If the estimator also provides a score method with the same arguments as fit, it will be used to
-        calculate scores during training.
-
-    model_final: estimator for fitting the response residuals to the features and treatment residuals
-        Must implement `fit` and `predict` methods that must have signatures::
-
-            model_final.fit(Y, T, X=X, W=W, Z=Z, nuisances=nuisances,
-                            sample_weight=sample_weight, sample_var=sample_var)
-            model_final.predict(X=X)
-
-        Predict, should just take the features X and return the constant marginal effect. In fact we allow for the
-        model method signatures to skip any of the keyword arguments as long as the class is always called with the
-        omitted keyword argument set to ``None``. Moreover, the predict function of the final model can take no
-        argument if the class is always called with ``X=None``. This can be enforced in child classes by
-        re-implementing the fit and the various effect methods. If ``discrete_treatment=True``, then the input ``T``
-        to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of
-        the one-hot.
-
     discrete_treatment: bool
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
@@ -313,127 +281,9 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
         by :mod:`np.random<numpy.random>`.
 
-    monte_carlo_iterations: int, optional
+    monte_carlo_iterations: int, optional (default=None)
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
-    Examples
-    --------
-
-    The example code below implements a very simple version of the double machine learning
-    method on top of the :class:`._OrthoLearner` class, for expository purposes.
-    For a more elaborate implementation of a Double Machine Learning child class of the class
-    :class:`._OrthoLearner` check out :class:`.DML`
-    and its child classes:
-
-    .. testcode::
-
-        import numpy as np
-        from sklearn.linear_model import LinearRegression
-        from econml._ortho_learner import _OrthoLearner
-        class ModelNuisance:
-            def __init__(self, model_t, model_y):
-                self._model_t = model_t
-                self._model_y = model_y
-            def fit(self, Y, T, W=None):
-                self._model_t.fit(W, T)
-                self._model_y.fit(W, Y)
-                return self
-            def predict(self, Y, T, W=None):
-                return Y - self._model_y.predict(W), T - self._model_t.predict(W)
-        class ModelFinal:
-            def __init__(self):
-                return
-            def fit(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
-                return self
-            def predict(self, X=None):
-                return self.model.coef_[0]
-            def score(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
-        np.random.seed(123)
-        X = np.random.normal(size=(100, 3))
-        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.1, size=(100,))
-        est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()),
-                            ModelFinal(),
-                            n_splits=2, discrete_treatment=False, discrete_instrument=False,
-                            categories='auto', random_state=None)
-        est.fit(y, X[:, 0], W=X[:, 1:])
-
-    >>> est.score_
-    0.00756830...
-    >>> est.const_marginal_effect()
-    1.02364992...
-    >>> est.effect()
-    array([1.023649...])
-    >>> est.effect(T0=0, T1=10)
-    array([10.236499...])
-    >>> est.score(y, X[:, 0], W=X[:, 1:])
-    0.00727995...
-    >>> est.model_final.model
-    LinearRegression(fit_intercept=False)
-    >>> est.model_final.model.coef_
-    array([1.023649...])
-
-    The following example shows how to do double machine learning with discrete treatments, using
-    the _OrthoLearner:
-
-    .. testcode::
-
-        class ModelNuisance:
-            def __init__(self, model_t, model_y):
-                self._model_t = model_t
-                self._model_y = model_y
-
-            def fit(self, Y, T, W=None):
-                self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1)))
-                self._model_y.fit(W, Y)
-                return self
-
-            def predict(self, Y, T, W=None):
-                return Y - self._model_y.predict(W), T - self._model_t.predict_proba(W)[:, 1:]
-
-        class ModelFinal:
-
-            def __init__(self):
-                return
-
-            def fit(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
-                return self
-
-            def predict(self):
-                # theta needs to be of dimension (1, d_t) if T is (n, d_t)
-                return np.array([[self.model.coef_[0]]])
-
-            def score(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
-
-        np.random.seed(123)
-        W = np.random.normal(size=(100, 3))
-        import scipy.special
-        from sklearn.linear_model import LogisticRegression
-        T = np.random.binomial(1, scipy.special.expit(W[:, 0]))
-        y = T + W[:, 0] + np.random.normal(0, 0.01, size=(100,))
-        est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()),
-                            ModelFinal(), n_splits=2, discrete_treatment=True, discrete_instrument=False,
-                            categories='auto', random_state=None)
-        est.fit(y, T, W=W)
-
-    >>> est.score_
-    0.00673015...
-    >>> est.const_marginal_effect()
-    array([[1.008401...]])
-    >>> est.effect()
-    array([1.008401...])
-    >>> est.score(y, T, W=W)
-    0.00310431...
-    >>> est.model_final.model.coef_[0]
-    1.00840170...
-
     Attributes
     ----------
     models_nuisance: list of objects of type(model_nuisance)
@@ -449,21 +299,67 @@ def score(self, Y, T, W=None, nuisances=None):
         The out-of-sample scores from training each nuisance model
     """
 
-    def __init__(self, model_nuisance, model_final, *,
+    def __init__(self, *,
                  discrete_treatment, discrete_instrument, categories, n_splits, random_state,
                  monte_carlo_iterations=None):
-        self._ortho_learner_model_nuisance = clone(model_nuisance, safe=False)
         self._models_nuisance = None
-        self._ortho_learner_model_final = clone(model_final, safe=False)
-        self._n_splits = n_splits
-        self._discrete_treatment = discrete_treatment
-        self._discrete_instrument = discrete_instrument
-        self._init_random_state = random_state
-        self._random_state = check_random_state(random_state)
-        self._categories = categories
-        self._monte_carlo_iterations = monte_carlo_iterations
+        self.n_splits = n_splits
+        self.discrete_treatment = discrete_treatment
+        self.discrete_instrument = discrete_instrument
+        self.random_state = random_state
+        self.categories = categories
+        self.monte_carlo_iterations = monte_carlo_iterations
         super().__init__()
 
+    @abstractmethod
+    def _gen_ortho_learner_model_nuisance(self):
+        """ Must return a fresh instance of a nuisance model
+
+        Returns
+        -------
+        model_nuisance: estimator
+            The estimator for fitting the nuisance function. Must implement
+            `fit` and `predict` methods that both have signatures::
+
+                model_nuisance.fit(Y, T, X=X, W=W, Z=Z,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_nuisance.predict(Y, T, X=X, W=W, Z=Z,
+                                    sample_weight=sample_weight, sample_var=sample_var)
+
+            In fact we allow for the model method signatures to skip any of the keyword arguments
+            as long as the class is always called with the omitted keyword argument set to ``None``.
+            This can be enforced in child classes by re-implementing the fit and the various effect
+            methods. If ``discrete_treatment=True``, then the input ``T`` to both above calls will be the
+            one-hot encoding of the original input ``T``, excluding the first column of the one-hot.
+
+            If the estimator also provides a score method with the same arguments as fit, it will be used to
+            calculate scores during training.
+        """
+        pass
+
+    @abstractmethod
+    def _gen_ortho_learner_model_final(self):
+        """ Must return a fresh instance of a final model
+
+        Returns
+        -------
+        model_final: estimator for fitting the response residuals to the features and treatment residuals
+            Must implement `fit` and `predict` methods that must have signatures::
+
+                model_final.fit(Y, T, X=X, W=W, Z=Z, nuisances=nuisances,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_final.predict(X=X)
+
+            Predict, should just take the features X and return the constant marginal effect. In fact we allow for the
+            model method signatures to skip any of the keyword arguments as long as the class is always called with the
+            omitted keyword argument set to ``None``. Moreover, the predict function of the final model can take no
+            argument if the class is always called with ``X=None``. This can be enforced in child classes by
+            re-implementing the fit and the various effect methods. If ``discrete_treatment=True``, then the input ``T``
+            to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of
+            the one-hot.
+        """
+        pass
+
     def _check_input_dims(self, Y, T, X=None, W=None, Z=None, *other_arrays):
         assert shape(Y)[0] == shape(T)[0], "Dimension mis-match!"
         for arr in [X, W, Z, *other_arrays]:
@@ -495,27 +391,28 @@ def _subinds_check_none(self, var, inds):
     def _strata(self, Y, T, X=None, W=None, Z=None,
                 sample_weight=None, sample_var=None, groups=None,
                 cache_values=False, monte_carlo_iterations=None):
-        if self._discrete_instrument:
+        if self.discrete_instrument:
             Z = LabelEncoder().fit_transform(np.ravel(Z))
 
-        if self._discrete_treatment:
+        if self.discrete_treatment:
             enc = LabelEncoder()
             T = enc.fit_transform(np.ravel(T))
-            if self._discrete_instrument:
+            if self.discrete_instrument:
                 return T + Z * len(enc.classes_)
             else:
                 return T
-        elif self._discrete_instrument:
+        elif self.discrete_instrument:
             return Z
         else:
             return None
 
-    def _prefit(self, Y, T, *args, **kwargs):
-        categories = self._categories
-        if self._discrete_treatment:
-            if categories != 'auto':
-                categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-            self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+    def _prefit(self, Y, T, *args, only_final=False, **kwargs):
+
+        # generate an instance of the final model
+        self._ortho_learner_model_final = self._gen_ortho_learner_model_final()
+        if not only_final:
+            # generate an instance of the nuisance model
+            self._ortho_learner_model_nuisance = self._gen_ortho_learner_model_nuisance()
 
         super()._prefit(Y, T, *args, **kwargs)
 
@@ -523,7 +420,7 @@ def _prefit(self, Y, T, *args, **kwargs):
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
     @BaseCateEstimator._wrap_fit
     def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
+            cache_values=False, inference=None, only_final=False, check_input=True):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -552,44 +449,81 @@ def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=No
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`).
+        only_final: bool, defaul False
+            Whether to fit the nuisance models or use the existing cached values
+        check_input: bool, default True
+            Whether to check if the input is valid
 
         Returns
         -------
         self : _OrthoLearner instance
         """
-        self._random_state = check_random_state(self._init_random_state)
-        Y, T, X, W, Z, sample_weight, sample_var, groups = check_input_arrays(
-            Y, T, X, W, Z, sample_weight, sample_var, groups)
-        self._check_input_dims(Y, T, X, W, Z, sample_weight, sample_var, groups)
-
-        all_nuisances = []
-        fitted_inds = None
-
-        for _ in range(self._monte_carlo_iterations or 1):
-            nuisances, new_inds = self._fit_nuisances(Y, T, X, W, Z, sample_weight=sample_weight, groups=groups)
-            all_nuisances.append(nuisances)
-            if fitted_inds is None:
-                fitted_inds = new_inds
-            elif not np.array_equal(fitted_inds, new_inds):
-                raise AttributeError("Different indices were fit by different folds, so they cannot be aggregated")
-
-        if self._monte_carlo_iterations is not None:
-            # TODO: support different ways to aggregate, like median?
-            nuisances = np.mean(np.array(all_nuisances), axis=0)
-
-        Y, T, X, W, Z, sample_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
-                                                    for arr in (Y, T, X, W, Z, sample_weight, sample_var))
-        nuisances = tuple([self._subinds_check_none(nuis, fitted_inds) for nuis in nuisances])
-        self._cached_values = CachedValues(nuisances=nuisances,
-                                           Y=Y, T=T, X=X, W=W, Z=Z,
-                                           sample_weight=sample_weight,
-                                           sample_var=sample_var) if cache_values else None
+        self._random_state = check_random_state(self.random_state)
+        if check_input:
+            Y, T, X, W, Z, sample_weight, sample_var, groups = check_input_arrays(
+                Y, T, X, W, Z, sample_weight, sample_var, groups)
+            self._check_input_dims(Y, T, X, W, Z, sample_weight, sample_var, groups)
+
+        if not only_final:
+
+            if self.discrete_treatment:
+                categories = self.categories
+                if categories != 'auto':
+                    categories = [categories]  # OneHotEncoder expects a 2D array with features per column
+                self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+                self._one_hot_encoder.fit(reshape(T, (-1, 1)))
+                self._d_t = (len(self._one_hot_encoder.categories_[0]) - 1,)
+                self.transformer = FunctionTransformer(
+                    func=_EncoderWrapper(self._one_hot_encoder).encode,
+                    validate=False)
+            else:
+                self.transformer = None
+
+            if self.discrete_instrument:
+                z_enc = LabelEncoder()
+                z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first')
+                z_ohe.fit(reshape(z_enc.fit_transform(Z.ravel()), (-1, 1)))
+                self.z_transformer = FunctionTransformer(
+                    func=_EncoderWrapper(z_ohe, z_enc).encode,
+                    validate=False)
+            else:
+                self.z_transformer = None
+
+            all_nuisances = []
+            fitted_inds = None
+
+            for _ in range(self.monte_carlo_iterations or 1):
+                nuisances, new_inds = self._fit_nuisances(Y, T, X, W, Z, sample_weight=sample_weight, groups=groups)
+                all_nuisances.append(nuisances)
+                if fitted_inds is None:
+                    fitted_inds = new_inds
+                elif not np.array_equal(fitted_inds, new_inds):
+                    raise AttributeError("Different indices were fit by different folds, so they cannot be aggregated")
+
+            if self.monte_carlo_iterations is not None:
+                # TODO: support different ways to aggregate, like median?
+                nuisances = np.mean(np.array(all_nuisances), axis=0)
+
+            Y, T, X, W, Z, sample_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
+                                                        for arr in (Y, T, X, W, Z, sample_weight, sample_var))
+            nuisances = tuple([self._subinds_check_none(nuis, fitted_inds) for nuis in nuisances])
+            self._cached_values = CachedValues(nuisances=nuisances,
+                                               Y=Y, T=T, X=X, W=W, Z=Z,
+                                               sample_weight=sample_weight,
+                                               sample_var=sample_var,
+                                               groups=groups) if cache_values else None
+        else:
+            nuisances = self._cached_values.nuisances
+            # _d_t is altered by fit nuisances to what prefit does. So we need to perform the same
+            # alteration even when we only want to fit_final.
+            if self.transformer is not None:
+                self._d_t = (len(self._one_hot_encoder.categories_[0]) - 1,)
+
         self._fit_final(Y=Y, T=T, X=X, W=W, Z=Z,
                         nuisances=nuisances,
                         sample_weight=sample_weight,
                         sample_var=sample_var)
 
-        self._cache_invalid_message = None
         return self
 
     def refit(self, inference=None):
@@ -605,74 +539,40 @@ def refit(self, inference=None):
             This instance
         """
         assert self._cached_values, "Refit can only be called if values were cached during the original fit"
-        assert self._cache_invalid_message is None, "Can't refit: " + self._cache_invalid_message
         cached = self._cached_values
         kwargs = filter_none_kwargs(
-            X=cached.X,
-            W=cached.W,
-            Z=cached.Z,
-            sample_weight=cached.sample_weight,
-            sample_var=cached.sample_var
+            Y=cached.Y, T=cached.T, X=cached.X, W=cached.W, Z=cached.Z,
+            sample_weight=cached.sample_weight, sample_var=cached.sample_var,
+            groups=cached.groups,
         )
-
-        # in case the final model uses randomness, make sure to reset the internal random state
-        self._random_state = check_random_state(self._init_random_state)
-
-        # this sequence of calls parallels that in BaseCateEstimator.wrap_fit
-
-        inference = self._get_inference(inference)
-
-        # not strictly necessary when refitting
-        self._prefit(cached.Y, cached.T, **kwargs)
-        if inference is not None:
-            # this *is* necessary even when refitting
-            # because we might have a new inference object
-            inference.prefit(self, cached.Y, cached.T, **kwargs)
-
-        # fit only the final model
-        self._fit_final(cached.Y,
-                        cached.T,
-                        nuisances=cached.nuisances,
-                        **kwargs)
-
-        if inference is not None:
-            # NOTE: we call inference fit *after* calling the main fit method
-            inference.fit(self, cached.Y, cached.T, **kwargs)
-        self._inference = inference
-
+        _OrthoLearner.fit(self, **kwargs,
+                          cache_values=True, inference=inference, only_final=True, check_input=False)
         return self
 
     def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+
         # use a binary array to get stratified split in case of discrete treatment
-        stratify = self._discrete_treatment or self._discrete_instrument
+        stratify = self.discrete_treatment or self.discrete_instrument
         strata = self._strata(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight, groups=groups)
         if strata is None:
             strata = T  # always safe to pass T as second arg to split even if we're not actually stratifying
 
-        if self._discrete_treatment:
-            T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1)))
-
-        if self._discrete_instrument:
-            z_enc = LabelEncoder()
-            Z = z_enc.fit_transform(Z.ravel())
-            z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first')
-            Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))
-            self.z_transformer = FunctionTransformer(
-                func=_EncoderWrapper(z_ohe, z_enc).encode,
-                validate=False)
-        else:
-            self.z_transformer = None
+        if self.discrete_treatment:
+            T = self.transformer.transform(reshape(T, (-1, 1)))
+
+        if self.discrete_instrument:
+            Z = self.z_transformer.transform(reshape(Z, (-1, 1)))
 
-        if self._n_splits == 1:  # special case, no cross validation
+        if self.n_splits == 1:  # special case, no cross validation
             folds = None
         else:
-            splitter = check_cv(self._n_splits, [0], classifier=stratify)
+            splitter = check_cv(self.n_splits, [0], classifier=stratify)
             # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state
             # TODO: ideally, we'd also infer whether we need a GroupKFold (if groups are passed)
             #       however, sklearn doesn't support both stratifying and grouping (see
             #       https://github.com/scikit-learn/scikit-learn/issues/13621), so for now the user needs to supply
             #       their own object that supports grouping if they want to use groups.
-            if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
+            if splitter != self.n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
                 splitter.shuffle = True
                 splitter.random_state = self._random_state
 
@@ -687,14 +587,6 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, group
             else:
                 folds = splitter.split(to_split, strata)
 
-        if self._discrete_treatment:
-            self._d_t = shape(T)[1:]
-            self.transformer = FunctionTransformer(
-                func=_EncoderWrapper(self._one_hot_encoder).encode,
-                validate=False)
-        else:
-            self.transformer = None
-
         nuisances, fitted_models, fitted_inds, scores = _crossfit(self._ortho_learner_model_nuisance, folds,
                                                                   Y, T, X=X, W=W, Z=Z,
                                                                   sample_weight=sample_weight, groups=groups)
@@ -806,76 +698,12 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
 
     @property
     def ortho_learner_model_final(self):
+        if not hasattr(self, '_ortho_learner_model_final'):
+            raise AttributeError("Model is not fitted!")
         return self._ortho_learner_model_final
 
-    @ortho_learner_model_final.setter
-    def ortho_learner_model_final(self, model_final):
-        self._ortho_learner_model_final = clone(model_final, safe=False)
-
-    @property
-    def ortho_learner_model_nuisance(self):
-        return self._ortho_learner_model_nuisance
-
-    @ortho_learner_model_nuisance.setter
-    def ortho_learner_model_nuisance(self, model_nuisance):
-        self._ortho_learner_model_nuisance = clone(model_nuisance, safe=False)
-        self._cache_invalid_message = "Setting the nuisance model invalidates the cached nuisances"
-
-    @property
-    def n_splits(self):
-        return self._n_splits
-
-    @n_splits.setter
-    def n_splits(self, n_splits):
-        self._n_splits = n_splits
-        self._cache_invalid_message = "Setting the number of splits invalidates the cached nuisances"
-
-    @property
-    def discrete_treatment(self):
-        return self._discrete_treatment
-
-    @discrete_treatment.setter
-    def discrete_treatment(self, discrete_treatment):
-        self._discrete_treatment = discrete_treatment
-        self._cache_invalid_message = "Setting discrete_treatment invalidates the cached nuisances"
-
-    @property
-    def discrete_instrument(self):
-        return self._discrete_instrument
-
-    @discrete_instrument.setter
-    def discrete_instrument(self, discrete_instrument):
-        self._discrete_instrument = discrete_instrument
-        self._cache_invalid_message = "Setting discrete_instrument invalidates the cached nuisances"
-
-    @property
-    def random_state(self):
-        return self._init_random_state
-
-    @random_state.setter
-    def random_state(self, random_state):
-        self._init_random_state = random_state
-        self._random_state = check_random_state(random_state)
-        self._cache_invalid_message = "Setting the random state invalidates the cached nuisances"
-
-    @property
-    def categories(self):
-        return self._categories
-
-    @categories.setter
-    def categories(self, categories):
-        self._categories = categories
-        self._cache_invalid_message = "Setting the categories invalidates the cached nuisances"
-
-    @property
-    def monte_carlo_iterations(self):
-        return self._monte_carlo_iterations
-
-    @monte_carlo_iterations.setter
-    def monte_carlo_iterations(self, monte_carlo_iterations):
-        self._monte_carlo_iterations = monte_carlo_iterations
-        self._cache_invalid_message = "Setting the number of monte carlo iterations invalidates the cached nuisances"
-
     @property
     def models_nuisance(self):
+        if not hasattr(self, '_models_nuisance'):
+            raise AttributeError("Model is not fitted!")
         return self._models_nuisance
diff --git a/econml/_rlearner.py b/econml/_rlearner.py
index 640b450cb..c62e52770 100644
--- a/econml/_rlearner.py
+++ b/econml/_rlearner.py
@@ -25,6 +25,7 @@
     The Econometrics Journal. https://arxiv.org/abs/1608.00060
 """
 
+from abc import abstractmethod
 import numpy as np
 import copy
 from warnings import warn
@@ -139,31 +140,6 @@ class _RLearner(_OrthoLearner):
 
     Parameters
     ----------
-    model_y: estimator of E[Y | X, W]
-        The estimator for fitting the response to the features and controls. Must implement
-        `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-        take an extra second argument (the controls), i.e. ::
-
-            model_y.fit(X, W, Y, sample_weight=sample_weight)
-            model_y.predict(X, W)
-
-    model_t: estimator of E[T | X, W]
-        The estimator for fitting the treatment to the features and controls. Must implement
-        `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-        take an extra second argument (the controls), i.e. ::
-
-            model_t.fit(X, W, T, sample_weight=sample_weight)
-            model_t.predict(X, W)
-
-    model_final: estimator for fitting the response residuals to the features and treatment residuals
-        Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
-        take an extra second argument (the treatment residuals). Predict, on the other hand,
-        should just take the features and return the constant marginal effect. More, concretely::
-
-            model_final.fit(X, T_res, Y_res,
-                            sample_weight=sample_weight, sample_var=sample_var)
-            model_final.predict(X)
-
     discrete_treatment: bool
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
@@ -197,60 +173,6 @@ class _RLearner(_OrthoLearner):
     monte_carlo_iterations: int, optional
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
-    Examples
-    --------
-    The example code below implements a very simple version of the double machine learning
-    method on top of the :class:`._RLearner` class, for expository purposes.
-    For a more elaborate implementation of a Double Machine Learning child class of the class
-    checkout :class:`.DML` and its child classes:
-
-    .. testcode::
-
-        import numpy as np
-        from sklearn.linear_model import LinearRegression
-        from econml._rlearner import _RLearner
-        from sklearn.base import clone
-        class ModelFirst:
-            def __init__(self, model):
-                self._model = clone(model, safe=False)
-            def fit(self, X, W, Y, sample_weight=None):
-                self._model.fit(np.hstack([X, W]), Y)
-                return self
-            def predict(self, X, W):
-                return self._model.predict(np.hstack([X, W]))
-        class ModelFinal:
-            def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
-                self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1),
-                                                                       Y_res)
-                return self
-            def predict(self, X):
-                return self.model.predict(X)
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.01, size=(1000,))
-        est = _RLearner(ModelFirst(LinearRegression()),
-                        ModelFirst(LinearRegression()),
-                        ModelFinal(),
-                        n_splits=2, discrete_treatment=False, categories='auto', random_state=None)
-        est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-
-    >>> est.const_marginal_effect(np.ones((1,1)))
-    array([0.999631...])
-    >>> est.effect(np.ones((1,1)), T0=0, T1=10)
-    array([9.996314...])
-    >>> est.score(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-    9.73638006...e-05
-    >>> est.rlearner_model_final.model
-    LinearRegression(fit_intercept=False)
-    >>> est.rlearner_model_final.model.coef_
-    array([0.999631...])
-    >>> est.score_
-    9.82623204...e-05
-    >>> [mdl._model for mdl in est.models_y]
-    [LinearRegression(), LinearRegression()]
-    >>> [mdl._model for mdl in est.models_t]
-    [LinearRegression(), LinearRegression()]
-
     Attributes
     ----------
     models_y: list of objects of type(model_y)
@@ -276,20 +198,66 @@ def predict(self, X):
         is multidimensional, then the average of the MSEs for each dimension of Y is returned.
     """
 
-    def __init__(self, model_y, model_t, model_final,
-                 discrete_treatment, categories, n_splits, random_state, monte_carlo_iterations=None):
-        self._rlearner_model_final = model_final
-        self._rlearner_model_y = model_y
-        self._rlearner_model_t = model_t
-        super().__init__(_ModelNuisance(clone(model_y, safe=False), clone(model_t, safe=False)),
-                         _ModelFinal(clone(model_final, safe=False)),
-                         discrete_treatment=discrete_treatment,
+    def __init__(self, *, discrete_treatment, categories, n_splits, random_state, monte_carlo_iterations=None):
+        super().__init__(discrete_treatment=discrete_treatment,
                          discrete_instrument=False,  # no instrument, so doesn't matter
                          categories=categories,
                          n_splits=n_splits,
                          random_state=random_state,
                          monte_carlo_iterations=monte_carlo_iterations)
 
+    @abstractmethod
+    def _gen_model_y(self):
+        """
+        Returns
+        -------
+        model_y: estimator of E[Y | X, W]
+            The estimator for fitting the response to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_y.fit(X, W, Y, sample_weight=sample_weight)
+                model_y.predict(X, W)
+        """
+        pass
+
+    @abstractmethod
+    def _gen_model_t(self):
+        """
+        Returns
+        -------
+        model_t: estimator of E[T | X, W]
+            The estimator for fitting the treatment to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_t.fit(X, W, T, sample_weight=sample_weight)
+                model_t.predict(X, W)
+        """
+        pass
+
+    @abstractmethod
+    def _gen_rlearner_model_final(self):
+        """
+        Returns
+        -------
+        model_final: estimator for fitting the response residuals to the features and treatment residuals
+            Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
+            take an extra second argument (the treatment residuals). Predict, on the other hand,
+            should just take the features and return the constant marginal effect. More, concretely::
+
+                model_final.fit(X, T_res, Y_res,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_final.predict(X)
+        """
+        pass
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ModelNuisance(self._gen_model_y(), self._gen_model_t())
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_rlearner_model_final())
+
     @_deprecate_positional("X, and should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
     def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
@@ -363,35 +331,7 @@ def score(self, Y, T, X=None, W=None):
     def rlearner_model_final(self):
         # NOTE: important to get parent's wrapped copy so that
         #       after training wrapped featurizer is also trained, etc.
-        return self._ortho_learner_model_final._model_final
-
-    @rlearner_model_final.setter
-    def rlearner_model_final(self, model_final):
-        model_final = clone(model_final, safe=False)
-        self._rlearner_model_final = model_final
-        self._ortho_learner_model_final = _ModelFinal(model_final)
-
-    @property
-    def rlearner_model_y(self):
-        return self._ortho_learner_model_nuisance._model_y
-
-    @rlearner_model_y.setter
-    def rlearner_model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._rlearner_model_y = model_y
-        self._ortho_learner_model_nuisance = _ModelNuisance(model_y, self._rlearner_model_t)
-        self._cache_invalid_message = "Setting the Y model invalidates cached nuisances"
-
-    @property
-    def rlearner_model_t(self):
-        return self._ortho_learner_model_nuisance._model_t
-
-    @rlearner_model_t.setter
-    def rlearner_model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._rlearner_model_t = model_t
-        self._ortho_learner_model_nuisance = _ModelNuisance(self._rlearner_model_y, model_t)
-        self._cache_invalid_message = "Setting the T model invalidates cached nuisances"
+        return self.ortho_learner_model_final._model_final
 
     @property
     def models_y(self):
@@ -408,17 +348,3 @@ def nuisance_scores_y(self):
     @property
     def nuisance_scores_t(self):
         return self.nuisance_scores_[1]
-
-    @_OrthoLearner.ortho_learner_model_final.setter
-    def ortho_learner_model_final(self, model):
-        raise AttributeError("OrthoLearner's final model cannot be set directly on an _RLearner instance; "
-                             "set the rlearner_model_final isntead.")
-
-    @_OrthoLearner.ortho_learner_model_nuisance.setter
-    def ortho_learner_model_nuisance(self, model):
-        raise AttributeError("Nuisance model cannot be set directly on an _RLearner instance; "
-                             "set the Y and T model attributes instead.")
-
-    @_OrthoLearner.discrete_instrument.setter
-    def discrete_instrument(self, flag):
-        raise AttributeError("_RLearners don't support instruments, so discrete_instrument will always be False")
diff --git a/econml/cate_estimator.py b/econml/cate_estimator.py
index 31a245107..593c73300 100644
--- a/econml/cate_estimator.py
+++ b/econml/cate_estimator.py
@@ -511,7 +511,9 @@ def _get_inference_options(self):
         options.update(auto=LinearModelFinalInference)
         return options
 
-    bias_part_of_coef = False
+    @property
+    def bias_part_of_coef(self):
+        return False
 
     @property
     def coef_(self):
@@ -528,7 +530,7 @@ def coef_(self):
             a vector and not a 2D array. For binary treatment the n_t dimension is
             also omitted.
         """
-        return parse_final_model_params(self.model_final.coef_, self.model_final.intercept_,
+        return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
                                         self.fit_cate_intercept)[0]
 
@@ -547,7 +549,7 @@ def intercept_(self):
         """
         if not self.fit_cate_intercept:
             raise AttributeError("No intercept was fitted!")
-        return parse_final_model_params(self.model_final.coef_, self.model_final.intercept_,
+        return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
                                         self.fit_cate_intercept)[1]
 
@@ -726,7 +728,7 @@ def _get_inference_options(self):
 
     @property
     def feature_importances_(self):
-        return self.model_final.feature_importances_
+        return self.model_final_.feature_importances_
 
 
 class LinearModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):
diff --git a/econml/dml.py b/econml/dml.py
index ac07393da..70ee975f1 100644
--- a/econml/dml.py
+++ b/econml/dml.py
@@ -52,7 +52,7 @@
                              ForestModelFinalCateEstimatorMixin,
                              LinearModelFinalCateEstimatorMixin,
                              StatsModelsCateEstimatorMixin)
-from .inference import StatsModelsInference
+from .inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference
 from .sklearn_extensions.ensemble import SubsampledHonestForest
 from .sklearn_extensions.linear_model import (MultiOutputDebiasedLasso,
                                               StatsModelsLinearRegression,
@@ -235,30 +235,17 @@ def original_featurizer(self):
         return self.rlearner_model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
         return self.rlearner_model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
         #      We need to use the rlearner's copy to retain the information from fitting
         return self.rlearner_model_final._model
 
-    @model_final.setter
-    def model_final(self, model):
-        model = _FinalWrapper(model,
-                              fit_cate_intercept=self.rlearner_model_final._fit_cate_intercept,
-                              featurizer=self.rlearner_model_final._original_featurizer,
-                              use_weight_trick=self.rlearner_model_final._use_weight_trick)
-        self._rlearner_model_final = model
-
-    @_RLearner.rlearner_model_final.setter
-    def rlearner_model_final(self, model):
-        raise AttributeError("rlearner_final_model cannot be set directly on a DML instance; "
-                             "set the model_final attributes instead.")
-
     @property
     def model_cate(self):
         """
@@ -270,7 +257,7 @@ def model_cate(self):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the constant marginal CATE model.
         """
-        return self._model_final
+        return self.rlearner_model_final._model
 
     @property
     def models_y(self):
@@ -443,6 +430,9 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -461,53 +451,55 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
 
         # set random_state and discrete_treatment now even though they're set by super's init
         # so that they can be used to initialize models
-        self._random_state = check_random_state(random_state)
-        self._discrete_treatment = discrete_treatment
+        self.random_state = random_state
+        self.discrete_treatment = discrete_treatment
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
-        self._fit_cate_intercept = fit_cate_intercept
-        self._linear_first_stages = linear_first_stages
-        self._featurizer = clone(featurizer, safe=False)
-        super().__init__(model_y=self._prepare_model_y(clone(model_y, safe=False)),
-                         model_t=self._prepare_model_t(clone(model_t, safe=False)),
-                         model_final=self._prepare_final_model(model_final),
-                         discrete_treatment=discrete_treatment,
+        self.fit_cate_intercept = fit_cate_intercept
+        self.linear_first_stages = linear_first_stages
+        self.featurizer = clone(featurizer, safe=False)
+        self.model_y = clone(model_y, safe=False)
+        self.model_t = clone(model_t, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        super().__init__(discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
-
-    def _prepare_model_y(self, model_y):
-        self._model_y = model_y
-        if model_y == 'auto':
-            model_y = WeightedLassoCVWrapper(random_state=self._random_state)
-        return _FirstStageWrapper(model_y, True, self._featurizer, self._linear_first_stages, self._discrete_treatment)
-
-    def _prepare_model_t(self, model_t):
-        self._model_t = model_t
-        if model_t == 'auto':
-            if self._discrete_treatment:
-                model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self._random_state),
-                                               random_state=self._random_state)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_y(self):
+        if self.model_y == 'auto':
+            model_y = WeightedLassoCVWrapper(random_state=self.random_state)
+        else:
+            model_y = clone(self.model_y, safe=False)
+        return _FirstStageWrapper(model_y, True, self.featurizer, self.linear_first_stages, self.discrete_treatment)
+
+    def _gen_model_t(self):
+        if self.model_t == 'auto':
+            if self.discrete_treatment:
+                model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state),
+                                               random_state=self.random_state)
             else:
-                model_t = WeightedLassoCVWrapper(random_state=self._random_state)
-        return _FirstStageWrapper(model_t, False, self._featurizer,
-                                  self._linear_first_stages, self._discrete_treatment)
+                model_t = WeightedLassoCVWrapper(random_state=self.random_state)
+        else:
+            model_t = clone(self.model_t, safe=False)
+        return _FirstStageWrapper(model_t, False, self.featurizer,
+                                  self.linear_first_stages, self.discrete_treatment)
+
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
 
-    def _prepare_final_model(self, model):
-        self._model_final = model
-        return _FinalWrapper(self._model_final, self._fit_cate_intercept, self._featurizer, False)
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
 
-    def _update_models(self):
-        self._rlearner_model_y = self._prepare_model_y(self.model_y)
-        self._rlearner_model_t = self._prepare_model_t(self.model_t)
-        self._rlearner_model_final = self._prepare_model_final(self.model_final)
+    def _gen_rlearner_model_final(self):
+        return _FinalWrapper(self._gen_model_final(), self.fit_cate_intercept, self.featurizer, False)
 
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
@@ -535,7 +527,6 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'auto'
@@ -549,90 +540,12 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @property
-    def linear_first_stages(self):
-        return self._linear_first_stages
-
-    @linear_first_stages.setter
-    def linear_first_stages(self, linear_first_stages):
-        self._linear_first_stages = linear_first_stages
-        self._update_models()
-        # updating the models will set a misleading invalidation message, so overwrite it
-        self._cache_invalid_message = "Changing linear_first_stages invalidates stored nuisance results"
-
-    @_BaseDML.featurizer.setter
-    def featurizer(self, featurizer):
-        self._featurizer = clone(featurizer, safe=False)
-        if self._linear_first_stages:
-            self._update_models()
-            # updating the models will set a misleading invalidation message, so overwrite it
-            self._cache_invalid_message = ("Changing the featurizer when linear_first_stages is True "
-                                           "invalidates stored nuisance results")
-        else:
-            # only the final model needs to change
-            self._rlearner_model_final = self._prepare_final_model(self._model_final)
-
-    @property
-    def fit_cate_intercept(self):
-        return self._fit_cate_intercept
-
-    @fit_cate_intercept.setter
-    def fit_cate_intercept(self, fit_cate_intercept):
-        self._fit_cate_intercept = fit_cate_intercept
-        # only the final model needs to change
-        self._rlearner_model_final = self._prepare_final_model(self._model_final)
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
     def bias_part_of_coef(self):
-        return self.fit_cate_intercept
-
-    @_OrthoLearner.discrete_treatment.setter
-    def discrete_treatment(self, discrete_treatement):
-        # super().discrete_treatment = discrete_treatment
-        super(DML, DML).discrete_treatment.__set__(self, discrete_treatement)
-        # need to modify first-stage models in response, although unless model_t is 'auto'
-        # the treatment model probably also needs to be updated from a classifier to a regressor or vice-versa...
-        self._update_models()
-
-    @property
-    def model_t(self):
-        return self._model_t
-
-    @model_t.setter
-    def model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._rlearner_model_t = self._prepare_model_y(model_t)
-
-    @property
-    def model_y(self):
-        return self._model_y
-
-    @model_y.setter
-    def model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._rlearner_model_y = self._prepare_model_y(model_y)
-
-    @_RLearner.rlearner_model_y.setter
-    def rlearner_model_y(self, model):
-        raise AttributeError("rlearner_model_y cannot be set directly on a DML instance; "
-                             "set the model_y attribute instead.")
-
-    @_RLearner.rlearner_model_t.setter
-    def rlearner_model_t(self, model):
-        raise AttributeError("rlearner_model_t cannot be set directly on a DML instance; "
-                             "set the model_t attribute instead.")
-
-    @_RLearner.rlearner_model_final.setter
-    def rlearner_model_final(self, model):
-        raise AttributeError("rlearner_model_final cannot be set directly on a DML instance; "
-                             "set the model_final attribute instead.")
-
-    # Setting the random_state affects Y and T nuisances models if they are auto
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(DML, DML).random_state.__set__(self, random_state)
-        self._update_models()
+        return self.rlearner_model_final._fit_cate_intercept
 
 
 class LinearDML(StatsModelsCateEstimatorMixin, DML):
@@ -688,6 +601,9 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -706,19 +622,22 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
         super().__init__(model_y=model_y,
                          model_t=model_t,
-                         model_final=StatsModelsLinearRegression(fit_intercept=False),
+                         model_final=None,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state,)
+
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
 
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
@@ -748,7 +667,6 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'statsmodels'
@@ -763,10 +681,14 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @DML.model_final.setter
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
     def model_final(self, model):
-        raise AttributeError("LinearDML final model can't be chnaged from "
-                             "StatsModelsLinearRegression(fit_intercept=False)")
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
@@ -844,6 +766,9 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -865,28 +790,29 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
-        self._alpha = alpha
-        self._max_iter = max_iter
-        self._tol = tol
-        model_final = MultiOutputDebiasedLasso(
-            alpha=alpha,
-            fit_intercept=False,
-            max_iter=max_iter,
-            tol=tol,
-            random_state=random_state)
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.tol = tol
         super().__init__(model_y=model_y,
                          model_t=model_t,
-                         model_final=model_final,
+                         model_final=None,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return MultiOutputDebiasedLasso(alpha=self.alpha,
+                                        fit_intercept=False,
+                                        max_iter=self.max_iter,
+                                        tol=self.tol,
+                                        random_state=self.random_state)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
@@ -937,79 +863,31 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            sample_weight=sample_weight, sample_var=None, groups=groups,
                            cache_values=cache_values, inference=inference)
 
-    @DML.model_final.setter
-    def model_final(self, model):
-        raise AttributeError("SparseLinearDML final model can't be set directly;"
-                             "instead set alpha, tol, or max_iter to change the debiased lasso settings")
-
-    @property
-    def alpha(self):
-        return self._alpha
-
-    @alpha.setter
-    def alpha(self, alpha):
-        self._alpha = alpha
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
-
     @property
-    def max_iter(self):
-        return self._max_iter
-
-    @max_iter.setter
-    def max_iter(self, max_iter):
-        self.max_iter = max_iter
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
+    def model_final(self):
+        return self._gen_model_final()
 
-    @property
-    def tol(self):
-        return self._tol
-
-    @tol.setter
-    def tol(self, tol):
-        self._tol = tol
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
-
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(SparseLinearDML, SparseLinearDML).random_state.__set__(self, random_state)
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class _RandomFeatures(TransformerMixin):
     def __init__(self, dim, bw, random_state):
-        self._dim = dim
-        self._bw = bw
-        self._random_state = random_state
+        self.dim = dim
+        self.bw = bw
+        self.random_state = random_state
 
     def fit(self, X):
-        random_state = check_random_state(self._random_state)
-        self.omegas = random_state.normal(0, 1 / self._bw, size=(shape(X)[1], self._dim))
-        self.biases = random_state.uniform(0, 2 * np.pi, size=(1, self._dim))
+        random_state = check_random_state(self.random_state)
+        self.omegas_ = random_state.normal(0, 1 / self.bw, size=(shape(X)[1], self.dim))
+        self.biases_ = random_state.uniform(0, 2 * np.pi, size=(1, self.dim))
+        self.dim_ = self.dim
         return self
 
     def transform(self, X):
-        return np.sqrt(2 / self._dim) * np.cos(np.matmul(X, self.omegas) + self.biases)
+        return np.sqrt(2 / self.dim_) * np.cos(np.matmul(X, self.omegas_) + self.biases_)
 
 
 class KernelDML(DML):
@@ -1064,6 +942,9 @@ class KernelDML(DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1075,53 +956,44 @@ class KernelDML(DML):
     """
 
     def __init__(self, model_y='auto', model_t='auto', fit_cate_intercept=True,
-                 dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2, random_state=None,
-                 monte_carlo_iterations=None):
-        super().__init__(model_y=model_y, model_t=model_t,
-                         model_final=ElasticNetCV(fit_intercept=False, random_state=random_state),
-                         featurizer=_RandomFeatures(dim, bw, random_state),
+                 dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2,
+                 monte_carlo_iterations=None, random_state=None):
+        self.dim = dim
+        self.bw = bw
+        super().__init__(model_y=model_y,
+                         model_t=model_t,
+                         model_final=None,
+                         featurizer=None,
                          fit_cate_intercept=fit_cate_intercept,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-    @property
-    def bw(self):
-        return self._bw
+    def _gen_model_final(self):
+        return ElasticNetCV(fit_intercept=False, random_state=self.random_state)
 
-    @bw.setter
-    def bw(self, bw):
-        self._bw = bw
-        # super().featurizer = _RandomFeatures(self._dim, self._bw, self.random_state)
-        super(KernelDML, KernelDML).featurizer.__set__(self, _RandomFeatures(self._dim, self._bw, self.random_state))
+    def _gen_featurizer(self):
+        return _RandomFeatures(self.dim, self.bw, self.random_state)
 
     @property
-    def dim(self):
-        return self._dim
+    def featurizer(self):
+        return self._gen_featurizer()
 
-    @dim.setter
-    def dim(self, dim):
-        self._dim = dim
-        # super().featurizer = _RandomFeatures(self._dim, self._bw, self.random_state)
-        super(KernelDML, KernelDML).featurizer.__set__(self, _RandomFeatures(self._dim, self._bw, self.random_state))
+    @featurizer.setter
+    def featurizer(self, value):
+        if value is not None:
+            raise ValueError("Parameter `featurizer` cannot be altered for this estimator!")
 
-    @_BaseDML.featurizer.setter
-    def featurizer(self, dim):
-        raise AttributeError("KernelDML featurizer can't be set directly; "
-                             "instead set the bw and dim attributes to modify the kernel")
+    @property
+    def model_final(self):
+        return self._gen_model_final()
 
-    @DML.model_final.setter
+    @model_final.setter
     def model_final(self, model):
-        raise AttributeError("KernelDML final model can't be changed from ElasticNetCV")
-
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(KernelDML, KernelDML).random_state.__set__(self, random_state)
-        # super().model_final = ElasticNetCV(fit_intercept=False, random_state=random_state)
-        self._model_final = ElasticNetCV(fit_intercept=False,
-                                         random_state=random_state)
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class NonParamDML(_BaseDML):
@@ -1174,6 +1046,9 @@ class NonParamDML(_BaseDML):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1190,60 +1065,46 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
-        model_y = clone(model_y, safe=False)
-        model_t = clone(model_t, safe=False)
-        self._model_y = model_y
-        self._model_t = model_t
-        self._featurizer = clone(featurizer, safe=False)
-
-        super().__init__(model_y=_FirstStageWrapper(model_y, True,
-                                                    featurizer, False, discrete_treatment),
-                         model_t=_FirstStageWrapper(model_t, False,
-                                                    featurizer, False, discrete_treatment),
-                         model_final=_FinalWrapper(model_final, False, featurizer, True),
-                         discrete_treatment=discrete_treatment,
+        self.model_y = clone(model_y, safe=False)
+        self.model_t = clone(model_t, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        super().__init__(discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-    @_BaseDML.featurizer.setter
-    def featurizer(self, featurizer):
-        self._featurizer = clone(featurizer, safe=False)
-        # featurizer isn't actually used by first stage models since linear_first_stages is False,
-        # so we only need to update model_final
-        # _BaseDML's final model setter reuses its old featurizer
-        # so we need to pass _BaseDML, not DML to the super() call
-        # super(_BasaeDML).model_final = _FinalWrapper(...)
-        super(_BaseDML, _BaseDML).model_final.__set__(self, _FinalWrapper(
-            self.model_final, False, self._featurizer, True))
+    def _get_inference_options(self):
+        # add blb to parent's options
+        options = super()._get_inference_options()
+        options.update(auto=GenericSingleTreatmentModelFinalInference)
+        return options
 
-    @property
-    def model_t(self):
-        return self._model_t
+    def _gen_model_y(self):
+        return _FirstStageWrapper(clone(self.model_y, safe=False), True,
+                                  self.featurizer, False, self.discrete_treatment)
 
-    @model_t.setter
-    def model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._model_t = model_t
-        self._rlearner_model_t = _FirstStageWrapper(model_t, False,
-                                                    self.featurizer, False, self.discrete_treatment)
+    def _gen_model_t(self):
+        return _FirstStageWrapper(clone(self.model_t, safe=False), False,
+                                  self.featurizer, False, self.discrete_treatment)
 
-    @property
-    def model_y(self):
-        return self._model_y
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
 
-    @model_y.setter
-    def model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._model_y = model_y
-        self._rlearner_model_y = _FirstStageWrapper(model_y, False,
-                                                    self.featurizer, False, self.discrete_treatment)
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_rlearner_model_final(self):
+        return _FinalWrapper(self._gen_model_final(), False, self.featurizer, True)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
 
 class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
@@ -1268,7 +1129,7 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
 
-    n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1285,6 +1146,9 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     n_estimators : integer, optional (default=100)
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
@@ -1404,9 +1268,11 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
 
     def __init__(self,
                  model_y, model_t,
+                 featurizer=None,
                  discrete_treatment=False,
                  categories='auto',
-                 n_crossfit_splits=2,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
                  n_estimators=100,
                  criterion="mse",
                  max_depth=None,
@@ -1420,41 +1286,45 @@ def __init__(self,
                  honest=True,
                  n_jobs=None,
                  verbose=0,
-                 random_state=None,
-                 monte_carlo_iterations=None):
-        self._n_estimators = n_estimators
-        self._criterion = criterion
-        self._max_depth = max_depth
-        self._min_samples_split = min_samples_split
-        self._min_samples_leaf = min_samples_leaf
-        self._min_weight_fraction_leaf = min_weight_fraction_leaf
-        self._max_features = max_features
-        self._max_leaf_nodes = max_leaf_nodes
-        self._min_impurity_decrease = min_impurity_decrease
-        self._subsample_fr = subsample_fr
-        self._honest = honest
-        self._n_jobs = n_jobs
-        self._verbose = verbose
-        model_final = SubsampledHonestForest(n_estimators=n_estimators,
-                                             criterion=criterion,
-                                             max_depth=max_depth,
-                                             min_samples_split=min_samples_split,
-                                             min_samples_leaf=min_samples_leaf,
-                                             min_weight_fraction_leaf=min_weight_fraction_leaf,
-                                             max_features=max_features,
-                                             max_leaf_nodes=max_leaf_nodes,
-                                             min_impurity_decrease=min_impurity_decrease,
-                                             subsample_fr=subsample_fr,
-                                             honest=honest,
-                                             n_jobs=n_jobs,
-                                             random_state=random_state,
-                                             verbose=verbose)
-        super().__init__(model_y=model_y, model_t=model_t,
-                         model_final=model_final, featurizer=None,
+                 random_state=None):
+        self.n_estimators = n_estimators
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.subsample_fr = subsample_fr
+        self.honest = honest
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        super().__init__(model_y=model_y,
+                         model_t=model_t,
+                         model_final=None,
+                         featurizer=featurizer,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_crossfit_splits, random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return SubsampledHonestForest(n_estimators=self.n_estimators,
+                                      criterion=self.criterion,
+                                      max_depth=self.max_depth,
+                                      min_samples_split=self.min_samples_split,
+                                      min_samples_leaf=self.min_samples_leaf,
+                                      min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                                      max_features=self.max_features,
+                                      max_leaf_nodes=self.max_leaf_nodes,
+                                      min_impurity_decrease=self.min_impurity_decrease,
+                                      subsample_fr=self.subsample_fr,
+                                      honest=self.honest,
+                                      n_jobs=self.n_jobs,
+                                      random_state=self.random_state,
+                                      verbose=self.verbose)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
@@ -1484,7 +1354,7 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-            inference: string, `Inference` instance, or None
+        inference: string, `Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'blb'
             (for Bootstrap-of-Little-Bags based inference)
@@ -1498,143 +1368,14 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @DML.model_final.setter
-    def model_final(self, model):
-        raise AttributeError("ForestDML final model can't be set directly;"
-                             "instead set individual attributes to modify the SubsampledHonestForest settings")
-
-    def _set_model_final(self):
-        self._rlearner_model_final = SubsampledHonestForest(n_estimators=self._n_estimators,
-                                                            criterion=self._criterion,
-                                                            max_depth=self._max_depth,
-                                                            min_samples_split=self._min_samples_split,
-                                                            min_samples_leaf=self._min_samples_leaf,
-                                                            min_weight_fraction_leaf=self._min_weight_fraction_leaf,
-                                                            max_features=self._max_features,
-                                                            max_leaf_nodes=self._max_leaf_nodes,
-                                                            min_impurity_decrease=self._min_impurity_decrease,
-                                                            subsample_fr=self._subsample_fr,
-                                                            honest=self._honest,
-                                                            n_jobs=self._n_jobs,
-                                                            random_state=self._random_state,
-                                                            verbose=self._verbose)
-
-    @property
-    def n_estimators(self):
-        return self._n_estimators
-
-    @n_estimators.setter
-    def n_estimators(self, n_estimators):
-        self._n_estimators = n_estimators
-        self._set_model_final()
-
-    @property
-    def criterion(s):
-        return self._criterion
-
-    @criterion.setter
-    def criterion(self, criterion):
-        self._criterion = criterion
-        self._set_model_final()
-
     @property
-    def max_depth(s):
-        return self._max_depth
-
-    @max_depth.setter
-    def max_depth(self, max_depth):
-        self._max_depth = max_depth
-        self._set_model_final()
-
-    @property
-    def min_samples_split(self, min):
-        return self._min_samples_split
-
-    @min_samples_split.setter
-    def min_samples_split(self, min_samples_split):
-        self._min_samples_split = min_samples_split
-        self._set_model_final()
-
-    @property
-    def min_samples_leaf(self, mi):
-        return self._min_samples_leaf
-
-    @min_samples_leaf.setter
-    def min_samples_leaf(self, min_samples_leaf):
-        self._min_samples_leaf = min_samples_leaf
-        self._set_model_final()
-
-    @property
-    def min_weight_fraction_leaf(self, min_weight):
-        return self._min_weight_fraction_leaf
-
-    @min_weight_fraction_leaf.setter
-    def min_weight_fraction_leaf(self, min_weight_fraction_leaf):
-        self._min_weight_fraction_leaf = min_weight_fraction_leaf
-        self._set_model_final()
-
-    @property
-    def max_features(self):
-        return self._max_features
-
-    @max_features.setter
-    def max_features(self, max_features):
-        self._max_features = max_features
-        self._set_model_final()
-
-    @property
-    def max_leaf_nodes(self, ):
-        return self._max_leaf_nodes
-
-    @max_leaf_nodes.setter
-    def max_leaf_nodes(self, max_leaf_nodes):
-        self._max_leaf_nodes = max_leaf_nodes
-        self._set_model_final()
-
-    @property
-    def min_impurity_decrease(self, min_imp):
-        return self._min_impurity_decrease
-
-    @min_impurity_decrease.setter
-    def min_impurity_decrease(self, min_impurity_decrease):
-        self._min_impurity_decrease = min_impurity_decrease
-        self._set_model_final()
-
-    @property
-    def subsample_fr(self):
-        return self._subsample_fr
-
-    @subsample_fr.setter
-    def subsample_fr(self, subsample_fr):
-        self._subsample_fr = subsample_fr
-        self._set_model_final()
-
-    @property
-    def honest(self):
-        return self._honest
-
-    @honest.setter
-    def honest(self, honest):
-        self._honest = honest
-        self._set_model_final()
-
-    @property
-    def n_jobs(self):
-        return self._n_jobs
-
-    @n_jobs.setter
-    def n_jobs(self, n_jobs):
-        self._n_jobs = n_jobs
-        self._set_model_final()
-
-    @property
-    def verbose(self):
-        return self._verbose
+    def model_final(self):
+        return self._gen_model_final()
 
-    @verbose.setter
-    def verbose(self, verbose):
-        self._verbose = verbose
-        self._set_model_final()
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 @deprecated("The DMLCateEstimator class has been renamed to DML; "
diff --git a/econml/drlearner.py b/econml/drlearner.py
index b98e15326..7c5668d05 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -258,6 +258,9 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -367,23 +370,48 @@ def __init__(self, model_propensity='auto',
                  min_propensity=1e-6,
                  categories='auto',
                  n_splits=2,
+                 monte_carlo_iterations=None,
                  random_state=None):
-        if model_propensity == 'auto':
-            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
-                                                    random_state=random_state)
-        if model_regression == 'auto':
-            model_regression = WeightedLassoCVWrapper(cv=3, random_state=random_state)
-        self._multitask_model_final = multitask_model_final
-        super().__init__(_ModelNuisance(model_propensity, model_regression, min_propensity),
-                         _ModelFinal(model_final, featurizer, multitask_model_final),
-                         n_splits=n_splits, discrete_treatment=True,
+        self.model_propensity = clone(model_propensity, safe=False)
+        self.model_regression = clone(model_regression, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.multitask_model_final = multitask_model_final
+        self.featurizer = clone(featurizer, safe=False)
+        self.min_propensity = min_propensity
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         discrete_treatment=True,
                          discrete_instrument=False,  # no instrument, so doesn't matter
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        if self.model_propensity == 'auto':
+            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
+                                                    random_state=self.random_state)
+        else:
+            model_propensity = clone(self.model_propensity, safe=False)
+
+        if self.model_regression == 'auto':
+            model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state)
+        else:
+            model_regression = clone(self.model_regression, safe=False)
+
+        return _ModelNuisance(model_propensity, model_regression, self.min_propensity)
+
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -405,6 +433,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`).
@@ -416,7 +446,7 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, X=None, W=None):
         """
@@ -458,9 +488,9 @@ def multitask_model_cate(self):
             vector of outcomes correspond to the CATE model for each treatment, compared to baseline.
             Available only when multitask_model_final=True.
         """
-        if not self._multitask_model_final:
+        if not self.ortho_learner_model_final._multitask_model_final:
             raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.")
-        return super().model_final.model_cate
+        return self.ortho_learner_model_final.model_cate
 
     def model_cate(self, T=1):
         """
@@ -477,12 +507,12 @@ def model_cate(self, T=1):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False.
         """
-        if self._multitask_model_final:
+        if self.ortho_learner_model_final._multitask_model_final:
             raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.")
         _, T = self._expand_treatments(None, T)
         ind = inverse_onehot(T).item() - 1
         assert ind >= 0, "No model was fitted for the control"
-        return super().model_final.models_cate[ind]
+        return self.ortho_learner_model_final.models_cate[ind]
 
     @property
     def models_propensity(self):
@@ -521,7 +551,7 @@ def nuisance_scores_regression(self):
         return self.nuisance_scores_[1]
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         """
         Get the fitted featurizer.
 
@@ -531,7 +561,7 @@ def featurizer(self):
             An instance of the fitted featurizer that was used to preprocess X in the final CATE model training.
             Available only when featurizer is not None and X is not None.
         """
-        return super().model_final._featurizer
+        return self.ortho_learner_model_final._featurizer
 
     def cate_feature_names(self, feature_names=None):
         """
@@ -556,14 +586,22 @@ def cate_feature_names(self, feature_names=None):
             return None
         if feature_names is None:
             feature_names = self._input_names["feature_names"]
-        if self.featurizer is None:
+        if self.featurizer_ is None:
             return feature_names
-        elif hasattr(self.featurizer, 'get_feature_names'):
+        elif hasattr(self.featurizer_, 'get_feature_names'):
             # This fails if X=None and featurizer is not None, but that case is handled above
-            return self.featurizer.get_feature_names(feature_names)
+            return self.featurizer_.get_feature_names(feature_names)
         else:
             raise AttributeError("Featurizer does not have a method: get_feature_names!")
 
+    @property
+    def model_final_(self):
+        return self.ortho_learner_model_final._model_final
+
+    @property
+    def fitted_models_final(self):
+        return self.ortho_learner_model_final.models_cate
+
 
 class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
     """
@@ -642,6 +680,9 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -699,21 +740,31 @@ def __init__(self,
                  fit_cate_intercept=True,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
         self.fit_cate_intercept = fit_cate_intercept
         super().__init__(model_propensity=model_propensity,
                          model_regression=model_regression,
-                         model_final=StatsModelsLinearRegression(fit_intercept=fit_cate_intercept),
+                         model_final=None,
                          featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -735,6 +786,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports ``'bootstrap'``
             (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'``
@@ -747,7 +800,10 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         # Replacing fit from DRLearner, to add statsmodels inference in docstring
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
     def multitask_model_cate(self):
@@ -756,12 +812,22 @@ def multitask_model_cate(self):
         return super().multitask_model_cate
 
     @property
-    def model_final(self):
-        return super().model_final._model_final
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
@@ -853,6 +919,9 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -913,26 +982,34 @@ def __init__(self,
                  tol=1e-4,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
         self.fit_cate_intercept = fit_cate_intercept
-        model_final = DebiasedLasso(
-            alpha=alpha,
-            fit_intercept=fit_cate_intercept,
-            max_iter=max_iter,
-            tol=tol)
         super().__init__(model_propensity=model_propensity,
                          model_regression=model_regression,
-                         model_final=model_final,
+                         model_final=None,
                          featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        return DebiasedLasso(alpha=self.alpha,
+                             fit_intercept=self.fit_cate_intercept,
+                             max_iter=self.max_iter,
+                             tol=self.tol)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -954,6 +1031,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports ``'bootstrap'``
             (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
@@ -969,26 +1048,33 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             warn("This estimator does not yet support sample variances and inference does not take "
                  "sample variances into account. This feature will be supported in a future release.")
         check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
-                               discrete_treatment=self._discrete_treatment,
+                               discrete_treatment=self.discrete_treatment,
                                msg="The number of features in the final model (< 5) is too small for a sparse model. "
                                "We recommend using the LinearDRLearner for this low-dimensional setting.")
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=None, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
-    def multitask_model_cate(self):
-        # Replacing this method which is invalid for this class, so that we make the
-        # dosctring empty and not appear in the docs.
-        return super().multitask_model_cate
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
     def model_final(self):
-        return super().model_final._model_final
+        return self._gen_model_final()
 
-    @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
@@ -1015,7 +1101,7 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
 
-    n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1032,6 +1118,9 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     n_estimators : integer, optional (default=100)
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
@@ -1148,9 +1237,11 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
 
     def __init__(self,
                  model_regression, model_propensity,
+                 featurizer=None,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_crossfit_splits=2,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
                  n_estimators=1000,
                  criterion="mse",
                  max_depth=None,
@@ -1165,30 +1256,53 @@ def __init__(self,
                  n_jobs=None,
                  verbose=0,
                  random_state=None):
-        model_final = SubsampledHonestForest(n_estimators=n_estimators,
-                                             criterion=criterion,
-                                             max_depth=max_depth,
-                                             min_samples_split=min_samples_split,
-                                             min_samples_leaf=min_samples_leaf,
-                                             min_weight_fraction_leaf=min_weight_fraction_leaf,
-                                             max_features=max_features,
-                                             max_leaf_nodes=max_leaf_nodes,
-                                             min_impurity_decrease=min_impurity_decrease,
-                                             subsample_fr=subsample_fr,
-                                             honest=honest,
-                                             n_jobs=n_jobs,
-                                             random_state=random_state,
-                                             verbose=verbose)
-        super().__init__(model_regression=model_regression, model_propensity=model_propensity,
-                         model_final=model_final, featurizer=None,
+        self.n_estimators = n_estimators
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.subsample_fr = subsample_fr
+        self.honest = honest
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        super().__init__(model_regression=model_regression,
+                         model_propensity=model_propensity,
+                         model_final=None,
+                         featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
-                         n_splits=n_crossfit_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return SubsampledHonestForest(n_estimators=self.n_estimators,
+                                      criterion=self.criterion,
+                                      max_depth=self.max_depth,
+                                      min_samples_split=self.min_samples_split,
+                                      min_samples_leaf=self.min_samples_leaf,
+                                      min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                                      max_features=self.max_features,
+                                      max_leaf_nodes=self.max_leaf_nodes,
+                                      min_impurity_decrease=self.min_impurity_decrease,
+                                      subsample_fr=self.subsample_fr,
+                                      honest=self.honest,
+                                      n_jobs=self.n_jobs,
+                                      random_state=self.random_state,
+                                      verbose=self.verbose)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
 
@@ -1211,6 +1325,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, `Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'blb'
@@ -1222,16 +1338,29 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         """
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=None, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     def multitask_model_cate(self):
         # Replacing to remove docstring
         super().multitask_model_cate()
 
     @property
-    def model_final(self):
-        return super().model_final._model_final
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
diff --git a/econml/inference.py b/econml/inference.py
index 1e696561f..0ba29f20a 100644
--- a/econml/inference.py
+++ b/econml/inference.py
@@ -95,8 +95,8 @@ class has a model_final method, whose predict(cross_product(X, [0, ..., 1, ...,
     """
 
     def prefit(self, estimator, *args, **kwargs):
-        self.model_final = estimator.model_final
-        self.featurizer = estimator.featurizer if hasattr(estimator, 'featurizer') else None
+        self.model_final = estimator.model_final_
+        self.featurizer = estimator.featurizer_ if hasattr(estimator, 'featurizer_') else None
 
     def fit(self, estimator, *args, **kwargs):
         # once the estimator has been fit, it's kosher to store d_t here
@@ -326,8 +326,8 @@ class GenericModelFinalInferenceDiscrete(Inference):
     """
 
     def prefit(self, estimator, *args, **kwargs):
-        self.model_final = estimator.model_final
-        self.featurizer = estimator.featurizer if hasattr(estimator, 'featurizer') else None
+        self.model_final = estimator.model_final_
+        self.featurizer = estimator.featurizer_ if hasattr(estimator, 'featurizer_') else None
 
     def fit(self, estimator, *args, **kwargs):
         # once the estimator has been fit, it's kosher to store d_t here
diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py
index 0ad0134e5..021cd15f0 100644
--- a/econml/ortho_iv.py
+++ b/econml/ortho_iv.py
@@ -136,19 +136,26 @@ def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None
 
 
 class _BaseDMLATEIV(_OrthoLearner):
-    def __init__(self, model_nuisance,
-                 discrete_instrument=False, discrete_treatment=False,
+    def __init__(self, discrete_instrument=False,
+                 discrete_treatment=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
-        super().__init__(model_nuisance, _BaseDMLATEIVModelFinal(),
-                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLATEIVModelFinal()
 
     @_deprecate_positional("W and Z should be passed by keyword only. In a future release "
                            "we will disallow passing W and Z by position.", ['W', 'Z'])
-    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -170,6 +177,8 @@ def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=No
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -181,7 +190,7 @@ def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=No
         # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
         return super().fit(Y, T, W=W, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, W=None):
         """
@@ -271,22 +280,32 @@ class DMLATEIV(_BaseDMLATEIV):
     a biased ATE.
     """
 
-    def __init__(self, model_Y_W, model_T_W, model_Z_W,
-                 discrete_treatment=False, discrete_instrument=False,
+    def __init__(self, *, model_Y_W, model_T_W, model_Z_W,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
-        super().__init__(_DMLATEIVModelNuisance(model_Y_W=_FirstStageWrapper(model_Y_W, discrete_target=False),
-                                                model_T_W=_FirstStageWrapper(
-                                                    model_T_W, discrete_target=discrete_treatment),
-                                                model_Z_W=_FirstStageWrapper(
-                                                    model_Z_W, discrete_target=discrete_instrument)),
-                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_Z_W = clone(model_Z_W, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _DMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_Z_W=_FirstStageWrapper(clone(self.model_Z_W, safe=False), discrete_target=self.discrete_instrument))
 
 
 class _ProjectedDMLATEIVModelNuisance:
+
     def __init__(self, model_Y_W, model_T_W, model_T_WZ):
         self._model_Y_W = clone(model_Y_W, safe=False)
         self._model_T_W = clone(model_T_W, safe=False)
@@ -330,21 +349,30 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
 
 
 class ProjectedDMLATEIV(_BaseDMLATEIV):
-    def __init__(self, model_Y_W, model_T_W, model_T_WZ,
-                 discrete_treatment=False, discrete_instrument=False,
+
+    def __init__(self, *, model_Y_W, model_T_W, model_T_WZ,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_T_WZ = clone(model_T_WZ, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-        super().__init__(_ProjectedDMLATEIVModelNuisance(
-            model_Y_W=_FirstStageWrapper(
-                model_Y_W, discrete_target=False),
-            model_T_W=_FirstStageWrapper(
-                model_T_W, discrete_target=discrete_treatment),
-            model_T_WZ=_FirstStageWrapper(
-                model_T_WZ, discrete_target=discrete_treatment)),
-            discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
-            categories=categories,
-            n_splits=n_splits, random_state=random_state)
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ProjectedDMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_T_WZ=_FirstStageWrapper(clone(self.model_T_WZ, safe=False),
+                                          discrete_target=self.discrete_instrument))
 
 
 class _BaseDMLIVModelNuisance:
@@ -494,6 +522,9 @@ class _BaseDMLIV(_OrthoLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -501,18 +532,19 @@ class _BaseDMLIV(_OrthoLearner):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
-                 discrete_instrument=False, discrete_treatment=False, categories='auto',
-                 n_splits=2, random_state=None):
-        super().__init__(_BaseDMLIVModelNuisance(model_Y_X, model_T_X, model_T_XZ),
-                         _BaseDMLIVModelFinal(model_final),
-                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+    def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 n_splits=2, monte_carlo_iterations=None, random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
     @_deprecate_positional("Z and X should be passed by keyword only. In a future release "
                            "we will disallow passing Z and X by position.", ['X', 'Z'])
-    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -534,6 +566,8 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=No
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -545,7 +579,7 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=No
         # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
         return super().fit(Y, T, X=X, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, X=None):
         """
@@ -578,18 +612,18 @@ def score(self, Y, T, Z, X=None):
 
     @property
     def original_featurizer(self):
-        return super().model_final._model_final._original_featurizer
+        return self.ortho_learner_model_final._model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
-        return super().model_final._model_final._featurizer
+        return self.ortho_learner_model_final._model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
-        return super().model_final._model_final._model
+        return self.ortho_learner_model_final._model_final._model
 
     @property
     def model_cate(self):
@@ -602,7 +636,7 @@ def model_cate(self):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the constant marginal CATE model.
         """
-        return super().model_final._model_final._model
+        return self.ortho_learner_model_final._model_final._model
 
     @property
     def models_Y_X(self):
@@ -615,7 +649,7 @@ def models_Y_X(self):
             A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_Y_X]
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance]
 
     @property
     def models_T_X(self):
@@ -628,7 +662,7 @@ def models_T_X(self):
             A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_T_X]
+        return [mdl._model_T_X._model for mdl in super().models_nuisance]
 
     @property
     def models_T_XZ(self):
@@ -641,7 +675,7 @@ def models_T_XZ(self):
             A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_T_XZ]
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance]
 
     @property
     def nuisance_scores_Y_X(self):
@@ -746,6 +780,9 @@ class DMLIV(_BaseDMLIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     discrete_instrument: bool, optional, default False
         Whether the instrument values should be treated as categorical, rather than continuous, quantities
 
@@ -763,25 +800,41 @@ class DMLIV(_BaseDMLIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=None,
+    def __init__(self, *, model_Y_X, model_T_X, model_T_XZ, model_final,
+                 featurizer=None,
                  fit_cate_intercept=True,
-                 n_splits=2, discrete_instrument=False, discrete_treatment=False,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 discrete_instrument=False, discrete_treatment=False,
                  categories='auto', random_state=None):
-        self.bias_part_of_coef = fit_cate_intercept
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_Y_X, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
         self.fit_cate_intercept = fit_cate_intercept
-        super().__init__(_FirstStageWrapper(model_Y_X, False),
-                         _FirstStageWrapper(model_T_X, discrete_treatment),
-                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
-                         _FinalWrapper(model_final,
-                                       fit_cate_intercept=fit_cate_intercept,
-                                       featurizer=featurizer,
-                                       use_weight_trick=False),
-                         n_splits=n_splits,
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          discrete_instrument=discrete_instrument,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=False))
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final._model_final._fit_cate_intercept
+
 
 class NonParamDMLIV(_BaseDMLIV):
     """
@@ -841,6 +894,9 @@ class NonParamDMLIV(_BaseDMLIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     discrete_instrument: bool, optional, default False
         Whether the instrument values should be treated as categorical, rather than continuous, quantities
 
@@ -859,23 +915,38 @@ class NonParamDMLIV(_BaseDMLIV):
 
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
+    def __init__(self, *, model_Y_X, model_T_X, model_T_XZ, model_final,
                  featurizer=None, fit_cate_intercept=True,
-                 n_splits=2, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
                  random_state=None):
-        super().__init__(_FirstStageWrapper(model_Y_X, False),
-                         _FirstStageWrapper(model_T_X, discrete_treatment),
-                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
-                         _FinalWrapper(model_final,
-                                       fit_cate_intercept=fit_cate_intercept,
-                                       featurizer=featurizer,
-                                       use_weight_trick=True),
-                         n_splits=n_splits,
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_Y_X, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          discrete_instrument=discrete_instrument,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=True))
+
 
 class _BaseDRIVModelFinal:
     """
@@ -1036,6 +1107,9 @@ class _BaseDRIV(_OrthoLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1044,33 +1118,42 @@ class _BaseDRIV(_OrthoLearner):
     """
 
     def __init__(self,
-                 nuisance_models,
                  model_final,
                  featurizer=None,
                  fit_cate_intercept=True,
-                 cov_clip=0.1, opt_reweighted=False,
-                 discrete_instrument=False, discrete_treatment=False,
+                 cov_clip=0.1,
+                 opt_reweighted=False,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
         self.fit_cate_intercept = fit_cate_intercept
-        self.bias_part_of_coef = fit_cate_intercept
-
         self.cov_clip = cov_clip
         self.opt_reweighted = opt_reweighted
-        super().__init__(nuisance_models, _BaseDRIVModelFinal(model_final,
-                                                              featurizer,
-                                                              discrete_treatment,
-                                                              discrete_instrument,
-                                                              fit_cate_intercept,
-                                                              cov_clip,
-                                                              opt_reweighted),
-                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
-                         categories=categories, n_splits=n_splits, random_state=random_state)
+        super().__init__(discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                         categories=categories, n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations, random_state=random_state)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDRIVModelFinal(self._gen_model_final(),
+                                   clone(self.featurizer, safe=False),
+                                   self.discrete_treatment,
+                                   self.discrete_instrument,
+                                   self.fit_cate_intercept,
+                                   self.cov_clip,
+                                   self.opt_reweighted)
 
     @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -1094,6 +1177,8 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -1105,7 +1190,7 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
         # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring
         return super().fit(Y, T, X=X, W=W, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
         """
@@ -1143,18 +1228,18 @@ def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
 
     @property
     def original_featurizer(self):
-        return super().model_final._original_featurizer
+        return self.ortho_learner_model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
-        return super().model_final._featurizer
+        return self.ortho_learner_model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
-        return super().model_final._model_final
+        return self.ortho_learner_model_final._model_final
 
     def cate_feature_names(self, feature_names=None):
         """
@@ -1243,31 +1328,46 @@ class _IntentToTreatDRIV(_BaseDRIV):
     Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X,
+                 model_T_XZ,
                  prel_model_effect,
-                 model_effect,
+                 model_final,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  opt_reweighted=False,
                  categories='auto',
                  random_state=None):
         """
         """
-
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.prel_model_effect = clone(prel_model_effect, safe=False)
         # TODO: check that Y, T, Z do not have multiple columns
-        super().__init__(_IntentToTreatDRIVModelNuisance(model_Y_X, model_T_XZ, prel_model_effect),
-                         model_effect,
+        super().__init__(model_final,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          cov_clip=cov_clip,
                          n_splits=n_splits,
-                         discrete_instrument=True, discrete_treatment=True,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         discrete_instrument=True,
+                         discrete_treatment=True,
                          categories=categories,
                          opt_reweighted=opt_reweighted,
                          random_state=random_state)
 
+    def _gen_prel_model_effect(self):
+        return clone(self.prel_model_effect, safe=False)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False),
+                                                                  discrete_target=False),
+                                               _FirstStageWrapper(clone(self.model_T_XZ, safe=False),
+                                                                  discrete_target=True),
+                                               self._gen_prel_model_effect())
+
 
 class _DummyCATE:
     """
@@ -1277,7 +1377,7 @@ class _DummyCATE:
     def __init__(self):
         return
 
-    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None):
+    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None, **kwargs):
         return self
 
     def effect(self, X):
@@ -1332,6 +1432,9 @@ class IntentToTreatDRIV(_IntentToTreatDRIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     opt_reweighted : bool, optional, default False
         Whether to reweight the samples to minimize variance. If True then
         final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
@@ -1351,37 +1454,46 @@ class IntentToTreatDRIV(_IntentToTreatDRIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X, model_T_XZ,
                  flexible_model_effect,
-                 final_model_effect=None,
+                 model_final=None,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  opt_reweighted=False,
                  categories='auto',
                  random_state=None):
-        model_Y_X = _FirstStageWrapper(model_Y_X, discrete_target=False)
-        model_T_XZ = _FirstStageWrapper(model_T_XZ, discrete_target=True)
-        prel_model_effect = _IntentToTreatDRIV(model_Y_X,
-                                               model_T_XZ,
-                                               _DummyCATE(),
-                                               flexible_model_effect,
-                                               cov_clip=1e-7, n_splits=1,
-                                               opt_reweighted=True,
-                                               random_state=random_state)
-        if final_model_effect is None:
-            final_model_effect = flexible_model_effect
-        super().__init__(model_Y_X, model_T_XZ, prel_model_effect,
-                         final_model_effect,
+        self.flexible_model_effect = flexible_model_effect
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         prel_model_effect=None,
+                         model_final=model_final,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          cov_clip=cov_clip,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          opt_reweighted=opt_reweighted,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        if self.model_final is None:
+            return clone(self.flexible_model_effect, safe=False)
+        return clone(self.model_final, safe=False)
+
+    def _gen_prel_model_effect(self):
+        return _IntentToTreatDRIV(model_Y_X=clone(self.model_Y_X, safe=False),
+                                  model_T_XZ=clone(self.model_T_XZ, safe=False),
+                                  prel_model_effect=_DummyCATE(),
+                                  model_final=clone(self.flexible_model_effect, safe=False),
+                                  cov_clip=1e-7,
+                                  n_splits=1,
+                                  opt_reweighted=True,
+                                  random_state=self.random_state)
+
     @property
     def models_Y_X(self):
         return [mdl._model_Y_X._model for mdl in super().models_nuisance]
@@ -1402,6 +1514,15 @@ def nuisance_scores_T_XZ(self):
     def nuisance_scores_effect(self):
         return self.nuisance_scores_[2]
 
+    @property
+    def prel_model_effect(self):
+        return self._gen_prel_model_effect()
+
+    @prel_model_effect.setter
+    def prel_model_effect(self, value):
+        if value is not None:
+            raise ValueError("Parameter `prel_model_effect` cannot be altered for this estimator.")
+
 
 class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
     """
@@ -1446,6 +1567,9 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     categories: 'auto' or list, default 'auto'
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
@@ -1457,26 +1581,36 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X,
+                 model_T_XZ,
                  flexible_model_effect,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  categories='auto',
                  random_state=None):
-        super().__init__(model_Y_X, model_T_XZ,
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
                          flexible_model_effect=flexible_model_effect,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
-                         final_model_effect=StatsModelsLinearRegression(fit_intercept=False),
-                         cov_clip=cov_clip, n_splits=n_splits, opt_reweighted=False,
+                         model_final=None,
+                         cov_clip=cov_clip,
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         opt_reweighted=False,
                          categories=categories, random_state=random_state)
 
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
+
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -1500,6 +1634,8 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
@@ -1511,4 +1647,29 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
         """
         return super().fit(Y, T, Z=Z, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, inference='auto'):
+        return super().refit(inference=inference)
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final._fit_cate_intercept
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, value):
+        if value is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator.")
+
+    @property
+    def opt_reweighted(self):
+        return False
+
+    @opt_reweighted.setter
+    def opt_reweighted(self, value):
+        if not (value == False):
+            raise ValueError("Parameter `value` cannot be altered from `False` for this estimator.")