diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
index 13e5489cc..67bbd9115 100644
--- a/econml/_ortho_learner.py
+++ b/econml/_ortho_learner.py
@@ -27,6 +27,7 @@ class in this module implements the general logic in a very versatile way
 import copy
 from collections import namedtuple
 from warnings import warn
+from abc import ABC, abstractmethod
 
 import numpy as np
 from sklearn.base import clone
@@ -189,10 +190,10 @@ def predict(self, X, y, W=None):
 
 
 CachedValues = namedtuple('_CachedValues', ['nuisances',
-                                            'Y', 'T', 'X', 'W', 'Z', 'sample_weight', 'sample_var'])
+                                            'Y', 'T', 'X', 'W', 'Z', 'sample_weight', 'sample_var', 'groups'])
 
 
-class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
+class _OrthoLearner(ABC, TreatmentExpansionMixin, LinearCateEstimator):
     """
     Base class for all orthogonal learners. This class is a parent class to any method that has
     the following architecture:
@@ -247,39 +248,6 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
 
     Parameters
     ----------
-    model_nuisance: estimator
-        The estimator for fitting the nuisance function. Must implement
-        `fit` and `predict` methods that both have signatures::
-
-            model_nuisance.fit(Y, T, X=X, W=W, Z=Z,
-                               sample_weight=sample_weight, sample_var=sample_var)
-            model_nuisance.predict(Y, T, X=X, W=W, Z=Z,
-                                   sample_weight=sample_weight, sample_var=sample_var)
-
-        In fact we allow for the model method signatures to skip any of the keyword arguments
-        as long as the class is always called with the omitted keyword argument set to ``None``.
-        This can be enforced in child classes by re-implementing the fit and the various effect
-        methods. If ``discrete_treatment=True``, then the input ``T`` to both above calls will be the
-        one-hot encoding of the original input ``T``, excluding the first column of the one-hot.
-
-        If the estimator also provides a score method with the same arguments as fit, it will be used to
-        calculate scores during training.
-
-    model_final: estimator for fitting the response residuals to the features and treatment residuals
-        Must implement `fit` and `predict` methods that must have signatures::
-
-            model_final.fit(Y, T, X=X, W=W, Z=Z, nuisances=nuisances,
-                            sample_weight=sample_weight, sample_var=sample_var)
-            model_final.predict(X=X)
-
-        Predict, should just take the features X and return the constant marginal effect. In fact we allow for the
-        model method signatures to skip any of the keyword arguments as long as the class is always called with the
-        omitted keyword argument set to ``None``. Moreover, the predict function of the final model can take no
-        argument if the class is always called with ``X=None``. This can be enforced in child classes by
-        re-implementing the fit and the various effect methods. If ``discrete_treatment=True``, then the input ``T``
-        to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of
-        the one-hot.
-
     discrete_treatment: bool
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
@@ -313,127 +281,9 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
         by :mod:`np.random<numpy.random>`.
 
-    monte_carlo_iterations: int, optional
+    monte_carlo_iterations: int, optional (default=None)
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
-    Examples
-    --------
-
-    The example code below implements a very simple version of the double machine learning
-    method on top of the :class:`._OrthoLearner` class, for expository purposes.
-    For a more elaborate implementation of a Double Machine Learning child class of the class
-    :class:`._OrthoLearner` check out :class:`.DML`
-    and its child classes:
-
-    .. testcode::
-
-        import numpy as np
-        from sklearn.linear_model import LinearRegression
-        from econml._ortho_learner import _OrthoLearner
-        class ModelNuisance:
-            def __init__(self, model_t, model_y):
-                self._model_t = model_t
-                self._model_y = model_y
-            def fit(self, Y, T, W=None):
-                self._model_t.fit(W, T)
-                self._model_y.fit(W, Y)
-                return self
-            def predict(self, Y, T, W=None):
-                return Y - self._model_y.predict(W), T - self._model_t.predict(W)
-        class ModelFinal:
-            def __init__(self):
-                return
-            def fit(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
-                return self
-            def predict(self, X=None):
-                return self.model.coef_[0]
-            def score(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
-        np.random.seed(123)
-        X = np.random.normal(size=(100, 3))
-        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.1, size=(100,))
-        est = _OrthoLearner(ModelNuisance(LinearRegression(), LinearRegression()),
-                            ModelFinal(),
-                            n_splits=2, discrete_treatment=False, discrete_instrument=False,
-                            categories='auto', random_state=None)
-        est.fit(y, X[:, 0], W=X[:, 1:])
-
-    >>> est.score_
-    0.00756830...
-    >>> est.const_marginal_effect()
-    1.02364992...
-    >>> est.effect()
-    array([1.023649...])
-    >>> est.effect(T0=0, T1=10)
-    array([10.236499...])
-    >>> est.score(y, X[:, 0], W=X[:, 1:])
-    0.00727995...
-    >>> est.model_final.model
-    LinearRegression(fit_intercept=False)
-    >>> est.model_final.model.coef_
-    array([1.023649...])
-
-    The following example shows how to do double machine learning with discrete treatments, using
-    the _OrthoLearner:
-
-    .. testcode::
-
-        class ModelNuisance:
-            def __init__(self, model_t, model_y):
-                self._model_t = model_t
-                self._model_y = model_y
-
-            def fit(self, Y, T, W=None):
-                self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1)))
-                self._model_y.fit(W, Y)
-                return self
-
-            def predict(self, Y, T, W=None):
-                return Y - self._model_y.predict(W), T - self._model_t.predict_proba(W)[:, 1:]
-
-        class ModelFinal:
-
-            def __init__(self):
-                return
-
-            def fit(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                self.model = LinearRegression(fit_intercept=False).fit(T_res.reshape(-1, 1), Y_res)
-                return self
-
-            def predict(self):
-                # theta needs to be of dimension (1, d_t) if T is (n, d_t)
-                return np.array([[self.model.coef_[0]]])
-
-            def score(self, Y, T, W=None, nuisances=None):
-                Y_res, T_res = nuisances
-                return np.mean((Y_res - self.model.predict(T_res.reshape(-1, 1)))**2)
-
-        np.random.seed(123)
-        W = np.random.normal(size=(100, 3))
-        import scipy.special
-        from sklearn.linear_model import LogisticRegression
-        T = np.random.binomial(1, scipy.special.expit(W[:, 0]))
-        y = T + W[:, 0] + np.random.normal(0, 0.01, size=(100,))
-        est = _OrthoLearner(ModelNuisance(LogisticRegression(solver='lbfgs'), LinearRegression()),
-                            ModelFinal(), n_splits=2, discrete_treatment=True, discrete_instrument=False,
-                            categories='auto', random_state=None)
-        est.fit(y, T, W=W)
-
-    >>> est.score_
-    0.00673015...
-    >>> est.const_marginal_effect()
-    array([[1.008401...]])
-    >>> est.effect()
-    array([1.008401...])
-    >>> est.score(y, T, W=W)
-    0.00310431...
-    >>> est.model_final.model.coef_[0]
-    1.00840170...
-
     Attributes
     ----------
     models_nuisance: list of objects of type(model_nuisance)
@@ -449,21 +299,67 @@ def score(self, Y, T, W=None, nuisances=None):
         The out-of-sample scores from training each nuisance model
     """
 
-    def __init__(self, model_nuisance, model_final, *,
+    def __init__(self, *,
                  discrete_treatment, discrete_instrument, categories, n_splits, random_state,
                  monte_carlo_iterations=None):
-        self._ortho_learner_model_nuisance = clone(model_nuisance, safe=False)
         self._models_nuisance = None
-        self._ortho_learner_model_final = clone(model_final, safe=False)
-        self._n_splits = n_splits
-        self._discrete_treatment = discrete_treatment
-        self._discrete_instrument = discrete_instrument
-        self._init_random_state = random_state
-        self._random_state = check_random_state(random_state)
-        self._categories = categories
-        self._monte_carlo_iterations = monte_carlo_iterations
+        self.n_splits = n_splits
+        self.discrete_treatment = discrete_treatment
+        self.discrete_instrument = discrete_instrument
+        self.random_state = random_state
+        self.categories = categories
+        self.monte_carlo_iterations = monte_carlo_iterations
         super().__init__()
 
+    @abstractmethod
+    def _gen_ortho_learner_model_nuisance(self):
+        """ Must return a fresh instance of a nuisance model
+
+        Returns
+        -------
+        model_nuisance: estimator
+            The estimator for fitting the nuisance function. Must implement
+            `fit` and `predict` methods that both have signatures::
+
+                model_nuisance.fit(Y, T, X=X, W=W, Z=Z,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_nuisance.predict(Y, T, X=X, W=W, Z=Z,
+                                    sample_weight=sample_weight, sample_var=sample_var)
+
+            In fact we allow for the model method signatures to skip any of the keyword arguments
+            as long as the class is always called with the omitted keyword argument set to ``None``.
+            This can be enforced in child classes by re-implementing the fit and the various effect
+            methods. If ``discrete_treatment=True``, then the input ``T`` to both above calls will be the
+            one-hot encoding of the original input ``T``, excluding the first column of the one-hot.
+
+            If the estimator also provides a score method with the same arguments as fit, it will be used to
+            calculate scores during training.
+        """
+        pass
+
+    @abstractmethod
+    def _gen_ortho_learner_model_final(self):
+        """ Must return a fresh instance of a final model
+
+        Returns
+        -------
+        model_final: estimator for fitting the response residuals to the features and treatment residuals
+            Must implement `fit` and `predict` methods that must have signatures::
+
+                model_final.fit(Y, T, X=X, W=W, Z=Z, nuisances=nuisances,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_final.predict(X=X)
+
+            Predict, should just take the features X and return the constant marginal effect. In fact we allow for the
+            model method signatures to skip any of the keyword arguments as long as the class is always called with the
+            omitted keyword argument set to ``None``. Moreover, the predict function of the final model can take no
+            argument if the class is always called with ``X=None``. This can be enforced in child classes by
+            re-implementing the fit and the various effect methods. If ``discrete_treatment=True``, then the input ``T``
+            to both above calls will be the one-hot encoding of the original input ``T``, excluding the first column of
+            the one-hot.
+        """
+        pass
+
     def _check_input_dims(self, Y, T, X=None, W=None, Z=None, *other_arrays):
         assert shape(Y)[0] == shape(T)[0], "Dimension mis-match!"
         for arr in [X, W, Z, *other_arrays]:
@@ -495,27 +391,28 @@ def _subinds_check_none(self, var, inds):
     def _strata(self, Y, T, X=None, W=None, Z=None,
                 sample_weight=None, sample_var=None, groups=None,
                 cache_values=False, monte_carlo_iterations=None):
-        if self._discrete_instrument:
+        if self.discrete_instrument:
             Z = LabelEncoder().fit_transform(np.ravel(Z))
 
-        if self._discrete_treatment:
+        if self.discrete_treatment:
             enc = LabelEncoder()
             T = enc.fit_transform(np.ravel(T))
-            if self._discrete_instrument:
+            if self.discrete_instrument:
                 return T + Z * len(enc.classes_)
             else:
                 return T
-        elif self._discrete_instrument:
+        elif self.discrete_instrument:
             return Z
         else:
             return None
 
-    def _prefit(self, Y, T, *args, **kwargs):
-        categories = self._categories
-        if self._discrete_treatment:
-            if categories != 'auto':
-                categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-            self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+    def _prefit(self, Y, T, *args, only_final=False, **kwargs):
+
+        # generate an instance of the final model
+        self._ortho_learner_model_final = self._gen_ortho_learner_model_final()
+        if not only_final:
+            # generate an instance of the nuisance model
+            self._ortho_learner_model_nuisance = self._gen_ortho_learner_model_nuisance()
 
         super()._prefit(Y, T, *args, **kwargs)
 
@@ -523,7 +420,7 @@ def _prefit(self, Y, T, *args, **kwargs):
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
     @BaseCateEstimator._wrap_fit
     def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
+            cache_values=False, inference=None, only_final=False, check_input=True):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -552,44 +449,81 @@ def fit(self, Y, T, X=None, W=None, Z=None, *, sample_weight=None, sample_var=No
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`).
+        only_final: bool, defaul False
+            Whether to fit the nuisance models or use the existing cached values
+        check_input: bool, default True
+            Whether to check if the input is valid
 
         Returns
         -------
         self : _OrthoLearner instance
         """
-        self._random_state = check_random_state(self._init_random_state)
-        Y, T, X, W, Z, sample_weight, sample_var, groups = check_input_arrays(
-            Y, T, X, W, Z, sample_weight, sample_var, groups)
-        self._check_input_dims(Y, T, X, W, Z, sample_weight, sample_var, groups)
-
-        all_nuisances = []
-        fitted_inds = None
-
-        for _ in range(self._monte_carlo_iterations or 1):
-            nuisances, new_inds = self._fit_nuisances(Y, T, X, W, Z, sample_weight=sample_weight, groups=groups)
-            all_nuisances.append(nuisances)
-            if fitted_inds is None:
-                fitted_inds = new_inds
-            elif not np.array_equal(fitted_inds, new_inds):
-                raise AttributeError("Different indices were fit by different folds, so they cannot be aggregated")
-
-        if self._monte_carlo_iterations is not None:
-            # TODO: support different ways to aggregate, like median?
-            nuisances = np.mean(np.array(all_nuisances), axis=0)
-
-        Y, T, X, W, Z, sample_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
-                                                    for arr in (Y, T, X, W, Z, sample_weight, sample_var))
-        nuisances = tuple([self._subinds_check_none(nuis, fitted_inds) for nuis in nuisances])
-        self._cached_values = CachedValues(nuisances=nuisances,
-                                           Y=Y, T=T, X=X, W=W, Z=Z,
-                                           sample_weight=sample_weight,
-                                           sample_var=sample_var) if cache_values else None
+        self._random_state = check_random_state(self.random_state)
+        if check_input:
+            Y, T, X, W, Z, sample_weight, sample_var, groups = check_input_arrays(
+                Y, T, X, W, Z, sample_weight, sample_var, groups)
+            self._check_input_dims(Y, T, X, W, Z, sample_weight, sample_var, groups)
+
+        if not only_final:
+
+            if self.discrete_treatment:
+                categories = self.categories
+                if categories != 'auto':
+                    categories = [categories]  # OneHotEncoder expects a 2D array with features per column
+                self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
+                self._one_hot_encoder.fit(reshape(T, (-1, 1)))
+                self._d_t = (len(self._one_hot_encoder.categories_[0]) - 1,)
+                self.transformer = FunctionTransformer(
+                    func=_EncoderWrapper(self._one_hot_encoder).encode,
+                    validate=False)
+            else:
+                self.transformer = None
+
+            if self.discrete_instrument:
+                z_enc = LabelEncoder()
+                z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first')
+                z_ohe.fit(reshape(z_enc.fit_transform(Z.ravel()), (-1, 1)))
+                self.z_transformer = FunctionTransformer(
+                    func=_EncoderWrapper(z_ohe, z_enc).encode,
+                    validate=False)
+            else:
+                self.z_transformer = None
+
+            all_nuisances = []
+            fitted_inds = None
+
+            for _ in range(self.monte_carlo_iterations or 1):
+                nuisances, new_inds = self._fit_nuisances(Y, T, X, W, Z, sample_weight=sample_weight, groups=groups)
+                all_nuisances.append(nuisances)
+                if fitted_inds is None:
+                    fitted_inds = new_inds
+                elif not np.array_equal(fitted_inds, new_inds):
+                    raise AttributeError("Different indices were fit by different folds, so they cannot be aggregated")
+
+            if self.monte_carlo_iterations is not None:
+                # TODO: support different ways to aggregate, like median?
+                nuisances = np.mean(np.array(all_nuisances), axis=0)
+
+            Y, T, X, W, Z, sample_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
+                                                        for arr in (Y, T, X, W, Z, sample_weight, sample_var))
+            nuisances = tuple([self._subinds_check_none(nuis, fitted_inds) for nuis in nuisances])
+            self._cached_values = CachedValues(nuisances=nuisances,
+                                               Y=Y, T=T, X=X, W=W, Z=Z,
+                                               sample_weight=sample_weight,
+                                               sample_var=sample_var,
+                                               groups=groups) if cache_values else None
+        else:
+            nuisances = self._cached_values.nuisances
+            # _d_t is altered by fit nuisances to what prefit does. So we need to perform the same
+            # alteration even when we only want to fit_final.
+            if self.transformer is not None:
+                self._d_t = (len(self._one_hot_encoder.categories_[0]) - 1,)
+
         self._fit_final(Y=Y, T=T, X=X, W=W, Z=Z,
                         nuisances=nuisances,
                         sample_weight=sample_weight,
                         sample_var=sample_var)
 
-        self._cache_invalid_message = None
         return self
 
     def refit(self, inference=None):
@@ -605,74 +539,40 @@ def refit(self, inference=None):
             This instance
         """
         assert self._cached_values, "Refit can only be called if values were cached during the original fit"
-        assert self._cache_invalid_message is None, "Can't refit: " + self._cache_invalid_message
         cached = self._cached_values
         kwargs = filter_none_kwargs(
-            X=cached.X,
-            W=cached.W,
-            Z=cached.Z,
-            sample_weight=cached.sample_weight,
-            sample_var=cached.sample_var
+            Y=cached.Y, T=cached.T, X=cached.X, W=cached.W, Z=cached.Z,
+            sample_weight=cached.sample_weight, sample_var=cached.sample_var,
+            groups=cached.groups,
         )
-
-        # in case the final model uses randomness, make sure to reset the internal random state
-        self._random_state = check_random_state(self._init_random_state)
-
-        # this sequence of calls parallels that in BaseCateEstimator.wrap_fit
-
-        inference = self._get_inference(inference)
-
-        # not strictly necessary when refitting
-        self._prefit(cached.Y, cached.T, **kwargs)
-        if inference is not None:
-            # this *is* necessary even when refitting
-            # because we might have a new inference object
-            inference.prefit(self, cached.Y, cached.T, **kwargs)
-
-        # fit only the final model
-        self._fit_final(cached.Y,
-                        cached.T,
-                        nuisances=cached.nuisances,
-                        **kwargs)
-
-        if inference is not None:
-            # NOTE: we call inference fit *after* calling the main fit method
-            inference.fit(self, cached.Y, cached.T, **kwargs)
-        self._inference = inference
-
+        _OrthoLearner.fit(self, **kwargs,
+                          cache_values=True, inference=inference, only_final=True, check_input=False)
         return self
 
     def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+
         # use a binary array to get stratified split in case of discrete treatment
-        stratify = self._discrete_treatment or self._discrete_instrument
+        stratify = self.discrete_treatment or self.discrete_instrument
         strata = self._strata(Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight, groups=groups)
         if strata is None:
             strata = T  # always safe to pass T as second arg to split even if we're not actually stratifying
 
-        if self._discrete_treatment:
-            T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1)))
-
-        if self._discrete_instrument:
-            z_enc = LabelEncoder()
-            Z = z_enc.fit_transform(Z.ravel())
-            z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first')
-            Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))
-            self.z_transformer = FunctionTransformer(
-                func=_EncoderWrapper(z_ohe, z_enc).encode,
-                validate=False)
-        else:
-            self.z_transformer = None
+        if self.discrete_treatment:
+            T = self.transformer.transform(reshape(T, (-1, 1)))
+
+        if self.discrete_instrument:
+            Z = self.z_transformer.transform(reshape(Z, (-1, 1)))
 
-        if self._n_splits == 1:  # special case, no cross validation
+        if self.n_splits == 1:  # special case, no cross validation
             folds = None
         else:
-            splitter = check_cv(self._n_splits, [0], classifier=stratify)
+            splitter = check_cv(self.n_splits, [0], classifier=stratify)
             # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state
             # TODO: ideally, we'd also infer whether we need a GroupKFold (if groups are passed)
             #       however, sklearn doesn't support both stratifying and grouping (see
             #       https://github.com/scikit-learn/scikit-learn/issues/13621), so for now the user needs to supply
             #       their own object that supports grouping if they want to use groups.
-            if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
+            if splitter != self.n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
                 splitter.shuffle = True
                 splitter.random_state = self._random_state
 
@@ -687,14 +587,6 @@ def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None, group
             else:
                 folds = splitter.split(to_split, strata)
 
-        if self._discrete_treatment:
-            self._d_t = shape(T)[1:]
-            self.transformer = FunctionTransformer(
-                func=_EncoderWrapper(self._one_hot_encoder).encode,
-                validate=False)
-        else:
-            self.transformer = None
-
         nuisances, fitted_models, fitted_inds, scores = _crossfit(self._ortho_learner_model_nuisance, folds,
                                                                   Y, T, X=X, W=W, Z=Z,
                                                                   sample_weight=sample_weight, groups=groups)
@@ -806,76 +698,12 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
 
     @property
     def ortho_learner_model_final(self):
+        if not hasattr(self, '_ortho_learner_model_final'):
+            raise AttributeError("Model is not fitted!")
         return self._ortho_learner_model_final
 
-    @ortho_learner_model_final.setter
-    def ortho_learner_model_final(self, model_final):
-        self._ortho_learner_model_final = clone(model_final, safe=False)
-
-    @property
-    def ortho_learner_model_nuisance(self):
-        return self._ortho_learner_model_nuisance
-
-    @ortho_learner_model_nuisance.setter
-    def ortho_learner_model_nuisance(self, model_nuisance):
-        self._ortho_learner_model_nuisance = clone(model_nuisance, safe=False)
-        self._cache_invalid_message = "Setting the nuisance model invalidates the cached nuisances"
-
-    @property
-    def n_splits(self):
-        return self._n_splits
-
-    @n_splits.setter
-    def n_splits(self, n_splits):
-        self._n_splits = n_splits
-        self._cache_invalid_message = "Setting the number of splits invalidates the cached nuisances"
-
-    @property
-    def discrete_treatment(self):
-        return self._discrete_treatment
-
-    @discrete_treatment.setter
-    def discrete_treatment(self, discrete_treatment):
-        self._discrete_treatment = discrete_treatment
-        self._cache_invalid_message = "Setting discrete_treatment invalidates the cached nuisances"
-
-    @property
-    def discrete_instrument(self):
-        return self._discrete_instrument
-
-    @discrete_instrument.setter
-    def discrete_instrument(self, discrete_instrument):
-        self._discrete_instrument = discrete_instrument
-        self._cache_invalid_message = "Setting discrete_instrument invalidates the cached nuisances"
-
-    @property
-    def random_state(self):
-        return self._init_random_state
-
-    @random_state.setter
-    def random_state(self, random_state):
-        self._init_random_state = random_state
-        self._random_state = check_random_state(random_state)
-        self._cache_invalid_message = "Setting the random state invalidates the cached nuisances"
-
-    @property
-    def categories(self):
-        return self._categories
-
-    @categories.setter
-    def categories(self, categories):
-        self._categories = categories
-        self._cache_invalid_message = "Setting the categories invalidates the cached nuisances"
-
-    @property
-    def monte_carlo_iterations(self):
-        return self._monte_carlo_iterations
-
-    @monte_carlo_iterations.setter
-    def monte_carlo_iterations(self, monte_carlo_iterations):
-        self._monte_carlo_iterations = monte_carlo_iterations
-        self._cache_invalid_message = "Setting the number of monte carlo iterations invalidates the cached nuisances"
-
     @property
     def models_nuisance(self):
+        if not hasattr(self, '_models_nuisance'):
+            raise AttributeError("Model is not fitted!")
         return self._models_nuisance
diff --git a/econml/_rlearner.py b/econml/_rlearner.py
index 640b450cb..c62e52770 100644
--- a/econml/_rlearner.py
+++ b/econml/_rlearner.py
@@ -25,6 +25,7 @@
     The Econometrics Journal. https://arxiv.org/abs/1608.00060
 """
 
+from abc import abstractmethod
 import numpy as np
 import copy
 from warnings import warn
@@ -139,31 +140,6 @@ class _RLearner(_OrthoLearner):
 
     Parameters
     ----------
-    model_y: estimator of E[Y | X, W]
-        The estimator for fitting the response to the features and controls. Must implement
-        `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-        take an extra second argument (the controls), i.e. ::
-
-            model_y.fit(X, W, Y, sample_weight=sample_weight)
-            model_y.predict(X, W)
-
-    model_t: estimator of E[T | X, W]
-        The estimator for fitting the treatment to the features and controls. Must implement
-        `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-        take an extra second argument (the controls), i.e. ::
-
-            model_t.fit(X, W, T, sample_weight=sample_weight)
-            model_t.predict(X, W)
-
-    model_final: estimator for fitting the response residuals to the features and treatment residuals
-        Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
-        take an extra second argument (the treatment residuals). Predict, on the other hand,
-        should just take the features and return the constant marginal effect. More, concretely::
-
-            model_final.fit(X, T_res, Y_res,
-                            sample_weight=sample_weight, sample_var=sample_var)
-            model_final.predict(X)
-
     discrete_treatment: bool
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
@@ -197,60 +173,6 @@ class _RLearner(_OrthoLearner):
     monte_carlo_iterations: int, optional
         The number of times to rerun the first stage models to reduce the variance of the nuisances.
 
-    Examples
-    --------
-    The example code below implements a very simple version of the double machine learning
-    method on top of the :class:`._RLearner` class, for expository purposes.
-    For a more elaborate implementation of a Double Machine Learning child class of the class
-    checkout :class:`.DML` and its child classes:
-
-    .. testcode::
-
-        import numpy as np
-        from sklearn.linear_model import LinearRegression
-        from econml._rlearner import _RLearner
-        from sklearn.base import clone
-        class ModelFirst:
-            def __init__(self, model):
-                self._model = clone(model, safe=False)
-            def fit(self, X, W, Y, sample_weight=None):
-                self._model.fit(np.hstack([X, W]), Y)
-                return self
-            def predict(self, X, W):
-                return self._model.predict(np.hstack([X, W]))
-        class ModelFinal:
-            def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
-                self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1),
-                                                                       Y_res)
-                return self
-            def predict(self, X):
-                return self.model.predict(X)
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.01, size=(1000,))
-        est = _RLearner(ModelFirst(LinearRegression()),
-                        ModelFirst(LinearRegression()),
-                        ModelFinal(),
-                        n_splits=2, discrete_treatment=False, categories='auto', random_state=None)
-        est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-
-    >>> est.const_marginal_effect(np.ones((1,1)))
-    array([0.999631...])
-    >>> est.effect(np.ones((1,1)), T0=0, T1=10)
-    array([9.996314...])
-    >>> est.score(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-    9.73638006...e-05
-    >>> est.rlearner_model_final.model
-    LinearRegression(fit_intercept=False)
-    >>> est.rlearner_model_final.model.coef_
-    array([0.999631...])
-    >>> est.score_
-    9.82623204...e-05
-    >>> [mdl._model for mdl in est.models_y]
-    [LinearRegression(), LinearRegression()]
-    >>> [mdl._model for mdl in est.models_t]
-    [LinearRegression(), LinearRegression()]
-
     Attributes
     ----------
     models_y: list of objects of type(model_y)
@@ -276,20 +198,66 @@ def predict(self, X):
         is multidimensional, then the average of the MSEs for each dimension of Y is returned.
     """
 
-    def __init__(self, model_y, model_t, model_final,
-                 discrete_treatment, categories, n_splits, random_state, monte_carlo_iterations=None):
-        self._rlearner_model_final = model_final
-        self._rlearner_model_y = model_y
-        self._rlearner_model_t = model_t
-        super().__init__(_ModelNuisance(clone(model_y, safe=False), clone(model_t, safe=False)),
-                         _ModelFinal(clone(model_final, safe=False)),
-                         discrete_treatment=discrete_treatment,
+    def __init__(self, *, discrete_treatment, categories, n_splits, random_state, monte_carlo_iterations=None):
+        super().__init__(discrete_treatment=discrete_treatment,
                          discrete_instrument=False,  # no instrument, so doesn't matter
                          categories=categories,
                          n_splits=n_splits,
                          random_state=random_state,
                          monte_carlo_iterations=monte_carlo_iterations)
 
+    @abstractmethod
+    def _gen_model_y(self):
+        """
+        Returns
+        -------
+        model_y: estimator of E[Y | X, W]
+            The estimator for fitting the response to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_y.fit(X, W, Y, sample_weight=sample_weight)
+                model_y.predict(X, W)
+        """
+        pass
+
+    @abstractmethod
+    def _gen_model_t(self):
+        """
+        Returns
+        -------
+        model_t: estimator of E[T | X, W]
+            The estimator for fitting the treatment to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_t.fit(X, W, T, sample_weight=sample_weight)
+                model_t.predict(X, W)
+        """
+        pass
+
+    @abstractmethod
+    def _gen_rlearner_model_final(self):
+        """
+        Returns
+        -------
+        model_final: estimator for fitting the response residuals to the features and treatment residuals
+            Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
+            take an extra second argument (the treatment residuals). Predict, on the other hand,
+            should just take the features and return the constant marginal effect. More, concretely::
+
+                model_final.fit(X, T_res, Y_res,
+                                sample_weight=sample_weight, sample_var=sample_var)
+                model_final.predict(X)
+        """
+        pass
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ModelNuisance(self._gen_model_y(), self._gen_model_t())
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_rlearner_model_final())
+
     @_deprecate_positional("X, and should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
     def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
@@ -363,35 +331,7 @@ def score(self, Y, T, X=None, W=None):
     def rlearner_model_final(self):
         # NOTE: important to get parent's wrapped copy so that
         #       after training wrapped featurizer is also trained, etc.
-        return self._ortho_learner_model_final._model_final
-
-    @rlearner_model_final.setter
-    def rlearner_model_final(self, model_final):
-        model_final = clone(model_final, safe=False)
-        self._rlearner_model_final = model_final
-        self._ortho_learner_model_final = _ModelFinal(model_final)
-
-    @property
-    def rlearner_model_y(self):
-        return self._ortho_learner_model_nuisance._model_y
-
-    @rlearner_model_y.setter
-    def rlearner_model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._rlearner_model_y = model_y
-        self._ortho_learner_model_nuisance = _ModelNuisance(model_y, self._rlearner_model_t)
-        self._cache_invalid_message = "Setting the Y model invalidates cached nuisances"
-
-    @property
-    def rlearner_model_t(self):
-        return self._ortho_learner_model_nuisance._model_t
-
-    @rlearner_model_t.setter
-    def rlearner_model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._rlearner_model_t = model_t
-        self._ortho_learner_model_nuisance = _ModelNuisance(self._rlearner_model_y, model_t)
-        self._cache_invalid_message = "Setting the T model invalidates cached nuisances"
+        return self.ortho_learner_model_final._model_final
 
     @property
     def models_y(self):
@@ -408,17 +348,3 @@ def nuisance_scores_y(self):
     @property
     def nuisance_scores_t(self):
         return self.nuisance_scores_[1]
-
-    @_OrthoLearner.ortho_learner_model_final.setter
-    def ortho_learner_model_final(self, model):
-        raise AttributeError("OrthoLearner's final model cannot be set directly on an _RLearner instance; "
-                             "set the rlearner_model_final isntead.")
-
-    @_OrthoLearner.ortho_learner_model_nuisance.setter
-    def ortho_learner_model_nuisance(self, model):
-        raise AttributeError("Nuisance model cannot be set directly on an _RLearner instance; "
-                             "set the Y and T model attributes instead.")
-
-    @_OrthoLearner.discrete_instrument.setter
-    def discrete_instrument(self, flag):
-        raise AttributeError("_RLearners don't support instruments, so discrete_instrument will always be False")
diff --git a/econml/cate_estimator.py b/econml/cate_estimator.py
index 31a245107..593c73300 100644
--- a/econml/cate_estimator.py
+++ b/econml/cate_estimator.py
@@ -511,7 +511,9 @@ def _get_inference_options(self):
         options.update(auto=LinearModelFinalInference)
         return options
 
-    bias_part_of_coef = False
+    @property
+    def bias_part_of_coef(self):
+        return False
 
     @property
     def coef_(self):
@@ -528,7 +530,7 @@ def coef_(self):
             a vector and not a 2D array. For binary treatment the n_t dimension is
             also omitted.
         """
-        return parse_final_model_params(self.model_final.coef_, self.model_final.intercept_,
+        return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
                                         self.fit_cate_intercept)[0]
 
@@ -547,7 +549,7 @@ def intercept_(self):
         """
         if not self.fit_cate_intercept:
             raise AttributeError("No intercept was fitted!")
-        return parse_final_model_params(self.model_final.coef_, self.model_final.intercept_,
+        return parse_final_model_params(self.model_final_.coef_, self.model_final_.intercept_,
                                         self._d_y, self._d_t, self._d_t_in, self.bias_part_of_coef,
                                         self.fit_cate_intercept)[1]
 
@@ -726,7 +728,7 @@ def _get_inference_options(self):
 
     @property
     def feature_importances_(self):
-        return self.model_final.feature_importances_
+        return self.model_final_.feature_importances_
 
 
 class LinearModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):
diff --git a/econml/dml.py b/econml/dml.py
index ac07393da..70ee975f1 100644
--- a/econml/dml.py
+++ b/econml/dml.py
@@ -52,7 +52,7 @@
                              ForestModelFinalCateEstimatorMixin,
                              LinearModelFinalCateEstimatorMixin,
                              StatsModelsCateEstimatorMixin)
-from .inference import StatsModelsInference
+from .inference import StatsModelsInference, GenericSingleTreatmentModelFinalInference
 from .sklearn_extensions.ensemble import SubsampledHonestForest
 from .sklearn_extensions.linear_model import (MultiOutputDebiasedLasso,
                                               StatsModelsLinearRegression,
@@ -235,30 +235,17 @@ def original_featurizer(self):
         return self.rlearner_model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
         return self.rlearner_model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
         #      We need to use the rlearner's copy to retain the information from fitting
         return self.rlearner_model_final._model
 
-    @model_final.setter
-    def model_final(self, model):
-        model = _FinalWrapper(model,
-                              fit_cate_intercept=self.rlearner_model_final._fit_cate_intercept,
-                              featurizer=self.rlearner_model_final._original_featurizer,
-                              use_weight_trick=self.rlearner_model_final._use_weight_trick)
-        self._rlearner_model_final = model
-
-    @_RLearner.rlearner_model_final.setter
-    def rlearner_model_final(self, model):
-        raise AttributeError("rlearner_final_model cannot be set directly on a DML instance; "
-                             "set the model_final attributes instead.")
-
     @property
     def model_cate(self):
         """
@@ -270,7 +257,7 @@ def model_cate(self):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the constant marginal CATE model.
         """
-        return self._model_final
+        return self.rlearner_model_final._model
 
     @property
     def models_y(self):
@@ -443,6 +430,9 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -461,53 +451,55 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
 
         # set random_state and discrete_treatment now even though they're set by super's init
         # so that they can be used to initialize models
-        self._random_state = check_random_state(random_state)
-        self._discrete_treatment = discrete_treatment
+        self.random_state = random_state
+        self.discrete_treatment = discrete_treatment
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
-        self._fit_cate_intercept = fit_cate_intercept
-        self._linear_first_stages = linear_first_stages
-        self._featurizer = clone(featurizer, safe=False)
-        super().__init__(model_y=self._prepare_model_y(clone(model_y, safe=False)),
-                         model_t=self._prepare_model_t(clone(model_t, safe=False)),
-                         model_final=self._prepare_final_model(model_final),
-                         discrete_treatment=discrete_treatment,
+        self.fit_cate_intercept = fit_cate_intercept
+        self.linear_first_stages = linear_first_stages
+        self.featurizer = clone(featurizer, safe=False)
+        self.model_y = clone(model_y, safe=False)
+        self.model_t = clone(model_t, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        super().__init__(discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
-
-    def _prepare_model_y(self, model_y):
-        self._model_y = model_y
-        if model_y == 'auto':
-            model_y = WeightedLassoCVWrapper(random_state=self._random_state)
-        return _FirstStageWrapper(model_y, True, self._featurizer, self._linear_first_stages, self._discrete_treatment)
-
-    def _prepare_model_t(self, model_t):
-        self._model_t = model_t
-        if model_t == 'auto':
-            if self._discrete_treatment:
-                model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self._random_state),
-                                               random_state=self._random_state)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_y(self):
+        if self.model_y == 'auto':
+            model_y = WeightedLassoCVWrapper(random_state=self.random_state)
+        else:
+            model_y = clone(self.model_y, safe=False)
+        return _FirstStageWrapper(model_y, True, self.featurizer, self.linear_first_stages, self.discrete_treatment)
+
+    def _gen_model_t(self):
+        if self.model_t == 'auto':
+            if self.discrete_treatment:
+                model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold(random_state=self.random_state),
+                                               random_state=self.random_state)
             else:
-                model_t = WeightedLassoCVWrapper(random_state=self._random_state)
-        return _FirstStageWrapper(model_t, False, self._featurizer,
-                                  self._linear_first_stages, self._discrete_treatment)
+                model_t = WeightedLassoCVWrapper(random_state=self.random_state)
+        else:
+            model_t = clone(self.model_t, safe=False)
+        return _FirstStageWrapper(model_t, False, self.featurizer,
+                                  self.linear_first_stages, self.discrete_treatment)
+
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
 
-    def _prepare_final_model(self, model):
-        self._model_final = model
-        return _FinalWrapper(self._model_final, self._fit_cate_intercept, self._featurizer, False)
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
 
-    def _update_models(self):
-        self._rlearner_model_y = self._prepare_model_y(self.model_y)
-        self._rlearner_model_t = self._prepare_model_t(self.model_t)
-        self._rlearner_model_final = self._prepare_model_final(self.model_final)
+    def _gen_rlearner_model_final(self):
+        return _FinalWrapper(self._gen_model_final(), self.fit_cate_intercept, self.featurizer, False)
 
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
@@ -535,7 +527,6 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'auto'
@@ -549,90 +540,12 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @property
-    def linear_first_stages(self):
-        return self._linear_first_stages
-
-    @linear_first_stages.setter
-    def linear_first_stages(self, linear_first_stages):
-        self._linear_first_stages = linear_first_stages
-        self._update_models()
-        # updating the models will set a misleading invalidation message, so overwrite it
-        self._cache_invalid_message = "Changing linear_first_stages invalidates stored nuisance results"
-
-    @_BaseDML.featurizer.setter
-    def featurizer(self, featurizer):
-        self._featurizer = clone(featurizer, safe=False)
-        if self._linear_first_stages:
-            self._update_models()
-            # updating the models will set a misleading invalidation message, so overwrite it
-            self._cache_invalid_message = ("Changing the featurizer when linear_first_stages is True "
-                                           "invalidates stored nuisance results")
-        else:
-            # only the final model needs to change
-            self._rlearner_model_final = self._prepare_final_model(self._model_final)
-
-    @property
-    def fit_cate_intercept(self):
-        return self._fit_cate_intercept
-
-    @fit_cate_intercept.setter
-    def fit_cate_intercept(self, fit_cate_intercept):
-        self._fit_cate_intercept = fit_cate_intercept
-        # only the final model needs to change
-        self._rlearner_model_final = self._prepare_final_model(self._model_final)
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
     def bias_part_of_coef(self):
-        return self.fit_cate_intercept
-
-    @_OrthoLearner.discrete_treatment.setter
-    def discrete_treatment(self, discrete_treatement):
-        # super().discrete_treatment = discrete_treatment
-        super(DML, DML).discrete_treatment.__set__(self, discrete_treatement)
-        # need to modify first-stage models in response, although unless model_t is 'auto'
-        # the treatment model probably also needs to be updated from a classifier to a regressor or vice-versa...
-        self._update_models()
-
-    @property
-    def model_t(self):
-        return self._model_t
-
-    @model_t.setter
-    def model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._rlearner_model_t = self._prepare_model_y(model_t)
-
-    @property
-    def model_y(self):
-        return self._model_y
-
-    @model_y.setter
-    def model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._rlearner_model_y = self._prepare_model_y(model_y)
-
-    @_RLearner.rlearner_model_y.setter
-    def rlearner_model_y(self, model):
-        raise AttributeError("rlearner_model_y cannot be set directly on a DML instance; "
-                             "set the model_y attribute instead.")
-
-    @_RLearner.rlearner_model_t.setter
-    def rlearner_model_t(self, model):
-        raise AttributeError("rlearner_model_t cannot be set directly on a DML instance; "
-                             "set the model_t attribute instead.")
-
-    @_RLearner.rlearner_model_final.setter
-    def rlearner_model_final(self, model):
-        raise AttributeError("rlearner_model_final cannot be set directly on a DML instance; "
-                             "set the model_final attribute instead.")
-
-    # Setting the random_state affects Y and T nuisances models if they are auto
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(DML, DML).random_state.__set__(self, random_state)
-        self._update_models()
+        return self.rlearner_model_final._fit_cate_intercept
 
 
 class LinearDML(StatsModelsCateEstimatorMixin, DML):
@@ -688,6 +601,9 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -706,19 +622,22 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
         super().__init__(model_y=model_y,
                          model_t=model_t,
-                         model_final=StatsModelsLinearRegression(fit_intercept=False),
+                         model_final=None,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state,)
+
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
 
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
@@ -748,7 +667,6 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'statsmodels'
@@ -763,10 +681,14 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @DML.model_final.setter
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
     def model_final(self, model):
-        raise AttributeError("LinearDML final model can't be chnaged from "
-                             "StatsModelsLinearRegression(fit_intercept=False)")
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
@@ -844,6 +766,9 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -865,28 +790,29 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
-        self._alpha = alpha
-        self._max_iter = max_iter
-        self._tol = tol
-        model_final = MultiOutputDebiasedLasso(
-            alpha=alpha,
-            fit_intercept=False,
-            max_iter=max_iter,
-            tol=tol,
-            random_state=random_state)
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.tol = tol
         super().__init__(model_y=model_y,
                          model_t=model_t,
-                         model_final=model_final,
+                         model_final=None,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return MultiOutputDebiasedLasso(alpha=self.alpha,
+                                        fit_intercept=False,
+                                        max_iter=self.max_iter,
+                                        tol=self.tol,
+                                        random_state=self.random_state)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
@@ -937,79 +863,31 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            sample_weight=sample_weight, sample_var=None, groups=groups,
                            cache_values=cache_values, inference=inference)
 
-    @DML.model_final.setter
-    def model_final(self, model):
-        raise AttributeError("SparseLinearDML final model can't be set directly;"
-                             "instead set alpha, tol, or max_iter to change the debiased lasso settings")
-
-    @property
-    def alpha(self):
-        return self._alpha
-
-    @alpha.setter
-    def alpha(self, alpha):
-        self._alpha = alpha
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
-
     @property
-    def max_iter(self):
-        return self._max_iter
-
-    @max_iter.setter
-    def max_iter(self, max_iter):
-        self.max_iter = max_iter
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
+    def model_final(self):
+        return self._gen_model_final()
 
-    @property
-    def tol(self):
-        return self._tol
-
-    @tol.setter
-    def tol(self, tol):
-        self._tol = tol
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
-
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(SparseLinearDML, SparseLinearDML).random_state.__set__(self, random_state)
-        self._model_final = MultiOutputDebiasedLasso(
-            alpha=self.alpha,
-            fit_intercept=False,
-            max_iter=self.max_iter,
-            tol=self.tol,
-            random_state=self.random_state)
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class _RandomFeatures(TransformerMixin):
     def __init__(self, dim, bw, random_state):
-        self._dim = dim
-        self._bw = bw
-        self._random_state = random_state
+        self.dim = dim
+        self.bw = bw
+        self.random_state = random_state
 
     def fit(self, X):
-        random_state = check_random_state(self._random_state)
-        self.omegas = random_state.normal(0, 1 / self._bw, size=(shape(X)[1], self._dim))
-        self.biases = random_state.uniform(0, 2 * np.pi, size=(1, self._dim))
+        random_state = check_random_state(self.random_state)
+        self.omegas_ = random_state.normal(0, 1 / self.bw, size=(shape(X)[1], self.dim))
+        self.biases_ = random_state.uniform(0, 2 * np.pi, size=(1, self.dim))
+        self.dim_ = self.dim
         return self
 
     def transform(self, X):
-        return np.sqrt(2 / self._dim) * np.cos(np.matmul(X, self.omegas) + self.biases)
+        return np.sqrt(2 / self.dim_) * np.cos(np.matmul(X, self.omegas_) + self.biases_)
 
 
 class KernelDML(DML):
@@ -1064,6 +942,9 @@ class KernelDML(DML):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1075,53 +956,44 @@ class KernelDML(DML):
     """
 
     def __init__(self, model_y='auto', model_t='auto', fit_cate_intercept=True,
-                 dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2, random_state=None,
-                 monte_carlo_iterations=None):
-        super().__init__(model_y=model_y, model_t=model_t,
-                         model_final=ElasticNetCV(fit_intercept=False, random_state=random_state),
-                         featurizer=_RandomFeatures(dim, bw, random_state),
+                 dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2,
+                 monte_carlo_iterations=None, random_state=None):
+        self.dim = dim
+        self.bw = bw
+        super().__init__(model_y=model_y,
+                         model_t=model_t,
+                         model_final=None,
+                         featurizer=None,
                          fit_cate_intercept=fit_cate_intercept,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-    @property
-    def bw(self):
-        return self._bw
+    def _gen_model_final(self):
+        return ElasticNetCV(fit_intercept=False, random_state=self.random_state)
 
-    @bw.setter
-    def bw(self, bw):
-        self._bw = bw
-        # super().featurizer = _RandomFeatures(self._dim, self._bw, self.random_state)
-        super(KernelDML, KernelDML).featurizer.__set__(self, _RandomFeatures(self._dim, self._bw, self.random_state))
+    def _gen_featurizer(self):
+        return _RandomFeatures(self.dim, self.bw, self.random_state)
 
     @property
-    def dim(self):
-        return self._dim
+    def featurizer(self):
+        return self._gen_featurizer()
 
-    @dim.setter
-    def dim(self, dim):
-        self._dim = dim
-        # super().featurizer = _RandomFeatures(self._dim, self._bw, self.random_state)
-        super(KernelDML, KernelDML).featurizer.__set__(self, _RandomFeatures(self._dim, self._bw, self.random_state))
+    @featurizer.setter
+    def featurizer(self, value):
+        if value is not None:
+            raise ValueError("Parameter `featurizer` cannot be altered for this estimator!")
 
-    @_BaseDML.featurizer.setter
-    def featurizer(self, dim):
-        raise AttributeError("KernelDML featurizer can't be set directly; "
-                             "instead set the bw and dim attributes to modify the kernel")
+    @property
+    def model_final(self):
+        return self._gen_model_final()
 
-    @DML.model_final.setter
+    @model_final.setter
     def model_final(self, model):
-        raise AttributeError("KernelDML final model can't be changed from ElasticNetCV")
-
-    @_OrthoLearner.random_state.setter
-    def random_state(self, random_state):
-        # super().random_state = random_state
-        super(KernelDML, KernelDML).random_state.__set__(self, random_state)
-        # super().model_final = ElasticNetCV(fit_intercept=False, random_state=random_state)
-        self._model_final = ElasticNetCV(fit_intercept=False,
-                                         random_state=random_state)
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class NonParamDML(_BaseDML):
@@ -1174,6 +1046,9 @@ class NonParamDML(_BaseDML):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1190,60 +1065,46 @@ def __init__(self,
                  discrete_treatment=False,
                  categories='auto',
                  n_splits=2,
-                 random_state=None,
-                 monte_carlo_iterations=None):
+                 monte_carlo_iterations=None,
+                 random_state=None):
 
         # TODO: consider whether we need more care around stateful featurizers,
         #       since we clone it and fit separate copies
-        model_y = clone(model_y, safe=False)
-        model_t = clone(model_t, safe=False)
-        self._model_y = model_y
-        self._model_t = model_t
-        self._featurizer = clone(featurizer, safe=False)
-
-        super().__init__(model_y=_FirstStageWrapper(model_y, True,
-                                                    featurizer, False, discrete_treatment),
-                         model_t=_FirstStageWrapper(model_t, False,
-                                                    featurizer, False, discrete_treatment),
-                         model_final=_FinalWrapper(model_final, False, featurizer, True),
-                         discrete_treatment=discrete_treatment,
+        self.model_y = clone(model_y, safe=False)
+        self.model_t = clone(model_t, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        super().__init__(discrete_treatment=discrete_treatment,
                          categories=categories,
                          n_splits=n_splits,
-                         random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-    @_BaseDML.featurizer.setter
-    def featurizer(self, featurizer):
-        self._featurizer = clone(featurizer, safe=False)
-        # featurizer isn't actually used by first stage models since linear_first_stages is False,
-        # so we only need to update model_final
-        # _BaseDML's final model setter reuses its old featurizer
-        # so we need to pass _BaseDML, not DML to the super() call
-        # super(_BasaeDML).model_final = _FinalWrapper(...)
-        super(_BaseDML, _BaseDML).model_final.__set__(self, _FinalWrapper(
-            self.model_final, False, self._featurizer, True))
+    def _get_inference_options(self):
+        # add blb to parent's options
+        options = super()._get_inference_options()
+        options.update(auto=GenericSingleTreatmentModelFinalInference)
+        return options
 
-    @property
-    def model_t(self):
-        return self._model_t
+    def _gen_model_y(self):
+        return _FirstStageWrapper(clone(self.model_y, safe=False), True,
+                                  self.featurizer, False, self.discrete_treatment)
 
-    @model_t.setter
-    def model_t(self, model_t):
-        model_t = clone(model_t, safe=False)
-        self._model_t = model_t
-        self._rlearner_model_t = _FirstStageWrapper(model_t, False,
-                                                    self.featurizer, False, self.discrete_treatment)
+    def _gen_model_t(self):
+        return _FirstStageWrapper(clone(self.model_t, safe=False), False,
+                                  self.featurizer, False, self.discrete_treatment)
 
-    @property
-    def model_y(self):
-        return self._model_y
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
 
-    @model_y.setter
-    def model_y(self, model_y):
-        model_y = clone(model_y, safe=False)
-        self._model_y = model_y
-        self._rlearner_model_y = _FirstStageWrapper(model_y, False,
-                                                    self.featurizer, False, self.discrete_treatment)
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_rlearner_model_final(self):
+        return _FinalWrapper(self._gen_model_final(), False, self.featurizer, True)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
 
 class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
@@ -1268,7 +1129,7 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
 
-    n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1285,6 +1146,9 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     n_estimators : integer, optional (default=100)
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
@@ -1404,9 +1268,11 @@ class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
 
     def __init__(self,
                  model_y, model_t,
+                 featurizer=None,
                  discrete_treatment=False,
                  categories='auto',
-                 n_crossfit_splits=2,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
                  n_estimators=100,
                  criterion="mse",
                  max_depth=None,
@@ -1420,41 +1286,45 @@ def __init__(self,
                  honest=True,
                  n_jobs=None,
                  verbose=0,
-                 random_state=None,
-                 monte_carlo_iterations=None):
-        self._n_estimators = n_estimators
-        self._criterion = criterion
-        self._max_depth = max_depth
-        self._min_samples_split = min_samples_split
-        self._min_samples_leaf = min_samples_leaf
-        self._min_weight_fraction_leaf = min_weight_fraction_leaf
-        self._max_features = max_features
-        self._max_leaf_nodes = max_leaf_nodes
-        self._min_impurity_decrease = min_impurity_decrease
-        self._subsample_fr = subsample_fr
-        self._honest = honest
-        self._n_jobs = n_jobs
-        self._verbose = verbose
-        model_final = SubsampledHonestForest(n_estimators=n_estimators,
-                                             criterion=criterion,
-                                             max_depth=max_depth,
-                                             min_samples_split=min_samples_split,
-                                             min_samples_leaf=min_samples_leaf,
-                                             min_weight_fraction_leaf=min_weight_fraction_leaf,
-                                             max_features=max_features,
-                                             max_leaf_nodes=max_leaf_nodes,
-                                             min_impurity_decrease=min_impurity_decrease,
-                                             subsample_fr=subsample_fr,
-                                             honest=honest,
-                                             n_jobs=n_jobs,
-                                             random_state=random_state,
-                                             verbose=verbose)
-        super().__init__(model_y=model_y, model_t=model_t,
-                         model_final=model_final, featurizer=None,
+                 random_state=None):
+        self.n_estimators = n_estimators
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.subsample_fr = subsample_fr
+        self.honest = honest
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        super().__init__(model_y=model_y,
+                         model_t=model_t,
+                         model_final=None,
+                         featurizer=featurizer,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_crossfit_splits, random_state=random_state,
-                         monte_carlo_iterations=monte_carlo_iterations)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return SubsampledHonestForest(n_estimators=self.n_estimators,
+                                      criterion=self.criterion,
+                                      max_depth=self.max_depth,
+                                      min_samples_split=self.min_samples_split,
+                                      min_samples_leaf=self.min_samples_leaf,
+                                      min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                                      max_features=self.max_features,
+                                      max_leaf_nodes=self.max_leaf_nodes,
+                                      min_impurity_decrease=self.min_impurity_decrease,
+                                      subsample_fr=self.subsample_fr,
+                                      honest=self.honest,
+                                      n_jobs=self.n_jobs,
+                                      random_state=self.random_state,
+                                      verbose=self.verbose)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
@@ -1484,7 +1354,7 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             must support a 'groups' argument to its split method.
         cache_values: bool, default False
             Whether to cache inputs and first stage results, which will allow refitting a different final model
-            inference: string, `Inference` instance, or None
+        inference: string, `Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'blb'
             (for Bootstrap-of-Little-Bags based inference)
@@ -1498,143 +1368,14 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
                            cache_values=cache_values,
                            inference=inference)
 
-    @DML.model_final.setter
-    def model_final(self, model):
-        raise AttributeError("ForestDML final model can't be set directly;"
-                             "instead set individual attributes to modify the SubsampledHonestForest settings")
-
-    def _set_model_final(self):
-        self._rlearner_model_final = SubsampledHonestForest(n_estimators=self._n_estimators,
-                                                            criterion=self._criterion,
-                                                            max_depth=self._max_depth,
-                                                            min_samples_split=self._min_samples_split,
-                                                            min_samples_leaf=self._min_samples_leaf,
-                                                            min_weight_fraction_leaf=self._min_weight_fraction_leaf,
-                                                            max_features=self._max_features,
-                                                            max_leaf_nodes=self._max_leaf_nodes,
-                                                            min_impurity_decrease=self._min_impurity_decrease,
-                                                            subsample_fr=self._subsample_fr,
-                                                            honest=self._honest,
-                                                            n_jobs=self._n_jobs,
-                                                            random_state=self._random_state,
-                                                            verbose=self._verbose)
-
-    @property
-    def n_estimators(self):
-        return self._n_estimators
-
-    @n_estimators.setter
-    def n_estimators(self, n_estimators):
-        self._n_estimators = n_estimators
-        self._set_model_final()
-
-    @property
-    def criterion(s):
-        return self._criterion
-
-    @criterion.setter
-    def criterion(self, criterion):
-        self._criterion = criterion
-        self._set_model_final()
-
     @property
-    def max_depth(s):
-        return self._max_depth
-
-    @max_depth.setter
-    def max_depth(self, max_depth):
-        self._max_depth = max_depth
-        self._set_model_final()
-
-    @property
-    def min_samples_split(self, min):
-        return self._min_samples_split
-
-    @min_samples_split.setter
-    def min_samples_split(self, min_samples_split):
-        self._min_samples_split = min_samples_split
-        self._set_model_final()
-
-    @property
-    def min_samples_leaf(self, mi):
-        return self._min_samples_leaf
-
-    @min_samples_leaf.setter
-    def min_samples_leaf(self, min_samples_leaf):
-        self._min_samples_leaf = min_samples_leaf
-        self._set_model_final()
-
-    @property
-    def min_weight_fraction_leaf(self, min_weight):
-        return self._min_weight_fraction_leaf
-
-    @min_weight_fraction_leaf.setter
-    def min_weight_fraction_leaf(self, min_weight_fraction_leaf):
-        self._min_weight_fraction_leaf = min_weight_fraction_leaf
-        self._set_model_final()
-
-    @property
-    def max_features(self):
-        return self._max_features
-
-    @max_features.setter
-    def max_features(self, max_features):
-        self._max_features = max_features
-        self._set_model_final()
-
-    @property
-    def max_leaf_nodes(self, ):
-        return self._max_leaf_nodes
-
-    @max_leaf_nodes.setter
-    def max_leaf_nodes(self, max_leaf_nodes):
-        self._max_leaf_nodes = max_leaf_nodes
-        self._set_model_final()
-
-    @property
-    def min_impurity_decrease(self, min_imp):
-        return self._min_impurity_decrease
-
-    @min_impurity_decrease.setter
-    def min_impurity_decrease(self, min_impurity_decrease):
-        self._min_impurity_decrease = min_impurity_decrease
-        self._set_model_final()
-
-    @property
-    def subsample_fr(self):
-        return self._subsample_fr
-
-    @subsample_fr.setter
-    def subsample_fr(self, subsample_fr):
-        self._subsample_fr = subsample_fr
-        self._set_model_final()
-
-    @property
-    def honest(self):
-        return self._honest
-
-    @honest.setter
-    def honest(self, honest):
-        self._honest = honest
-        self._set_model_final()
-
-    @property
-    def n_jobs(self):
-        return self._n_jobs
-
-    @n_jobs.setter
-    def n_jobs(self, n_jobs):
-        self._n_jobs = n_jobs
-        self._set_model_final()
-
-    @property
-    def verbose(self):
-        return self._verbose
+    def model_final(self):
+        return self._gen_model_final()
 
-    @verbose.setter
-    def verbose(self, verbose):
-        self._verbose = verbose
-        self._set_model_final()
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 @deprecated("The DMLCateEstimator class has been renamed to DML; "
diff --git a/econml/drlearner.py b/econml/drlearner.py
index b98e15326..7c5668d05 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -258,6 +258,9 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -367,23 +370,48 @@ def __init__(self, model_propensity='auto',
                  min_propensity=1e-6,
                  categories='auto',
                  n_splits=2,
+                 monte_carlo_iterations=None,
                  random_state=None):
-        if model_propensity == 'auto':
-            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
-                                                    random_state=random_state)
-        if model_regression == 'auto':
-            model_regression = WeightedLassoCVWrapper(cv=3, random_state=random_state)
-        self._multitask_model_final = multitask_model_final
-        super().__init__(_ModelNuisance(model_propensity, model_regression, min_propensity),
-                         _ModelFinal(model_final, featurizer, multitask_model_final),
-                         n_splits=n_splits, discrete_treatment=True,
+        self.model_propensity = clone(model_propensity, safe=False)
+        self.model_regression = clone(model_regression, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.multitask_model_final = multitask_model_final
+        self.featurizer = clone(featurizer, safe=False)
+        self.min_propensity = min_propensity
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         discrete_treatment=True,
                          discrete_instrument=False,  # no instrument, so doesn't matter
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        if self.model_propensity == 'auto':
+            model_propensity = LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto',
+                                                    random_state=self.random_state)
+        else:
+            model_propensity = clone(self.model_propensity, safe=False)
+
+        if self.model_regression == 'auto':
+            model_regression = WeightedLassoCVWrapper(cv=3, random_state=self.random_state)
+        else:
+            model_regression = clone(self.model_regression, safe=False)
+
+        return _ModelNuisance(model_propensity, model_regression, self.min_propensity)
+
+    def _gen_featurizer(self):
+        return clone(self.featurizer, safe=False)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), self.multitask_model_final)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -405,6 +433,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`).
@@ -416,7 +446,7 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, X=None, W=None):
         """
@@ -458,9 +488,9 @@ def multitask_model_cate(self):
             vector of outcomes correspond to the CATE model for each treatment, compared to baseline.
             Available only when multitask_model_final=True.
         """
-        if not self._multitask_model_final:
+        if not self.ortho_learner_model_final._multitask_model_final:
             raise AttributeError("Separate CATE models were fitted for each treatment! Use model_cate.")
-        return super().model_final.model_cate
+        return self.ortho_learner_model_final.model_cate
 
     def model_cate(self, T=1):
         """
@@ -477,12 +507,12 @@ def model_cate(self, T=1):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the CATE model for treatment T=t, compared to baseline. Available when multitask_model_final=False.
         """
-        if self._multitask_model_final:
+        if self.ortho_learner_model_final._multitask_model_final:
             raise AttributeError("A single multitask model was fitted for all treatments! Use multitask_model_cate.")
         _, T = self._expand_treatments(None, T)
         ind = inverse_onehot(T).item() - 1
         assert ind >= 0, "No model was fitted for the control"
-        return super().model_final.models_cate[ind]
+        return self.ortho_learner_model_final.models_cate[ind]
 
     @property
     def models_propensity(self):
@@ -521,7 +551,7 @@ def nuisance_scores_regression(self):
         return self.nuisance_scores_[1]
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         """
         Get the fitted featurizer.
 
@@ -531,7 +561,7 @@ def featurizer(self):
             An instance of the fitted featurizer that was used to preprocess X in the final CATE model training.
             Available only when featurizer is not None and X is not None.
         """
-        return super().model_final._featurizer
+        return self.ortho_learner_model_final._featurizer
 
     def cate_feature_names(self, feature_names=None):
         """
@@ -556,14 +586,22 @@ def cate_feature_names(self, feature_names=None):
             return None
         if feature_names is None:
             feature_names = self._input_names["feature_names"]
-        if self.featurizer is None:
+        if self.featurizer_ is None:
             return feature_names
-        elif hasattr(self.featurizer, 'get_feature_names'):
+        elif hasattr(self.featurizer_, 'get_feature_names'):
             # This fails if X=None and featurizer is not None, but that case is handled above
-            return self.featurizer.get_feature_names(feature_names)
+            return self.featurizer_.get_feature_names(feature_names)
         else:
             raise AttributeError("Featurizer does not have a method: get_feature_names!")
 
+    @property
+    def model_final_(self):
+        return self.ortho_learner_model_final._model_final
+
+    @property
+    def fitted_models_final(self):
+        return self.ortho_learner_model_final.models_cate
+
 
 class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
     """
@@ -642,6 +680,9 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -699,21 +740,31 @@ def __init__(self,
                  fit_cate_intercept=True,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
         self.fit_cate_intercept = fit_cate_intercept
         super().__init__(model_propensity=model_propensity,
                          model_regression=model_regression,
-                         model_final=StatsModelsLinearRegression(fit_intercept=fit_cate_intercept),
+                         model_final=None,
                          featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=self.fit_cate_intercept)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -735,6 +786,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports ``'bootstrap'``
             (or an instance of :class:`.BootstrapInference`) and ``'statsmodels'``
@@ -747,7 +800,10 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         # Replacing fit from DRLearner, to add statsmodels inference in docstring
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
     def multitask_model_cate(self):
@@ -756,12 +812,22 @@ def multitask_model_cate(self):
         return super().multitask_model_cate
 
     @property
-    def model_final(self):
-        return super().model_final._model_final
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
@@ -853,6 +919,9 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -913,26 +982,34 @@ def __init__(self,
                  tol=1e-4,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
         self.fit_cate_intercept = fit_cate_intercept
-        model_final = DebiasedLasso(
-            alpha=alpha,
-            fit_intercept=fit_cate_intercept,
-            max_iter=max_iter,
-            tol=tol)
         super().__init__(model_propensity=model_propensity,
                          model_regression=model_regression,
-                         model_final=model_final,
+                         model_final=None,
                          featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        return DebiasedLasso(alpha=self.alpha,
+                             fit_intercept=self.fit_cate_intercept,
+                             max_iter=self.max_iter,
+                             tol=self.tol)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
+
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -954,6 +1031,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, :class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports ``'bootstrap'``
             (or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
@@ -969,26 +1048,33 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             warn("This estimator does not yet support sample variances and inference does not take "
                  "sample variances into account. This feature will be supported in a future release.")
         check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
-                               discrete_treatment=self._discrete_treatment,
+                               discrete_treatment=self.discrete_treatment,
                                msg="The number of features in the final model (< 5) is too small for a sparse model. "
                                "We recommend using the LinearDRLearner for this low-dimensional setting.")
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=None, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     @property
-    def multitask_model_cate(self):
-        # Replacing this method which is invalid for this class, so that we make the
-        # dosctring empty and not appear in the docs.
-        return super().multitask_model_cate
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
     def model_final(self):
-        return super().model_final._model_final
+        return self._gen_model_final()
 
-    @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
 
 
 class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
@@ -1015,7 +1101,7 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
 
-    n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -1032,6 +1118,9 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     n_estimators : integer, optional (default=100)
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
@@ -1148,9 +1237,11 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
 
     def __init__(self,
                  model_regression, model_propensity,
+                 featurizer=None,
                  min_propensity=1e-6,
                  categories='auto',
-                 n_crossfit_splits=2,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
                  n_estimators=1000,
                  criterion="mse",
                  max_depth=None,
@@ -1165,30 +1256,53 @@ def __init__(self,
                  n_jobs=None,
                  verbose=0,
                  random_state=None):
-        model_final = SubsampledHonestForest(n_estimators=n_estimators,
-                                             criterion=criterion,
-                                             max_depth=max_depth,
-                                             min_samples_split=min_samples_split,
-                                             min_samples_leaf=min_samples_leaf,
-                                             min_weight_fraction_leaf=min_weight_fraction_leaf,
-                                             max_features=max_features,
-                                             max_leaf_nodes=max_leaf_nodes,
-                                             min_impurity_decrease=min_impurity_decrease,
-                                             subsample_fr=subsample_fr,
-                                             honest=honest,
-                                             n_jobs=n_jobs,
-                                             random_state=random_state,
-                                             verbose=verbose)
-        super().__init__(model_regression=model_regression, model_propensity=model_propensity,
-                         model_final=model_final, featurizer=None,
+        self.n_estimators = n_estimators
+        self.criterion = criterion
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_decrease = min_impurity_decrease
+        self.subsample_fr = subsample_fr
+        self.honest = honest
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        super().__init__(model_regression=model_regression,
+                         model_propensity=model_propensity,
+                         model_final=None,
+                         featurizer=featurizer,
                          multitask_model_final=False,
                          min_propensity=min_propensity,
                          categories=categories,
-                         n_splits=n_crossfit_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_model_final(self):
+        return SubsampledHonestForest(n_estimators=self.n_estimators,
+                                      criterion=self.criterion,
+                                      max_depth=self.max_depth,
+                                      min_samples_split=self.min_samples_split,
+                                      min_samples_leaf=self.min_samples_leaf,
+                                      min_weight_fraction_leaf=self.min_weight_fraction_leaf,
+                                      max_features=self.max_features,
+                                      max_leaf_nodes=self.max_leaf_nodes,
+                                      min_impurity_decrease=self.min_impurity_decrease,
+                                      subsample_fr=self.subsample_fr,
+                                      honest=self.honest,
+                                      n_jobs=self.n_jobs,
+                                      random_state=self.random_state,
+                                      verbose=self.verbose)
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_model_final(), self._gen_featurizer(), False)
 
     @_deprecate_positional("X and W should be passed by keyword only. In a future release "
                            "we will disallow passing X and W by position.", ['X', 'W'])
-    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
 
@@ -1211,6 +1325,8 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string, `Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of :class:`.BootstrapInference`) and 'blb'
@@ -1222,16 +1338,29 @@ def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, grou
         """
         return super().fit(Y, T, X=X, W=W,
                            sample_weight=sample_weight, sample_var=None, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, *, inference='auto'):
+        super().refit(inference=inference)
 
     def multitask_model_cate(self):
         # Replacing to remove docstring
         super().multitask_model_cate()
 
     @property
-    def model_final(self):
-        return super().model_final._model_final
+    def multitask_model_final(self):
+        return False
+
+    @multitask_model_final.setter
+    def multitask_model_final(self, value):
+        if value:
+            raise ValueError("Parameter `multitask_model_final` cannot change from `False` for this estimator!")
 
     @property
-    def fitted_models_final(self):
-        return super().model_final.models_cate
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, model):
+        if model is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator!")
diff --git a/econml/inference.py b/econml/inference.py
index 1e696561f..0ba29f20a 100644
--- a/econml/inference.py
+++ b/econml/inference.py
@@ -95,8 +95,8 @@ class has a model_final method, whose predict(cross_product(X, [0, ..., 1, ...,
     """
 
     def prefit(self, estimator, *args, **kwargs):
-        self.model_final = estimator.model_final
-        self.featurizer = estimator.featurizer if hasattr(estimator, 'featurizer') else None
+        self.model_final = estimator.model_final_
+        self.featurizer = estimator.featurizer_ if hasattr(estimator, 'featurizer_') else None
 
     def fit(self, estimator, *args, **kwargs):
         # once the estimator has been fit, it's kosher to store d_t here
@@ -326,8 +326,8 @@ class GenericModelFinalInferenceDiscrete(Inference):
     """
 
     def prefit(self, estimator, *args, **kwargs):
-        self.model_final = estimator.model_final
-        self.featurizer = estimator.featurizer if hasattr(estimator, 'featurizer') else None
+        self.model_final = estimator.model_final_
+        self.featurizer = estimator.featurizer_ if hasattr(estimator, 'featurizer_') else None
 
     def fit(self, estimator, *args, **kwargs):
         # once the estimator has been fit, it's kosher to store d_t here
diff --git a/econml/ortho_iv.py b/econml/ortho_iv.py
index 0ad0134e5..021cd15f0 100644
--- a/econml/ortho_iv.py
+++ b/econml/ortho_iv.py
@@ -136,19 +136,26 @@ def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None
 
 
 class _BaseDMLATEIV(_OrthoLearner):
-    def __init__(self, model_nuisance,
-                 discrete_instrument=False, discrete_treatment=False,
+    def __init__(self, discrete_instrument=False,
+                 discrete_treatment=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
-        super().__init__(model_nuisance, _BaseDMLATEIVModelFinal(),
-                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLATEIVModelFinal()
 
     @_deprecate_positional("W and Z should be passed by keyword only. In a future release "
                            "we will disallow passing W and Z by position.", ['W', 'Z'])
-    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -170,6 +177,8 @@ def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=No
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -181,7 +190,7 @@ def fit(self, Y, T, Z, W=None, *, sample_weight=None, sample_var=None, groups=No
         # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
         return super().fit(Y, T, W=W, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, W=None):
         """
@@ -271,22 +280,32 @@ class DMLATEIV(_BaseDMLATEIV):
     a biased ATE.
     """
 
-    def __init__(self, model_Y_W, model_T_W, model_Z_W,
-                 discrete_treatment=False, discrete_instrument=False,
+    def __init__(self, *, model_Y_W, model_T_W, model_Z_W,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
-        super().__init__(_DMLATEIVModelNuisance(model_Y_W=_FirstStageWrapper(model_Y_W, discrete_target=False),
-                                                model_T_W=_FirstStageWrapper(
-                                                    model_T_W, discrete_target=discrete_treatment),
-                                                model_Z_W=_FirstStageWrapper(
-                                                    model_Z_W, discrete_target=discrete_instrument)),
-                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_Z_W = clone(model_Z_W, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _DMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_Z_W=_FirstStageWrapper(clone(self.model_Z_W, safe=False), discrete_target=self.discrete_instrument))
 
 
 class _ProjectedDMLATEIVModelNuisance:
+
     def __init__(self, model_Y_W, model_T_W, model_T_WZ):
         self._model_Y_W = clone(model_Y_W, safe=False)
         self._model_T_W = clone(model_T_W, safe=False)
@@ -330,21 +349,30 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
 
 
 class ProjectedDMLATEIV(_BaseDMLATEIV):
-    def __init__(self, model_Y_W, model_T_W, model_T_WZ,
-                 discrete_treatment=False, discrete_instrument=False,
+
+    def __init__(self, *, model_Y_W, model_T_W, model_T_WZ,
+                 discrete_treatment=False,
+                 discrete_instrument=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_Y_W = clone(model_Y_W, safe=False)
+        self.model_T_W = clone(model_T_W, safe=False)
+        self.model_T_WZ = clone(model_T_WZ, safe=False)
+        super().__init__(discrete_instrument=discrete_instrument,
+                         discrete_treatment=discrete_treatment,
+                         categories=categories,
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
-        super().__init__(_ProjectedDMLATEIVModelNuisance(
-            model_Y_W=_FirstStageWrapper(
-                model_Y_W, discrete_target=False),
-            model_T_W=_FirstStageWrapper(
-                model_T_W, discrete_target=discrete_treatment),
-            model_T_WZ=_FirstStageWrapper(
-                model_T_WZ, discrete_target=discrete_treatment)),
-            discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
-            categories=categories,
-            n_splits=n_splits, random_state=random_state)
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ProjectedDMLATEIVModelNuisance(
+            model_Y_W=_FirstStageWrapper(clone(self.model_Y_W, safe=False), discrete_target=False),
+            model_T_W=_FirstStageWrapper(clone(self.model_T_W, safe=False), discrete_target=self.discrete_treatment),
+            model_T_WZ=_FirstStageWrapper(clone(self.model_T_WZ, safe=False),
+                                          discrete_target=self.discrete_instrument))
 
 
 class _BaseDMLIVModelNuisance:
@@ -494,6 +522,9 @@ class _BaseDMLIV(_OrthoLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -501,18 +532,19 @@ class _BaseDMLIV(_OrthoLearner):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
-                 discrete_instrument=False, discrete_treatment=False, categories='auto',
-                 n_splits=2, random_state=None):
-        super().__init__(_BaseDMLIVModelNuisance(model_Y_X, model_T_X, model_T_XZ),
-                         _BaseDMLIVModelFinal(model_final),
-                         discrete_treatment=discrete_treatment, discrete_instrument=discrete_instrument,
+    def __init__(self, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 n_splits=2, monte_carlo_iterations=None, random_state=None):
+        super().__init__(discrete_treatment=discrete_treatment,
+                         discrete_instrument=discrete_instrument,
                          categories=categories,
-                         n_splits=n_splits, random_state=random_state)
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         random_state=random_state)
 
     @_deprecate_positional("Z and X should be passed by keyword only. In a future release "
                            "we will disallow passing Z and X by position.", ['X', 'Z'])
-    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -534,6 +566,8 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=No
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -545,7 +579,7 @@ def fit(self, Y, T, Z, X=None, *, sample_weight=None, sample_var=None, groups=No
         # Replacing fit from _OrthoLearner, to enforce W=None and improve the docstring
         return super().fit(Y, T, X=X, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, X=None):
         """
@@ -578,18 +612,18 @@ def score(self, Y, T, Z, X=None):
 
     @property
     def original_featurizer(self):
-        return super().model_final._model_final._original_featurizer
+        return self.ortho_learner_model_final._model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
-        return super().model_final._model_final._featurizer
+        return self.ortho_learner_model_final._model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
-        return super().model_final._model_final._model
+        return self.ortho_learner_model_final._model_final._model
 
     @property
     def model_cate(self):
@@ -602,7 +636,7 @@ def model_cate(self):
             An instance of the model_final object that was fitted after calling fit which corresponds
             to the constant marginal CATE model.
         """
-        return super().model_final._model_final._model
+        return self.ortho_learner_model_final._model_final._model
 
     @property
     def models_Y_X(self):
@@ -615,7 +649,7 @@ def models_Y_X(self):
             A list of instances of the `model_Y_X` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_Y_X]
+        return [mdl._model_Y_X._model for mdl in super().models_nuisance]
 
     @property
     def models_T_X(self):
@@ -628,7 +662,7 @@ def models_T_X(self):
             A list of instances of the `model_T_X` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_T_X]
+        return [mdl._model_T_X._model for mdl in super().models_nuisance]
 
     @property
     def models_T_XZ(self):
@@ -641,7 +675,7 @@ def models_T_XZ(self):
             A list of instances of the `model_T_XZ` object. Each element corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [mdl._model for mdl in super().models_T_XZ]
+        return [mdl._model_T_XZ._model for mdl in super().models_nuisance]
 
     @property
     def nuisance_scores_Y_X(self):
@@ -746,6 +780,9 @@ class DMLIV(_BaseDMLIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     discrete_instrument: bool, optional, default False
         Whether the instrument values should be treated as categorical, rather than continuous, quantities
 
@@ -763,25 +800,41 @@ class DMLIV(_BaseDMLIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final, featurizer=None,
+    def __init__(self, *, model_Y_X, model_T_X, model_T_XZ, model_final,
+                 featurizer=None,
                  fit_cate_intercept=True,
-                 n_splits=2, discrete_instrument=False, discrete_treatment=False,
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 discrete_instrument=False, discrete_treatment=False,
                  categories='auto', random_state=None):
-        self.bias_part_of_coef = fit_cate_intercept
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_Y_X, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
         self.fit_cate_intercept = fit_cate_intercept
-        super().__init__(_FirstStageWrapper(model_Y_X, False),
-                         _FirstStageWrapper(model_T_X, discrete_treatment),
-                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
-                         _FinalWrapper(model_final,
-                                       fit_cate_intercept=fit_cate_intercept,
-                                       featurizer=featurizer,
-                                       use_weight_trick=False),
-                         n_splits=n_splits,
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          discrete_instrument=discrete_instrument,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=False))
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final._model_final._fit_cate_intercept
+
 
 class NonParamDMLIV(_BaseDMLIV):
     """
@@ -841,6 +894,9 @@ class NonParamDMLIV(_BaseDMLIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     discrete_instrument: bool, optional, default False
         Whether the instrument values should be treated as categorical, rather than continuous, quantities
 
@@ -859,23 +915,38 @@ class NonParamDMLIV(_BaseDMLIV):
 
     """
 
-    def __init__(self, model_Y_X, model_T_X, model_T_XZ, model_final,
+    def __init__(self, *, model_Y_X, model_T_X, model_T_XZ, model_final,
                  featurizer=None, fit_cate_intercept=True,
-                 n_splits=2, discrete_instrument=False, discrete_treatment=False, categories='auto',
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
+                 categories='auto',
                  random_state=None):
-        super().__init__(_FirstStageWrapper(model_Y_X, False),
-                         _FirstStageWrapper(model_T_X, discrete_treatment),
-                         _FirstStageWrapper(model_T_XZ, discrete_treatment),
-                         _FinalWrapper(model_final,
-                                       fit_cate_intercept=fit_cate_intercept,
-                                       featurizer=featurizer,
-                                       use_weight_trick=True),
-                         n_splits=n_splits,
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_Y_X, safe=False)
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
+        self.fit_cate_intercept = fit_cate_intercept
+        super().__init__(n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          discrete_instrument=discrete_instrument,
                          discrete_treatment=discrete_treatment,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_ortho_learner_model_nuisance(self):
+        return _BaseDMLIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False), False),
+                                       _FirstStageWrapper(clone(self.model_T_X, safe=False), self.discrete_treatment),
+                                       _FirstStageWrapper(clone(self.model_T_XZ, safe=False), self.discrete_treatment))
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDMLIVModelFinal(_FinalWrapper(clone(self.model_final, safe=False),
+                                                  fit_cate_intercept=self.fit_cate_intercept,
+                                                  featurizer=clone(self.featurizer, safe=False),
+                                                  use_weight_trick=True))
+
 
 class _BaseDRIVModelFinal:
     """
@@ -1036,6 +1107,9 @@ class _BaseDRIV(_OrthoLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
@@ -1044,33 +1118,42 @@ class _BaseDRIV(_OrthoLearner):
     """
 
     def __init__(self,
-                 nuisance_models,
                  model_final,
                  featurizer=None,
                  fit_cate_intercept=True,
-                 cov_clip=0.1, opt_reweighted=False,
-                 discrete_instrument=False, discrete_treatment=False,
+                 cov_clip=0.1,
+                 opt_reweighted=False,
+                 discrete_instrument=False,
+                 discrete_treatment=False,
                  categories='auto',
-                 n_splits=2, random_state=None):
-
+                 n_splits=2,
+                 monte_carlo_iterations=None,
+                 random_state=None):
+        self.model_final = clone(model_final, safe=False)
+        self.featurizer = clone(featurizer, safe=False)
         self.fit_cate_intercept = fit_cate_intercept
-        self.bias_part_of_coef = fit_cate_intercept
-
         self.cov_clip = cov_clip
         self.opt_reweighted = opt_reweighted
-        super().__init__(nuisance_models, _BaseDRIVModelFinal(model_final,
-                                                              featurizer,
-                                                              discrete_treatment,
-                                                              discrete_instrument,
-                                                              fit_cate_intercept,
-                                                              cov_clip,
-                                                              opt_reweighted),
-                         discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
-                         categories=categories, n_splits=n_splits, random_state=random_state)
+        super().__init__(discrete_instrument=discrete_instrument, discrete_treatment=discrete_treatment,
+                         categories=categories, n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations, random_state=random_state)
+
+    def _gen_model_final(self):
+        return clone(self.model_final, safe=False)
+
+    def _gen_ortho_learner_model_final(self):
+        return _BaseDRIVModelFinal(self._gen_model_final(),
+                                   clone(self.featurizer, safe=False),
+                                   self.discrete_treatment,
+                                   self.discrete_instrument,
+                                   self.fit_cate_intercept,
+                                   self.cov_clip,
+                                   self.opt_reweighted)
 
     @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference=None):
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -1094,6 +1177,8 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`).
@@ -1105,7 +1190,7 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
         # Replacing fit from _OrthoLearner, to reorder arguments and improve the docstring
         return super().fit(Y, T, X=X, W=W, Z=Z,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
 
     def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
         """
@@ -1143,18 +1228,18 @@ def score(self, Y, T, Z, X=None, W=None, sample_weight=None):
 
     @property
     def original_featurizer(self):
-        return super().model_final._original_featurizer
+        return self.ortho_learner_model_final._original_featurizer
 
     @property
-    def featurizer(self):
+    def featurizer_(self):
         # NOTE This is used by the inference methods and has to be the overall featurizer. intended
         # for internal use by the library
-        return super().model_final._featurizer
+        return self.ortho_learner_model_final._featurizer
 
     @property
-    def model_final(self):
+    def model_final_(self):
         # NOTE This is used by the inference methods and is more for internal use to the library
-        return super().model_final._model_final
+        return self.ortho_learner_model_final._model_final
 
     def cate_feature_names(self, feature_names=None):
         """
@@ -1243,31 +1328,46 @@ class _IntentToTreatDRIV(_BaseDRIV):
     Helper class for the DRIV algorithm for the intent-to-treat A/B test setting
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X,
+                 model_T_XZ,
                  prel_model_effect,
-                 model_effect,
+                 model_final,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  opt_reweighted=False,
                  categories='auto',
                  random_state=None):
         """
         """
-
+        self.model_Y_X = clone(model_Y_X, safe=False)
+        self.model_T_XZ = clone(model_T_XZ, safe=False)
+        self.prel_model_effect = clone(prel_model_effect, safe=False)
         # TODO: check that Y, T, Z do not have multiple columns
-        super().__init__(_IntentToTreatDRIVModelNuisance(model_Y_X, model_T_XZ, prel_model_effect),
-                         model_effect,
+        super().__init__(model_final,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          cov_clip=cov_clip,
                          n_splits=n_splits,
-                         discrete_instrument=True, discrete_treatment=True,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         discrete_instrument=True,
+                         discrete_treatment=True,
                          categories=categories,
                          opt_reweighted=opt_reweighted,
                          random_state=random_state)
 
+    def _gen_prel_model_effect(self):
+        return clone(self.prel_model_effect, safe=False)
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _IntentToTreatDRIVModelNuisance(_FirstStageWrapper(clone(self.model_Y_X, safe=False),
+                                                                  discrete_target=False),
+                                               _FirstStageWrapper(clone(self.model_T_XZ, safe=False),
+                                                                  discrete_target=True),
+                                               self._gen_prel_model_effect())
+
 
 class _DummyCATE:
     """
@@ -1277,7 +1377,7 @@ class _DummyCATE:
     def __init__(self):
         return
 
-    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None):
+    def fit(self, y, T, *, Z, X, W=None, sample_weight=None, groups=None, **kwargs):
         return self
 
     def effect(self, X):
@@ -1332,6 +1432,9 @@ class IntentToTreatDRIV(_IntentToTreatDRIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     opt_reweighted : bool, optional, default False
         Whether to reweight the samples to minimize variance. If True then
         final_model_effect.fit must accept sample_weight as a kw argument (WeightWrapper from
@@ -1351,37 +1454,46 @@ class IntentToTreatDRIV(_IntentToTreatDRIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X, model_T_XZ,
                  flexible_model_effect,
-                 final_model_effect=None,
+                 model_final=None,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  opt_reweighted=False,
                  categories='auto',
                  random_state=None):
-        model_Y_X = _FirstStageWrapper(model_Y_X, discrete_target=False)
-        model_T_XZ = _FirstStageWrapper(model_T_XZ, discrete_target=True)
-        prel_model_effect = _IntentToTreatDRIV(model_Y_X,
-                                               model_T_XZ,
-                                               _DummyCATE(),
-                                               flexible_model_effect,
-                                               cov_clip=1e-7, n_splits=1,
-                                               opt_reweighted=True,
-                                               random_state=random_state)
-        if final_model_effect is None:
-            final_model_effect = flexible_model_effect
-        super().__init__(model_Y_X, model_T_XZ, prel_model_effect,
-                         final_model_effect,
+        self.flexible_model_effect = flexible_model_effect
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
+                         prel_model_effect=None,
+                         model_final=model_final,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
                          cov_clip=cov_clip,
                          n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
                          opt_reweighted=opt_reweighted,
                          categories=categories,
                          random_state=random_state)
 
+    def _gen_model_final(self):
+        if self.model_final is None:
+            return clone(self.flexible_model_effect, safe=False)
+        return clone(self.model_final, safe=False)
+
+    def _gen_prel_model_effect(self):
+        return _IntentToTreatDRIV(model_Y_X=clone(self.model_Y_X, safe=False),
+                                  model_T_XZ=clone(self.model_T_XZ, safe=False),
+                                  prel_model_effect=_DummyCATE(),
+                                  model_final=clone(self.flexible_model_effect, safe=False),
+                                  cov_clip=1e-7,
+                                  n_splits=1,
+                                  opt_reweighted=True,
+                                  random_state=self.random_state)
+
     @property
     def models_Y_X(self):
         return [mdl._model_Y_X._model for mdl in super().models_nuisance]
@@ -1402,6 +1514,15 @@ def nuisance_scores_T_XZ(self):
     def nuisance_scores_effect(self):
         return self.nuisance_scores_[2]
 
+    @property
+    def prel_model_effect(self):
+        return self._gen_prel_model_effect()
+
+    @prel_model_effect.setter
+    def prel_model_effect(self, value):
+        if value is not None:
+            raise ValueError("Parameter `prel_model_effect` cannot be altered for this estimator.")
+
 
 class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
     """
@@ -1446,6 +1567,9 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
+    monte_carlo_iterations: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
     categories: 'auto' or list, default 'auto'
         The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
         The first category will be treated as the control treatment.
@@ -1457,26 +1581,36 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
         by :mod:`np.random<numpy.random>`.
     """
 
-    def __init__(self, model_Y_X, model_T_XZ,
+    def __init__(self, *, model_Y_X,
+                 model_T_XZ,
                  flexible_model_effect,
                  featurizer=None,
                  fit_cate_intercept=True,
                  cov_clip=.1,
                  n_splits=3,
+                 monte_carlo_iterations=None,
                  categories='auto',
                  random_state=None):
-        super().__init__(model_Y_X, model_T_XZ,
+        super().__init__(model_Y_X=model_Y_X,
+                         model_T_XZ=model_T_XZ,
                          flexible_model_effect=flexible_model_effect,
                          featurizer=featurizer,
                          fit_cate_intercept=fit_cate_intercept,
-                         final_model_effect=StatsModelsLinearRegression(fit_intercept=False),
-                         cov_clip=cov_clip, n_splits=n_splits, opt_reweighted=False,
+                         model_final=None,
+                         cov_clip=cov_clip,
+                         n_splits=n_splits,
+                         monte_carlo_iterations=monte_carlo_iterations,
+                         opt_reweighted=False,
                          categories=categories, random_state=random_state)
 
+    def _gen_model_final(self):
+        return StatsModelsLinearRegression(fit_intercept=False)
+
     # override only so that we can update the docstring to indicate support for `StatsModelsInference`
     @_deprecate_positional("X, W, and Z should be passed by keyword only. In a future release "
                            "we will disallow passing X, W, and Z by position.", ['X', 'W', 'Z'])
-    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
+    def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference='auto'):
         """
         Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
 
@@ -1500,6 +1634,8 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
             All rows corresponding to the same group will be kept together during splitting.
             If groups is not None, the n_splits argument passed to this class's initializer
             must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
         inference: string,:class:`.Inference` instance, or None
             Method for performing inference.  This estimator supports 'bootstrap'
             (or an instance of:class:`.BootstrapInference`) and 'statsmodels'
@@ -1511,4 +1647,29 @@ def fit(self, Y, T, Z, X=None, W=None, *, sample_weight=None, sample_var=None, g
         """
         return super().fit(Y, T, Z=Z, X=X, W=W,
                            sample_weight=sample_weight, sample_var=sample_var, groups=groups,
-                           inference=inference)
+                           cache_values=cache_values, inference=inference)
+
+    def refit(self, inference='auto'):
+        return super().refit(inference=inference)
+
+    @property
+    def bias_part_of_coef(self):
+        return self.ortho_learner_model_final._fit_cate_intercept
+
+    @property
+    def model_final(self):
+        return self._gen_model_final()
+
+    @model_final.setter
+    def model_final(self, value):
+        if value is not None:
+            raise ValueError("Parameter `model_final` cannot be altered for this estimator.")
+
+    @property
+    def opt_reweighted(self):
+        return False
+
+    @opt_reweighted.setter
+    def opt_reweighted(self, value):
+        if not (value == False):
+            raise ValueError("Parameter `value` cannot be altered from `False` for this estimator.")