py-why · kbattocchi · Nov 21, 2019 · Nov 19, 2019 · Nov 20, 2019 · Nov 20, 2019
diff --git a/econml/cate_estimator.py b/econml/cate_estimator.py
@@ -383,7 +383,20 @@ def effect(self, X=None, *, T0=0, T1=1):
 
 
 class LinearModelFinalCateEstimatorMixin(BaseCateEstimator):
-    """Base class for models where the final stage is a linear model."""
+    """
+    Base class for models where the final stage is a linear model.
+
+    Subclasses must expose a ``model_final`` attribute containing the model's
+    final stage model.
+
+    Attributes
+    ----------
+    bias_part_of_coef: bool
+        Whether the CATE model's intercept is contained in the final model's ``coef_`` rather
+        than as a separate ``intercept_``
+    """
+
+    bias_part_of_coef = False
 
     @property
     def coef_(self):
@@ -400,7 +413,11 @@ def coef_(self):
             the first block of n_x columns are the coefficients associated with treatment 0,
             the next n_x columns are the coefficients associated with treatment 1 etc.
         """
-        return self.model_final.coef_
+        all_coefs = self.model_final.coef_
+        if self.bias_part_of_coef:
+            return all_coefs[..., 1:]
+        else:
+            return all_coefs
 
     @property
     def intercept_(self):
@@ -411,7 +428,11 @@ def intercept_(self):
         -------
         intercept: float or (n_y,) array like
         """
-        return self.model_final.intercept_
+        if self.bias_part_of_coef:
+            all_coefs = self.model_final.coef_
+            return all_coefs[..., 0]
+        else:
+            return self.model_final.intercept_
 
     @BaseCateEstimator._defer_to_inference
     def coef__interval(self, *, alpha=0.1):
@@ -479,6 +500,20 @@ def _get_inference_options(self):
 
 class LinearModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):
     # TODO Share some logic with non-discrete version
+    """
+    Base class for models where the final stage is a linear model.
+
+    Subclasses must expose a ``fitted_models_final`` attribute 
+    returning an array of the fitted models for each non-control treatment
+
+    Attributes
+    ----------
+    bias_part_of_coef: bool
+        Whether the CATE model's intercept is contained in each final model's ``coef_`` rather
+        than as a separate ``intercept_``
+    """
+
+    bias_part_of_coef = False
 
     def coef_(self, T):
         """ The coefficients in the linear model of the constant marginal treatment
@@ -498,7 +533,11 @@ def coef_(self, T):
         """
         _, T = self._expand_treatments(None, T)
         ind = (T @ np.arange(T.shape[1])).astype(int)[0]
-        return self.fitted_models_final[ind].coef_
+        all_coefs = self.fitted_models_final[ind].coef_
+        if self.bias_part_of_coef:
+            return all_coefs[..., 1:]
+        else:
+            return all_coefs
 
     def intercept_(self, T):
         """ The intercept in the linear model of the constant marginal treatment
@@ -515,7 +554,11 @@ def intercept_(self, T):
         """
         _, T = self._expand_treatments(None, T)
         ind = (T @ np.arange(1, T.shape[1] + 1)).astype(int)[0] - 1
-        return self.fitted_models_final[ind].intercept_
+        if self.bias_part_of_coef:
+            all_coefs = self.fitted_models_final[ind].coef_
+            return all_coefs[..., 0]
+        else:
+            return self.fitted_models_final[ind].intercept_
 
     @BaseCateEstimator._defer_to_inference
     def coef__interval(self, T, *, alpha=0.1):

diff --git a/econml/dml.py b/econml/dml.py
@@ -124,18 +124,22 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         The estimator for fitting the response residuals to the treatment residuals. Must implement
         `fit` and `predict` methods, and must be a linear model for correctness.
 
-    featurizer: transformer
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer: :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
         `model_y` accordingly)
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, optional, default False
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -161,7 +165,8 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
 
     def __init__(self,
                  model_y, model_t, model_final,
-                 featurizer,
+                 featurizer=None,
+                 fit_cate_intercept=True,
                  linear_first_stages=False,
                  discrete_treatment=False,
                  n_splits=2,
@@ -177,22 +182,23 @@ def __init__(self, model, is_Y):
                 self._is_Y = is_Y
 
             def _combine(self, X, W, n_samples, fitting=True):
+                no_x = X is None
+                if no_x:
+                    X = np.ones((n_samples, 1))
+                if W is None:
+                    W = np.empty((n_samples, 0))
+                XW = hstack([X, W])
                 if self._is_Y and linear_first_stages:
-                    if X is not None:
-                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    if no_x:
+                        return XW
+
+                    if self._featurizer is None:
+                        F = X
                     else:
-                        X = np.ones((n_samples, 1))
-                        F = np.ones((n_samples, 1))
-                    if W is None:
-                        W = np.empty((n_samples, 0))
-                    XW = hstack([X, W])
-                    return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F, W]))
+                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F]))
                 else:
-                    if X is None:
-                        X = np.ones((n_samples, 1))
-                    if W is None:
-                        W = np.empty((n_samples, 0))
-                    return hstack([X, W])
+                    return XW
 
             def fit(self, X, W, Target, sample_weight=None):
                 if (not self._is_Y) and discrete_treatment:
@@ -220,13 +226,30 @@ class FinalWrapper:
             def __init__(self):
                 self._model = clone(model_final, safe=False)
                 self._featurizer = clone(featurizer, safe=False)
+                if fit_cate_intercept:
+                    add_intercept = FunctionTransformer(lambda F:
+                                                        hstack([np.ones((F.shape[0], 1)), F]))
+                    if featurizer:
+                        self._featurizer = Pipeline([('featurize', self._featurizer),
+                                                     ('add_intercept', add_intercept)])
+                    else:
+                        self._featurizer = add_intercept
+
+            def _combine(self, X, T, fitting=True):
+                if X is not None:
+                    if self._featurizer is not None:
+                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    else:
+                        F = X
+                else:
+                    F = np.ones((T.shape[0], 1))
+                return cross_product(F, T)
 
             def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
                 # Track training dimensions to see if Y or T is a vector instead of a 2-dimensional array
                 self._d_t = shape(T_res)[1:]
                 self._d_y = shape(Y_res)[1:]
-                F = self._featurizer.fit_transform(X) if X is not None else np.ones((T_res.shape[0], 1))
-                fts = cross_product(F, T_res)
+                fts = self._combine(X, T_res)
                 if sample_weight is not None:
                     if sample_var is not None:
                         self._model.fit(fts,
@@ -246,14 +269,14 @@ def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
                     self._intercept = intercept
 
             def predict(self, X):
-                F = self._featurizer.transform(X) if X is not None else np.ones((1, 1))
-                F, T = broadcast_unit_treatments(F, self._d_t[0] if self._d_t else 1)
-                prediction = self._model.predict(cross_product(F, T))
+                X2, T = broadcast_unit_treatments(X if X is not None else np.empty((1, 0)),
+                                                  self._d_t[0] if self._d_t else 1)
+                prediction = self._model.predict(self._combine(None if X is None else X2, T, fitting=False))
                 if self._intercept is not None:
                     prediction -= self._intercept
                 return reshape_treatmentwise_effects(prediction,
                                                      self._d_t, self._d_y)
-
+        self.bias_part_of_coef = fit_cate_intercept
         super().__init__(model_y=FirstStageWrapper(model_y, is_Y=True),
                          model_t=FirstStageWrapper(model_t, is_Y=False),
                          model_final=FinalWrapper(),
@@ -292,10 +315,13 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
         The estimator for fitting the treatment to the features. Must implement
         `fit` and `predict` methods.
 
-    featurizer: transformer, optional (default is \
-        :class:`PolynomialFeatures(degree=1, include_bias=True) <sklearn.preprocessing.PolynomialFeatures>`)
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
@@ -330,7 +356,8 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
 
     def __init__(self,
                  model_y=LassoCV(), model_t=LassoCV(),
-                 featurizer=PolynomialFeatures(degree=1, include_bias=True),
+                 featurizer=None,
+                 fit_cate_intercept=True,
                  linear_first_stages=True,
                  discrete_treatment=False,
                  n_splits=2,
@@ -339,6 +366,7 @@ def __init__(self,
                          model_t=model_t,
                          model_final=StatsModelsLinearRegression(fit_intercept=False),
                          featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          n_splits=n_splits,
@@ -410,10 +438,13 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    featurizer: transformer, optional
-    (default is :class:`PolynomialFeatures(degree=1, include_bias=True) <sklearn.preprocessing.PolynomialFeatures>`)
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
@@ -450,7 +481,8 @@ def __init__(self,
                  alpha='auto',
                  max_iter=1000,
                  tol=1e-4,
-                 featurizer=PolynomialFeatures(degree=1, include_bias=True),
+                 featurizer=None,
+                 fit_cate_intercept=True,
                  linear_first_stages=True,
                  discrete_treatment=False,
                  n_splits=2,
@@ -464,6 +496,7 @@ def __init__(self,
                          model_t=model_t,
                          model_final=model_final,
                          featurizer=featurizer,
+                         fit_cate_intercept=fit_cate_intercept,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
                          n_splits=n_splits,
@@ -568,4 +601,5 @@ def transform(self, X):
         super().__init__(model_y=model_y, model_t=model_t,
                          model_final=ElasticNetCV(),
                          featurizer=RandomFeatures(random_state),
+                         fit_cate_intercept=False,
                          discrete_treatment=discrete_treatment, n_splits=n_splits, random_state=random_state)
diff --git a/econml/drlearner.py b/econml/drlearner.py
@@ -114,15 +114,15 @@ class takes as input the parameter `model_regressor``, which is an arbitrary sci
           mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
           clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
 
-    multitask_model_final : optional bool (default=False)
+    multitask_model_final : bool, optional, default False
         Whether the model_final should be treated as a multi-task model. See description of model_final.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -535,15 +535,15 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`~econml.utilities.MultiModelWrapper` helper class.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional (Default=True)
+    fit_cate_intercept : bool, optional, default True
         Whether the linear CATE model should have a constant term.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -711,28 +711,28 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`~econml.utilities.MultiModelWrapper` helper class.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional (Default=True)
+    fit_cate_intercept : bool, optional, default True
         Whether the linear CATE model should have a constant term.
 
-    alpha: string | float, optional. Default='auto'.
+    alpha: string | float, optional., default 'auto'.
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
 
-    max_iter : int, optional, default=1000
+    max_iter : int, optional, default 1000
         The maximum number of iterations in the Debiased Lasso
 
-    tol : float, optional, default=1e-4
+    tol : float, optional, default 1e-4
         The tolerance for the optimization: if the updates are
         smaller than ``tol``, the optimization code checks the
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 

diff --git a/econml/inference.py b/econml/inference.py
@@ -96,10 +96,18 @@ def const_marginal_effect_interval(self, X, *, alpha=0.1):
                      for pred in preds)
 
     def coef__interval(self, *, alpha=0.1):
-        return self.model_final.coef__interval(alpha)
+        if self._est.bias_part_of_coef:
+            lo, hi = self.model_final.coef__interval(alpha)
+            return lo[..., 1:], hi[..., 1:]
+        else:
+            return self.model_final.coef__interval(alpha)
 
     def intercept__interval(self, *, alpha=0.1):
-        return self.model_final.intercept__interval(alpha)
+        if self._est.bias_part_of_coef:
+            lo, hi = self.model_final.coef__interval(alpha)
+            return lo[..., 0], hi[..., 0]
+        else:
+            return self.model_final.intercept__interval(alpha)
 
     def _predict_interval(self, X, alpha):
         return self.model_final.predict_interval(X, alpha=alpha)