Add fit_cate_intercept to DML, rework feature generation

kbattocchi · kbattocchi · commit 8c28f52ed53e · 2019-11-19T18:09:29.000-05:00
diff --git a/econml/dml.py b/econml/dml.py
@@ -124,18 +124,19 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         The estimator for fitting the response residuals to the treatment residuals. Must implement
         `fit` and `predict` methods, and must be a linear model for correctness.
 
-    featurizer: transformer
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer: :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
         `model_y` accordingly)
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, optional, default False
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -161,7 +162,7 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
 
     def __init__(self,
                  model_y, model_t, model_final,
-                 featurizer,
+                 featurizer=None,
                  linear_first_stages=False,
                  discrete_treatment=False,
                  n_splits=2,
@@ -177,22 +178,23 @@ def __init__(self, model, is_Y):
                 self._is_Y = is_Y
 
             def _combine(self, X, W, n_samples, fitting=True):
+                no_x = X is None
+                if no_x:
+                    X = np.ones((n_samples, 1))
+                if W is None:
+                    W = np.empty((n_samples, 0))
+                XW = hstack([X, W])
                 if self._is_Y and linear_first_stages:
-                    if X is not None:
-                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    if no_x:
+                        return XW
+
+                    if self._featurizer is None:
+                        F = X
                     else:
-                        X = np.ones((n_samples, 1))
-                        F = np.ones((n_samples, 1))
-                    if W is None:
-                        W = np.empty((n_samples, 0))
-                    XW = hstack([X, W])
-                    return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F, W]))
+                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F]))
                 else:
-                    if X is None:
-                        X = np.ones((n_samples, 1))
-                    if W is None:
-                        W = np.empty((n_samples, 0))
-                    return hstack([X, W])
+                    return XW
 
             def fit(self, X, W, Target, sample_weight=None):
                 if (not self._is_Y) and discrete_treatment:
@@ -221,12 +223,21 @@ def __init__(self):
                 self._model = clone(model_final, safe=False)
                 self._featurizer = clone(featurizer, safe=False)
 
+            def _combine(self, X, T, fitting=True):
+                if X is not None:
+                    if self._featurizer is not None:
+                        F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
+                    else:
+                        F = X
+                else:
+                    F = np.ones((T.shape[0], 1))
+                return cross_product(F, T)
+
             def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
                 # Track training dimensions to see if Y or T is a vector instead of a 2-dimensional array
                 self._d_t = shape(T_res)[1:]
                 self._d_y = shape(Y_res)[1:]
-                F = self._featurizer.fit_transform(X) if X is not None else np.ones((T_res.shape[0], 1))
-                fts = cross_product(F, T_res)
+                fts = self._combine(X, T_res)
                 if sample_weight is not None:
                     if sample_var is not None:
                         self._model.fit(fts,
@@ -246,9 +257,9 @@ def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
                     self._intercept = intercept
 
             def predict(self, X):
-                F = self._featurizer.transform(X) if X is not None else np.ones((1, 1))
-                F, T = broadcast_unit_treatments(F, self._d_t[0] if self._d_t else 1)
-                prediction = self._model.predict(cross_product(F, T))
+                X2, T = broadcast_unit_treatments(X if X is not None else np.empty((1, 0)),
+                                                  self._d_t[0] if self._d_t else 1)
+                prediction = self._model.predict(self._combine(None if X is None else X2, T, fitting=False))
                 if self._intercept is not None:
                     prediction -= self._intercept
                 return reshape_treatmentwise_effects(prediction,
@@ -292,10 +303,13 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
         The estimator for fitting the treatment to the features. Must implement
         `fit` and `predict` methods.
 
-    featurizer: transformer, optional (default is \
-        :class:`PolynomialFeatures(degree=1, include_bias=True) <sklearn.preprocessing.PolynomialFeatures>`)
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
@@ -330,14 +344,15 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
 
     def __init__(self,
                  model_y=LassoCV(), model_t=LassoCV(),
-                 featurizer=PolynomialFeatures(degree=1, include_bias=True),
+                 featurizer=None,
+                 fit_cate_intercept=True,
                  linear_first_stages=True,
                  discrete_treatment=False,
                  n_splits=2,
                  random_state=None):
         super().__init__(model_y=model_y,
                          model_t=model_t,
-                         model_final=StatsModelsLinearRegression(fit_intercept=False),
+                         model_final=StatsModelsLinearRegression(fit_intercept=fit_cate_intercept),
                          featurizer=featurizer,
                          linear_first_stages=linear_first_stages,
                          discrete_treatment=discrete_treatment,
@@ -410,10 +425,13 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    featurizer: transformer, optional
-    (default is :class:`PolynomialFeatures(degree=1, include_bias=True) <sklearn.preprocessing.PolynomialFeatures>`)
-        The transformer used to featurize the raw features when fitting the final model.  Must implement
-        a `fit_transform` method.
+    featurizer : :term:`transformer`, optional, default None
+        Must support fit_transform and transform. Used to create composite features in the final CATE regression.
+        It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
+        If featurizer=None, then CATE is trained on X.
+
+    fit_cate_intercept : bool, optional, default True
+        Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
@@ -450,7 +468,8 @@ def __init__(self,
                  alpha='auto',
                  max_iter=1000,
                  tol=1e-4,
-                 featurizer=PolynomialFeatures(degree=1, include_bias=True),
+                 featurizer=None,
+                 fit_cate_intercept=True,
                  linear_first_stages=True,
                  discrete_treatment=False,
                  n_splits=2,
diff --git a/econml/drlearner.py b/econml/drlearner.py
@@ -114,15 +114,15 @@ class takes as input the parameter `model_regressor``, which is an arbitrary sci
           mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
           clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
 
-    multitask_model_final : optional bool (default=False)
+    multitask_model_final : bool, optional, default False
         Whether the model_final should be treated as a multi-task model. See description of model_final.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -535,15 +535,15 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`~econml.utilities.MultiModelWrapper` helper class.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional (Default=True)
+    fit_cate_intercept : bool, optional, default True
         Whether the linear CATE model should have a constant term.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -711,28 +711,28 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`~econml.utilities.MultiModelWrapper` helper class.
 
-    featurizer : sklearn featurizer or None
+    featurizer : :term:`transformer`, optional, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional (Default=True)
+    fit_cate_intercept : bool, optional, default True
         Whether the linear CATE model should have a constant term.
 
-    alpha: string | float, optional. Default='auto'.
+    alpha: string | float, optional., default 'auto'.
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
 
-    max_iter : int, optional, default=1000
+    max_iter : int, optional, default 1000
         The maximum number of iterations in the Debiased Lasso
 
-    tol : float, optional, default=1e-4
+    tol : float, optional, default 1e-4
         The tolerance for the optimization: if the updates are
         smaller than ``tol``, the optimization code checks the
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+    n_splits: int, cross-validation generator or an iterable, optional, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are: