Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added SparseLinearDRLearner #167

Merged
merged 7 commits into from
Nov 18, 2019
39 changes: 24 additions & 15 deletions econml/cate_estimator.py
Original file line number Diff line number Diff line change
@@ -11,7 +11,8 @@
from .bootstrap import BootstrapEstimator
from .inference import BootstrapInference
from .utilities import tensordot, ndim, reshape, shape
from .inference import StatsModelsInference, StatsModelsInferenceDiscrete, LinearModelFinalInference
from .inference import StatsModelsInference, StatsModelsInferenceDiscrete, LinearModelFinalInference,\
LinearModelFinalInferenceDiscrete


class BaseCateEstimator(metaclass=abc.ABCMeta):
@@ -415,20 +416,9 @@ def _get_inference_options(self):
return options


class StatsModelsCateEstimatorDiscreteMixin(BaseCateEstimator):
class LinearModelFinalCateEstimatorDiscreteMixin(BaseCateEstimator):
# TODO Create parent StatsModelsCateEstimatorMixin class so that some functionalities can be shared

def _get_inference_options(self):
# add statsmodels to parent's options
options = super()._get_inference_options()
options.update(statsmodels=StatsModelsInferenceDiscrete)
return options

@property
@abc.abstractmethod
def statsmodels(self):
pass

def coef_(self, T):
""" The coefficients in the linear model of the constant marginal treatment
effect associated with treatment T.
@@ -447,7 +437,7 @@ def coef_(self, T):
"""
_, T = self._expand_treatments(None, T)
ind = (T @ np.arange(T.shape[1])).astype(int)[0]
return self.statsmodels_fitted[ind].coef_
return self.fitted_models_final[ind].coef_

def intercept_(self, T):
""" The intercept in the linear model of the constant marginal treatment
@@ -464,7 +454,7 @@ def intercept_(self, T):
"""
_, T = self._expand_treatments(None, T)
ind = (T @ np.arange(1, T.shape[1] + 1)).astype(int)[0] - 1
return self.statsmodels_fitted[ind].intercept_
return self.fitted_models_final[ind].intercept_

@BaseCateEstimator._defer_to_inference
def coef__interval(self, T, *, alpha=0.1):
@@ -505,3 +495,22 @@ def intercept__interval(self, T, *, alpha=0.1):
The lower and upper bounds of the confidence interval.
"""
pass


class StatsModelsCateEstimatorDiscreteMixin(LinearModelFinalCateEstimatorDiscreteMixin):
# TODO Create parent StatsModelsCateEstimatorMixin class so that some functionalities can be shared

def _get_inference_options(self):
# add statsmodels to parent's options
options = super()._get_inference_options()
options.update(statsmodels=StatsModelsInferenceDiscrete)
return options


class DebiasedLassoCateEstimatorDiscreteMixin(LinearModelFinalCateEstimatorDiscreteMixin):

def _get_inference_options(self):
# add statsmodels to parent's options
options = super()._get_inference_options()
options.update(debiasedlasso=LinearModelFinalInferenceDiscrete)
return options
235 changes: 229 additions & 6 deletions econml/drlearner.py
Original file line number Diff line number Diff line change
@@ -28,12 +28,13 @@
"""

import numpy as np
from warnings import warn
from sklearn.linear_model import LogisticRegressionCV, LinearRegression, LassoCV
from econml.utilities import inverse_onehot
from econml.sklearn_extensions.linear_model import WeightedLassoCV
from econml.sklearn_extensions.linear_model import WeightedLassoCV, DebiasedLasso
from sklearn.base import clone
from econml._ortho_learner import _OrthoLearner
from econml.cate_estimator import StatsModelsCateEstimatorDiscreteMixin
from econml.cate_estimator import StatsModelsCateEstimatorDiscreteMixin, DebiasedLassoCateEstimatorDiscreteMixin
from econml.utilities import StatsModelsLinearRegression
from sklearn.preprocessing import PolynomialFeatures

@@ -539,6 +540,9 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.

fit_cate_intercept : bool, optional (Default=True)
Whether the linear CATE model should have a constant term.

n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
@@ -656,9 +660,228 @@ def multitask_model_cate(self):
return super().multitask_model_cate

@property
def statsmodels(self):
return self.model_final._model_final
def model_final(self):
return super().model_final._model_final

@property
def fitted_models_final(self):
return super().model_final.models_cate


class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
"""
Special case of the :class:`~econml.drlearner.DRLearner` where the final stage
is a Debiased Lasso Regression. In this case, inference can be performed via the debiased lasso approach
and its asymptotic normal characterization of the estimated parameters. This is computationally
faster than bootstrap inference. Set ``inference='debiasedlasso'`` at fit time, to enable inference
via asymptotic normality.

More concretely, this estimator assumes that the final cate model for each treatment takes a linear form:

.. math ::
\\theta_t(X) = \\left\\langle \\theta_t, \\phi(X) \\right\\rangle + \\beta_t

where :math:`\\phi(X)` is the outcome features of the featurizers, or `X` if featurizer is None. :math:`\\beta_t`
is a an intercept of the CATE, which is included if ``fit_cate_intercept=True`` (Default). It fits this by
running a debiased lasso regression (i.e. :math:`\\ell_1`-penalized regression with debiasing),
regressing the doubly robust outcome differences on X: i.e. first solves the penalized square loss problem

.. math ::
\\min_{\\theta_t, \\beta_t}\
E_n\\left[\\left(Y_{i, t}^{DR} - Y_{i, 0}^{DR}\
- \\left\\langle \\theta_t, \\phi(X_i) \\right\\rangle - \\beta_t\\right)^2\\right]\
+ \\lambda \\left\\lVert \\theta_t \\right\\rVert_1

and then adds a debiasing correction to the solution. If alpha='auto' (recommended), then the penalty
weight :math:`\\lambda` is set optimally via cross-validation.

This approach is valid even if the CATE model is not linear in :math:`\\phi(X)`. In this case it performs
inference on the best sparse linear approximation of the CATE model.

Parameters
----------
model_propensity : scikit-learn classifier
Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
where T is a shape (n, ) array.

model_regression : scikit-learn regressor
Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
`predict` methods. If different models per treatment arm are desired, see the
:class:`~econml.utilities.MultiModelWrapper` helper class.

featurizer : sklearn featurizer or None
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.

fit_cate_intercept : bool, optional (Default=True)
Whether the linear CATE model should have a constant term.

alpha: string | float, optional. Default='auto'.
CATE L1 regularization applied through the debiased lasso in the final model.
'auto' corresponds to a CV form of the :class:`DebiasedLasso`.

max_iter : int, optional, default=1000
The maximum number of iterations in the Debiased Lasso

tol : float, optional, default=1e-4
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.

n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:

- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`
- An iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).

Unless an iterable is used, we call `split(X,T)` to generate the splits.

random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.

Examples
--------
A simple example with the default models::

import numpy as np
import scipy.special
from econml.drlearner import DRLearner, SparseLinearDRLearner

np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
y = (1 + .5*X[:, 0]) * T + X[:, 0] + np.random.normal(size=(1000,))
est = SparseLinearDRLearner()
est.fit(y, T, X=X, W=None, inference='debiasedlasso')

>>> est.effect(X[:3])
array([ 0.46089994, 0.31993838, -0.07386204])
>>> est.effect_interval(X[:3])
(array([ 0.11912103, -0.16476616, -0.64903889]),
array([0.80267886, 0.80464292, 0.50131482]))
>>> est.coef_(T=1)
array([0.40984866, 0.02624624, 0.05320565])
>>> est.coef__interval(T=1)
(array([ 0.21365131, -0.15865431, -0.13733605]),
array([0.606046 , 0.21114678, 0.24374735]))
>>> est.intercept_(T=1)
0.864611566994208
>>> est.intercept__interval(T=1)
(0.6779174404370922, 1.0513056935513236)

Attributes
----------
score_ : float
The MSE in the final doubly robust potential outcome regressions, i.e.

.. math::
\\frac{1}{n_t} \\sum_{t=1}^{n_t} \\frac{1}{n} \\sum_{i=1}^n (Y_{i, t}^{DR} - \\hat{\\theta}_t(X_i))^2

where n_t is the number of treatments (excluding control).

If `sample_weight` is not None at fit time, then a weighted average across samples is returned.

"""

def __init__(self,
model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
model_regression=WeightedLassoCV(cv=3),
featurizer=None,
fit_cate_intercept=True,
alpha='auto',
max_iter=1000,
tol=1e-4,
n_splits=2, random_state=None):
model_final = DebiasedLasso(
alpha=alpha,
fit_intercept=fit_cate_intercept,
max_iter=max_iter,
tol=tol)
super().__init__(model_propensity=model_propensity,
model_regression=model_regression,
model_final=model_final,
featurizer=featurizer,
multitask_model_final=False,
n_splits=n_splits,
random_state=random_state)

def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None):
"""
Estimate the counterfactual model from data, i.e. estimates function: math: `\\theta(\\cdot)`.

Parameters
----------
Y: (n,) vector of length n
Outcomes for each sample
T: (n,) vector of length n
Treatments for each sample
X: optional(n, d_x) matrix or None (Default=None)
Features for each sample
W: optional(n, d_w) matrix or None (Default=None)
Controls for each sample
sample_weight: optional(n,) vector or None (Default=None)
Weights for each samples
sample_var: optional(n,) vector or None (Default=None)
Sample variance for each sample
inference: string, :class:`.Inference` instance, or None
Method for performing inference. This estimator supports ``'bootstrap'``
(or an instance of :class:`.BootstrapInference`) and ``'debiasedlasso'``
(or an instance of :class:`.LinearModelInferenceDiscrete`).

Returns
-------
self: DRLearner instance
"""
# Replacing fit from DRLearner, to add debiasedlasso inference in docstring
# TODO: support sample_var
if sample_weight is not None and inference is not None:
warn("This estimator does not yet support sample variances and inference does not take "
"sample variances into account. This feature will be supported in a future release.")
self._check_sparsity(X, T)
return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, sample_var=None, inference=inference)

def _check_sparsity(self, X, T):
# Check if model is sparse enough for this model
if X is None:
d_x = 1
elif self.featurizer is None:
d_x = X.shape[1]
else:
d_x = clone(self.featurizer, safe=False).fit_transform(X[[0], :]).shape[1]
if self._discrete_treatment:
d_t = len(set(T.flatten())) - 1
else:
d_t = 1 if np.ndim(T) < 2 else T.shape[1]
if d_x * d_t < 5:
warn("The number of features in the final model (< 5) is too small for a sparse model. "
"We recommend using the LinearDMLCateEstimator for this low-dimensional setting.",
UserWarning)

@property
def multitask_model_cate(self):
# Replacing this method which is invalid for this class, so that we make the
# dosctring empty and not appear in the docs.
return super().multitask_model_cate

@property
def model_final(self):
return super().model_final._model_final

@property
def statsmodels_fitted(self):
return self.model_final.models_cate
def fitted_models_final(self):
return super().model_final.models_cate
Loading