-
Notifications
You must be signed in to change notification settings - Fork 740
/
Copy path_rlearner.py
319 lines (269 loc) · 14.6 KB
/
_rlearner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
The R Learner is an approach for estimating flexible non-parametric models
of conditional average treatment effects in the setting with no unobserved confounders.
The method is based on the idea of Neyman orthogonality and estimates a CATE
whose mean squared error is robust to the estimation errors of auxiliary submodels
that also need to be estimated from data:
1) the outcome or regression model
2) the treatment or propensity or policy or logging policy model
References
----------
Xinkun Nie, Stefan Wager (2017). Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
https://arxiv.org/abs/1712.04912
Dylan Foster, Vasilis Syrgkanis (2019). Orthogonal Statistical Learning.
ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
Chernozhukov et al. (2017). Double/debiased machine learning for treatment and structural parameters.
The Econometrics Journal. https://arxiv.org/abs/1608.00060
"""
import numpy as np
import copy
from warnings import warn
from .utilities import (shape, reshape, ndim, hstack, cross_product, transpose,
broadcast_unit_treatments, reshape_treatmentwise_effects,
StatsModelsLinearRegression, LassoCVWrapper)
from sklearn.model_selection import KFold, StratifiedKFold, check_cv
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import (PolynomialFeatures, LabelEncoder, OneHotEncoder,
FunctionTransformer)
from sklearn.base import clone, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.utils import check_random_state
from .cate_estimator import (BaseCateEstimator, LinearCateEstimator,
TreatmentExpansionMixin, StatsModelsCateEstimatorMixin)
from .inference import StatsModelsInference
from ._ortho_learner import _OrthoLearner
class _RLearner(_OrthoLearner):
"""
Base class for orthogonal learners.
Parameters
----------
model_y: estimator of E[Y | X, W]
The estimator for fitting the response to the features and controls. Must implement
`fit` and `predict` methods. Unlike sklearn estimators both methods must
take an extra second argument (the controls), i.e. ::
model_y.fit(X, W, Y, sample_weight=sample_weight)
model_y.predict(X, W)
model_t: estimator of E[T | X, W]
The estimator for fitting the treatment to the features and controls. Must implement
`fit` and `predict` methods. Unlike sklearn estimators both methods must
take an extra second argument (the controls), i.e. ::
model_t.fit(X, W, T, sample_weight=sample_weight)
model_t.predict(X, W)
model_final: estimator for fitting the response residuals to the features and treatment residuals
Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
take an extra second argument (the treatment residuals). Predict, on the other hand,
should just take the features and return the constant marginal effect. More, concretely::
model_final.fit(X, T_res, Y_res,
sample_weight=sample_weight, sample_var=sample_var)
model_final.predict(X)
discrete_treatment: bool
Whether the treatment values should be treated as categorical, rather than continuous, quantities
n_splits: int, cross-validation generator or an iterable
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by `np.random`.
Examples
--------
The example code below implements a very simple version of the double machine learning
method on top of the :py:class:`~econml._ortho_learner._RLearner` class, for expository purposes.
For a more elaborate implementation of a Double Machine Learning child class of the class
checkout :py:class:`~econml.dml.DMLCateEstimator` and its child classes::
import numpy as np
from sklearn.linear_model import LinearRegression
from econml._rlearner import _RLearner
from sklearn.base import clone
class ModelFirst:
def __init__(self, model):
self._model = clone(model, safe=False)
def fit(self, X, W, Y, sample_weight=None):
self._model.fit(np.hstack([X, W]), Y)
return self
def predict(self, X, W):
return self._model.predict(np.hstack([X, W]))
class ModelFinal:
def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1),
Y_res)
return self
def predict(self, X):
return self.model.predict(X)
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.01, size=(1000,))
est = _RLearner(ModelFirst(LinearRegression()),
ModelFirst(LinearRegression()),
ModelFinal(),
n_splits=2, discrete_treatment=False, random_state=None)
est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
>>> est.const_marginal_effect(np.ones((1,1)))
array([0.99963147])
>>> est.effect(np.ones((1,1)), T0=0, T1=10)
array([9.99631472])
>>> est.score(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
9.736380060274913e-05
>>> est.model_final.model
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
normalize=False)
>>> est.model_final.model.coef_
array([0.99963147])
>>> est.score_
9.826232040878233e-05
>>> [mdl._model for mdl in est.models_y]
[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False),
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)]
>>> [mdl._model for mdl in est.models_t]
[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False),
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)]
Attributes
----------
models_y: list of objects of type(model_y)
A list of instances of the model_y object. Each element corresponds to a crossfitting
fold and is the model instance that was fitted for that training fold.
models_t: list of objects of type(model_t)
A list of instances of the model_t object. Each element corresponds to a crossfitting
fold and is the model instance that was fitted for that training fold.
model_final : object of type(model_final)
An instance of the model_final object that was fitted after calling fit.
score_ : float
The MSE in the final residual on residual regression, i.e.
.. math::
\\frac{1}{n} \\sum_{i=1}^n (Y_i - \\hat{E}[Y|X_i, W_i]\
- \\hat{\\theta}(X_i)\\cdot (T_i - \\hat{E}[T|X_i, W_i]))^2
If `sample_weight` is not None at fit time, then a weighted average is returned. If the outcome Y
is multidimensional, then the average of the MSEs for each dimension of Y is returned.
"""
def __init__(self, model_y, model_t, model_final,
discrete_treatment, n_splits, random_state):
class ModelNuisance:
"""
Nuisance model fits the model_y and model_t at fit time and at predict time
calculates the residual Y and residual T based on the fitted models and returns
the residuals as two nuisance parameters.
"""
def __init__(self, model_y, model_t):
self._model_y = clone(model_y, safe=False)
self._model_t = clone(model_t, safe=False)
def fit(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
assert Z is None, "Cannot accept instrument!"
self._model_t.fit(X, W, T, sample_weight=sample_weight)
self._model_y.fit(X, W, Y, sample_weight=sample_weight)
return self
def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
Y_pred = self._model_y.predict(X, W)
T_pred = self._model_t.predict(X, W)
if (X is None) and (W is None): # In this case predict above returns a single row
Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
Y_res = Y - Y_pred.reshape(Y.shape)
T_res = T - T_pred.reshape(T.shape)
return Y_res, T_res
class ModelFinal:
"""
Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
that depends on X, i.e.
.. math ::
Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon
and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
residual on residual regression.
"""
def __init__(self, model_final):
self._model_final = clone(model_final, safe=False)
def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
Y_res, T_res = nuisances
self._model_final.fit(X, T_res, Y_res, sample_weight=sample_weight, sample_var=sample_var)
return self
def predict(self, X=None):
return self._model_final.predict(X)
def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, sample_var=None):
Y_res, T_res = nuisances
if Y_res.ndim == 1:
Y_res = Y_res.reshape((-1, 1))
if T_res.ndim == 1:
T_res = T_res.reshape((-1, 1))
effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
if sample_weight is not None:
return np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0))
else:
return np.mean((Y_res - Y_res_pred)**2)
super().__init__(ModelNuisance(model_y, model_t),
ModelFinal(model_final), discrete_treatment, n_splits, random_state)
def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, inference=None):
"""
Estimate the counterfactual model from data, i.e. estimates function: math: `\\theta(\\cdot)`.
Parameters
----------
Y: (n, d_y) matrix or vector of length n
Outcomes for each sample
T: (n, d_t) matrix or vector of length n
Treatments for each sample
X: optional(n, d_x) matrix or None (Default=None)
Features for each sample
W: optional(n, d_w) matrix or None (Default=None)
Controls for each sample
sample_weight: optional(n,) vector or None (Default=None)
Weights for each samples
sample_var: optional(n,) vector or None (Default=None)
Sample variance for each sample
inference: string, `Inference` instance, or None
Method for performing inference. This estimator supports 'bootstrap'
(or an instance of `BootstrapInference`).
Returns
-------
self: _RLearner instance
"""
# Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, sample_var=sample_var, inference=inference)
def score(self, Y, T, X=None, W=None):
"""
Score the fitted CATE model on a new data set. Generates nuisance parameters
for the new data set based on the fitted residual nuisance models created at fit time.
It uses the mean prediction of the models fitted by the different crossfit folds.
Then calculates the MSE of the final residual Y on residual T regression.
If model_final does not have a score method, then it raises an `AttributeError`
Parameters
----------
Y: (n, d_y) matrix or vector of length n
Outcomes for each sample
T: (n, d_t) matrix or vector of length n
Treatments for each sample
X: optional(n, d_x) matrix or None (Default=None)
Features for each sample
W: optional(n, d_w) matrix or None (Default=None)
Controls for each sample
Returns
-------
score: float
The MSE of the final CATE model on the new data.
"""
# Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
return super().score(Y, T, X=X, W=W)
@property
def model_final(self):
return super().model_final._model_final
@property
def models_y(self):
return [mdl._model_y for mdl in super().models_nuisance]
@property
def models_t(self):
return [mdl._model_t for mdl in super().models_nuisance]