-
Notifications
You must be signed in to change notification settings - Fork 199
/
Copy pathdobson.py
382 lines (296 loc) · 11.9 KB
/
dobson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
'''This module provides Python/statsmodel solutions to all code examples in
Dobson AJ & Barnett AG: "An Introduction to Generalized Linear Models"
3rd ed
CRC Press(2008)
Points that still need to be done are marked with "tbd" below:
- [Unclear] Clarify the exact definition of "loglog" and "cloglog" in the differnent
languages.
- [Unclear] in "senility_and_WAIS" I don't understand what the "grouped response"
is supposed to mean
- [Missing 1] "ordinal_logistic_regression" is not yet implemented in statsmodels
- [Missing 2] Cox proportional hazards are not yet implemented in statsmodels
- [Missing 3] Repeated measures models are not yet implemented in statsmodels
author: thomas haslwanter
date: may 2013
ver: 0.21
'''
# Standard libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import patsy
from statsmodels.stats.api import anova_lm
from statsmodels.formula.api import glm, ols
from statsmodels.genmod.families import Poisson, Binomial, Gaussian, links
# for data import
import urllib2
import zipfile
from StringIO import StringIO
def get_data(inFile):
'''Get data from original Excel-file'''
xls = pd.ExcelFile(inFile)
df = xls.parse('Sheet1', skiprows=2)
return df
def regression():
'''Poisson regression example
chapter 4.4, p.69'''
# get the data from the web
inFile = r'GLM_data/Table 4.3 Poisson regression.xls'
df = get_data(inFile)
# do the fit
p = glm('y~x', family=Poisson(links.identity), data=df)
print p.fit().summary()
def multiple_linear_regression():
'''Multiple linear regression
chapter 6.3, p. 98'''
# get the data from the web
inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
df = get_data(inFile)
# do the fit, for the original model ...
model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
print model.summary()
print anova_lm(model)
# as GLM
p = glm('carbohydrate ~ age + weight + protein',
family=Gaussian(), data=df).fit()
print 'Same model, calculated with GLM'
''' The confidence intervals are different than those from OLS.
The reason (from Nathaniel Smith):
OLS uses a method that gives exact results, but only works in the special
case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
instead uses an approximate method which is correct asymptotically but may
be off for small samples; the tradeoff you get in return is that this method
works the same way for all GLM models, including those with non-Gaussian
error terms and non-trivial link functions. So that's why they're different.
'''
print p.summary()
# ... and for model 1
model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
print model1.summary()
print anova_lm(model1)
def anova():
'''ANOVA
chapter 6.4, p. 108, and p. 113
GLM does not work with anova_lm.
'''
# get the data from the web
inFile = r'GLM_data/Table 6.6 Plant experiment.xls'
df = get_data(inFile)
# fit the model (p 109)
p = glm('weight~group', family=Gaussian(), data=df)
print p.fit().summary()
print '-'*65
print 'OLS'
model = ols('weight~group', data=df)
print model.fit().summary()
print anova_lm(model.fit())
# The model corresponding to the null hypothesis of no treatment effect is
model0 = ols('weight~1', data=df)
# Get the data for the two-factor ANOVA (p 113)
inFile = r'GLM_data/Table 6.9 Two-factor data.xls'
df = get_data(inFile)
# adjust the header names from the Excel-file
df.columns = ['A','B', 'data']
# two-factor anova, with interactions
ols_int = ols('data~A*B', data=df)
anova_lm(ols_int.fit())
# The python commands for the other four models are
ols_add = ols('data~A+B', data=df)
ols_A = ols('data~A', data=df)
ols_B = ols('data~B', data=df)
ols_mean = ols('data~1', data=df)
def ancova():
''' ANCOVA
chapter 6.5, p 117 '''
# get the data from the web
inFile = r'GLM_data/Table 6.12 Achievement scores.xls'
df = get_data(inFile)
# fit the model
model = ols('y~x+method', data=df).fit()
print anova_lm(model)
print model.summary()
def logistic_regression():
'''Logistic regression example
chapter 7.3, p 130
[tbd]: the cloglog values are inconsistent with those mentioned in the book.
This is probably due to the specific definitions of "loglog" and "cloglog"
in the respective languages.
'''
inFile = r'GLM_data/Table 7.2 Beetle mortality.xls'
df = get_data(inFile)
# adjust the unusual column names in the Excel file
colNames = [name.split(',')[1].lstrip() for name in df.columns.values]
df.columns = colNames
# fit the model
df['tested'] = df['n']
df['killed'] = df['y']
df['survived'] = df['tested'] - df['killed']
model = glm('survived + killed ~ x', data=df, family=Binomial()).fit()
print model.summary()
print '-'*65
print 'Equivalent solution:'
model = glm('I(n - y) + y ~ x', data=df, family=Binomial()).fit()
print model.summary()
# The fitted number of survivors can be obtained by
fits = df['n']*(1-model.fittedvalues)
print 'Fits Logit:'
print fits
# The fits for other link functions are:
model_probit = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.probit)).fit()
print model_probit.summary()
fits_probit = df['n']*(1-model_probit.fittedvalues)
print 'Fits Probit:'
print fits_probit
model_cll = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.cloglog)).fit()
print model_cll.summary()
fits_cll = df['n']*(1-model_cll.fittedvalues)
print 'Fits Extreme Value:'
print fits_cll
def general_logistic_regression():
'''Example General Logistic Recression,
Example 7.4.1, p. 135'''
# Get the data
inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls'
df = get_data(inFile)
# Define the variables so that they match Dobson
df['n_y'] = df['n'] - df['y']
df['newstor'] = df['storage']-1
df['x'] = np.log(df['centrifuge'])
# Model 1
model1 = glm('n_y + y ~ newstor*x', data=df, family=Binomial()).fit()
print model1.summary()
# Model 2
model2 = glm('n_y + y ~ newstor+x', data=df, family=Binomial()).fit()
print model2.summary()
# Model 3
model3 = glm('n_y + y ~ x', data=df, family=Binomial()).fit()
print model3 .summary()
def senility_and_WAIS():
'''Another example of logistic regression.
chapter 7.8, p 143
[tbd]: I don't understand how the "Binomial model" (grouped response)
is supposed to work, in either language'''
inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls'
df = get_data(inFile)
# ungrouped
model = glm('s ~ x', data=df, family=Binomial()).fit()
print model.summary()
# Hosmer-Lemeshow
# grouped: Here I don't get how the grouping is supposed to be achieved, either in R or in Python
# [tbd]
def nominal_logistic_regression():
'''Nominal Logistic Regression
chapter 8.3, p. 155
At this point, nominal logistic regression cannot be done with the formula approach.
Regarding the output, note that R produces log(pi2/pi1) and log(pi3/pi1), while
statsmodels produces log(pi2/pi1) and log(pi3/pi2)
'''
# Get the data
inFile = r'GLM_data/Table 8.1 Car preferences.xls'
df = get_data(inFile)
# to make sure that "women" and "no/little" are the reference,
# adjust them such that they come first alphabetically
df['response'][df['response'] == 'no/little'] = '_no/little'
df['sex'][df['sex'] == 'women'] = '_women'
print df
# Generate the design matrices using patsy
pm = patsy.dmatrices('response~sex+age', data=df)
# Generate the endog and exog matrices
endog = np.repeat(np.array(df['response']), df['frequency'].values.astype(int), axis=0)
exog = np.array(np.repeat(pm[1], df['frequency'].values.astype(int), axis=0))
exog = pd.DataFrame(exog, columns=pm[1].design_info.column_names)
# Fit the model, and print the summary
model = sm.MNLogit(endog, exog, method='nm').fit()
print model.summary()
def ordinal_logistic_regression_tbd():
'''Ordinal Logistic Regression
chapter 8.4, p161
This function is not implemented in statsmodels yet. One solution can be found at
http://fabianp.net/blog/2013/logistic-ordinal-regression/
'''
inFile = r'GLM_data/Table 8.1 Car preferences.xls'
df = get_data(inFile)
def poisson_regression():
'''Poisson Regression
chapter 9.2, p.170 & 171 '''
inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls"
df = get_data(inFile)
print df
# Generate the required variables
df['smoke'] = np.zeros(len(df))
df['smoke'][df['smoking']=='smoker']=1
df['agecat'] = np.array([1,2,3,4,5,1,2,3,4,5])
df['agesq'] = df['agecat']**2
df['smkage'] = df['agecat']
df['smkage'][df['smoking']=='non-smoker']=0
model = glm('deaths~agecat+agesq+smoke+smkage',
family=Poisson(), data=df,
exposure=df["person-years"]).fit()
print model.summary()
def log_linear_models():
'''Log-linear models
chapter 9.7, p 180 & 182 '''
# Malignant melanoma, p 180 --------------------------------
inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls'
df = get_data(inFile)
# Minimal model
model_min = glm('frequency~1', family = Poisson(), data=df).fit()
print 'Malignant melanoma'
print model_min.fittedvalues[0]
# Additive model
model_add = glm('frequency~site+type', family = Poisson(), data=df).fit()
print model_add.fittedvalues[0]
# Saturated model
# model_sat = glm('frequency~site*type', family = Poisson(), data=df).fit()
#
# The saturated model gives a perfect fit, and the fitted data are equal to
# the original data. Statsmodels indicates a "PerfectSeparationError"
# Ulcer and aspirin, p. 182 -------------------------------------
inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls'
df = get_data(inFile)
df.columns = ['GD', 'CC', 'AP', 'freq']
model1 = glm('freq~GD+CC+GD*CC', family = Poisson(), data=df).fit()
model2 = glm('freq~GD+CC+GD*CC + AP', family = Poisson(), data=df).fit()
model3 = glm('freq~GD+CC+GD*CC + AP + AP*CC', family = Poisson(), data=df).fit()
model4 = glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = Poisson(), data=df).fit()
print 'Ulcer and aspirin'
print model4.fittedvalues
def remission_times_tbd():
'''Survival analysis / Remission times
chapter 10.7, p. 201
These models, also known as "Cox proportional hazards model",
are currently under development but not yet available in statsmodels.'''
inFile = r'GLM_data/Table 10.1 Remission times.xls'
df = get_data(inFile)
print df
def longitudinal_data_tbd():
'''Stroke example
chapter 11.6, p. 222
Clustered and Longitudinal Data are described by repeated measures models.
These are under development, but not yet available in statsmodels.'''
inFile = r'GLM_data/Table 11.1 Recovery from stroke.xls'
df = get_data(inFile)
print df
if __name__ == '__main__':
regression()
multiple_linear_regression()
anova()
ancova()
logistic_regression()
general_logistic_regression()
senility_and_WAIS()
nominal_logistic_regression()
ordinal_logistic_regression_tbd()
poisson_regression()
log_linear_models()
remission_times_tbd()
multiple_linear_regression()
anova()
ancova()
logistic_regression()
general_logistic_regression()
nominal_logistic_regression()
ordinal_logistic_regression_tbd()
log_linear_models()
remission_times_tbd()
longitudinal_data_tbd()