-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfit_models.py
171 lines (144 loc) · 5.71 KB
/
fit_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import CNN
import aws_funcs as af
import numpy as np
import pandas as pd
import os
class EmployModel(object):
def __init__(self, model, X_file,
arr_bucket = 'ajfcapstonearrays',
weight_bucket = 'ajfcapstoneweights',
weights_filename = None, img_size = 50,
lr = 0.001, epochs = 25, batch_size = 250):
'''
input: model = object of model being employed
X_file = base filename of independant variable files
arr_bucket = S3 bucket to retrieve data arrays from
weight_bucket = S3 bucket to save weights into
weights_filename = base name to save trained model weights into
img_size = size of images being fed into model
lr = learning rate of model
epochs = # of epochs to use
batch_size = size of batches in each epoch
'''
self.model = model
self.X_file = X_file
self.arr_bucket = arr_bucket
self.weight_bucket = weight_bucket
self.weights_filename = weights_filename
self.img_size = img_size
self.lr = lr
self.epochs = epochs
self.batch_size = batch_size
self.model = self.model(img_size=self.img_size, lr = self.lr)
self._b = af.connect_2_s3_bucket(self.arr_bucket)
self.X_files = self.get_X_files()
self.y_files = self.get_y_files()
self.X_train = self.X_files[:-1]
self.y_train = self.y_files[:-1]
self.X_test, self.y_test = self.get_test_files()
self.num_batches = len(self.X_train)
self.fit_model_batches()
self._bw = af.connect_2_s3_bucket(self.weight_bucket)
self.save_weights_local()
self.save_weights_remote()
def fit_model_batches(self):
'''
input: none
output: none
fits a model iteratively through batches of data. For large training data
sets
'''
for i, (X_train, y_train) in enumerate(zip(self.X_train,
self.y_train)):
print 'fitting batch {} of {}:\n X = {}, y = {}'.format(\
i, self.num_batches, X_train, y_train)
X, y = af.get_Xy_data(X_train,y_train, bucket=self.arr_bucket)
self.model.fit(X,y, nb_epoch = self.epochs,
batch_size = self.batch_size
)
return
def get_X_files(self):
'''
gets the names files for the independant variables (image arrays)
'''
x_files = [f.name for f in self._b.list() if self.X_file in f.name]
return x_files
def get_y_files(self):
'''
gets the names of files for the dependant variables (category arrays)
'''
y_files = [self.get_y_filename(f) for f in self.X_files]
return y_files
def get_test_files(self):
'''
gets files the model will be validated on
'''
X_test, y_test = af.get_Xy_data(self.X_files[-1],
self.y_files[-1],
bucket = self.arr_bucket)
y_test = self.reshape_y_test_arr(y_test)
return X_test, y_test
def get_y_filename(self, x_filename):
'''
INPUT: x_filename with X training/testing data
OUTPUT: y_filename with y data corresponding to x_filename data
'''
ls_x_filename = x_filename.split('_')
ls_x_filename[1] = 'y'
y_filename = '_'.join(ls_x_filename)
return y_filename
def save_weights_local(self):
'''
save the weights from a fitted model to local weights folder
'''
self.model.save_weights('weights/' + self.weights_filename + '.h5')
return
def save_weights_remote(self):
'''
input: S3 bucket to save weights into
output: None
takes all the files from the weights folder and saves them in the S3 bucket
'''
for f in os.listdir('weights'):
path = 'weights/' + f
k = self._bw.new_key(f)
k.set_contents_from_filename(path)
return
def reshape_y_test_arr(self, y_arr):
'''
input: y_arr = y test array shaped (n, 5)
output: y test array reshaped to (n, 1)
'''
df = pd.DataFrame(y_arr)
df[1] = df[1].apply(lambda x: 2 if x == 1 else 0)
df[2] = df[2].apply(lambda x: 3 if x == 1 else 0)
df[3] = df[3].apply(lambda x: 4 if x == 1 else 0)
df[4] = df[4].apply(lambda x: 5 if x == 1 else 0)
arr = df.sum(axis = 1).astype(int).values
return arr - 1
def test_probabilities(self):
'''
returns the probability of belonging to any of the classes for each
image in the validation set
'''
return self.model.predict_proba(self.X_test)
def test_classifications(self):
'''
returns the most likely classification for each image in the validation
set
'''
return self.model.predict_classes(self.X_test)
def accuracy(self):
'''
returns the accuracy of the model based on the predicted classes of the
validation set vs. the real values of the validation set
'''
pred_classes = self.test_classifications()
correct = sum([1 for a, b in zip(self.y_test, pred_classes) if a == b])
return float(correct) / len(self.y_test)
if __name__ == '__main__':
# build model to fit
model = CNN.vgg_basic
cnn = EmployModel(model, 'arr_X_50_full', lr = .0005, batch_size=999,
weights_filename = '50_full_basic_batchfit_3')
print 'accuracy: ', cnn.accuracy()