-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
181 lines (145 loc) · 6.34 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score # evaluation metrics
from mklaren.kernel.kinterface import Kinterface
from mklaren.kernel.kernel import linear_kernel, poly_kernel, matern_kernel, rbf_kernel
from bayes_opt import BayesianOptimization
import pandas as pd
import argparse
import numpy as np
# reading data
X = pd.read_csv('data/train_X.csv', low_memory=False)
y = pd.read_csv('data/train_y.csv', low_memory=False)
columns = np.array(X.columns)
rows = np.array(X.T.columns)
# defining the kernels
K_exp = Kinterface(data=X, kernel=rbf_kernel, kernel_args={"sigma": 0.0003}) # RBF kernel
K_poly = Kinterface(data=X, kernel=poly_kernel, kernel_args={"b": 3}) # polynomial kernel with degree=3
K_lin = Kinterface(data=X, kernel=linear_kernel) # linear kernel
K_mat = Kinterface(data=X, kernel=matern_kernel)
# redaing routine
def read_data(path):
X = pd.read_table('http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/xtrain.txt',names = columns, index_col=0, low_memory=False)
y = pd.read_table('http://members.cbio.mines-paristech.fr/~jvert/svn/tutorials/data/breastcancerwang/ytrain.txt',names= ['label'], index_col=False, low_memory=False)
# X = pd.read_csv(path+'train_X.csv')
# y = pd.read_csv(path+'train_y.csv')
return X,y
# KFolde training helper function
def KFold_train(X,Y_train, kf,clf, metrics, print_report = False):
kf.get_n_splits(X)
n, d = kf.n_splits, len(metrics)
score = np.zeros((n, d))
i = 0
for train_index, test_index in kf.split(X):
print(train_index)
print(train_index.shape)
print(X.shape)
X_train, X_test = X[train_index], X[test_index]
# print(train_index)
# print(Y_train[train_index])
y_train, y_test = Y_train[train_index], Y_train[test_index]
clf.fit(X_train,y_train)
pred_y = clf.predict(X_test)
for metric, j in zip(metrics, range(d)):
score[i,j] = metric(y_test, pred_y)
if print_report:
print(classification_report(y_test, pred_y))
print(score[i,:])
i+=1
return np.mean(score, axis=0)
def KFold_train_score(X, Y_train, kf, clf, metrics, print_report = False):
kf.get_n_splits(X)
n, d = kf.n_splits, len(metrics)
score = np.zeros((n, d))
i = 0
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = Y_train[train_index], Y_train[test_index]
clf.fit(X_train,y_train)
pred_y = svm_clf.decision_function(X_test)
for metric, j in zip(metrics, range(d)):
score[i,j] = metric(y_test, pred_y)
if print_report:
print(classification_report(y_test, pred_y))
print(score[i,:])
i+=1
return np.mean(score, axis=0)
# Roubost KFolds
def roubst_KCV(n_rand, X,Y_train,kf,clf, metrics, print_report = False):
d = len(metrics)
roubst_score = np.zeros((n_rand, d))
for n in range(n_rand):
roubst_score[n,:] = KFold_train(X,Y_train,kf,clf, metrics, print_report = False)
return np.mean(roubst_score, axis=0), np.std(roubst_score, axis=0)
def roubst_KCV_score(n_rand, X,Y_train,kf,clf, metrics, print_report = False):
d = len(metrics)
roubst_score = np.zeros((n_rand, d))
for n in range(n_rand):
roubst_score[n,:] = KFold_train_score(X,Y_train,kf,clf, metrics, print_report = False)
return np.mean(roubst_score, axis=0), np.std(roubst_score, axis=0)
# training routine
def train(X,y, kf, alph_bound = (0,5), beta_bound= (0,5), epsolon_bound= (0,5), psi_bound= (0,5)):
# defining black box function for BO
def black_box_function(alph, beta, epsolon, psi):
"""Function with unknown internals we wish to maximize.
This is just serving as an example, for all intents and
purposes think of the internals of this function, i.e.: the process
which generates its output values, as unknown.
"""
combined_kernel = lambda x, y: \
alph * K_exp(x, y) + beta * K_lin(x, y) + epsolon * K_poly(x, y) + psi * K_mat(x, y)
svm_clf = SVC(kernel=combined_kernel)
#np.random.seed(42)
# m, std = roubst_KCV(5 ,X ,y , kf, svm_clf, [accuracy_score, precision_score, recall_score])
m = KFold_train(X,y, kf, svm_clf, [accuracy_score, precision_score, recall_score])
# m, std = KFold_train_score(X,Y_train, kf, svm_clf,[roc_auc_score])
return m[0] #+ std[0]/2
# Bounded region of parameter space
pbounds = {'alph': (0, 5), 'beta': (0, 5),'epsolon':(0,5), 'psi' : (0,5)}
# BO Optimizer
optimizer = BayesianOptimization(
f=black_box_function,
pbounds=pbounds,
random_state=1,
)
# preform the optimization
optimizer.maximize(
init_points=4,
n_iter=50,
)
# printing the final result
print(optimizer.max['params'])
combined_kernel = lambda x, y: \
optimizer.max['params']['alph'] * K_exp(x, y) + optimizer.max['params']['beta'] * K_lin(x, y) + optimizer.max['params']['epsolon'] * K_poly(x, y) + \
optimizer.max['params']['psi'] * K_mat(x, y)
svm_clf = SVC(kernel=combined_kernel)
return svm_clf
# np.random.seed(42)
def print_result(svm_clf):
print('MKLBO with 4 kerels\n')
print('Accuracy score Precision score Recall score F1 score\n')
print(roubst_KCV(5,X,y,kf, svm_clf,[accuracy_score, precision_score, recall_score, f1_score]))
print('AUC ROC\n')
print(roubst_KCV_score(5,X,Y_train, kf, svm_clf,[roc_auc_score]))
# wriring the result to a file
def save_result(path, svm_clf):
with open('result/result.txt','w') as f:
f.write('MKLBO with 4 kerels\n')
f.write('Accuracy score Precision score Recall score F1 score\n')
f.write(str(roubst_KCV(5,X,y,kf, svm_clf,[accuracy_score, precision_score, recall_score, f1_score]) + '\n'))
f.write('AUC ROC\n')
f.write(str(roubst_KCV_score(5,X,Y_train, kf, svm_clf,[roc_auc_score]), + '\n'))
def main():
# reading data
print('Reading data ----------------- \n')
X, y = read_data('data')
kf = KFold(n_splits=3, shuffle=True)
print(kf)
print('Training --------------------- \n')
svm_clf = train(X, y , kf)
print('\n Result -------------------- \n')
print_result(svm_clf)
print('Saving result ---------------- \n')
save_result('result', svm_clf)
if __name__ == '__main__':
main()