-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathexperiments_mp.py
264 lines (233 loc) · 9.45 KB
/
experiments_mp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# -*- coding: utf-8 -*-
"""
This module was created for testing initial benchmarks of the various
clustering approaches.
"""
import numpy as np
import os
from sklearn import metrics
from time import time
from dkmeans.data import get_dataset
from dkmeans.data import DEFAULT_DATASET, DEFAULT_THETA, DEFAULT_WINDOW
from dkmeans.data import DEFAULT_M, DEFAULT_N
from dkmeans.data import choose_best_centroids
from dkmeans.metrics import average_distance
import dkmeans.singleshot as dkss
import dkmeans.multishot as dkms
import dkmeans.kmeans_pooled as kp
from multiprocessing import Process, Queue
METHODS = {
'pooled': (kp.main, {}),
#'singleshot_lloyd': (dkss.main, {'optimization': 'lloyd'}),
#'singleshot_gradient': (dkss.main, {'optimization': 'gradient'}),
#'multishot_lloyd': (dkms.main, {'optimization': 'lloyd'}),
#'multishot_gradient': (dkms.main, {'optimization': 'gradient'}),
} # STORES the method mains and the kwarg for the corresponding opt
METHOD_NAMES = METHODS.keys()
METRICS = { 'silhouette': metrics.silhouette_score,
#'avg_dist': average_distance,
}
METRIC_NAMES = METRICS.keys()
DEFAULT_METHOD = "pooled"
DEFAULT_VERBOSE = True
def evaluate_metric(X, labels, metric):
"""
More helpful for when we have different choices of metrics
"""
flat_X = [x.flatten() for x in X]
try:
return METRICS[metric](flat_X, np.array(labels))
except ValueError: # TODO - fix this...
if len(set(labels)) == 1:
print(labels)
print("Bad Clustering - all labels assigned to one cluster")
return -1
def run_method(X, k, method=DEFAULT_METHOD, subjects=None, **kwargs):
"""
Run a given method by name
"""
# print("Running Method %s" % method)
start = time()
if 'init_centroids' in kwargs.keys() and type(
kwargs['init_centroids']) is dict:
kwargs['init_centroids'] = kwargs['init_centroids'][method]
res = METHODS[method][0](X, k, **METHODS[method][1], **kwargs)
# print(res)
end = time()
res['rtime'] = end - start
res['subjects'] = subjects
return res
def run_experiment(k, N, dataset=DEFAULT_DATASET, theta=DEFAULT_THETA,
dfnc_window=DEFAULT_WINDOW, m=DEFAULT_M, n=DEFAULT_N,
metrics=METRIC_NAMES,
methods=METHOD_NAMES, **kwargs):
"""
Run an experiment with a particular choice of
1. Data set
2. Data Parameters k, n, theta, dfnc_window, m, n
3. metric
4. method
5. Method parameters passed in kwargs
"""
subjects = None
X, subjects = get_dataset(N, dataset=dataset, theta=theta,
dfnc_window=dfnc_window, m=m, n=n)
res = {method: run_method(X, k, subjects=subjects, method=method, **kwargs)
for method in methods}
measures = {res[r]['name']: {metric: evaluate_metric(res[r]['X'],
res[r]['cluster_labels'], metric)
for metric in metrics} for r in res}
return measures, res
def run_repeated_experiment_mp(R, k, N, metrics=METRIC_NAMES,
methods=METHOD_NAMES, **kwargs):
"""
Run repeated experiments - this function may be unnecesarry and
cluttered?
"""
measures = {method: {metric: [] for metric in metrics}
for method in methods}
results = {method: [] for method in methods}
processes = []
queues = []
for r in range(R):
q = Queue()
p = Process(target=one_run_exp, args=(q, k, N, metrics,
methods, ), kwargs=kwargs)
p.start()
processes.append(p)
queues.append(q)
for i, pq in enumerate(zip(processes, queues)):
p, q = pq
meas, res = q.get()
p.join()
for method in methods:
results[method] += [res[method]]
for metric in metrics:
measures[method][metric] += [meas[method][metric]]
return measures, results
def run_repeated_experiment(R, k, N, metrics=METRIC_NAMES,
methods=METHOD_NAMES, **kwargs):
"""
Run repeated experiments - this function may be unnecesarry and
cluttered?
"""
measures = {method: {metric: [] for metric in metrics}
for method in methods}
results = {method: [] for method in methods}
for r in range(R):
meas, res = one_run_exp(None, k, N, metrics, methods, **kwargs)
print("Printing Results k=%d, R=%d, N=%d in process %s" %
(k, R, N, os.getpid()))
for method in meas:
for measure in meas[method]:
print('\t%s, %s: %s' % (method, measure,
np.max(meas[method][measure])))
for method in methods:
results[method] += [res[method]]
for metric in metrics:
measures[method][metric] += [meas[method][metric]]
return measures, results
def one_run_exp(q, k, N, metrics, methods, **kwargs):
np.random.seed(seed=int(str(os.getpid())))
meas, res = run_experiment(k, N, metrics=metrics, methods=methods,
**kwargs)
if q is not None:
q.put([meas, res])
return meas, res
# print("Done with experiment k=%d, N=%d in process %s" %
# (k, N, os.getpid()))
def dfnc_pipe(k, N, R, s=2, rep=100):
print("Running dFNC exemplars k=%d, R=%d, N=%d, s=%d in process %s" %
(k, rep, N, s, os.getpid()))
meas, res = run_repeated_experiment_mp(rep, k, N,
dataset='real_fmri_exemplar',
s=s,
verbose=True)
for method in meas:
print("saving exemplars")
np.save('results/exemplar_N%d_s%d_k%d_%s_meas.npy' %
(N, s, k, method), meas)
np.save('results/exemplar_N%d_s%d_k%d_%s_res.npy' %
(N, s, k, method), res)
print("Printing Exemplar Results k=%d, R=%d, N=%d in process %s" %
(k, rep, N, os.getpid()))
for method in meas:
for measure in meas[method]:
print('\t%s, %s: %s' % (method, measure,
np.max(meas[method][measure])))
for method in meas:
exemplars = choose_best_centroids('results/exemplar_N%d_s%d_k%d_%s_res.npy'
% (N, s, k, method),
'results/exemplar_N%d_s%d_k%d_%s_meas.npy'
% (N, s, k, method),
list(meas.keys()))
print("Running dFNC second stage k=%d, R=%d, N=%d in process %s" %
(k, R, N, os.getpid()))
measR, resR = run_repeated_experiment_mp(1, k, N,
dataset='real_fmri',
verbose=True,
s=s,
init_centroids=exemplars)
print("Saving measure results")
np.save('results/fbirn_N%d_s%d_k%d_%s_meas.npy' %
(N, s, k, method), measR)
print("Saving results")
np.save('results/fbirn_N%d_s%d_k%d_%s_res.npy' %
(N, s, k, method), resR)
print("Printing Second Stage Results k=%d, R=%d, N=%d in process %s" %
(k, R, N, os.getpid()))
for method in measR:
for measure in measR[method]:
print('\t%s, %s: %s' % (method, measure,
np.max(measR[method][measure])))
def main():
"""Run a suite of experiments in order"""
# datar = [ # 'gaussian',
# 'iris',
# 'simulated_fmri',
# 'real_fmri_exemplar'
# ] # datasets to run
R = 5 # Number of repetitions
N = 314 # Number of samples
# Oth experiment is gaussian set with known number of clusters, 3,
'''
print("Running known K experiment")
theta = [[-1, 0.5], [1, 0.5], [2.5, 0.5]]
meas, res = run_repeated_experiment(R, 3, N, theta=theta, verbose=True,
dataset='gaussian')
np.save('repeat_known_k_meas.npy', meas)
np.save('repeat_known_k_res.npy', res)
'''
# First experiment is increasing k
# measure the scores and iterations, no runtimes
'''
print("Running increasing K experiment")
k_test = range(4, 5)
for k in k_test:
for d in datar:
print("K: %d; Dataset %s" % (k, d))
meas, res = run_repeated_experiment(R, k, N,
dataset=d, verbose=False)
np.save('results/d%s_k%d_meas.npy' % (d, k), meas)
np.save('results/d%s_k%d_res.npy' % (d, k), res)
'''
# Second experiment is increasing N with fixed k
# Measure the number of iterations and the runtime and the scores
# TODO: Implement this
# Third experiment is Increasing number of subjects in simulated
# Real fMRI data
# TODO: Implement this
# Experiment with 300 subjects, 30 sites
print("Running K experiment dFNC pipeline")
s = 2
#processes = []
dfnc_pipe(5, N, R, s)
# for k in k_test:
# p = Process(target=dfnc_pipe, args=(k, N, R, s))
# p.start()
# processes.append(p)
# for p in processes:
# p.join()
if __name__ == '__main__':
# TODO Arg-Parsing
main()