Skip to content

Commit 6c1b0ad

Browse files
Add files via upload
1 parent fff4ea7 commit 6c1b0ad

File tree

2 files changed

+254
-0
lines changed

2 files changed

+254
-0
lines changed

Synthetic data/Synthetic_CNN.py

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Nov 26 11:54:05 2018
4+
5+
@author: obazgir
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import matplotlib.pyplot as plt
11+
import math
12+
import keras
13+
from sklearn.model_selection import train_test_split
14+
from keras.models import Sequential
15+
from sklearn.model_selection import KFold
16+
from keras.layers import Dense , Dropout
17+
from sklearn.ensemble import RandomForestRegressor
18+
from sklearn.svm import SVR
19+
import pickle
20+
import Toolbox
21+
from Toolbox import NRMSE, Random_Image_Gen, two_d_norm, two_d_eq, Assign_features_to_pixels, MDS_Im_Gen, Bias_Calc, REFINED_Im_Gen
22+
from scipy.stats import pearsonr
23+
from scipy.stats import pearsonr
24+
import os
25+
26+
27+
## Simulating the data
28+
P = [800] # Number of features
29+
Results_Dic = {}
30+
for p in P:
31+
32+
COV_X = 0.5*np.random.random((p,p))
33+
COV_X = np.maximum( COV_X, COV_X.transpose()) # Generating covariance highly correlated covariance matrix
34+
35+
36+
for i in range(p):
37+
if i - int(p/20) < 0:
38+
COV_X[i,0:i+int(p/20)] = 0.2*np.random.random(i+int(p/20)) + 0.5
39+
elif i+int(p/20) > p:
40+
COV_X[i,i-int(p/20):] = 0.2*np.random.random(abs(p-i+int(p/20))) + 0.5
41+
#else:
42+
# COV_X[i,i-int(p/20):i+int(p/20)] = 0.2*np.random.random(int(p/10)) + 0.5
43+
COV_X = np.maximum( COV_X, COV_X.transpose())
44+
np.fill_diagonal(COV_X, 1)
45+
Columns_PD = p*[None]; index_PD = p*[None]
46+
47+
for i in range(p):
48+
Columns_PD[i] = "F" + str(i)
49+
index_PD[i] = "F" + str(i)
50+
51+
NN = int(math.sqrt(p)) +1
52+
53+
Samples = [10000] # Sample size which could be different as described in the REFINED manuscript
54+
55+
## Synthetic data
56+
for n in Samples:
57+
N = round(n) # Number of samples
58+
X= np.random.multivariate_normal(Mu, COV_X, size = N)
59+
#X = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=500)
60+
SPR_Ratio = [0.2,0.5,0.8] # Spurious features ratio
61+
for spr in SPR_Ratio:
62+
sz = round(spr*p)
63+
B1 = 3*np.random.random((sz,)) + 6; B2 = np.zeros(p-sz); B = np.concatenate((B1,B2)) # Weights
64+
Y = np.matmul(X,B)
65+
Y = (Y - Y.min())/(Y.max() - Y.min()) # Target values
66+
67+
CNN_Dic = {}
68+
# reading the REFINED coordinates
69+
with open('theMapping_Synth'+str(p)+'.pickle','rb') as file:
70+
gene_names,coords,map_in_int = pickle.load(file)
71+
72+
73+
74+
Results_CNN = np.zeros((5,3))
75+
i = 0
76+
# Using 5 fold cross validation for performance measurement
77+
kf = KFold(n_splits=5)
78+
for train_index, test_index in kf.split(X):
79+
X_Train, X_Test = X[train_index], X[test_index]
80+
Y_Train, Y_Test = Y[train_index], Y[test_index]
81+
Y_Test = Y_Test.reshape(len(Y_Test),1)
82+
83+
################################################
84+
85+
from keras.layers.core import Activation, Flatten
86+
from keras.layers.convolutional import Conv2D
87+
from keras.layers.convolutional import MaxPooling2D
88+
from keras import backend as K
89+
from sklearn.model_selection import KFold
90+
#K.set_image_dim_ordering('th')
91+
from sklearn.model_selection import train_test_split
92+
from keras.optimizers import RMSprop, Adam, Adadelta, SGD,Nadam
93+
from keras.layers.normalization import BatchNormalization
94+
95+
Y_Train_CNN = Y[train_index]
96+
Y_Test_CNN = Y[test_index]; Y_Test_CNN = Y_Test_CNN.reshape(len(Y_Test_CNN),1)
97+
98+
Width = NN
99+
Height = NN
100+
101+
102+
nn = math.ceil(np.sqrt(p)) # Image dimension
103+
Nn = p
104+
105+
X_REFINED_Train = REFINED_Im_Gen(X_Train,nn, map_in_int, gene_names,coords)
106+
X_REFINED_Test = REFINED_Im_Gen(X_Test,nn, map_in_int, gene_names,coords)
107+
108+
Width = nn
109+
Height = nn
110+
111+
X_Training = X_REFINED_Train.reshape(-1,Width,Height,1)
112+
X_Testing = X_REFINED_Test.reshape(-1,Width,Height,1)
113+
# Defining the CNN Model
114+
115+
def CNN_model():
116+
nb_filters = 8
117+
nb_conv = 3
118+
119+
model = Sequential()
120+
model.add(Conv2D(nb_filters*1, nb_conv, nb_conv,border_mode='valid',input_shape=(Width, Height,1)))
121+
model.add(BatchNormalization())
122+
model.add(Activation('relu'))
123+
#model.add(MaxPooling2D(pool_size=(2, 2)))
124+
125+
model.add(Conv2D(nb_filters*3, nb_conv, nb_conv))
126+
model.add(BatchNormalization())
127+
model.add(Activation('relu'))
128+
#model.add(MaxPooling2D(pool_size=(2, 2)))
129+
130+
model.add(Flatten())
131+
132+
133+
134+
model.add(Dense(256))
135+
model.add(BatchNormalization())
136+
model.add(Activation('relu'))
137+
138+
model.add(Dense(128))
139+
model.add(BatchNormalization())
140+
model.add(Activation('relu'))
141+
model.add(Dropout(1 - 0.7))
142+
143+
model.add(Dense(1))
144+
145+
opt = Adam(lr = 0.0001)
146+
model.compile(loss='mse', optimizer = opt)
147+
return model
148+
# Training the CNN Model
149+
model = CNN_model()
150+
model.fit(X_Training, Y_Train_CNN, batch_size= 100, epochs = 50, verbose=0)# callbacks = callbacks_list)
151+
Y_Pred_CNN = model.predict(X_Testing, batch_size= 100, verbose=0)
152+
153+
# Printing out the results
154+
NRMSE_CNN, MSE_CNN = NRMSE(Y_Test_CNN, Y_Pred_CNN)
155+
print(NRMSE_CNN, "CNN NRMSE")
156+
Y_Test_CNN = Y_Test_CNN.reshape(len(Y_Test_CNN),1)
157+
PearsonCorr_CNN, p_value = pearsonr(Y_Test_CNN, Y_Pred_CNN)
158+
Results_CNN[i,0] = NRMSE_CNN; Results_CNN[i,1] = MSE_CNN ; Results_CNN[i,2] = PearsonCorr_CNN
159+
i = i + 1
160+
161+
162+
NRMSE_CNN = np.mean(Results_CNN[:,0]); MSE_CNN = np.mean(Results_CNN[:,1]); Corr_CNN = np.mean(Results_CNN[:,2]);
163+
164+
165+
Results_Sample = np.zeros((1,3))
166+
Results_Sample[0,:] = np.array([NRMSE_CNN,MSE_CNN,Corr_CNN])
167+
Results = pd.DataFrame(data = Results_Sample , index = ["CNN"], columns = ["NRMSE","MSE","Corr"])
168+
Results_Dic[spr,n,p] = Results
169+
170+
with open('Results_Dic'+str(p)+'_5.csv', 'w') as f:[f.write('{0},{1}\n'.format(key, value)) for key, value in Results_Dic.items()]
171+

Synthetic data/Synthetic_Data.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Nov 26 11:54:05 2018
4+
5+
@author: obazgir
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import matplotlib.pyplot as plt
11+
import math
12+
import keras
13+
from sklearn.model_selection import train_test_split
14+
from keras.models import Sequential
15+
from sklearn.model_selection import KFold
16+
from keras.layers import Dense , Dropout
17+
from sklearn.ensemble import RandomForestRegressor
18+
from sklearn.svm import SVR
19+
20+
from scipy.stats import pearsonr
21+
import os
22+
os.chdir('C:\\Users\\obazgir\\Desktop\\CMDS_IMAGES_NEW\\Dr.Ghosh')
23+
from MDS_RR import MDS_RR
24+
## Simulating the data
25+
26+
P = [20,50,100,200,400,800,1000,2000,4000] # Number of features
27+
Results_Dic = {}
28+
for p in P:
29+
30+
31+
COV_X = 0.5*np.random.random((p,p))
32+
COV_X = np.maximum( COV_X, COV_X.transpose()) # Generating covariance highly correlated covariance matrix
33+
#COV_X = 0.7*np.ones((p,p))
34+
35+
for i in range(p):
36+
if i - int(p/20) < 0:
37+
COV_X[i,0:i+int(p/20)] = 0.2*np.random.random(i+int(p/20)) + 0.5
38+
elif i+int(p/20) > p:
39+
COV_X[i,i-int(p/20):] = 0.2*np.random.random(abs(p-i+int(p/20))) + 0.5
40+
#else:
41+
# COV_X[i,i-int(p/20):i+int(p/20)] = 0.2*np.random.random(int(p/10)) + 0.5
42+
COV_X = np.maximum( COV_X, COV_X.transpose())
43+
np.fill_diagonal(COV_X, 1)
44+
COV_X = np.ones((p,p)) - COV_X
45+
Columns_PD = p*[None]; index_PD = p*[None]
46+
47+
for i in range(p):
48+
Columns_PD[i] = "F" + str(i)
49+
index_PD[i] = "F" + str(i)
50+
51+
COV_X_PD = pd.DataFrame(data = COV_X, index = index_PD, columns = Columns_PD)
52+
Mu = np.repeat(0.3, p)
53+
54+
#%% Init MDS
55+
import Toolbox
56+
from Toolbox import two_d_eq, Assign_features_to_pixels,Random_Image_Gen,REFINED_Im_Gen
57+
from sklearn.manifold import MDS
58+
from sklearn.metrics.pairwise import euclidean_distances
59+
import pickle
60+
61+
62+
#%% MDS
63+
nn = math.ceil(np.sqrt(p)) # Image dimension
64+
Nn = p # Number of features
65+
Euc_Dist = COV_X # Making the Euclidean distance matrix symmetric
66+
67+
68+
embedding = MDS(n_components=2) # Reduce the dimensionality by MDS into 2 components
69+
mds_xy = embedding.fit_transform(COV_X) # Apply MDS
70+
71+
print(">>>> MDS dimensionality reduction is done")
72+
73+
eq_xy = two_d_eq(mds_xy,Nn)
74+
Img = Assign_features_to_pixels(eq_xy,nn,verbose=1) # Img is the none-overlapping coordinates generated by MDS
75+
76+
77+
78+
Desc = Columns_PD # Drug descriptors name
79+
Dist = pd.DataFrame(data = Euc_Dist, columns = Desc, index = Desc) # Generating a distance matrix which includes the Euclidean distance between each and every descriptor
80+
data = (Desc, Dist, Img ) # Preparing the hill climbing inputs
81+
82+
with open("Init_Synth"+str(p)+".pickle", 'wb') as f: # The hill climbing input is a pickle, therefore everything is saved as a pickle to be loaded by the hill climbing
83+
pickle.dump(data, f)

0 commit comments

Comments
 (0)