-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStep5-LDAK-GWAS.py
293 lines (225 loc) · 10.6 KB
/
Step5-LDAK-GWAS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import os
import pandas as pd
import numpy as np
import sys
filedirec = sys.argv[1]
gwas_file = sys.argv[2]
if "." in gwas_file:
gwasfilewithoutextension = gwas_file.split(".")[0]
save_path = filedirec + os.sep + gwasfilewithoutextension
foldnumber = sys.argv[3]
gwas_path = save_path + os.sep + filedirec+".gz"
def check_phenotype_is_binary_or_continous(filedirec):
# Read the processed quality controlled file for a phenotype
df = pd.read_csv(save_path+os.sep+filedirec+'_QC.fam',sep="\s+",header=None)
column_values = df[5].unique()
if len(set(column_values)) == 2:
return "Binary"
else:
return "Continous"
print("Phenotype",sys.argv[1])
print("Gwas File",sys.argv[2])
print("Fold Number",sys.argv[3])
print("Save Path",save_path)
print("GWAS Path",gwas_path)
if ".gz"in gwas_file:
df = pd.read_csv(gwas_path,compression= "gzip",sep="\s+")
else:
df = pd.read_csv(gwas_path,sep="\s+")
if "BETA" in df.columns.to_list():
# For Continous Phenotype.
df = df[['CHR', 'BP', 'SNP', 'A1', 'A2', 'N', 'SE', 'P', 'BETA', 'INFO', 'MAF']]
else:
df["BETA"] = np.log(df["OR"])
df["SE"] = df["SE"]/df["OR"]
df = df[['CHR', 'BP', 'SNP', 'A1', 'A2', 'N', 'SE', 'P', 'BETA', 'INFO', 'MAF']]
print(df.head().to_markdown())
df_transformed = pd.DataFrame({
'Predictor': df['CHR'].astype(str) + ":" + df['BP'].astype(str),
'A1': df['A1'],
'A2': df['A2'],
'n': df['N'],
'Z': df['BETA']/df['SE'],
'SNP':df['SNP']
})
df.to_csv(save_path + os.sep +filedirec+".txt",sep="\t",index=False)
# Remove SNPs where the number of alleles are more than 1
df_transformed = df_transformed[df_transformed['A1'].apply(len) == 1]
df_transformed = df_transformed[df_transformed['A2'].apply(len) == 1]
df_transformed = df_transformed.drop_duplicates(subset=['Predictor'], keep='first')
# Optionally, reset index
df_transformed.reset_index(drop=True, inplace=True)
df_transformed.to_csv(save_path + os.sep +filedirec+".ldak",sep="\t",index=False)
from operator import index
import pandas as pd
import numpy as np
import os
import subprocess
import sys
import pandas as pd
import statsmodels.api as sm
import pandas as pd
from sklearn.metrics import roc_auc_score, confusion_matrix
from statsmodels.stats.contingency_tables import mcnemar
def create_directory(directory):
"""Function to create a directory if it doesn't exist."""
if not os.path.exists(directory): # Checking if the directory doesn't exist
os.makedirs(directory) # Creating the directory if it doesn't exist
return directory # Returning the created or existing directory
#foldnumber = "0" # Setting 'foldnumber' to "0"
folddirec = save_path + os.sep + "Fold_" + foldnumber # Creating a directory path for the specific fold
trainfilename = "train_data" # Setting the name of the training data file
newtrainfilename = "train_data.QC" # Setting the name of the new training data file
validationfilename = "validation_data"
newvalidationfilename = "validation_data.QC"
testfilename = "test_data" # Setting the name of the test data file
newtestfilename = "test_data.QC" # Setting the name of the new test data file
# Number of PCA to be included as a covariate.
numberofpca = ["6"] # Setting the number of PCA components to be included
# Clumping parameters.
clump_p1 = [1] # List containing clump parameter 'p1'
clump_r2 = [0.1] # List containing clump parameter 'r2'
clump_kb = [200] # List containing clump parameter 'kb'
# Pruning parameters.
p_window_size = [200] # List containing pruning parameter 'window_size'
p_slide_size = [50] # List containing pruning parameter 'slide_size'
p_LD_threshold = [0.25] # List containing pruning parameter 'LD_threshold'
pvaluefile = folddirec + os.sep + 'range_list'
# Initializing an empty DataFrame with specified column names
prs_result = pd.DataFrame(columns=["clump_p1", "clump_r2", "clump_kb", "p_window_size", "p_slide_size", "p_LD_threshold",
"pvalue", "numberofpca","numberofvariants","Train_pure_prs", "Train_null_model", "Train_best_model",
"Test_pure_prs", "Test_null_model", "Test_best_model"])
import os
import subprocess
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import explained_variance_score
# Define a global variable to store results
prs_result = pd.DataFrame()
def transform_ldak_data(traindirec, newtrainfilename,numberofpca,ldakmodel,power, p1_val, p2_val, p3_val, c1_val, c2_val, c3_val,Name,pvaluefile):
os.system("awk "+"\'"+"{print $3,$8}"+"\'"+" ./"+save_path+os.sep+filedirec+".txt > ./"+traindirec+os.sep+"SNP.pvalue")
files_to_remove = [
traindirec + os.sep + ldakmodel +gwas_file+"_ldak_gwas_final",
traindirec+os.sep+ldakmodel+gwas_file+".effects",
]
# Loop through the files and remove them if they exist
for file_path in files_to_remove:
if os.path.exists(file_path):
os.remove(file_path)
print(f"Removed: {file_path}")
else:
print(f"File does not exist: {file_path}")
command1 = [
'./ldak',
'--cut-genes', 'highld',
'--bfile', traindirec+os.sep+newtrainfilename+".clumped.pruned",
'--genefile', 'highld.txt'
]
subprocess.run(command1)
command2 = [
'./ldak',
'--calc-cors',traindirec+os.sep+'cors',
'--bfile', traindirec+os.sep+newtrainfilename+".clumped.pruned"
]
subprocess.run(command2)
print(" ".join(command2))
# Here we need to update the cors.bim file and it should be the same as the GWAS file.
# LDAK accepts gwas in a specific format and the LDAK
df = pd.read_csv(save_path + os.sep +filedirec+".ldak",sep="\s+")
t1 = pd.read_csv(traindirec+os.sep+'cors.cors.bim',sep="\s+",header=None)
t1[1] = t1[0].astype(str)+":"+t1[3].astype(str)
t1.to_csv(traindirec+os.sep+'cors.cors.bim',sep="\t",header=False,index=None)
command3 = [
'./ldak',
'--mega-prs', traindirec+os.sep+ldakmodel+filedirec,
'--model', ldakmodel,
'--summary', save_path + os.sep +filedirec+".ldak",
'--power', str(power),
'--skip-cv','YES',
'--cors', traindirec+os.sep+'cors',
#'--check-high-LD', 'NO',
#'--high-LD', 'highld/genes.predictors.used',
'--allow-ambiguous', 'YES',
]
subprocess.run(command3)
#exit(0)
# Read the original gwas
df = pd.read_csv(save_path + os.sep +filedirec+".ldak",sep="\s+")
# Read the effect size for each SNP generated by LDAK.
df2 = pd.read_csv(traindirec+os.sep+ldakmodel+filedirec+".effects",sep="\s+")
# Get the SNP information as it is required by Plink, because the orginal data have SNPs like rsXXX and
# Effects generated by LDAK have SNPs in 2:16937 CHR:POSITION format.
df = df[df["Predictor"].isin(df2["Predictor"].values)]
df2["SNP"] = df["SNP"].values
df2.to_csv(traindirec+os.sep+ldakmodel+".effects",index=False,sep="\t")
numberofcolumns1 = pd.read_csv(traindirec+os.sep+ldakmodel+filedirec+".effects",sep="\s+").shape[1]
print(numberofcolumns1)
numberofcolumns = numberofcolumns1 - 4
# Read the effect size.
# It contains the effect sizes from multiple model.
temp = pd.read_csv(traindirec + os.sep + ldakmodel + ".effects", sep="\s+")
# Loop through effect sizes, modify values for binary phenotypes, and save specific columns
for loop in range(4, numberofcolumns1 - 1):
# If phenotype is binary, apply the exponential transformation
if check_phenotype_is_binary_or_continous(filedirec) == "Binary":
temp.iloc[:, loop] = np.exp(temp.iloc[:, loop])
else:
pass
# Save last, second, and current loop column in specified order
ordered_columns = [temp.columns[-1], temp.columns[1], temp.columns[loop]]
temp[ordered_columns].to_csv(
traindirec + os.sep + ldakmodel +filedirec+"_ldak_gwas_final",
sep="\t",
index=False
)
print(temp.head())
print(temp[ordered_columns].head())
command = [
"./plink",
"--bfile", traindirec+os.sep+newtrainfilename,
### SNP column = 3, Effect allele column 1 = 4, OR column=9
"--score", traindirec + os.sep + ldakmodel +filedirec+"_ldak_gwas_final", "1", "2", "3", "header",
"--q-score-range", traindirec+os.sep+"range_list",traindirec+os.sep+"SNP.pvalue",
"--extract", traindirec+os.sep+trainfilename+".valid.snp",
"--out", Name+os.sep+trainfilename
]
#exit(0)
subprocess.run(command)
# Calculate the PRS for the test data using the same set of SNPs and also calculate the PCA.
command = [
"./plink",
"--bfile", folddirec+os.sep+testfilename,
### SNP column = 3, Effect allele column 1 = 4, OR column=9
"--score", traindirec + os.sep + ldakmodel +filedirec+"_ldak_gwas_final", "1", "2", "3", "header",
"--q-score-range", traindirec+os.sep+"range_list",traindirec+os.sep+"SNP.pvalue",
"--extract", traindirec+os.sep+trainfilename+".valid.snp",
"--out", Name+os.sep+testfilename
]
subprocess.run(command)
command = [
"./plink",
"--bfile", folddirec+os.sep+validationfilename,
### SNP column = 3, Effect allele column 1 = 4, OR column=9
"--score", traindirec + os.sep + ldakmodel +filedirec+"_ldak_gwas_final", "1", "2", "3", "header",
"--q-score-range", traindirec+os.sep+"range_list",traindirec+os.sep+"SNP.pvalue",
"--extract", traindirec+os.sep+trainfilename+".valid.snp",
"--out", Name+os.sep+validationfilename
]
subprocess.run(command)
powers = [-0.25]
ldakmodels =["lasso","lasso-sparse","ridge","bolt","bayesr","elastic"]
#ldakmodels =["lasso-sparse","ridge","bolt","bayesr","elastic"]
ldakmodels =["lasso" ]
result_directory = "LDAK-GWAS"
create_directory(folddirec+os.sep+result_directory)
result_directory = folddirec+os.sep+result_directory
for p1_val in p_window_size:
for p2_val in p_slide_size:
for p3_val in p_LD_threshold:
for c1_val in clump_p1:
for c2_val in clump_r2:
for c3_val in clump_kb:
for p in numberofpca:
for ldakmodel in ldakmodels:
for power in powers:
transform_ldak_data(folddirec, newtrainfilename, p,ldakmodel,power,str(p1_val), str(p2_val), str(p3_val), str(c1_val), str(c2_val), str(c3_val),result_directory, pvaluefile)