@@ -48,7 +48,7 @@ def run_intersect():
48
48
# Process & sort target variants
49
49
# ToDo: check if it works for bim format files?
50
50
with xopen ('target_variants.txt' , 'wt' ) as outf :
51
- outf .write ('CHR:POS:A0:A1\t ID_TARGET\t REF_TARGET\t IS_MA_TARGET\t ALT_FREQ \t F_MISS_DOSAGE\n ' )
51
+ outf .write ('CHR:POS:A0:A1\t ID_TARGET\t REF_TARGET\t IS_MA_TARGET\t MAF \t F_MISS_DOSAGE\n ' )
52
52
target_heap = []
53
53
for path in args .target :
54
54
logger .info ("Reading & sorting TARGET variants: {}" .format (path ))
@@ -64,7 +64,7 @@ def run_intersect():
64
64
# if v['ID'] != freq['ID'] != miss['ID']:
65
65
# print(v)
66
66
ALTs = v ['ALT' ].split (',' )
67
- ALT_FREQS = freq ['ALT_FREQS' ].split (',' )
67
+ ALT_FREQS = [ float ( x ) for x in freq ['ALT_FREQS' ].split (',' )]
68
68
F_MISS_DOSAGE = miss ['F_MISS_DOSAGE' ]
69
69
IS_MA_TARGET = len (ALTs ) > 1
70
70
for i , ALT in enumerate (ALTs ):
@@ -74,7 +74,8 @@ def run_intersect():
74
74
key = '{}:{}:{}:{}' .format (v ['#CHROM' ], v ['POS' ], ALT , v ['REF' ])
75
75
# outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
76
76
# F_MISS_DOSAGE))
77
- heapq .heappush (target_heap , ([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , ALT_FREQS [i ],F_MISS_DOSAGE ]))
77
+ MAF = AAF2MAF (ALT_FREQS [i ])
78
+ heapq .heappush (target_heap , ([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , MAF ,F_MISS_DOSAGE ]))
78
79
79
80
# Output the sorted reference variants
80
81
logger .info ("Outputting TARGET variants -> target_variants.txt" )
@@ -91,6 +92,15 @@ def run_intersect():
91
92
for vmatch in sorted_join_variants ('reference_variants.txt' , 'target_variants.txt' ):
92
93
n_matched += 1
93
94
vmatch ['SAME_REF' ] = vmatch ['REF_REF' ] == vmatch ['REF_REF' ]
95
+
96
+ # Define variant's eligibility for PCA
97
+ # From original implementation: ((IS_MA_REF == FALSE) && (IS_MA_TARGET == FALSE)) && (((IS_INDEL == FALSE) && (STRANDAMB == FALSE)) || ((IS_INDEL == TRUE) && (SAME_REF == TRUE)))
98
+ PCA_ELIGIBLE = ((vmatch ['IS_MA_REF' ] is False ) and (vmatch ['IS_MA_TARGET' ] is False )) and \
99
+ (((vmatch ['IS_INDEL' ] is False ) and (vmatch ['STRANDAMB' ] is False )) or ((vmatch ['IS_INDEL' ] is True ) and (vmatch ['SAME_REF' ] is True )))
100
+
101
+ PCA_ELIGIBLE = PCA_ELIGIBLE and (vmatch ['MAF' ] > args .maf_filter ) and (vmatch ['F_MISS_DOSAGE' ] < args .maf_filter )
102
+ vmatch ['PCA_ELIGIBLE' ] = PCA_ELIGIBLE
103
+
94
104
if n_matched == 1 :
95
105
writer = csv .DictWriter (csvfile , fieldnames = vmatch .keys (), delimiter = '\t ' )
96
106
writer .writeheader ()
@@ -119,27 +129,56 @@ def sorted_join_variants(reffile, targetfile):
119
129
f1_iter = read_var_general (reffile )
120
130
f2_iter = read_var_general (targetfile )
121
131
132
+ prev_key1 = None # Initialize previous key for file 1
133
+ prev_key2 = None # Initialize previous key for file 2
134
+
122
135
line1 = next (f1_iter , None )
123
136
line2 = next (f2_iter , None )
124
137
125
138
while line1 is not None and line2 is not None :
126
139
key1 = line1 ['CHR:POS:A0:A1' ]
127
140
key2 = line2 ['CHR:POS:A0:A1' ]
128
141
142
+ # Check if lines are sorted by the key for each file
143
+ if prev_key1 is not None and key1 < prev_key1 :
144
+ raise ValueError ("REFERENCE keys are not sorted" )
145
+ if prev_key2 is not None and key2 < prev_key2 :
146
+ raise ValueError ("TARGET keys are not sorted" )
147
+
129
148
if key1 == key2 :
130
149
line1 .update (line2 )
131
150
yield line1
151
+ prev_key1 = key1 # Update previous key for file 1
152
+ prev_key2 = key2 # Update previous key for file 2
153
+
132
154
line1 = next (f1_iter , None )
133
155
line2 = next (f2_iter , None )
134
156
elif key1 < key2 :
157
+ prev_key1 = key1 # Update previous key for file 1
135
158
line1 = next (f1_iter , None )
136
159
else :
160
+ prev_key2 = key2 # Update previous key for file 2
137
161
line2 = next (f2_iter , None )
138
162
139
163
140
164
def allele_complement (s ):
165
+ '''
166
+ Complement alleles
167
+ :param s: allele to be complemented
168
+ :return: complement
169
+ '''
141
170
return s .replace ("A" , "V" ).replace ("T" , "X" ).replace ("C" , "Y" ).replace ("G" , "Z" ).replace ("V" , "T" ).replace ("X" , "A" ).replace ("Y" , "G" ).replace ("Z" , "C" )
142
171
172
+ def AAF2MAF (aaf ):
173
+ '''
174
+ Convert alternative allele frequency (AAF) to minor allele frequency (MAF)
175
+ :param aaf: alternative allele frequency
176
+ :return: minor allele frequency (MAF)
177
+ '''
178
+ if aaf > 0.5 :
179
+ return 1 - aaf
180
+ else :
181
+ return aaf
143
182
144
183
def parse_args (args = None ):
145
184
parser = argparse .ArgumentParser (
@@ -167,7 +206,21 @@ def parse_args(args=None):
167
206
"--chrom" ,
168
207
dest = "filter_chrom" ,
169
208
required = False ,
170
- help = "whether to limit matches to specific chromosome of the reference" ,
209
+ help = "Whether to limit matches to specific chromosome of the reference" ,
210
+ )
211
+ parser .add_argument (
212
+ "--maf_target" ,
213
+ dest = "maf_filter" ,
214
+ default = 0.1 ,
215
+ required = False ,
216
+ help = "Filter: Minimum minor Allele Frequency for PCA eligibility" ,
217
+ )
218
+ parser .add_argument (
219
+ "--geno_miss" ,
220
+ dest = "maf_filter" ,
221
+ default = 0.1 ,
222
+ required = False ,
223
+ help = "Filter: Maximum Genotype missingness for PCA eligibility" ,
171
224
)
172
225
parser .add_argument (
173
226
"-v" ,
0 commit comments