@@ -55,17 +55,17 @@ def run_intersect():
55
55
target_heap = []
56
56
for path in args .target :
57
57
logger .info ("Reading TARGET variants: {}" .format (path ))
58
- pvar = read_var_general (path , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
58
+ pvar = read_var_general (path , chrom = None ) # essential not to filter target (messes up common line indexing)
59
59
60
60
loc_afreq = path .replace ('.pvar.zst' , '.afreq.gz' )
61
- afreq = read_var_general (loc_afreq , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
61
+ afreq = read_var_general (loc_afreq , chrom = None ) # essential not to filter target (messes up common line indexing)
62
62
63
63
loc_vmiss = path .replace ('.pvar.zst' , '.vmiss.gz' )
64
- vmiss = read_var_general (loc_vmiss , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
64
+ vmiss = read_var_general (loc_vmiss , chrom = None ) # essential not to filter target (messes up common line indexing)
65
65
66
66
for v , freq , miss in zip (pvar , afreq , vmiss ):
67
- # if v['ID'] != freq['ID'] != miss['ID']:
68
- # print(v )
67
+ if all ([ v ['ID' ], freq ['ID' ], miss ['# ID' ]]) is False :
68
+ raise ValueError ( "TARGET variant files are not sorted" )
69
69
ALTs = v ['ALT' ].split (',' )
70
70
ALT_FREQS = [float (x ) for x in freq ['ALT_FREQS' ].split (',' )]
71
71
F_MISS_DOSAGE = miss ['F_MISS_DOSAGE' ]
@@ -75,10 +75,8 @@ def run_intersect():
75
75
key = '{}:{}:{}:{}' .format (v ['#CHROM' ], v ['POS' ], v ['REF' ], ALT )
76
76
else :
77
77
key = '{}:{}:{}:{}' .format (v ['#CHROM' ], v ['POS' ], ALT , v ['REF' ])
78
- # outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
79
- # F_MISS_DOSAGE))
80
- MAF = AAF2MAF (ALT_FREQS [i ])
81
- target_heap .append (([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , MAF ,F_MISS_DOSAGE ]))
78
+ MAF = aaf2maf (ALT_FREQS [i ])
79
+ target_heap .append (([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , MAF , F_MISS_DOSAGE ]))
82
80
83
81
logger .info ("Sorting TARGET variants (heapify)" )
84
82
heapq .heapify (target_heap )
@@ -105,7 +103,8 @@ def run_intersect():
105
103
PCA_ELIGIBLE = ((vmatch ['IS_MA_REF' ] == 'False' ) and (vmatch ['IS_MA_TARGET' ] == 'False' )) and \
106
104
(((vmatch ['IS_INDEL' ] == 'False' ) and (vmatch ['STRANDAMB' ] == 'False' )) or ((vmatch ['IS_INDEL' ] == 'True' ) and (vmatch ['SAME_REF' ] == 'True' )))
107
105
108
- PCA_ELIGIBLE = PCA_ELIGIBLE and (float (vmatch ['MAF' ]) > args .maf_filter ) and (float (vmatch ['F_MISS_DOSAGE' ]) < args .maf_filter )
106
+ PCA_ELIGIBLE = PCA_ELIGIBLE and (float (vmatch ['MAF' ]) > args .maf_filter ) and \
107
+ (float (vmatch ['F_MISS_DOSAGE' ]) < args .vmiss_filter )
109
108
vmatch ['PCA_ELIGIBLE' ] = PCA_ELIGIBLE
110
109
if PCA_ELIGIBLE is True :
111
110
n_PCA_ELIGIBLE += 1
@@ -114,7 +113,8 @@ def run_intersect():
114
113
writer = csv .DictWriter (csvfile , fieldnames = vmatch .keys (), delimiter = '\t ' )
115
114
writer .writeheader ()
116
115
writer .writerow (vmatch )
117
- logger .info ("{}/{} ({:.2f} variants are eligible for PCA" .format (n_PCA_ELIGIBLE , n_matched , 100 * n_PCA_ELIGIBLE / n_matched ))
116
+ logger .info ("{}/{} ({:.2f}%) variants are eligible for PCA" .format (n_PCA_ELIGIBLE , n_matched ,
117
+ 100 * n_PCA_ELIGIBLE / n_matched ))
118
118
119
119
# Output counts
120
120
logger .info ("Outputting variant counts -> intersect_counts_$.txt" )
@@ -123,9 +123,15 @@ def run_intersect():
123
123
124
124
125
125
def read_var_general (path , chrom = None ):
126
+ """
127
+ General function for reading variant files from plink2 outputs
128
+ :param path: path to variant file
129
+ :param chrom: filter to specific chromosome
130
+ :return: row of a df as a dict
131
+ """
126
132
with xopen (path , "rt" ) as f :
127
133
# ToDo: check if this is memory inefficent
128
- reader = csv .DictReader (filter (lambda row : row [:2 ]!= '##' , f ), delimiter = "\t " ) # need to remove comments of VCF-like characters, might be fully in memory though
134
+ reader = csv .DictReader (filter (lambda r : r [:2 ] != '##' , f ), delimiter = "\t " ) # need to remove comments of VCF-like characters, might be fully in memory though
129
135
if (chrom is None ) or (chrom == 'ALL' ):
130
136
for row in reader :
131
137
yield row
@@ -135,9 +141,9 @@ def read_var_general(path, chrom=None):
135
141
yield row
136
142
137
143
138
- def sorted_join_variants (reffile , targetfile ):
139
- f1_iter = read_var_general (reffile )
140
- f2_iter = read_var_general (targetfile )
144
+ def sorted_join_variants (path_ref , path_target ):
145
+ f1_iter = read_var_general (path_ref )
146
+ f2_iter = read_var_general (path_target )
141
147
142
148
prev_key1 = None # Initialize previous key for file 1
143
149
prev_key2 = None # Initialize previous key for file 2
@@ -172,24 +178,26 @@ def sorted_join_variants(reffile, targetfile):
172
178
173
179
174
180
def allele_complement (s ):
175
- '''
181
+ """
176
182
Complement alleles
177
183
:param s: allele to be complemented
178
184
:return: complement
179
- '''
185
+ """
180
186
return s .replace ("A" , "V" ).replace ("T" , "X" ).replace ("C" , "Y" ).replace ("G" , "Z" ).replace ("V" , "T" ).replace ("X" , "A" ).replace ("Y" , "G" ).replace ("Z" , "C" )
181
187
182
- def AAF2MAF (aaf ):
183
- '''
188
+
189
+ def aaf2maf (aaf ):
190
+ """
184
191
Convert alternative allele frequency (AAF) to minor allele frequency (MAF)
185
192
:param aaf: alternative allele frequency
186
193
:return: minor allele frequency (MAF)
187
- '''
194
+ """
188
195
if aaf > 0.5 :
189
196
return 1 - aaf
190
197
else :
191
198
return aaf
192
199
200
+
193
201
def parse_args (args = None ):
194
202
parser = argparse .ArgumentParser (
195
203
description = _description_text (),
@@ -227,7 +235,7 @@ def parse_args(args=None):
227
235
)
228
236
parser .add_argument (
229
237
"--geno_miss" ,
230
- dest = "maf_filter " ,
238
+ dest = "vmiss_filter " ,
231
239
default = 0.1 ,
232
240
required = False ,
233
241
help = "Filter: Maximum Genotype missingness for PCA eligibility" ,
0 commit comments