1
1
import argparse
2
2
import logging
3
- import sys
4
- import os
5
3
from xopen import xopen
6
4
import csv
7
5
import textwrap
@@ -33,23 +31,25 @@ def run_intersect():
33
31
heapq .heappush (ref_heap , ([key , v ['ID' ], v ['REF' ]],[IS_INDEL , STRANDAMB , IS_MA_REF ]))
34
32
35
33
# Output the sorted reference variants
36
- for i in range (len (ref_heap )):
34
+ n_ref = len (ref_heap )
35
+ for i in range (n_ref ):
37
36
popped = heapq .heappop (ref_heap )
38
37
outf .write ('\t ' .join ([str (x ) for x in popped [0 ] + popped [1 ]]) + '\n ' )
39
38
del ref_heap
40
39
41
40
# Process & sort target variants
41
+ # ToDo: check if it works for bim format files?
42
42
with xopen ('target_variants.txt' , 'wt' ) as outf :
43
43
outf .write ('CHR:POS:A0:A1\t ID_TARGET\t REF_TARGET\t IS_MA_TARGET\t ALT_FREQ\t F_MISS_DOSAGE\n ' )
44
44
target_heap = []
45
45
for path in args .target :
46
- pvar = read_var_general (path , chrom = None ) # essential not to filter if it is target (messes up common indexing)
46
+ pvar = read_var_general (path , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
47
47
48
48
loc_afreq = path .replace ('.pvar.zst' , '.afreq.gz' )
49
- afreq = read_var_general (loc_afreq , chrom = None ) # essential not to filter if it is target (messes up common indexing)
49
+ afreq = read_var_general (loc_afreq , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
50
50
51
51
loc_vmiss = path .replace ('.pvar.zst' , '.vmiss.gz' )
52
- vmiss = read_var_general (loc_vmiss , chrom = None ) # essential not to filter if it is target (messes up common indexing)
52
+ vmiss = read_var_general (loc_vmiss , chrom = None ) # essential not to filter if it is target (messes up common line indexing)
53
53
54
54
for v , freq , miss in zip (pvar , afreq , vmiss ):
55
55
# if v['ID'] != freq['ID'] != miss['ID']:
@@ -68,19 +68,33 @@ def run_intersect():
68
68
heapq .heappush (target_heap , ([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , ALT_FREQS [i ],F_MISS_DOSAGE ]))
69
69
70
70
# Output the sorted reference variants
71
- for i in range (len (target_heap )):
71
+ n_target = len (target_heap )
72
+ for i in range (n_target ):
72
73
popped = heapq .heappop (target_heap )
73
74
outf .write ('\t ' .join ([str (x ) for x in popped [0 ] + popped [1 ]]) + '\n ' )
74
75
del target_heap
75
76
76
- # ToDo: implement merge (on the same keys) of the two sorted files
77
+ # Merge matched variants on sorted files
78
+ n_matched = 0
79
+ with open ('matched_variants.txt' , 'w' ) as csvfile :
80
+ for vmatch in sorted_join_variants ('reference_variants.txt' , 'target_variants.txt' ):
81
+ n_matched += 1
82
+ vmatch ['SAME_REF' ] = vmatch ['REF_REF' ] == vmatch ['REF_REF' ]
83
+ if n_matched == 1 :
84
+ writer = csv .DictWriter (csvfile , fieldnames = vmatch .keys (), delimiter = '\t ' )
85
+ writer .writeheader ()
86
+ writer .writerow (vmatch )
87
+
88
+ # Output counts
89
+ with open ('intersect_counts_${}.txt' .format (chrom ), 'w' ) as outf :
90
+ outf .write ('\n ' .join (map (str , [n_target , n_ref , n_matched ])))
77
91
78
92
79
93
def read_var_general (path , chrom = None ):
80
94
with xopen (path , "rt" ) as f :
81
95
# ToDo: check if this is memory inefficent
82
96
reader = csv .DictReader (filter (lambda row : row [:2 ]!= '##' , f ), delimiter = "\t " ) # need to remove comments of VCF-like characters, might be fully in memory though
83
- if chrom is None :
97
+ if ( chrom == None ) or ( chrom == 'ALL' ) :
84
98
for row in reader :
85
99
yield row
86
100
else :
@@ -89,34 +103,32 @@ def read_var_general(path, chrom=None):
89
103
yield row
90
104
91
105
92
- def sorted_join (reffile , targetfile ):
93
- with read_var_general (reffile ) as f1 , read_var_general (targetfile ) as f2 :
94
- f1_iter = iter (f1 )
95
- f2_iter = iter (f2 )
96
-
97
- line1 = next (f1_iter , None )
98
- line2 = next (f2_iter , None )
99
-
100
- while line1 is not None and line2 is not None :
101
- key1 = line1 ['CHR:POS:A0:A1' ]
102
- key2 = line2 ['CHR:POS:A0:A1' ]
103
-
104
- if key1 == key2 :
105
- yield line1 .strip () + delimiter + line2 .strip ()
106
- line1 = next (f1_iter , None )
107
- line2 = next (f2_iter , None )
108
- elif key1 < key2 :
109
- line1 = next (f1_iter , None )
110
- else :
111
- line2 = next (f2_iter , None )
106
+ def sorted_join_variants (reffile , targetfile ):
107
+ f1_iter = read_var_general (reffile )
108
+ f2_iter = read_var_general (targetfile )
112
109
110
+ line1 = next (f1_iter , None )
111
+ line2 = next (f2_iter , None )
113
112
113
+ while line1 is not None and line2 is not None :
114
+ key1 = line1 ['CHR:POS:A0:A1' ]
115
+ key2 = line2 ['CHR:POS:A0:A1' ]
114
116
117
+ if key1 == key2 :
118
+ line1 .update (line2 )
119
+ yield line1
120
+ line1 = next (f1_iter , None )
121
+ line2 = next (f2_iter , None )
122
+ elif key1 < key2 :
123
+ line1 = next (f1_iter , None )
124
+ else :
125
+ line2 = next (f2_iter , None )
115
126
116
127
117
128
def allele_complement (s ):
118
129
return s .replace ("A" , "V" ).replace ("T" , "X" ).replace ("C" , "Y" ).replace ("G" , "Z" ).replace ("V" , "T" ).replace ("X" , "A" ).replace ("Y" , "G" ).replace ("Z" , "C" )
119
130
131
+
120
132
def parse_args (args = None ):
121
133
parser = argparse .ArgumentParser (
122
134
description = _description_text (),
0 commit comments