1
1
import argparse
2
2
import logging
3
3
import sys
4
+ import os
4
5
from xopen import xopen
5
6
import csv
6
7
import textwrap
8
+ import heapq
7
9
8
10
9
- from pgscatalog .core import TargetVariants
10
-
11
11
logger = logging .getLogger (__name__ )
12
12
13
13
14
14
def run_intersect ():
15
15
args = parse_args ()
16
16
17
- # Process reference variants
17
+ # Process & sort reference variants
18
18
with xopen ('reference_variants.txt' , 'wt' ) as outf :
19
19
outf .write ('CHR:POS:A0:A1\t ID_REF\t REF_REF\t IS_INDEL\t STRANDAMB\t IS_MA_REF\n ' )
20
+ ref_heap = []
20
21
ref_pvar = read_var_general (args .reference , chrom = args .filter_chrom )
21
22
for v in ref_pvar :
22
23
ALTs = v ['ALT' ].split (',' )
@@ -29,11 +30,18 @@ def run_intersect():
29
30
30
31
IS_INDEL = (len (v ['REF' ]) > 1 ) | (len (ALT ) > 1 )
31
32
STRANDAMB = (v ['REF' ] == allele_complement (ALT ))
32
- outf .write ('{}\t {}\t {}\t {}\t {}\t {}\n ' .format (key ,v ['ID' ], v ['REF' ], IS_INDEL , STRANDAMB , IS_MA_REF ))
33
+ heapq .heappush (ref_heap , ([key , v ['ID' ], v ['REF' ]],[IS_INDEL , STRANDAMB , IS_MA_REF ]))
34
+
35
+ # Output the sorted variants
36
+ for i in range (len (ref_heap )):
37
+ popped = heapq .heappop (ref_heap )
38
+ outf .write ('\t ' .join ([str (x ) for x in popped [0 ] + popped [1 ]]) + '\n ' )
39
+ del ref_heap
33
40
34
- # Process target variants
41
+ # Process & sort target variants
35
42
with xopen ('target_variants.txt' , 'wt' ) as outf :
36
43
outf .write ('CHR:POS:A0:A1\t ID_TARGET\t REF_TARGET\t IS_MA_TARGET\t ALT_FREQ\t F_MISS_DOSAGE\n ' )
44
+ target_heap = []
37
45
for path in args .target :
38
46
pvar = read_var_general (path )
39
47
@@ -56,17 +64,20 @@ def run_intersect():
56
64
key = '{}:{}:{}:{}' .format (v ['#CHROM' ], v ['POS' ], v ['REF' ], ALT )
57
65
else :
58
66
key = '{}:{}:{}:{}' .format (v ['#CHROM' ], v ['POS' ], ALT , v ['REF' ])
59
- outf .write ('{}\t {}\t {}\t {}\t {}\t {}\n ' .format (key ,v ['ID' ],v ['REF' ], str (IS_MA_TARGET ), ALT_FREQS [i ], F_MISS_DOSAGE ))
67
+ heapq .heappush (target_heap , ([key , v ['ID' ], v ['REF' ]], [IS_MA_TARGET , ALT_FREQS [i ], F_MISS_DOSAGE ]))
68
+
69
+ for i in range (len (target_heap )):
70
+ popped = heapq .heappop (target_heap )
71
+ outf .write ('\t ' .join ([str (x ) for x in popped [0 ] + popped [1 ]]) + '\n ' )
72
+ del target_heap
73
+
74
+ # ToDo: implement merge (on the same keys) of the two sorted files
60
75
61
76
62
77
def read_var_general (path , chrom = None ):
63
78
with xopen (path , "rt" ) as f :
64
- for line in f :
65
- if line .startswith ("##" ):
66
- continue
67
- else :
68
- fieldnames = line .strip ().split ("\t " )
69
- reader = csv .DictReader (f , fieldnames = fieldnames , delimiter = "\t " )
79
+ # ToDo: check if this is memory inefficent
80
+ reader = csv .DictReader (filter (lambda row : row [:2 ]!= '##' , f ), delimiter = "\t " ) # need to remove comments of VCF-like characters, might be fully in memory though
70
81
if chrom is None :
71
82
for row in reader :
72
83
yield row
0 commit comments