Skip to content

Commit 50855b7

Browse files
committed
Read & sort the variants from reference and target using heaps (this requires the full file to be in memory)
1 parent 171db8b commit 50855b7

File tree

1 file changed

+23
-12
lines changed

1 file changed

+23
-12
lines changed

pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py

+23-12
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
import argparse
22
import logging
33
import sys
4+
import os
45
from xopen import xopen
56
import csv
67
import textwrap
8+
import heapq
79

810

9-
from pgscatalog.core import TargetVariants
10-
1111
logger = logging.getLogger(__name__)
1212

1313

1414
def run_intersect():
1515
args = parse_args()
1616

17-
# Process reference variants
17+
# Process & sort reference variants
1818
with xopen('reference_variants.txt', 'wt') as outf:
1919
outf.write('CHR:POS:A0:A1\tID_REF\tREF_REF\tIS_INDEL\tSTRANDAMB\tIS_MA_REF\n')
20+
ref_heap = []
2021
ref_pvar = read_var_general(args.reference, chrom=args.filter_chrom)
2122
for v in ref_pvar:
2223
ALTs = v['ALT'].split(',')
@@ -29,11 +30,18 @@ def run_intersect():
2930

3031
IS_INDEL = (len(v['REF']) > 1) | (len(ALT) > 1)
3132
STRANDAMB = (v['REF'] == allele_complement(ALT))
32-
outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key,v['ID'], v['REF'], IS_INDEL, STRANDAMB, IS_MA_REF))
33+
heapq.heappush(ref_heap, ([key, v['ID'], v['REF']],[IS_INDEL, STRANDAMB, IS_MA_REF]))
34+
35+
# Output the sorted variants
36+
for i in range(len(ref_heap)):
37+
popped = heapq.heappop(ref_heap)
38+
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
39+
del ref_heap
3340

34-
# Process target variants
41+
# Process & sort target variants
3542
with xopen('target_variants.txt', 'wt') as outf:
3643
outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tALT_FREQ\tF_MISS_DOSAGE\n')
44+
target_heap = []
3745
for path in args.target:
3846
pvar = read_var_general(path)
3947

@@ -56,17 +64,20 @@ def run_intersect():
5664
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], v['REF'], ALT)
5765
else:
5866
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], ALT, v['REF'])
59-
outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key,v['ID'],v['REF'], str(IS_MA_TARGET), ALT_FREQS[i], F_MISS_DOSAGE))
67+
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i], F_MISS_DOSAGE]))
68+
69+
for i in range(len(target_heap)):
70+
popped = heapq.heappop(target_heap)
71+
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
72+
del target_heap
73+
74+
# ToDo: implement merge (on the same keys) of the two sorted files
6075

6176

6277
def read_var_general(path, chrom=None):
6378
with xopen(path, "rt") as f:
64-
for line in f:
65-
if line.startswith("##"):
66-
continue
67-
else:
68-
fieldnames = line.strip().split("\t")
69-
reader = csv.DictReader(f, fieldnames=fieldnames, delimiter="\t")
79+
# ToDo: check if this is memory inefficent
80+
reader = csv.DictReader(filter(lambda row: row[:2]!='##', f), delimiter="\t") # need to remove comments of VCF-like characters, might be fully in memory though
7081
if chrom is None:
7182
for row in reader:
7283
yield row

0 commit comments

Comments
 (0)