Skip to content

Commit 2dbea73

Browse files
committed
Merge sorted variants, output the variant counts. pure python.
1 parent da3e3ad commit 2dbea73

File tree

1 file changed

+41
-29
lines changed

1 file changed

+41
-29
lines changed

pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py

+41-29
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import argparse
22
import logging
3-
import sys
4-
import os
53
from xopen import xopen
64
import csv
75
import textwrap
@@ -33,23 +31,25 @@ def run_intersect():
3331
heapq.heappush(ref_heap, ([key, v['ID'], v['REF']],[IS_INDEL, STRANDAMB, IS_MA_REF]))
3432

3533
# Output the sorted reference variants
36-
for i in range(len(ref_heap)):
34+
n_ref = len(ref_heap)
35+
for i in range(n_ref):
3736
popped = heapq.heappop(ref_heap)
3837
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
3938
del ref_heap
4039

4140
# Process & sort target variants
41+
# ToDo: check if it works for bim format files?
4242
with xopen('target_variants.txt', 'wt') as outf:
4343
outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tALT_FREQ\tF_MISS_DOSAGE\n')
4444
target_heap = []
4545
for path in args.target:
46-
pvar = read_var_general(path, chrom=None) # essential not to filter if it is target (messes up common indexing)
46+
pvar = read_var_general(path, chrom=None) # essential not to filter if it is target (messes up common line indexing)
4747

4848
loc_afreq = path.replace('.pvar.zst', '.afreq.gz')
49-
afreq = read_var_general(loc_afreq, chrom=None) # essential not to filter if it is target (messes up common indexing)
49+
afreq = read_var_general(loc_afreq, chrom=None) # essential not to filter if it is target (messes up common line indexing)
5050

5151
loc_vmiss = path.replace('.pvar.zst', '.vmiss.gz')
52-
vmiss = read_var_general(loc_vmiss, chrom=None) # essential not to filter if it is target (messes up common indexing)
52+
vmiss = read_var_general(loc_vmiss, chrom=None) # essential not to filter if it is target (messes up common line indexing)
5353

5454
for v, freq, miss in zip(pvar, afreq, vmiss):
5555
# if v['ID'] != freq['ID'] != miss['ID']:
@@ -68,19 +68,33 @@ def run_intersect():
6868
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i],F_MISS_DOSAGE]))
6969

7070
# Output the sorted reference variants
71-
for i in range(len(target_heap)):
71+
n_target = len(target_heap)
72+
for i in range(n_target):
7273
popped = heapq.heappop(target_heap)
7374
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
7475
del target_heap
7576

76-
# ToDo: implement merge (on the same keys) of the two sorted files
77+
# Merge matched variants on sorted files
78+
n_matched = 0
79+
with open('matched_variants.txt', 'w') as csvfile:
80+
for vmatch in sorted_join_variants('reference_variants.txt', 'target_variants.txt'):
81+
n_matched += 1
82+
vmatch['SAME_REF'] = vmatch['REF_REF'] == vmatch['REF_REF']
83+
if n_matched == 1:
84+
writer = csv.DictWriter(csvfile, fieldnames=vmatch.keys(), delimiter='\t')
85+
writer.writeheader()
86+
writer.writerow(vmatch)
87+
88+
# Output counts
89+
with open('intersect_counts_${}.txt'.format(chrom), 'w') as outf:
90+
outf.write('\n'.join(map(str, [n_target, n_ref, n_matched])))
7791

7892

7993
def read_var_general(path, chrom=None):
8094
with xopen(path, "rt") as f:
8195
# ToDo: check if this is memory inefficent
8296
reader = csv.DictReader(filter(lambda row: row[:2]!='##', f), delimiter="\t") # need to remove comments of VCF-like characters, might be fully in memory though
83-
if chrom is None:
97+
if (chrom == None) or (chrom == 'ALL'):
8498
for row in reader:
8599
yield row
86100
else:
@@ -89,34 +103,32 @@ def read_var_general(path, chrom=None):
89103
yield row
90104

91105

92-
def sorted_join(reffile, targetfile):
93-
with read_var_general(reffile) as f1, read_var_general(targetfile) as f2:
94-
f1_iter = iter(f1)
95-
f2_iter = iter(f2)
96-
97-
line1 = next(f1_iter, None)
98-
line2 = next(f2_iter, None)
99-
100-
while line1 is not None and line2 is not None:
101-
key1 = line1['CHR:POS:A0:A1']
102-
key2 = line2['CHR:POS:A0:A1']
103-
104-
if key1 == key2:
105-
yield line1.strip() + delimiter + line2.strip()
106-
line1 = next(f1_iter, None)
107-
line2 = next(f2_iter, None)
108-
elif key1 < key2:
109-
line1 = next(f1_iter, None)
110-
else:
111-
line2 = next(f2_iter, None)
106+
def sorted_join_variants(reffile, targetfile):
107+
f1_iter = read_var_general(reffile)
108+
f2_iter = read_var_general(targetfile)
112109

110+
line1 = next(f1_iter, None)
111+
line2 = next(f2_iter, None)
113112

113+
while line1 is not None and line2 is not None:
114+
key1 = line1['CHR:POS:A0:A1']
115+
key2 = line2['CHR:POS:A0:A1']
114116

117+
if key1 == key2:
118+
line1.update(line2)
119+
yield line1
120+
line1 = next(f1_iter, None)
121+
line2 = next(f2_iter, None)
122+
elif key1 < key2:
123+
line1 = next(f1_iter, None)
124+
else:
125+
line2 = next(f2_iter, None)
115126

116127

117128
def allele_complement(s):
118129
return s.replace("A", "V").replace("T", "X").replace("C", "Y").replace("G", "Z").replace("V", "T").replace("X", "A").replace("Y", "G").replace("Z", "C")
119130

131+
120132
def parse_args(args=None):
121133
parser = argparse.ArgumentParser(
122134
description=_description_text(),

0 commit comments

Comments
 (0)