Skip to content

Commit da3e3ad

Browse files
committed
Read & sort the variants from reference & target using heaps (this requires the full file to be in memory?)
1 parent 50855b7 commit da3e3ad

File tree

1 file changed

+33
-6
lines changed

1 file changed

+33
-6
lines changed

pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def run_intersect():
3232
STRANDAMB = (v['REF'] == allele_complement(ALT))
3333
heapq.heappush(ref_heap, ([key, v['ID'], v['REF']],[IS_INDEL, STRANDAMB, IS_MA_REF]))
3434

35-
# Output the sorted variants
35+
# Output the sorted reference variants
3636
for i in range(len(ref_heap)):
3737
popped = heapq.heappop(ref_heap)
3838
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
@@ -43,15 +43,14 @@ def run_intersect():
4343
outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tALT_FREQ\tF_MISS_DOSAGE\n')
4444
target_heap = []
4545
for path in args.target:
46-
pvar = read_var_general(path)
46+
pvar = read_var_general(path, chrom=None) # essential not to filter if it is target (messes up common indexing)
4747

4848
loc_afreq = path.replace('.pvar.zst', '.afreq.gz')
49-
afreq = read_var_general(loc_afreq)
49+
afreq = read_var_general(loc_afreq, chrom=None) # essential not to filter if it is target (messes up common indexing)
5050

5151
loc_vmiss = path.replace('.pvar.zst', '.vmiss.gz')
52-
vmiss = read_var_general(loc_vmiss)
52+
vmiss = read_var_general(loc_vmiss, chrom=None) # essential not to filter if it is target (messes up common indexing)
5353

54-
## ToDo: figure out if it needs to change to bim files
5554
for v, freq, miss in zip(pvar, afreq, vmiss):
5655
# if v['ID'] != freq['ID'] != miss['ID']:
5756
# print(v)
@@ -64,8 +63,11 @@ def run_intersect():
6463
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], v['REF'], ALT)
6564
else:
6665
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], ALT, v['REF'])
67-
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i], F_MISS_DOSAGE]))
66+
# outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
67+
# F_MISS_DOSAGE))
68+
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i],F_MISS_DOSAGE]))
6869

70+
# Output the sorted reference variants
6971
for i in range(len(target_heap)):
7072
popped = heapq.heappop(target_heap)
7173
outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
@@ -87,6 +89,31 @@ def read_var_general(path, chrom=None):
8789
yield row
8890

8991

92+
def sorted_join(reffile, targetfile):
93+
with read_var_general(reffile) as f1, read_var_general(targetfile) as f2:
94+
f1_iter = iter(f1)
95+
f2_iter = iter(f2)
96+
97+
line1 = next(f1_iter, None)
98+
line2 = next(f2_iter, None)
99+
100+
while line1 is not None and line2 is not None:
101+
key1 = line1['CHR:POS:A0:A1']
102+
key2 = line2['CHR:POS:A0:A1']
103+
104+
if key1 == key2:
105+
yield line1.strip() + delimiter + line2.strip()
106+
line1 = next(f1_iter, None)
107+
line2 = next(f2_iter, None)
108+
elif key1 < key2:
109+
line1 = next(f1_iter, None)
110+
else:
111+
line2 = next(f2_iter, None)
112+
113+
114+
115+
116+
90117
def allele_complement(s):
91118
return s.replace("A", "V").replace("T", "X").replace("C", "Y").replace("G", "Z").replace("V", "T").replace("X", "A").replace("Y", "G").replace("Z", "C")
92119

0 commit comments

Comments
 (0)