Read & sort the variants from reference & target using heaps (this requires the full file to be in memory?)

smlmbrt · smlmbrt · commit da3e3ada59c2 · 2024-03-25T19:14:45.000Z
diff --git a/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py b/pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py
@@ -32,7 +32,7 @@ def run_intersect():
                 STRANDAMB = (v['REF'] == allele_complement(ALT))
                 heapq.heappush(ref_heap, ([key, v['ID'], v['REF']],[IS_INDEL, STRANDAMB, IS_MA_REF]))
 
-        # Output the sorted variants
+        # Output the sorted reference variants
         for i in range(len(ref_heap)):
             popped = heapq.heappop(ref_heap)
             outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
@@ -43,15 +43,14 @@ def run_intersect():
         outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tALT_FREQ\tF_MISS_DOSAGE\n')
         target_heap = []
         for path in args.target:
-            pvar = read_var_general(path)
+            pvar = read_var_general(path, chrom=None)  # essential not to filter if it is target (messes up common indexing)
 
             loc_afreq = path.replace('.pvar.zst', '.afreq.gz')
-            afreq = read_var_general(loc_afreq)
+            afreq = read_var_general(loc_afreq, chrom=None)  # essential not to filter if it is target (messes up common indexing)
 
             loc_vmiss = path.replace('.pvar.zst', '.vmiss.gz')
-            vmiss = read_var_general(loc_vmiss)
+            vmiss = read_var_general(loc_vmiss, chrom=None)  # essential not to filter if it is target (messes up common indexing)
 
-            ## ToDo: figure out if it needs to change to bim files
             for v, freq, miss in zip(pvar, afreq, vmiss):
                 # if v['ID'] != freq['ID'] != miss['ID']:
                 #     print(v)
@@ -64,8 +63,11 @@ def run_intersect():
                         key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], v['REF'], ALT)
                     else:
                         key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], ALT, v['REF'])
-                    heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i], F_MISS_DOSAGE]))
+                    # outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
+                    #                                              F_MISS_DOSAGE))
+                    heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i],F_MISS_DOSAGE]))
 
+        # Output the sorted reference variants
         for i in range(len(target_heap)):
             popped = heapq.heappop(target_heap)
             outf.write('\t'.join([str(x) for x in popped[0] + popped[1]]) + '\n')
@@ -87,6 +89,31 @@ def read_var_general(path, chrom=None):
                     yield row
 
 
+def sorted_join(reffile, targetfile):
+    with read_var_general(reffile) as f1, read_var_general(targetfile) as f2:
+        f1_iter = iter(f1)
+        f2_iter = iter(f2)
+
+        line1 = next(f1_iter, None)
+        line2 = next(f2_iter, None)
+
+        while line1 is not None and line2 is not None:
+            key1 = line1['CHR:POS:A0:A1']
+            key2 = line2['CHR:POS:A0:A1']
+
+            if key1 == key2:
+                yield line1.strip() + delimiter + line2.strip()
+                line1 = next(f1_iter, None)
+                line2 = next(f2_iter, None)
+            elif key1 < key2:
+                line1 = next(f1_iter, None)
+            else:
+                line2 = next(f2_iter, None)
+
+
+
+
+
 def allele_complement(s):
     return s.replace("A", "V").replace("T", "X").replace("C", "Y").replace("G", "Z").replace("V", "T").replace("X", "A").replace("Y", "G").replace("Z", "C")