Skip to content

Commit 14e55a4

Browse files
committed
Add PCA eligibility checks, and add the check for file sort while merging
1 parent 3cdabfc commit 14e55a4

File tree

1 file changed

+57
-4
lines changed

1 file changed

+57
-4
lines changed

pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py

+57-4
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def run_intersect():
4848
# Process & sort target variants
4949
# ToDo: check if it works for bim format files?
5050
with xopen('target_variants.txt', 'wt') as outf:
51-
outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tALT_FREQ\tF_MISS_DOSAGE\n')
51+
outf.write('CHR:POS:A0:A1\tID_TARGET\tREF_TARGET\tIS_MA_TARGET\tMAF\tF_MISS_DOSAGE\n')
5252
target_heap = []
5353
for path in args.target:
5454
logger.info("Reading & sorting TARGET variants: {}".format(path))
@@ -64,7 +64,7 @@ def run_intersect():
6464
# if v['ID'] != freq['ID'] != miss['ID']:
6565
# print(v)
6666
ALTs = v['ALT'].split(',')
67-
ALT_FREQS = freq['ALT_FREQS'].split(',')
67+
ALT_FREQS = [float(x) for x in freq['ALT_FREQS'].split(',')]
6868
F_MISS_DOSAGE = miss['F_MISS_DOSAGE']
6969
IS_MA_TARGET = len(ALTs) > 1
7070
for i, ALT in enumerate(ALTs):
@@ -74,7 +74,8 @@ def run_intersect():
7474
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], ALT, v['REF'])
7575
# outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
7676
# F_MISS_DOSAGE))
77-
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, ALT_FREQS[i],F_MISS_DOSAGE]))
77+
MAF = AAF2MAF(ALT_FREQS[i])
78+
heapq.heappush(target_heap, ([key, v['ID'], v['REF']], [IS_MA_TARGET, MAF,F_MISS_DOSAGE]))
7879

7980
# Output the sorted reference variants
8081
logger.info("Outputting TARGET variants -> target_variants.txt")
@@ -91,6 +92,15 @@ def run_intersect():
9192
for vmatch in sorted_join_variants('reference_variants.txt', 'target_variants.txt'):
9293
n_matched += 1
9394
vmatch['SAME_REF'] = vmatch['REF_REF'] == vmatch['REF_REF']
95+
96+
# Define variant's eligibility for PCA
97+
# From original implementation: ((IS_MA_REF == FALSE) && (IS_MA_TARGET == FALSE)) && (((IS_INDEL == FALSE) && (STRANDAMB == FALSE)) || ((IS_INDEL == TRUE) && (SAME_REF == TRUE)))
98+
PCA_ELIGIBLE = ((vmatch['IS_MA_REF'] is False) and (vmatch['IS_MA_TARGET'] is False)) and \
99+
(((vmatch['IS_INDEL'] is False) and (vmatch['STRANDAMB'] is False)) or ((vmatch['IS_INDEL'] is True) and (vmatch['SAME_REF'] is True)))
100+
101+
PCA_ELIGIBLE = PCA_ELIGIBLE and (vmatch['MAF'] > args.maf_filter) and (vmatch['F_MISS_DOSAGE'] < args.maf_filter)
102+
vmatch['PCA_ELIGIBLE'] = PCA_ELIGIBLE
103+
94104
if n_matched == 1:
95105
writer = csv.DictWriter(csvfile, fieldnames=vmatch.keys(), delimiter='\t')
96106
writer.writeheader()
@@ -119,27 +129,56 @@ def sorted_join_variants(reffile, targetfile):
119129
f1_iter = read_var_general(reffile)
120130
f2_iter = read_var_general(targetfile)
121131

132+
prev_key1 = None # Initialize previous key for file 1
133+
prev_key2 = None # Initialize previous key for file 2
134+
122135
line1 = next(f1_iter, None)
123136
line2 = next(f2_iter, None)
124137

125138
while line1 is not None and line2 is not None:
126139
key1 = line1['CHR:POS:A0:A1']
127140
key2 = line2['CHR:POS:A0:A1']
128141

142+
# Check if lines are sorted by the key for each file
143+
if prev_key1 is not None and key1 < prev_key1:
144+
raise ValueError("REFERENCE keys are not sorted")
145+
if prev_key2 is not None and key2 < prev_key2:
146+
raise ValueError("TARGET keys are not sorted")
147+
129148
if key1 == key2:
130149
line1.update(line2)
131150
yield line1
151+
prev_key1 = key1 # Update previous key for file 1
152+
prev_key2 = key2 # Update previous key for file 2
153+
132154
line1 = next(f1_iter, None)
133155
line2 = next(f2_iter, None)
134156
elif key1 < key2:
157+
prev_key1 = key1 # Update previous key for file 1
135158
line1 = next(f1_iter, None)
136159
else:
160+
prev_key2 = key2 # Update previous key for file 2
137161
line2 = next(f2_iter, None)
138162

139163

140164
def allele_complement(s):
165+
'''
166+
Complement alleles
167+
:param s: allele to be complemented
168+
:return: complement
169+
'''
141170
return s.replace("A", "V").replace("T", "X").replace("C", "Y").replace("G", "Z").replace("V", "T").replace("X", "A").replace("Y", "G").replace("Z", "C")
142171

172+
def AAF2MAF(aaf):
173+
'''
174+
Convert alternative allele frequency (AAF) to minor allele frequency (MAF)
175+
:param aaf: alternative allele frequency
176+
:return: minor allele frequency (MAF)
177+
'''
178+
if aaf > 0.5:
179+
return 1-aaf
180+
else:
181+
return aaf
143182

144183
def parse_args(args=None):
145184
parser = argparse.ArgumentParser(
@@ -167,7 +206,21 @@ def parse_args(args=None):
167206
"--chrom",
168207
dest="filter_chrom",
169208
required=False,
170-
help="whether to limit matches to specific chromosome of the reference",
209+
help="Whether to limit matches to specific chromosome of the reference",
210+
)
211+
parser.add_argument(
212+
"--maf_target",
213+
dest="maf_filter",
214+
default=0.1,
215+
required=False,
216+
help="Filter: Minimum minor Allele Frequency for PCA eligibility",
217+
)
218+
parser.add_argument(
219+
"--geno_miss",
220+
dest="maf_filter",
221+
default=0.1,
222+
required=False,
223+
help="Filter: Maximum Genotype missingness for PCA eligibility",
171224
)
172225
parser.add_argument(
173226
"-v",

0 commit comments

Comments
 (0)