Skip to content

Commit 486b2e5

Browse files
committed
QC checks and formatting fixes
1 parent b20acc2 commit 486b2e5

File tree

1 file changed

+29
-21
lines changed

1 file changed

+29
-21
lines changed

pgscatalog.match/src/pgscatalog/match/cli/intersect_cli.py

+29-21
Original file line numberDiff line numberDiff line change
@@ -55,17 +55,17 @@ def run_intersect():
5555
target_heap = []
5656
for path in args.target:
5757
logger.info("Reading TARGET variants: {}".format(path))
58-
pvar = read_var_general(path, chrom=None) # essential not to filter if it is target (messes up common line indexing)
58+
pvar = read_var_general(path, chrom=None) # essential not to filter target (messes up common line indexing)
5959

6060
loc_afreq = path.replace('.pvar.zst', '.afreq.gz')
61-
afreq = read_var_general(loc_afreq, chrom=None) # essential not to filter if it is target (messes up common line indexing)
61+
afreq = read_var_general(loc_afreq, chrom=None) # essential not to filter target (messes up common line indexing)
6262

6363
loc_vmiss = path.replace('.pvar.zst', '.vmiss.gz')
64-
vmiss = read_var_general(loc_vmiss, chrom=None) # essential not to filter if it is target (messes up common line indexing)
64+
vmiss = read_var_general(loc_vmiss, chrom=None) # essential not to filter target (messes up common line indexing)
6565

6666
for v, freq, miss in zip(pvar, afreq, vmiss):
67-
# if v['ID'] != freq['ID'] != miss['ID']:
68-
# print(v)
67+
if all([v['ID'], freq['ID'], miss['#ID']]) is False:
68+
raise ValueError("TARGET variant files are not sorted")
6969
ALTs = v['ALT'].split(',')
7070
ALT_FREQS = [float(x) for x in freq['ALT_FREQS'].split(',')]
7171
F_MISS_DOSAGE = miss['F_MISS_DOSAGE']
@@ -75,10 +75,8 @@ def run_intersect():
7575
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], v['REF'], ALT)
7676
else:
7777
key = '{}:{}:{}:{}'.format(v['#CHROM'], v['POS'], ALT, v['REF'])
78-
# outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(key, v['ID'], v['REF'], str(IS_MA_TARGET), ALT_FREQS[i],
79-
# F_MISS_DOSAGE))
80-
MAF = AAF2MAF(ALT_FREQS[i])
81-
target_heap.append(([key, v['ID'], v['REF']], [IS_MA_TARGET, MAF,F_MISS_DOSAGE]))
78+
MAF = aaf2maf(ALT_FREQS[i])
79+
target_heap.append(([key, v['ID'], v['REF']], [IS_MA_TARGET, MAF, F_MISS_DOSAGE]))
8280

8381
logger.info("Sorting TARGET variants (heapify)")
8482
heapq.heapify(target_heap)
@@ -105,7 +103,8 @@ def run_intersect():
105103
PCA_ELIGIBLE = ((vmatch['IS_MA_REF'] == 'False') and (vmatch['IS_MA_TARGET'] == 'False')) and \
106104
(((vmatch['IS_INDEL'] == 'False') and (vmatch['STRANDAMB'] == 'False')) or ((vmatch['IS_INDEL'] == 'True') and (vmatch['SAME_REF'] == 'True')))
107105

108-
PCA_ELIGIBLE = PCA_ELIGIBLE and (float(vmatch['MAF']) > args.maf_filter) and (float(vmatch['F_MISS_DOSAGE']) < args.maf_filter)
106+
PCA_ELIGIBLE = PCA_ELIGIBLE and (float(vmatch['MAF']) > args.maf_filter) and \
107+
(float(vmatch['F_MISS_DOSAGE']) < args.vmiss_filter)
109108
vmatch['PCA_ELIGIBLE'] = PCA_ELIGIBLE
110109
if PCA_ELIGIBLE is True:
111110
n_PCA_ELIGIBLE += 1
@@ -114,7 +113,8 @@ def run_intersect():
114113
writer = csv.DictWriter(csvfile, fieldnames=vmatch.keys(), delimiter='\t')
115114
writer.writeheader()
116115
writer.writerow(vmatch)
117-
logger.info("{}/{} ({:.2f} variants are eligible for PCA".format(n_PCA_ELIGIBLE, n_matched, 100*n_PCA_ELIGIBLE/n_matched))
116+
logger.info("{}/{} ({:.2f}%) variants are eligible for PCA".format(n_PCA_ELIGIBLE, n_matched,
117+
100*n_PCA_ELIGIBLE/n_matched))
118118

119119
# Output counts
120120
logger.info("Outputting variant counts -> intersect_counts_$.txt")
@@ -123,9 +123,15 @@ def run_intersect():
123123

124124

125125
def read_var_general(path, chrom=None):
126+
"""
127+
General function for reading variant files from plink2 outputs
128+
:param path: path to variant file
129+
:param chrom: filter to specific chromosome
130+
:return: row of a df as a dict
131+
"""
126132
with xopen(path, "rt") as f:
127133
# ToDo: check if this is memory inefficent
128-
reader = csv.DictReader(filter(lambda row: row[:2]!='##', f), delimiter="\t") # need to remove comments of VCF-like characters, might be fully in memory though
134+
reader = csv.DictReader(filter(lambda r: r[:2] != '##', f), delimiter="\t") # need to remove comments of VCF-like characters, might be fully in memory though
129135
if (chrom is None) or (chrom == 'ALL'):
130136
for row in reader:
131137
yield row
@@ -135,9 +141,9 @@ def read_var_general(path, chrom=None):
135141
yield row
136142

137143

138-
def sorted_join_variants(reffile, targetfile):
139-
f1_iter = read_var_general(reffile)
140-
f2_iter = read_var_general(targetfile)
144+
def sorted_join_variants(path_ref, path_target):
145+
f1_iter = read_var_general(path_ref)
146+
f2_iter = read_var_general(path_target)
141147

142148
prev_key1 = None # Initialize previous key for file 1
143149
prev_key2 = None # Initialize previous key for file 2
@@ -172,24 +178,26 @@ def sorted_join_variants(reffile, targetfile):
172178

173179

174180
def allele_complement(s):
175-
'''
181+
"""
176182
Complement alleles
177183
:param s: allele to be complemented
178184
:return: complement
179-
'''
185+
"""
180186
return s.replace("A", "V").replace("T", "X").replace("C", "Y").replace("G", "Z").replace("V", "T").replace("X", "A").replace("Y", "G").replace("Z", "C")
181187

182-
def AAF2MAF(aaf):
183-
'''
188+
189+
def aaf2maf(aaf):
190+
"""
184191
Convert alternative allele frequency (AAF) to minor allele frequency (MAF)
185192
:param aaf: alternative allele frequency
186193
:return: minor allele frequency (MAF)
187-
'''
194+
"""
188195
if aaf > 0.5:
189196
return 1-aaf
190197
else:
191198
return aaf
192199

200+
193201
def parse_args(args=None):
194202
parser = argparse.ArgumentParser(
195203
description=_description_text(),
@@ -227,7 +235,7 @@ def parse_args(args=None):
227235
)
228236
parser.add_argument(
229237
"--geno_miss",
230-
dest="maf_filter",
238+
dest="vmiss_filter",
231239
default=0.1,
232240
required=False,
233241
help="Filter: Maximum Genotype missingness for PCA eligibility",

0 commit comments

Comments
 (0)