-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcalculate_res_gene_depth_qc.py
executable file
·91 lines (70 loc) · 3.34 KB
/
calculate_res_gene_depth_qc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
import csv
import argparse
# calculate mean depth coverage for each resistance gene
def calculate_mean_depth(gene_positions, depth):
total_depth = 0
total_positions = 0
for position in range(gene_positions[0], gene_positions[1] + 1):
if position in depth:
total_depth += depth[position]
total_positions += 1
if total_positions != 0:
mean_depth = total_depth / total_positions
else:
mean_depth = 0
return round(mean_depth, 3)
# calculate percent of gene above depth threshold
def calculate_percent_gene_covered(gene_positions, depth, depth_threshold):
covered_positions = 0
for position in range(gene_positions[0], gene_positions[1] + 1):
if position in depth and depth[position] >= depth_threshold:
covered_positions += 1
total_positions = gene_positions[1] - gene_positions[0] + 1
if total_positions != 0:
pct_cov = (covered_positions / total_positions) * 100
else:
pct_cov = 0
return round(pct_cov, 3)
def main(args):
# Read gene positions from the resistance genes bed file
gene_data = []
with open(args.bed, 'r') as bed_file:
for line in bed_file:
fields = line.strip().split()
gene_name = fields[3]
start_position = int(fields[1]) +1 # add +1 because bed file is 0 indexed and depth file is 1 indexed
end_position = int(fields[2])
gene_data.append((gene_name, (start_position, end_position)))
# Read depth data from intermediate mpileup tsv file
depth_at_position = {}
with open(args.depth, 'r') as tsv_file:
for line in tsv_file:
line = line.strip()
if not line:
continue # Skip empty lines
chrom, pos, ref, depth = line.split('\t')
#skipping header line
try:
position = int(pos)
depth = float(depth)
except ValueError:
continue
depth_at_position[position] = depth
# Calculate qc metrics and write results to csv
output_file = args.output
with open(output_file, 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['gene_name', 'gene_position_start', 'gene_position_end', 'mean_depth_coverage', 'percent_of_gene_covered_above_depth_threshold'])
for gene_name, gene_positions in gene_data:
mean_depth = calculate_mean_depth(gene_positions, depth_at_position)
percent_covered = calculate_percent_gene_covered(gene_positions, depth_at_position, args.threshold)
writer.writerow([gene_name, gene_positions[0], gene_positions[1], mean_depth, percent_covered])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Calculate resistance gene coverage qc metrics.')
parser.add_argument('--bed', required=True, help='Path for resistance genes bed file.')
parser.add_argument('--depth', required=True, help='Path for tsv file with depth for each position (intermediate mpileup output).')
parser.add_argument('--threshold', type=float, default=10, help='min_depth threshold (default: 10).')
parser.add_argument('--output', help='Output CSV file name.')
args = parser.parse_args()
main(args)