-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweighted_peaks.py
106 lines (86 loc) · 3 KB
/
weighted_peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#input bed file with peaks and text file found using samtools depth with chr, NT position, and depth
#outputs txt file with chr, start, end, strand, Total Reads, Max Depth, Relative Depth, and reads/NT for each peak
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-p","--peaks",help="path to peak file in .bed format")
parser.add_argument("-d","--depths",help="path to depth file in .txt format")
parser.add_argument("-o","--output",help="path to file output in .txt format")
args = parser.parse_args()
peaks = open(args.peaks,"r")
depth = open(args.depths,"r")
#extract column data
peaklines = []
depthlines = []
for line in peaks:
peaklines.append(line.split("\t"))
bedchr = list(zip(*peaklines)[0])
bedchrStart = list(zip(*peaklines)[1])
bedchrEnd = list(zip(*peaklines)[2])
bedchrStrand = list(zip(*peaklines)[5])
for line in depth:
depthlines.append(line.split("\t"))
depthchr = list(zip(*depthlines)[0]) #make sure depthchr and bedchr have same names!
depthNT = list(zip(*depthlines)[1]) #position of depth
depthValue = list(zip(*depthlines)[2])
depthValue = np.array([ele.replace('\n','') for ele in depthValue])
depthValue = depthValue.astype(np.float)
#lists to store data you want
totalReads = []
maxPeakHeights = []
peakLength = []
readsperNT = []
for i in range(len(bedchr)):
indPeakD= [] #list to hold depths for nt of each individual peak
for x in range(len(depthchr)):
if (bedchr[i] == depthchr[x]) & (depthNT[x] >= bedchrStart[i]) & (depthNT[x] <= bedchrEnd[i]):
indPeakD.append(depthValue[x])
else:
continue
totalReads.append(sum(indPeakD))
maxPeakHeights.append(max(indPeakD))
peakLength.append(len(indPeakD))
readsperNT.append(sum(indPeakD)/len(indPeakD))
relativePeakHeights = np.array(maxPeakHeights)/(min(maxPeakHeights))
#write to file
output=open(args.output,"w")
output.write("chr")
output.write("\t")
output.write("Start")
output.write("\t")
output.write("End")
output.write("\t")
output.write("Strand")
output.write("\t")
output.write("Tot Reads")
output.write("\t")
output.write("Max Depth")
output.write("\t")
output.write("Relative Depth")
output.write("\t")
output.write("Length")
output.write("\t")
output.write("Reads/NT")
output.write("\n")
for i in range(len(peaklines)):
#for peaks, write columns:
#chr, Start pos, End pos, strand, total reads, maxPeakDepth, relpeakDepth, len, reads/NT
output.write(str(bedchr[i]))
output.write("\t")
output.write(str(bedchrStart[i]))
output.write("\t")
output.write(str(bedchrEnd[i]))
output.write("\t")
output.write(str(bedchrStrand[i]))
output.write("\t")
output.write(str(totalReads[i]))
output.write("\t")
output.write(str(maxPeakHeights[i]))
output.write("\t")
output.write(str(relativePeakHeights[i]))
output.write("\t")
output.write(str(peakLength[i]))
output.write("\t")
output.write(str(readsperNT[i]))
output.write("\n")
output.close()