-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.sh
73 lines (48 loc) · 3.56 KB
/
analysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# This file contains some interactive exploratory
# analysis using uniwig.
# From within a folder with all your .bed files in it:
```
for i in *.bed; do sort -k1,1V -k3,3n $i > $i.sorted; done;
sort -k1,1V -k2,2n --merge *.sorted > all_sorted.bed
cut -f1,2 all_sorted.bed | uniq -c > starts2.txt
```
# In order for this to work, the file has to be sorted
# we do the sort in 2 steps: First sort each file,
# then use the sort --merge to combine them. This is much faster
# than merge them first, and then sorting the combined file.
# to sort by start coordinate, we first sort by chrom then start:
for i in `findname Ctcf *.bed`; do sort -k1,1V -k2,2n $i > $i.sorted; done;
sort -k1,1V -k2,2n --merge *.sorted > all_sorted.bed
# you could then use a command like this to count the frequency of each position:
cut -f1,2 all_sorted.bed | uniq -c > starts.txt
# to sort by end coordinate, we first sort by chrom then end:
for i in `findname Ctcf *.bed`; do sort -k1,1V -k3,3n $i > $i.sorted; done;
sort -k1,1V -k3,3n --merge *.sorted > all_sorted.bed
# Here's how you'd create it for the whole genome? not sure if this works.
#time ./uniwig -m 25 -e non_ctcf_combined_endsort.bed | wigToBigWig -clip stdin `refgenie seek hg38/fasta.chrom_sizes` non-ctcf-combined-ends.bw
# Once we have it sorted by start, we could just resort that by end
sort -k1,1V -k3,3n non_ctcf_combined.bed > non_ctcf_combined_endsort.bed
# Here is how to take the combined file and filter it to just chr1
grep -P "chr1\t" non_ctcf_combined.bed > non_ctcf_combined_chr1.bed
grep -P "chr1\t" non_ctcf_combined_endsort.bed > non_ctcf_combined_chr1_endsort.bed
# confirm that it worked:
wc -l non_ctcf_combined_chr1.bed
wc -l non_ctcf_combined_chr1_endsort.bed
tail non_ctcf_combined_chr1_endsort.bed
tail non_ctcf_combined_chr1.bed
# Given the sorted files, we can pass to uniwig to smooth
time ./uniwig -m 25 -e non_ctcf_combined_chr1_endsort.bed | wigToBigWig -clip stdin `refgenie seek hg38/fasta.chrom_sizes` data/non-ctcf-combined-ends-chr1.bw
time ./uniwig -m 25 non_ctcf_combined_chr1.bed | wigToBigWig -clip stdin `refgenie seek hg38/fasta.chrom_sizes` data/non-ctcf-combined-chr1.bw
# Now copy this data over to rivanna
ls data/*chr1*
scp data/*chr1* rivi:/project/shefflab/processed/bedhmm/
scp data/*coverage*chr1*.bw rivi:/project/shefflab/processed/bedhmm/
scp data/
# Here are some other related commands to play around with
echo "fixedStep chrom=chr1 start=1 step=1" > data/non-ctcf-combined-coverage-chr1.wig
bedtools genomecov -d -i non_ctcf_combined_chr1_endsort.bed -g `refgenie seek hg38/fasta.chrom_sizes` | grep "chr1" | cut -f3 >> data/non-ctcf-combined-coverage-chr1.wig
wigToBigWig -clip data/non-ctcf-combined-coverage-chr1.wig `refgenie seek hg38/fasta.chrom_sizes` data/non-ctcf-combined-coverage-chr1.bw
bedtools genomecov -d -i non_ctcf_combined_chr1_endsort.bed -g `refgenie seek hg38/fasta.chrom_sizes` | grep "chr1" | cut -f3 | sed 1i"fixedStep chrom=chr1 start=1 step=1" | head
bedtools genomecov -d -i non_ctcf_combined_chr1.bed-g `refgenie seek hg38/fasta.chrom_sizes` | grep "chr1" | cut -f3 | sed 1i"fixedStep chrom=chr1 start=1 step=1" | wigToBigWig -clip stdin `refgenie seek hg38/fasta.chrom_sizes` data/non-ctcf-combined-coverage-chr1.bw
time bedtools genomecov -d -i non_ctcf_combined_chr1.bed -g `refgenie seek hg38/fasta.chrom_sizes` | grep "chr1" | cut -f3 | sed 1i"fixedStep chrom=chr1 start=1 step=1" > data/non-ctcf-combined-coverage-chr1.wig
wigToBigWig -clip data/non-ctcf-combined-coverage-chr1.wig `refgenie seek hg38/fasta.chrom_sizes` data/non-ctcf-combined-coverage-chr1.bw