Skip to content

Commit 47ad044

Browse files
authored
Merge pull request #44 from Han-Cao/faster_repmask_vcf
Speedup repmask_vcf.sh
2 parents bd2f1de + af729bc commit 47ad044

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

bin/repmask_vcf.sh

+31-16
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,17 @@ echo "compute repeat proportion for each SVs..."
7171
samtools faidx indels.fa
7272
awk '{print $1"\t"$2}' indels.fa.fai > indels.length
7373
awk 'NR > 3 {print $5"\t"$6-1"\t"$7"\t"$10}' ${REPMASK_ONECODE_OUT} | bedtools merge > merge.bed # add -1 to start to meet .bed format
74-
RMQUERIES=$(awk 'NR > 3 {print $5}' ${REPMASK_ONECODE_OUT} | sort | uniq)
7574
rm -rf span &> /dev/null # clean in case there is a "span" file already
7675
rm -rf ${ANNOT_FILE}.gz &> /dev/null # clean in case there was a ${ANNOT_FILE}.gz file already
77-
for i in ${RMQUERIES}
78-
do
79-
paste -d "\t" <(echo -e "${i}") <(grep -w "${i}" merge.bed | awk '{print ($3-$2)}' | paste -sd+ | bc) <(grep -w "${i}" indels.length) | awk '{print $1"\t"$2"\t"$4"\t"($2/$4)}' >> span
80-
done
76+
# join allele length with TE span
77+
# input 1: sum of TE length group by SV ID
78+
# input 2: SV length
79+
# output: id, sum of TE length, SV length, TE span; sorted by SV ID
80+
join -11 -21 \
81+
<(awk '{sum[$1] += $3-$2} END {for (i in sum) print i"\t"sum[i]}' merge.bed | sort -k1,1) \
82+
<(sort -k1,1 indels.length) | \
83+
awk '{print $1"\t"$2"\t"$3"\t"($2/$3)}' > span
84+
8185
# merge with ${ANNOT_FILE}_1
8286
join -13 -21 -a1 <(sort -k3,3 ${ANNOT_FILE}_1) <(sort -k1,1 span) | sed 's/ /\t/g' | \
8387
awk '{print $2"\t"$3"\t"$1"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$15}' | \
@@ -89,22 +93,33 @@ if [[ ${MAM} == "MAM" ]]
8993
then
9094
echo "Mammalian filters ON. Filtering..."
9195
# FILTER 1: L1 Twin Priming and similar
92-
TwP=$(awk '{if ($6 == 2 && $11 == "C,+" && $10~/LINE/) {names = split($10,n,",",seps); ids = split($12, i,",",seps); if (names n[1] == names n[2] && ids i[1] == ids i[2]) {print $0"\tTwP"}}}' vcf_annotation | awk '{print $3}')
9396
rm TwP.txt &> /dev/null
94-
for i in $TwP
95-
do
96-
grep -w $(grep -w "${i}" <(grep -v "#" genotypes.vcf | awk '{print $1"_"$2"\t"$3}') | cut -f 2) repeatmasker_dir/indels.fa.onecode.out | \
97-
awk 'getline second {line=split(second, a, "\t", sep); print $12"\t"$13"\t"$14"\t"a[12]"\t"a[13]"\t"a[14]}' | \
98-
awk -v coord=${i} '{if ($2<$5) {print coord"\t5P_INV:plus"} else {print coord"\t5P_INV:minus"}}' >> TwP.txt
99-
done
97+
# awk 1: find SVs IDs only matched by TwP, sort by SV ID
98+
# join: join SV ID with its matched TEs from the OneCode output (sort by SV ID and reverse sort by strand to keep the order of "C,+")
99+
# awk 2: write SV ID and coordiante of C L1 (odd line) and + L1 (even line) to one line
100+
# awk 3: compare coordinate and annotate 5P_INV
101+
awk '{
102+
if ($6 == 2 && $11 == "C,+" && $10 ~ /LINE/) {
103+
names = split($10, n, ",", seps);
104+
ids = split($12, i, ",", seps);
105+
106+
if (n[1] == n[2] && i[1] == i[2]) {
107+
print $3;
108+
}
109+
}
110+
}' vcf_annotation | sort | \
111+
join -11 -21 - <(awk '{print $5"\t"$9"\t"$12"\t"$13"\t"$14}' repeatmasker_dir/indels.fa.onecode.out | sort -k1,1 -k2,2r) | \
112+
awk 'NR % 2 == 1 {odd_line = $0; getline; print odd_line"\t"$3"\t"$4"\t"$5}' | \
113+
awk '{if ($4<$7) {print $1"\t5P_INV:plus"} else {print $1"\t5P_INV:minus"} }' > TwP.txt
100114

101115
# FILTER 2: SVA VNTR
102116
# get coordinates of each putative TE in TE (single TE hits)
103-
awk '$6 == 1 && $14 > 0.9 {print $1"_"$2"\t"$1"\t"$2"\t"($2+1)"\t"$8"\t"$10}' vcf_annotation | grep 'SVA' | sort -k1,1 > SVA_candidates
104-
join -17 -25 <(join -11 -21 SVA_candidates <(awk 'NR > 3 && !/^#/ {print $1"_"$2"\t"$3}' genotypes.vcf | \
105-
sort -k1,1) | sort -k7,7) <(sort -k5,5 repeatmasker_dir/indels.fa.onecode.out) | \
117+
# it now extracts SV ID from vcf_annotation rather than join with with VCF, these 2 should be identical according to annotate_vcf.R
118+
# compared to the original script, SVA_candidates has an additional column (7th) for SV ID
119+
awk '$6 == 1 && $14 > 0.9 && $10 ~ /SVA/ {print $1"_"$2"\t"$1"\t"($2-1)"\t"$2"\t"$8"\t"$10"\t"$3}' vcf_annotation | sort -k7,7 > SVA_candidates
120+
join -17 -25 SVA_candidates <(sort -k5,5 repeatmasker_dir/indels.fa.onecode.out) | \
106121
sed 's/ /\t/g' | \
107-
awk '{if ($15 == "+") {print $16"\t"$18"\t"$19"\t"$1} else {print $16"\t"$20"\t"$19"\t"$1}}' > SVA_candidates.bed
122+
awk '{if ($15 == "+") {print $16"\t"($18-1)"\t"$19"\t"$1} else {print $16"\t"($20-1)"\t"$19"\t"$1}}' > SVA_candidates.bed
108123
# create bed with SVAs' VNTR position according to TRF on DFAM3.6 models (2022)
109124
rm SVA_VNTR.bed &> /dev/null
110125
echo -e "SVA_A\t436\t855\t+" >> SVA_VNTR.bed

0 commit comments

Comments
 (0)