@@ -71,13 +71,17 @@ echo "compute repeat proportion for each SVs..."
71
71
samtools faidx indels.fa
72
72
awk ' {print $1"\t"$2}' indels.fa.fai > indels.length
73
73
awk ' NR > 3 {print $5"\t"$6-1"\t"$7"\t"$10}' ${REPMASK_ONECODE_OUT} | bedtools merge > merge.bed # add -1 to start to meet .bed format
74
- RMQUERIES=$( awk ' NR > 3 {print $5}' ${REPMASK_ONECODE_OUT} | sort | uniq)
75
74
rm -rf span & > /dev/null # clean in case there is a "span" file already
76
75
rm -rf ${ANNOT_FILE} .gz & > /dev/null # clean in case there was a ${ANNOT_FILE}.gz file already
77
- for i in ${RMQUERIES}
78
- do
79
- paste -d " \t" <( echo -e " ${i} " ) <( grep -w " ${i} " merge.bed | awk ' {print ($3-$2)}' | paste -sd+ | bc) <( grep -w " ${i} " indels.length) | awk ' {print $1"\t"$2"\t"$4"\t"($2/$4)}' >> span
80
- done
76
+ # join allele length with TE span
77
+ # input 1: sum of TE length group by SV ID
78
+ # input 2: SV length
79
+ # output: id, sum of TE length, SV length, TE span; sorted by SV ID
80
+ join -11 -21 \
81
+ <( awk ' {sum[$1] += $3-$2} END {for (i in sum) print i"\t"sum[i]}' merge.bed | sort -k1,1) \
82
+ <( sort -k1,1 indels.length) | \
83
+ awk ' {print $1"\t"$2"\t"$3"\t"($2/$3)}' > span
84
+
81
85
# merge with ${ANNOT_FILE}_1
82
86
join -13 -21 -a1 <( sort -k3,3 ${ANNOT_FILE} _1) <( sort -k1,1 span) | sed ' s/ /\t/g' | \
83
87
awk ' {print $2"\t"$3"\t"$1"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$15}' | \
@@ -89,22 +93,33 @@ if [[ ${MAM} == "MAM" ]]
89
93
then
90
94
echo " Mammalian filters ON. Filtering..."
91
95
# FILTER 1: L1 Twin Priming and similar
92
- TwP=$( awk ' {if ($6 == 2 && $11 == "C,+" && $10~/LINE/) {names = split($10,n,",",seps); ids = split($12, i,",",seps); if (names n[1] == names n[2] && ids i[1] == ids i[2]) {print $0"\tTwP"}}}' vcf_annotation | awk ' {print $3}' )
93
96
rm TwP.txt & > /dev/null
94
- for i in $TwP
95
- do
96
- grep -w $( grep -w " ${i} " <( grep -v " #" genotypes.vcf | awk ' {print $1"_"$2"\t"$3}' ) | cut -f 2) repeatmasker_dir/indels.fa.onecode.out | \
97
- awk ' getline second {line=split(second, a, "\t", sep); print $12"\t"$13"\t"$14"\t"a[12]"\t"a[13]"\t"a[14]}' | \
98
- awk -v coord=${i} ' {if ($2<$5) {print coord"\t5P_INV:plus"} else {print coord"\t5P_INV:minus"}}' >> TwP.txt
99
- done
97
+ # awk 1: find SVs IDs only matched by TwP, sort by SV ID
98
+ # join: join SV ID with its matched TEs from the OneCode output (sort by SV ID and reverse sort by strand to keep the order of "C,+")
99
+ # awk 2: write SV ID and coordiante of C L1 (odd line) and + L1 (even line) to one line
100
+ # awk 3: compare coordinate and annotate 5P_INV
101
+ awk ' {
102
+ if ($6 == 2 && $11 == "C,+" && $10 ~ /LINE/) {
103
+ names = split($10, n, ",", seps);
104
+ ids = split($12, i, ",", seps);
105
+
106
+ if (n[1] == n[2] && i[1] == i[2]) {
107
+ print $3;
108
+ }
109
+ }
110
+ }' vcf_annotation | sort | \
111
+ join -11 -21 - <( awk ' {print $5"\t"$9"\t"$12"\t"$13"\t"$14}' repeatmasker_dir/indels.fa.onecode.out | sort -k1,1 -k2,2r) | \
112
+ awk ' NR % 2 == 1 {odd_line = $0; getline; print odd_line"\t"$3"\t"$4"\t"$5}' | \
113
+ awk ' {if ($4<$7) {print $1"\t5P_INV:plus"} else {print $1"\t5P_INV:minus"} }' > TwP.txt
100
114
101
115
# FILTER 2: SVA VNTR
102
116
# get coordinates of each putative TE in TE (single TE hits)
103
- awk ' $6 == 1 && $14 > 0.9 {print $1"_"$2"\t"$1"\t"$2"\t"($2+1)"\t"$8"\t"$10}' vcf_annotation | grep ' SVA' | sort -k1,1 > SVA_candidates
104
- join -17 -25 <( join -11 -21 SVA_candidates <( awk ' NR > 3 && !/^#/ {print $1"_"$2"\t"$3}' genotypes.vcf | \
105
- sort -k1,1) | sort -k7,7) <( sort -k5,5 repeatmasker_dir/indels.fa.onecode.out) | \
117
+ # it now extracts SV ID from vcf_annotation rather than join with with VCF, these 2 should be identical according to annotate_vcf.R
118
+ # compared to the original script, SVA_candidates has an additional column (7th) for SV ID
119
+ awk ' $6 == 1 && $14 > 0.9 && $10 ~ /SVA/ {print $1"_"$2"\t"$1"\t"($2-1)"\t"$2"\t"$8"\t"$10"\t"$3}' vcf_annotation | sort -k7,7 > SVA_candidates
120
+ join -17 -25 SVA_candidates <( sort -k5,5 repeatmasker_dir/indels.fa.onecode.out) | \
106
121
sed ' s/ /\t/g' | \
107
- awk ' {if ($15 == "+") {print $16"\t"$18"\t"$19"\t"$1} else {print $16"\t"$20"\t"$19"\t"$1}}' > SVA_candidates.bed
122
+ awk ' {if ($15 == "+") {print $16"\t"( $18-1) "\t"$19"\t"$1} else {print $16"\t"( $20-1) "\t"$19"\t"$1}}' > SVA_candidates.bed
108
123
# create bed with SVAs' VNTR position according to TRF on DFAM3.6 models (2022)
109
124
rm SVA_VNTR.bed & > /dev/null
110
125
echo -e " SVA_A\t436\t855\t+" >> SVA_VNTR.bed
0 commit comments