Skip to content

Commit 6559faa

Browse files
committed
add scripts and update readme
1 parent 2cae05f commit 6559faa

File tree

7 files changed

+448
-2
lines changed

7 files changed

+448
-2
lines changed

README.md

+23-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,23 @@
1-
# desiree-genome
2-
Repository containing scripts and supplementary information on potato cv. Desiree genome assembly
1+
# Désirée potato genome
2+
3+
By using PacBio HiFi and Hi-C data only we were able to generate phased chromosome-level assembly of a tetraploid potato cultivar. This repository contains scripts and supplementary information on the assembly and gene annotation process.
4+
5+
## Download
6+
7+
Genome assembly and annotation files are available at [Zenodo](https://doi.org/10.5281/zenodo.14609304) and [desiree.nib.si](https://desiree.nib.si)
8+
9+
## Pipeline overview
10+
11+
![alt text](figures\pipeline.png "Pipeline overview")
12+
13+
Commands used to generate the assembly and annotation are located in [`scripts`](https://github.com/NIB-SI/desiree-genome/scripts).
14+
15+
1. `assembly.txt` - assembly of initial phased sets of contigs with *hifiasm*
16+
2. `scaffolding.txt` - scaffolding to chromosomes with *YaHs*
17+
3. `annotation.txt` - gene annotation pipeline
18+
19+
20+
## Citation
21+
If you are using this genome in your research, please cite:
22+
23+
-- add biorxiv CITATION

figures/pipeline.png

320 KB
Loading

scripts/annotation.txt

+196
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
2+
# Predict repeats [EDTA]
3+
4+
EDTA.pl \
5+
--genome $asm \
6+
--cds $cds \
7+
--sensitive 1 \
8+
--anno 1 \
9+
--evaluate 1 \
10+
--threads $threads
11+
12+
# Transfer gene models from reference [liftoff]
13+
14+
## csv input contains reference genome fasta; gff name; reference gff
15+
while IFS=, read ref gff_name ref_gff extra_params; do
16+
ref_name=$(basename -s .fa $ref)
17+
extra_params_expanded=$(eval "echo ${extra_params}")
18+
liftoff \
19+
-p $threads \
20+
$extra_params_expanded \
21+
-chroms $in_dir/$ref_name.chroms.txt \
22+
$asm \
23+
$ref \
24+
-g $ref_gff \
25+
-o $out_dir/$asm_name."$gff_name"_liftoff.gff3 \
26+
-u $out_dir/$asm_name."$gff_name"_liftoff_unmapped.txt \
27+
-dir $out_dir/"$gff_name"_intermediate_files
28+
# rename polished gff
29+
mv $out_dir/$asm_name."$gff_name"_liftoff.gff3_polished \
30+
$out_dir/$asm_name."$gff_name"_liftoff_polished.gff3
31+
done < $csv
32+
33+
# Map short-read transcriptomes [STAR]
34+
35+
for set in $(basename -a $reads_dir/*.fq.gz | cut -d_ -f1 | uniq)
36+
do
37+
mkdir $out_dir/$set
38+
STAR \
39+
--runMode alignReads \
40+
--runThreadN $threads \
41+
--readFilesIn $reads_dir/"$set"_1.fq.gz $reads_dir/"$set"_2.fq.gz \
42+
--genomeDir $asm_dir \
43+
--readFilesCommand zcat \
44+
--outFileNamePrefix $out_dir/$set/ \
45+
--outSAMstrandField intronMotif \
46+
--outSAMattributes All \
47+
--outSAMattrIHstart 0 \
48+
--outSAMtype BAM SortedByCoordinate \
49+
--limitBAMsortRAM 100000000000
50+
done
51+
52+
# Assembly transcripts from shor-read mappings
53+
54+
for set in $illumina_mappings/*/
55+
do
56+
set_name=$(basename $set)
57+
stringtie \
58+
$set/Aligned.sortedByCoord.out.bam \
59+
-o $out_dir/$set_name.gtf \
60+
-p $threads\
61+
--rf \
62+
-l STRG-$set_name
63+
done
64+
65+
# Map iso-seq reads [minimap2]
66+
67+
for set_dir in $isoseq_data/*/
68+
do
69+
set_name=$(basename $set_dir)
70+
mkdir $out_base/$asm_name/$set_name
71+
for reads in $set_dir/*.fq.gz
72+
do
73+
reads_name=$(basename -s .fq.gz $reads)
74+
minimap2 \
75+
-ax splice:hq -uf \
76+
-G 5k \
77+
--junc-bed $junc \
78+
-t $threads \
79+
$asm \
80+
$reads | \
81+
samtools sort -@ $threads \
82+
-o $out_base/$asm_name/$set_name/$reads_name.bam
83+
done
84+
# merge samples by set
85+
samtools merge \
86+
$out_base/$asm_name/$set_name.merged.bam \
87+
$out_base/$asm_name/$set_name/*.bam
88+
# index merged set bams
89+
samtools index $out_base/$asm_name/$set_name.merged.bam
90+
done
91+
conda deactivate
92+
93+
# Collapse redundant iso-seq transcripts [tama-collapse]
94+
95+
for bam in $isoseq_mappings/*.merged.bam
96+
do
97+
bam_name=$(basename -s .merged.bam $bam)
98+
python tama_collapse.py \
99+
-b BAM -s $bam \
100+
-f $asm \
101+
-p $out_dir/$bam_name \
102+
-x no_cap
103+
done
104+
conda deactivate
105+
106+
# Validate junctions and filter mappings [portcullis]
107+
108+
portcullis full \
109+
-t $threads \
110+
$asm \
111+
--output $out_dir \
112+
--bam_filter \
113+
$illumina_mappings/*/Aligned.sortedByCoord.out.bam
114+
115+
# Predict transcripts with braker [BRAKER3]
116+
117+
braker.pl \
118+
--genome=$masked_asm \
119+
--prot_seq=$prot \
120+
--bam=$bams \
121+
--workingdir=$out_dir \
122+
--threads $threads \
123+
--busco_lineage solanales_odb10 &> \
124+
$out_dir/braker_run.log
125+
126+
# Choose best trabscripts [mikado]
127+
128+
## 1 CONFIGURE
129+
mikado configure \
130+
--list $out_dir/inputs.txt \
131+
--reference $asm \
132+
--mode permissive \
133+
--check-references \
134+
--scoring plant.yaml \
135+
--copy-scoring $out_dir/plant.yaml \
136+
--threads $threads \
137+
--out-dir $out_dir \
138+
--junctions $junc \
139+
--blast_targets $prot \
140+
$out_dir/configuration.yaml
141+
142+
## 2 PREPARE
143+
mikado prepare \
144+
--procs $threads \
145+
--json-conf $out_dir/configuration.yaml
146+
147+
## 3 BLAST
148+
makeblastdb \
149+
-in $out_dir/blast/$prot_name.fa \
150+
-dbtype prot -parse_seqids > \
151+
$out_dir/blast/"$prot_name"_prepare.log
152+
153+
blastx -max_target_seqs 5 \
154+
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore ppos btop" \
155+
-num_threads $threads \
156+
-query $out_dir/mikado_prepared.fasta \
157+
-db $out_dir/blast/$prot_name.fa \
158+
-out $out_dir/mikado_prepared.blast.tsv
159+
160+
## 4 ORF predictions [transdecoder]
161+
TransDecoder.LongOrfs \
162+
-t $out_dir/mikado_prepared.fasta \
163+
--output_dir $out_dir
164+
165+
TransDecoder.Predict \
166+
-t $out_dir/mikado_prepared.fasta \
167+
--output_dir $out_dir
168+
169+
## 5 SERIALZE
170+
mikado serialise \
171+
--procs $threads \
172+
--json-conf $out_dir/configuration.yaml \
173+
--orfs $out_dir/mikado_prepared.fasta.transdecoder.bed \
174+
--tsv $out_dir/mikado_prepared.blast.tsv \
175+
--blast_targets $out_dir/blast/uniprot_plants_simple_prefix.fa
176+
177+
## 6 PICK
178+
mikado pick \
179+
--procs $threads \
180+
--configuration $out_dir/configuration_$version.yaml \
181+
--subloci-out $out_dir/mikado.subloci.gff3
182+
183+
# Functional annotation [emapper]
184+
185+
emapper.py \
186+
-m diamond \
187+
--data_dir $data_dir \
188+
-i $prot \
189+
--dbmem \
190+
--decorate_gff $in_gff \
191+
--decorate_gff_ID_field ID \
192+
--cpu $threads \
193+
-o $asm_name.$version \
194+
--output_dir $out_dir \
195+
--tax_scope 33090 > $out_dir/run.log 2>&1
196+

scripts/assembly.txt

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
2+
# Genomscope using illumina reads
3+
4+
## count
5+
jellyfish count |
6+
-C -m 21 -s 3000000000 -t 18 \
7+
$illumina_1 $illumina_2 \
8+
-o $wd/illumina.jf
9+
10+
## export histogram
11+
jellyfish histo -t 18 \
12+
$wd/illumina.jf > \
13+
$wd/illumina.jf.histo
14+
15+
# run genomescope
16+
genomescope2 -k 21 -p 4 \
17+
-i $wd/illumina.jf.histo \
18+
-o $wd
19+
20+
21+
# Create initial phased contigs [hifiasm]
22+
hifiasm \
23+
-t $threads \
24+
-o $out_dir/desiree_$asm_version.asm \
25+
$hifi \
26+
--h1 $hic_1 \
27+
--h2 $hic_2 \
28+
--hom-cov 95 \
29+
--n-hap 4 \
30+
-s 0.45 \
31+
--n-perturb 20000 \
32+
--f-perturb 0.2 \
33+
--n-weight 6 2> \
34+
$out_dir/desiree_$asm_version.log
35+
36+
# Convert gfas to fastas
37+
for file in $out_dir/*.gfa; do
38+
if [[ "$file" != *noseq.gfa ]]; then
39+
file_name=$(basename "$file" .gfa)
40+
gfatools gfa2fa $file > $out_dir/$file_name.fa
41+
fi
42+
done
43+
44+
# Merqury quality control
45+
46+
for file in $out_dir/*.fa
47+
do
48+
assembly1=$file
49+
assembly2=""
50+
out_name=$(basename "$file" .fa | sed "s/\./_/g")
51+
maxcov=200
52+
maxcount=15000000
53+
threads=20
54+
OMP_NUM_THREADS=$threads
55+
export OMP_NUM_THREADS
56+
57+
mkdir $wd
58+
cd $wd
59+
60+
merqury.sh $meryl_db \
61+
$assembly1 \
62+
$assembly2 \
63+
merqury > \
64+
merqury_$out_name.log
65+
66+
for i in $(basename -s .qv "$out_dir"/*.qv)
67+
do
68+
Rscript plot_spectra_cn.R \
69+
-f "$i".spectra-cn.hist \
70+
-o "$i".spectra-cn \
71+
-z "$i".only.hist \
72+
-m $maxcov \
73+
-n $maxcount
74+
done
75+
76+
Rscript plot_spectra_cn.R \
77+
-f merqury.spectra-asm.hist \
78+
-o merqury.spectra-asm \
79+
-z merqury.dist_only.hist \
80+
-m $maxcov \
81+
-n $maxcount
82+
83+
done
84+
85+
# Removal of contaminants contigs [blast]
86+
87+
# run blast against organellar sequences
88+
for db_name in rDNA_solanum_2023-07-17 chloroplasts_solanum_2.1_2023-07-17 chloroplasts_solanum_1.1_2023-07-17 mitochondira_solanum_1.1_2023-07-17
89+
do
90+
blastn -task megablast \
91+
-outfmt "6 qseqid sseqid length qlen" -num_threads $threads \
92+
-query $input_asm \
93+
-db $blast_dbs/$db_name \
94+
-out $blast_outdir/$input_asm_name/"$db_name"_results.txt
95+
done
96+
97+
## mask assembly for blast against bacterial db
98+
windowmasker \
99+
-mk_counts -sformat obinary -genome_size 750000000 -mem 500000 \
100+
-in $input_asm -out $masked_db
101+
102+
## run blast against bacterial sequences
103+
blastn -task megablast \
104+
-outfmt "6 qseqid sseqid length qlen" -num_threads $threads \
105+
-window_masker_db $masked_db \
106+
-query $input_asm \
107+
-db $blast_dbs/$db_name \
108+
-out $blast_outdir/$input_asm_name/"$db_name"_results.txt

scripts/orthofinder.txt

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Prepare protein files:
2+
## csv file contains assembly and associated genes in gff
3+
while IFS=, read asm in_gff; do
4+
gff_name=$(basename -s .gff $in_gff)
5+
conda activate gfftools
6+
gffread -S -y \
7+
$out_dir/$gff_name.faa.tmp \
8+
-g $asm \
9+
$in_gff
10+
seqkit replace -p "\s.+" \
11+
$out_dir/$gff_name.faa.tmp > \
12+
$out_dir/$gff_name.faa
13+
rm $out_dir/$gff_name.faa.tmp
14+
conda deactivate
15+
done < $csv
16+
17+
# Run orthofinder
18+
conda activate orthofinder
19+
orthofinder -t $threads \
20+
-f $out_dir
21+
conda deactivate

0 commit comments

Comments
 (0)