NIB-SI
diff --git a/‎README.md
+23-2 b/‎README.md
+23-2
diff --git a/‎figures/pipeline.png
320 KB b/‎figures/pipeline.png
320 KB
diff --git a/‎scripts/annotation.txt
+196 b/‎scripts/annotation.txt
+196
diff --git a/‎scripts/assembly.txt
+108 b/‎scripts/assembly.txt
+108
diff --git a/‎scripts/orthofinder.txt
+21 b/‎scripts/orthofinder.txt
+21
@@ -1,2 +1,23 @@
-# desiree-genome
-Repository containing scripts and supplementary information on potato cv. Desiree genome assembly
+# Désirée potato genome
+
+By using PacBio HiFi and Hi-C data only we were able to generate phased chromosome-level assembly of a tetraploid potato cultivar. This repository contains scripts and supplementary information on the assembly and gene annotation process.
+
+## Download
+
+ Genome assembly and annotation files are available at [Zenodo](https://doi.org/10.5281/zenodo.14609304) and [desiree.nib.si](https://desiree.nib.si)
+
+## Pipeline overview
+
+ ![alt text](figures\pipeline.png "Pipeline overview")
+
+ Commands used to generate the assembly and annotation are located in [`scripts`](https://github.com/NIB-SI/desiree-genome/scripts).
+
+1. `assembly.txt` - assembly of initial phased sets of contigs with *hifiasm*
+2. `scaffolding.txt` - scaffolding to chromosomes with *YaHs*
+3. `annotation.txt` - gene annotation pipeline
+
+
+## Citation
+If you are using this genome in your research, please cite:
+
+-- add biorxiv CITATION
@@ -0,0 +1,196 @@
+
+# Predict repeats [EDTA]
+
+EDTA.pl \
+--genome $asm \
+--cds $cds \
+--sensitive 1 \
+--anno 1 \
+--evaluate 1 \
+--threads $threads
+
+# Transfer gene models from reference [liftoff]
+
+## csv input contains reference genome fasta; gff name; reference gff
+while IFS=, read ref gff_name ref_gff extra_params; do
+ref_name=$(basename -s .fa $ref)
+extra_params_expanded=$(eval "echo ${extra_params}")
+liftoff \
+-p $threads \
+$extra_params_expanded \
+-chroms $in_dir/$ref_name.chroms.txt \
+$asm \
+$ref \
+-g $ref_gff \
+-o $out_dir/$asm_name."$gff_name"_liftoff.gff3 \
+-u $out_dir/$asm_name."$gff_name"_liftoff_unmapped.txt \
+-dir $out_dir/"$gff_name"_intermediate_files
+# rename polished gff
+mv $out_dir/$asm_name."$gff_name"_liftoff.gff3_polished \
+$out_dir/$asm_name."$gff_name"_liftoff_polished.gff3
+done < $csv
+
+# Map short-read transcriptomes [STAR]
+
+for set in $(basename -a $reads_dir/*.fq.gz | cut -d_ -f1 | uniq)
+do
+mkdir $out_dir/$set
+STAR \
+--runMode alignReads \
+--runThreadN $threads \
+--readFilesIn $reads_dir/"$set"_1.fq.gz $reads_dir/"$set"_2.fq.gz \
+--genomeDir $asm_dir \
+--readFilesCommand zcat \
+--outFileNamePrefix $out_dir/$set/ \
+--outSAMstrandField intronMotif \
+--outSAMattributes All \
+--outSAMattrIHstart 0 \
+--outSAMtype BAM SortedByCoordinate \
+--limitBAMsortRAM 100000000000
+done
+
+# Assembly transcripts from shor-read mappings
+
+for set in $illumina_mappings/*/
+do
+  set_name=$(basename $set)
+  stringtie \
+  $set/Aligned.sortedByCoord.out.bam \
+  -o $out_dir/$set_name.gtf \
+  -p $threads\
+  --rf \
+  -l STRG-$set_name
+done
+
+# Map iso-seq reads [minimap2]
+
+for set_dir in $isoseq_data/*/
+do
+ set_name=$(basename $set_dir)
+ mkdir $out_base/$asm_name/$set_name
+ for reads in $set_dir/*.fq.gz
+  do 
+   reads_name=$(basename -s .fq.gz $reads)
+   minimap2 \
+   -ax splice:hq -uf \
+   -G 5k \
+   --junc-bed $junc \
+   -t $threads \
+   $asm \
+   $reads | \
+   samtools sort -@ $threads \
+   -o $out_base/$asm_name/$set_name/$reads_name.bam
+ done
+ # merge samples by set
+ samtools merge \
+ $out_base/$asm_name/$set_name.merged.bam \
+ $out_base/$asm_name/$set_name/*.bam
+ # index merged set bams
+ samtools index $out_base/$asm_name/$set_name.merged.bam
+done
+conda deactivate
+
+# Collapse redundant iso-seq transcripts [tama-collapse]
+
+ for bam in $isoseq_mappings/*.merged.bam
+  do 
+   bam_name=$(basename -s .merged.bam $bam)
+   python tama_collapse.py \
+   -b BAM -s $bam \
+   -f $asm \
+   -p $out_dir/$bam_name \
+   -x no_cap
+ done
+conda deactivate
+
+# Validate junctions and filter mappings [portcullis]
+
+portcullis full \
+-t $threads \
+$asm \
+--output $out_dir \
+--bam_filter \
+$illumina_mappings/*/Aligned.sortedByCoord.out.bam
+
+# Predict transcripts with braker [BRAKER3]
+
+braker.pl \
+--genome=$masked_asm \
+--prot_seq=$prot \
+--bam=$bams \
+--workingdir=$out_dir \
+--threads $threads \
+--busco_lineage solanales_odb10 &> \
+$out_dir/braker_run.log
+
+# Choose best trabscripts [mikado]
+
+## 1 CONFIGURE
+mikado configure \
+--list $out_dir/inputs.txt \
+--reference $asm \
+--mode permissive \
+--check-references \
+--scoring plant.yaml \
+--copy-scoring $out_dir/plant.yaml \
+--threads $threads \
+--out-dir $out_dir \
+--junctions $junc \
+--blast_targets $prot \
+$out_dir/configuration.yaml
+
+## 2 PREPARE
+mikado prepare \
+--procs $threads \
+--json-conf $out_dir/configuration.yaml
+
+## 3 BLAST
+makeblastdb \
+-in $out_dir/blast/$prot_name.fa \
+-dbtype prot -parse_seqids > \
+$out_dir/blast/"$prot_name"_prepare.log
+
+blastx -max_target_seqs 5 \
+-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore ppos btop" \
+-num_threads $threads \
+-query $out_dir/mikado_prepared.fasta \
+-db $out_dir/blast/$prot_name.fa \
+-out $out_dir/mikado_prepared.blast.tsv
+
+## 4 ORF predictions [transdecoder]
+TransDecoder.LongOrfs \
+-t $out_dir/mikado_prepared.fasta \
+--output_dir $out_dir
+
+TransDecoder.Predict \
+-t $out_dir/mikado_prepared.fasta \
+--output_dir $out_dir
+
+## 5 SERIALZE
+mikado serialise \
+--procs $threads \
+--json-conf $out_dir/configuration.yaml \
+--orfs $out_dir/mikado_prepared.fasta.transdecoder.bed \
+--tsv $out_dir/mikado_prepared.blast.tsv \
+--blast_targets $out_dir/blast/uniprot_plants_simple_prefix.fa
+
+## 6 PICK
+mikado pick \
+--procs $threads \
+--configuration $out_dir/configuration_$version.yaml \
+--subloci-out $out_dir/mikado.subloci.gff3
+
+# Functional annotation [emapper]
+
+emapper.py \
+-m diamond \
+--data_dir $data_dir \
+-i $prot \
+--dbmem \
+--decorate_gff $in_gff \
+--decorate_gff_ID_field ID \
+--cpu $threads \
+-o $asm_name.$version \
+--output_dir $out_dir \
+--tax_scope 33090 > $out_dir/run.log 2>&1
+
@@ -0,0 +1,108 @@
+
+# Genomscope using illumina reads
+
+## count
+jellyfish count |
+-C -m 21 -s 3000000000 -t 18 \
+$illumina_1 $illumina_2 \
+-o $wd/illumina.jf
+
+## export histogram
+jellyfish histo -t 18 \
+$wd/illumina.jf > \
+$wd/illumina.jf.histo
+
+# run genomescope
+genomescope2 -k 21 -p 4 \
+-i $wd/illumina.jf.histo \
+-o $wd
+
+
+# Create initial phased contigs [hifiasm]
+hifiasm \
+-t $threads \
+-o $out_dir/desiree_$asm_version.asm \
+$hifi \
+--h1 $hic_1 \
+--h2 $hic_2 \
+--hom-cov 95 \
+--n-hap 4 \
+-s 0.45 \
+--n-perturb 20000 \
+--f-perturb 0.2 \
+--n-weight 6 2> \
+$out_dir/desiree_$asm_version.log
+
+# Convert gfas to fastas
+for file in $out_dir/*.gfa; do
+  if [[ "$file" != *noseq.gfa ]]; then
+    file_name=$(basename "$file" .gfa)
+    gfatools gfa2fa $file > $out_dir/$file_name.fa
+  fi
+done
+
+# Merqury quality control
+
+for file in $out_dir/*.fa
+do
+assembly1=$file
+assembly2=""
+out_name=$(basename "$file" .fa | sed "s/\./_/g")
+maxcov=200
+maxcount=15000000
+threads=20
+OMP_NUM_THREADS=$threads
+export OMP_NUM_THREADS
+
+mkdir $wd
+cd $wd
+
+merqury.sh $meryl_db \
+$assembly1 \
+$assembly2 \
+merqury > \
+merqury_$out_name.log
+
+for i in $(basename -s .qv "$out_dir"/*.qv)
+ do
+  Rscript plot_spectra_cn.R \
+  -f "$i".spectra-cn.hist \
+  -o "$i".spectra-cn \
+  -z "$i".only.hist \
+  -m $maxcov \
+  -n $maxcount
+ done
+
+Rscript plot_spectra_cn.R \
+-f merqury.spectra-asm.hist \
+-o merqury.spectra-asm \
+-z merqury.dist_only.hist \
+-m $maxcov \
+-n $maxcount
+
+done
+
+# Removal of contaminants contigs [blast]
+
+# run blast against organellar sequences
+for db_name in rDNA_solanum_2023-07-17 chloroplasts_solanum_2.1_2023-07-17 chloroplasts_solanum_1.1_2023-07-17 mitochondira_solanum_1.1_2023-07-17
+ do
+  blastn -task megablast \
+  -outfmt "6 qseqid sseqid length qlen" -num_threads $threads \
+  -query $input_asm \
+  -db $blast_dbs/$db_name \
+  -out $blast_outdir/$input_asm_name/"$db_name"_results.txt
+ done
+
+## mask assembly for blast against bacterial db
+windowmasker \
+-mk_counts -sformat obinary -genome_size 750000000 -mem 500000 \
+-in $input_asm -out $masked_db
+
+## run blast against bacterial sequences
+blastn -task megablast \
+-outfmt "6 qseqid sseqid length qlen" -num_threads $threads \
+-window_masker_db $masked_db \
+-query $input_asm \
+-db $blast_dbs/$db_name \
+-out $blast_outdir/$input_asm_name/"$db_name"_results.txt
@@ -0,0 +1,21 @@
+# Prepare protein files:
+## csv file contains assembly and associated genes in gff
+while IFS=, read asm in_gff; do
+gff_name=$(basename -s .gff $in_gff)
+    conda activate gfftools
+    gffread -S -y \
+    $out_dir/$gff_name.faa.tmp \
+    -g $asm \
+    $in_gff
+    seqkit replace -p "\s.+" \
+    $out_dir/$gff_name.faa.tmp > \
+    $out_dir/$gff_name.faa
+    rm $out_dir/$gff_name.faa.tmp
+    conda deactivate
+done < $csv
+
+# Run orthofinder
+conda activate orthofinder
+orthofinder -t $threads \
+-f $out_dir
+conda deactivate