Joon-Klaps
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎assets/schemas/mapping_constrains.json
+3-3 b/‎assets/schemas/mapping_constrains.json
+3-3
diff --git a/‎bin/custom_multiqc.py
+1-1 b/‎bin/custom_multiqc.py
+1-1
diff --git a/‎bin/utils/module_data_processing.py
+3-3 b/‎bin/utils/module_data_processing.py
+3-3
diff --git a/‎conf/modules.config
+1-1 b/‎conf/modules.config
+1-1
diff --git a/‎conf/tests/test.config
+1-1 b/‎conf/tests/test.config
+1-1
diff --git a/‎conf/tests/test_fail_mapped.config
+1-1 b/‎conf/tests/test_fail_mapped.config
+1-1
diff --git a/‎conf/tests/test_full.config
+1-1 b/‎conf/tests/test_full.config
+1-1
diff --git a/‎conf/tests/test_umi.config
+1-1 b/‎conf/tests/test_umi.config
+1-1
diff --git a/‎docs/images/preprocessing.png
2.76 KB b/‎docs/images/preprocessing.png
2.76 KB
diff --git a/‎docs/images/variant_and_refinement.png
53.5 KB b/‎docs/images/variant_and_refinement.png
53.5 KB
diff --git a/‎docs/parameters.md
+1-1 b/‎docs/parameters.md
+1-1
diff --git a/‎docs/workflow/assembly_polishing.md
+17-23 b/‎docs/workflow/assembly_polishing.md
+17-23
@@ -59,7 +59,7 @@
 10. [Optional] Remove clusters with low read coverage. `bin/extract_clusters.py`
 11. Scaffolding of contigs to centroid ([`Minimap2`](https://github.com/lh3/minimap2), [`iVar-consensus`](https://andersen-lab.github.io/ivar/html/manualpage.html))
 12. [Optional] Annotate 0-depth regions with external reference `bin/lowcov_to_reference.py`.
-13. [Optional] Select best reference from `--mapping_constrains`:
+13. [Optional] Select best reference from `--mapping_constraints`:
     - [`Mash sketch`](https://github.com/marbl/Mash)
     - [`Mash screen`](https://github.com/marbl/Mash)
 14. Mapping filtered reads to supercontig and mapping constrains([`BowTie2`](http://bowtie-bio.sourceforge.net/bowtie2/),[`BWAmem2`](https://github.com/bwa-mem2/bwa-mem2) and [`BWA`](https://github.com/lh3/bwa))
 
@@ -1,8 +1,8 @@
 {
     "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "$id": "https://raw.githubusercontent.com/Joon-Klaps/viralgenie/dev/assets/schemas/mapping_constrains.json",
-    "title": "Joon-Klaps/viralgenie pipeline - params.mapping_constrains schema",
-    "description": "Schema for the file provided with params.mapping_constrains",
+    "$id": "https://raw.githubusercontent.com/Joon-Klaps/viralgenie/dev/assets/schemas/mapping_constraints.json",
+    "title": "Joon-Klaps/viralgenie pipeline - params.mapping_constraints schema",
+    "description": "Schema for the file provided with params.mapping_constraints",
     "type": "array",
     "items": {
         "type": "object",
 
@@ -101,7 +101,7 @@ def file_choices(choices, fname):
     )
 
     parser.add_argument(
-        "--mapping_constrains",
+        "--mapping_constraints",
         metavar="MAPPING CONSTRAINS",
         help="Mapping constrains file containing information on the sequences that need to be used for mapping against the samples, supported formats: '.csv', '.tsv', '.yaml', '.yml'",
         type=lambda s: file_choices(("csv", "tsv", "yaml", "yml"), s),
 
@@ -275,7 +275,7 @@ def reformat_constrain_df(df, file_columns, args):
         return df, df
 
     # Add constrain metadata to the mapping constrain table
-    constrain_meta = filelist_to_df([args.mapping_constrains])
+    constrain_meta = filelist_to_df([args.mapping_constraints])
 
     # drop unwanted columns & reorder
     constrain_meta = drop_columns(constrain_meta, ["sequence", "samples"])
@@ -295,12 +295,12 @@ def reformat_constrain_df(df, file_columns, args):
 
     # add mapping summary to sample overview table in ... wide format with species & segment combination
     logger.info("Creating mapping constrain summary (wide) table")
-    mapping_constrains_summary = create_constrain_summary(df, file_columns).set_index("sample")
+    mapping_constraints_summary = create_constrain_summary(df, file_columns).set_index("sample")
 
     logger.info("Coalescing columns")
     coalesced_constrains = coalesce_constrain(df)
     coalesced_constrains = drop_columns(coalesced_constrains, ["id", "selection", "rank"])
-    return coalesced_constrains, mapping_constrains_summary
+    return coalesced_constrains, mapping_constraints_summary
 
 
 def generate_ignore_samples(dataframe: pd.DataFrame) -> pd.Series:
 
@@ -372,7 +372,7 @@ process {
             ext.args   =
                 [
                     params.spades_mode ? "--${params.spades_mode}" : '' //,
-                    // params.mapping_constrains ? "--trusted-contigs ${params.mapping_constrains}" : ''
+                    // params.mapping_constraints ? "--trusted-contigs ${params.mapping_constraints}" : ''
                 ].join(' ').trim()
             publishDir = [
                 [
 
@@ -39,7 +39,7 @@ params {
     kaiju_db                    = "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_viruses_2023-05-26.tgz"
 
     reference_pool              = "https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/MN908947/sequences.fasta"
-    mapping_constrains          = "${projectDir}/assets/samplesheets/mapping_constrains.csv"
+    mapping_constraints          = "${projectDir}/assets/samplesheets/mapping_constraints.csv"
 
     save_intermediate_polishing = true
     intermediate_mapping_stats  = true
 
@@ -34,7 +34,7 @@ params {
     skip_read_classification  = true
     kaiju_db                    = "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_viruses_2023-05-26.tgz"
     reference_pool              = "https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/MN908947/sequences.fasta"
-    mapping_constrains           = "${projectDir}/assets/samplesheets/mapping_constrains_fail.tsv"
+    mapping_constraints           = "${projectDir}/assets/samplesheets/mapping_constraints_fail.tsv"
 
     min_mapped_reads            = 100
     intermediate_mapping_stats  = true
 
@@ -40,7 +40,7 @@ params {
     kaiju_db                    = "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_viruses_2023-05-26.tgz"
     reference_pool              = "https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/MN908947/sequences.fasta"
 
-    mapping_constrains          = "${projectDir}/assets/samplesheets/mapping_constrains.csv"
+    mapping_constraints          = "${projectDir}/assets/samplesheets/mapping_constraints.csv"
     checkv_db                   = "https://github.com/nf-core/test-datasets/raw/phageannotator/modules/nfcore/checkv/endtoend/checkv_minimal_db.tar"
 
     save_intermediate_polishing = true
 
@@ -42,7 +42,7 @@ params {
     kaiju_db                    = "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_viruses_2023-05-26.tgz"
 
     reference_pool              = "https://github.com/Joon-Klaps/nextclade_data/raw/old_datasets/data/nextstrain/sars-cov-2/MN908947/sequences.fasta"
-    mapping_constrains           = "${projectDir}/assets/samplesheets/mapping_constrains.csv"
+    mapping_constraints           = "${projectDir}/assets/samplesheets/mapping_constraints.csv"
 
     save_intermediate_polishing = true
     save_intermediate_reads     = true
 
@@ -133,7 +133,7 @@ Parameters relating to the analysis of variants associated to contigs and scaffo
 |-----------|-----------|-----------|
 | `skip_variant_calling` | Skip the analysis of variants for the external reference or contigs |  |
 | `mapper` | Define which mapping tool needs to be used when mapping reads to reference | bwamem2 |
-| `mapping_constrains` | Sequence to use as a mapping reference instead of the de novo contigs or scaffolds |  |
+| `mapping_constraints` | Sequence to use as a mapping reference instead of the de novo contigs or scaffolds |  |
 | `deduplicate` | deduplicate the reads <details><summary>Help</summary><small>If used with umi's, `umi tools` will be used to group and call consensus of each indiual read group. If not used with umi's use `PicardsMarkDuplicates`. </small></details>| True |
 | `variant_caller` | Define the variant caller to use: 'ivar' or 'bcftools' | ivar |
 | `consensus_caller` | consensus tool used for calling new consensus in final iteration | ivar |
 
@@ -1,4 +1,3 @@
-
 # Assembly & polishing
 
 Viralgenie offers an elaborate workflow for the assembly and polishing of viral genomes:
@@ -9,17 +8,17 @@ Viralgenie offers an elaborate workflow for the assembly and polishing of viral
 1. [Reference Matching](#4-reference-matching): comparing contigs to a reference sequence pool.
 1. [Taxonomy guided Clustering](#5-taxonomy-guided-clustering): clustering contigs based on taxonomy and nucleotide similarity.
     - [Pre-clustering](#51-pre-clustering-using-taxonomy): separating contigs based on identified taxonomy-id.
-    - [Actual clustering](#52-actual-clustering-on-nucloetide-similarity): clustering contigs based on nucleotide similarity.
-1. [Scaffolding](#scaffolding): scaffolding the contigs to the centroid of each bin.
-1. [Annotation with Reference](#annotation-with-reference): annotating regions with 0-depth coverage with the reference sequence.
+    - [Actual clustering](#52-actual-clustering-on-nucleotide-similarity): clustering contigs based on nucleotide similarity.
+1. [Scaffolding](#7-scaffolding): scaffolding the contigs to the centroid of each bin.
+1. [Annotation with Reference](#8-annotation-with-reference): annotating regions with 0-depth coverage with the reference sequence.
 
 ![assembly_polishing](../images/assembly_polishing.png)
 
-> The overal workflow of creating reference assisted assemblies can be skipped with the argument `--skip_assembly`. See the [parameters assembly section](../parameters.md#assembly) for all relevant arguments to control the assembly steps.
+> The overall workflow of creating reference assisted assemblies can be skipped with the argument `--skip_assembly`. See the [parameters assembly section](../parameters.md#assembly) for all relevant arguments to control the assembly steps.
 
 > The overall refinement of contigs can be skipped with the argument `--skip_polishing`. See the [parameters polishing section](../parameters.md#polishing) for all relevant arguments to control the polishing steps.
 
-The consensus genome of all clusters are then send to the [variant analysis & iterative refinement](variant_and_refinement.md) step.
+The consensus genome of all clusters are then sent to the [variant analysis & iterative refinement](variant_and_refinement.md) step.
 
 ## 1. De-novo Assembly
 
@@ -34,7 +33,7 @@ Low complexity contigs can be filtered out using prinseq++ with the `--skip_cont
 
 Contigs can be extended using [SSPACE Basic](https://github.com/nsoranzo/sspace_basic) with the `--skip_sspace_basic false` parameter. SSPACE is a tool for scaffolding contigs using paired-end reads. It is modified from SSAKE assembler and has the feature of extending contigs using reads that are unmappable in the contig assembly step. To maximize its efficiency, consider specifying the arguments `--read_distance`, `--read_distance_sd`, and `--read_orientation`.  For more information on these arguments, see the [parameters assembly section](../parameters.md#assembly).
 
-> The extension of contigs is ran by default, to skip this step, use `--skip_sspace_basic`.
+> The extension of contigs is run by default, to skip this step, use `--skip_sspace_basic`.
 
 ## 3. Coverage calculation
 
@@ -44,24 +43,23 @@ Processed reads are mapped back against the contigs to determine the number of r
 
 ## 4. Reference Matching
 
-The newly assembled contigs are compared to a reference sequence pool (--reference_pool) using a [BLASTn search](https://www.ncbi.nlm.nih.gov/books/NBK153387/). This process not only helps annotate the contigs but also assists in linking together sets of contigs that are distant within a single genome. Essentially, it aids in identifying contigs belonging to the same genomic segment and choosing the right reference for scaffolding purposes.
+The newly assembled contigs are compared to a reference sequence pool (`--reference_pool`) using a [BLASTn search](https://www.ncbi.nlm.nih.gov/books/NBK153387/). This process not only helps annotate the contigs but also assists in linking together sets of contigs that are distant within a single genome. Essentially, it aids in identifying contigs belonging to the same genomic segment and choosing the right reference for scaffolding purposes.
 
-The top 5 hits for each contig are combined with the denovo contigs and send to the clustering step.
+The top 5 hits for each contig are combined with the de novo contigs and sent to the clustering step.
 
 > The reference pool can be specified with the `--reference_pool` parameter. The default is the latest clustered [Reference Viral DataBase (RVDB)](https://rvdb.dbi.udel.edu/).
 
 ## 5. Taxonomy guided Clustering
 
-The clustering workflow of contigs consists out of 2 steps, the [pre-clustering using taxonomy](#51-pre-clustering-using-taxonomy) and
-[actual clustering on nucleotide similarity](#52-actual-clustering-on-nucloetide-similarity). The taxonomy guided clustering is used to separate contigs based on taxonomy and nucleotide similarity.
+The clustering workflow of contigs consists of 2 steps, the [pre-clustering using taxonomy](#51-pre-clustering-using-taxonomy) and
+[actual clustering on nucleotide similarity](#52-actual-clustering-on-nucleotide-similarity). The taxonomy guided clustering is used to separate contigs based on taxonomy and nucleotide similarity.
 
 ```mermaid
 graph LR;
     A[Contigs] --> B["`**Pre-clustering**`"];
     B --> C["`**Actual clustering**`"];
 ```
 
-
 ### 5.1 Pre-clustering using taxonomy
 
 The contigs along with their references have their taxonomy assigned using [Kraken2](https://ccb.jhu.edu/software/kraken2/) and [Kaiju](https://kaiju.binf.ku.dk/).
@@ -70,7 +68,7 @@ The contigs along with their references have their taxonomy assigned using [Krak
 > - Kraken2: viral refseq database, `--kraken2_db`
 > - Kaiju: clustered [RVDB](https://rvdb.dbi.udel.edu/), `--kaiju_db`
 
-As Kajiu and Kraken2 can have different taxonomic assignments, an additional step is performed to resolve potential inconsistencies in taxonomy and to identify the taxonomy of the contigs. This is done with a custom script that is based on `KrakenTools extract_kraken_reads.py` and `kaiju-Merge-Outputs`.
+As Kaiju and Kraken2 can have different taxonomic assignments, an additional step is performed to resolve potential inconsistencies in taxonomy and to identify the taxonomy of the contigs. This is done with a custom script that is based on `KrakenTools extract_kraken_reads.py` and `kaiju-Merge-Outputs`.
 
 ```mermaid
 graph LR;
@@ -94,7 +92,7 @@ graph LR;
 
 1. Options here are 'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom' or 'superkingdom'.
 
-2. `--precluster_include_childeren`__"genus1"__ :
+2. `--precluster_include_children` __"genus1"__ :
 
     ```mermaid
     graph TD;
@@ -118,9 +116,9 @@ graph LR;
     ```
     Dotted lines represent exclusion of taxa.
 
-> The pre-clustering step will be run by default but can be skipped with the argument `--skip_preclustering`. Specify which classifier to use with `--precluster_classifiers` parameter. The default is `kaiju,kraken2`. Contig taxon filtering is still enabled despite not having to solve for inconsistencies if only Kaiju or Kraken2 is ran.
+> The pre-clustering step will be run by default but can be skipped with the argument `--skip_preclustering`. Specify which classifier to use with `--precluster_classifiers` parameter. The default is `kaiju,kraken2`. Contig taxon filtering is still enabled despite not having to solve for inconsistencies if only Kaiju or Kraken2 is run.
 
-### 5.2 Actual clustering on nucloetide similarity
+### 5.2 Actual clustering on nucleotide similarity
 
 The clustering is performed with one of the following tools:
 
@@ -131,7 +129,6 @@ The clustering is performed with one of the following tools:
 - [`vRhyme`](https://github.com/AnantharamanLab/vRhyme)
 - [`mash`](https://github.com/marbl/Mash)
 
-
 These methods all come with their own advantages and disadvantages. For example, cdhitest is very fast but cannot be used for large viruses >10Mb and similarity threshold cannot go below 80% which is not preferable for highly diverse RNA viruses. Vsearch is slower but accurate. Mmseqs-linclust is the fastest but tends to create a large amount of bins. Mmseqs-cluster is slower but can handle larger datasets and is more accurate. vRhyme is a new method that is still under development but has shown promising results but can sometimes not output any bins when segments are small. Mash is a very fast comparison method is linked with a custom script that identifies communities within a network.
 
 !!! Tip
@@ -143,28 +140,25 @@ These methods all come with their own advantages and disadvantages. For example,
 
 > The similarity threshold can be specified with the `--similarity_threshold` parameter. The default is `0.85`.
 
-
 ## 6. Coverage filtering
 
-The coverage of the contigs is calculated using the same method as in the [coverage calculation step](#3-coverage-calculation). A cumulative sum is taken across the contigs from every assembler. If these cumulative sums is above the specified `--perc_reads_contig` parameter, the contig is kept. If all cumulative sums is below the specified parameter, the contig is removed.
+The coverage of the contigs is calculated using the same method as in the [coverage calculation step](#3-coverage-calculation). A cumulative sum is taken across the contigs from every assembler. If these cumulative sums are above the specified `--perc_reads_contig` parameter, the contig is kept. If all cumulative sums are below the specified parameter, the contig is removed.
 
 !!! Info annotate "Show me an example how it works"
     If the `--perc_reads_contig` is set to `5`, the cumulative sum of the contigs from every assembler is calculated. For example:
 
-    -  Cluster 1: the cumulative sum of the contigs from SPAdes is 0.6, Megahit is 0.5, the cluster is kept.
+    - Cluster 1: the cumulative sum of the contigs from SPAdes is 0.6, Megahit is 0.5, the cluster is kept.
     - Cluster 2: the cumulative sum of the contigs from SPAdes is 0.1, Megahit is 0.1, the cluster is removed.
     - Cluster 3: the cumulative sum of the contigs from SPAdes is 0.5, Megahit is 0, the cluster is kept.
 
 > The default is `5` and can be specified with the `--perc_reads_contig` parameter.
 
-
 ## 7. Scaffolding
 
 After classifying all contigs and their top BLAST hits into distinct clusters or bins, the contigs are then scaffolded to the centroid of each bin. Any external references that are not centroids of the cluster are subsequently removed to prevent further bias. All members of the cluster are consequently mapped towards their centroid with [Minimap2](https://github.com/lh3/minimap2) and consensus is called using [iVar-consensus](https://andersen-lab.github.io/ivar/html/manualpage.html).
 
-
 ## 8. Annotation with Reference
 
-Regions with 0-depth coverage are annotated with the reference sequence. This is done with a [custom script](https://github.com/Joon-Klaps/viralgenie/blob/dev/bin/lowcov_to_reference.py) that uses the coverage of the denovo contigs towards the reference sequence to identify regions with 0-depth coverage. The reference sequence is then annotated to these regions.
+Regions with 0-depth coverage are annotated with the reference sequence. This is done with a [custom script](https://github.com/Joon-Klaps/viralgenie/blob/dev/bin/lowcov_to_reference.py) that uses the coverage of the de novo contigs towards the reference sequence to identify regions with 0-depth coverage. The reference sequence is then annotated to these regions.
 
 > This step can be skipped using `--skip_hybrid_consensus` parameter.
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ def file_choices(choices, fname):`
`101`	`101`	`)`
`102`	`102`
`103`	`103`	`parser.add_argument(`
`104`		`- "--mapping_constrains",`
	`104`	`+ "--mapping_constraints",`
`105`	`105`	`metavar="MAPPING CONSTRAINS",`
`106`	`106`	`help="Mapping constrains file containing information on the sequences that need to be used for mapping against the samples, supported formats: '.csv', '.tsv', '.yaml', '.yml'",`
`107`	`107`	`type=lambda s: file_choices(("csv", "tsv", "yaml", "yml"), s),`
Original file line number	Diff line number	Diff line change
`@@ -372,7 +372,7 @@ process {`
`372`	`372`	`ext.args =`
`373`	`373`	`[`
`374`	`374`	`params.spades_mode ? "--${params.spades_mode}" : '' //,`
`375`		`- // params.mapping_constrains ? "--trusted-contigs ${params.mapping_constrains}" : ''`
	`375`	`+ // params.mapping_constraints ? "--trusted-contigs ${params.mapping_constraints}" : ''`
`376`	`376`	`].join(' ').trim()`
`377`	`377`	`publishDir = [`
`378`	`378`	`[`