Merge pull request #156 from Joon-Klaps/shannon-entropy

Joon-Klaps · web-flow · commit 35c1e25a1350 · 2025-01-26T11:40:22.000+01:00
Add column in custom mpileup - Shannon entropy
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
 - Update docs ([#150](https://github.com/Joon-Klaps/viralgenie/pull/150))
 - Make custom-mpileup.py postion 1 index based and not 0 index to follow bcftools ([#153](https://github.com/Joon-Klaps/viralgenie/pull/153))
 - Update docs for more streamlined docs & figures ([#154](https://github.com/Joon-Klaps/viralgenie/pull/154))
+- Add column in custom mpileup - Shannon entropy ([#156](https://github.com/Joon-Klaps/viralgenie/pull/156))
 
 ### `Fixed`
 
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@
     - Read UMI deduplication ([`HUMID`](https://humid.readthedocs.io/en/latest/usage.html))
     - Low complexity and quality filtering ([`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus))
     - Host-read removal ([`BowTie2`](http://bowtie-bio.sourceforge.net/bowtie2/))
-3. Metagenomic diveristy mapping
+3. Metagenomic diversity mapping
     - Performs taxonomic classification and/or profiling using one or more of:
         - [`Kraken2`](https://ccb.jhu.edu/software/kraken2/)
         - [`Bracken`](https://ccb.jhu.edu/software/bracken/)(optional)
diff --git a/bin/custom_mpileup.py b/bin/custom_mpileup.py
@@ -26,6 +26,7 @@ def parse_args(argv=None):
     parser.add_argument("--alignment", type=Path, help="Input BAM file prefix")
     parser.add_argument("--reference", type=Path, help="Reference FASTA file")
     parser.add_argument("--prefix", type=str, help="Name of the output file")
+    parser.add_argument("--k", type=int, help="Pseudocount to add to the total for shannon entropy calculation", default=50)
     parser.add_argument(
         "-l",
         "--log-level",
@@ -36,51 +37,95 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
-def process_mpileup(filename: Path, reference: Path) -> NDArray:
+def process_mpileup(filename: Path, reference: Path, k: int) -> NDArray:
     """
     Process mpileup data using numpy vectorized operations.
-
-    Args:
-        filename: Path to the alignment file (BAM/CRAM/SAM)
-        reference: Path to the reference FASTA file
-
-    Returns:
-        NDArray: Array with columns [position, A, C, G, T, insertions, deletions, consensus]
     """
-    # Initialize FASTA file properly
     fasta = pysam.FastaFile(str(reference))
-
     alignment_file = pysam.AlignmentFile(filename, "rc" if filename.suffix == ".cram" else "rb", reference_filename=str(reference))
 
     # Convert generator to structured numpy array
-    data = np.array(
-        [
-            (r["pos"]+1, r["ref"], r["A"], r["C"], r["G"], r["T"], r["insertions"], r["deletions"], "N")
-            for r in pysamstats.stat_variation(alignment_file, fafile=fasta)
-        ],
-        dtype=[
-            ("pos", int),
-            ("ref", "U1"),
-            ("A", int),
-            ("C", int),
-            ("G", int),
-            ("T", int),
-            ("ins", int),
-            ("del", int),
-            ("consensus", "U1"),
-        ],
+    stats = list(pysamstats.stat_variation(alignment_file, fafile=fasta))
+    n_rows = len(stats)
+
+    # Create structured array in one go
+    data = np.zeros(n_rows, dtype=[
+        ("pos", int), ("ref", "U1"), ("A", int), ("C", int),
+        ("G", int), ("T", int), ("ins", int), ("del", int),
+        ("consensus", "U1"), ("entropy", float), ("weighted_entropy", float)
+    ])
+
+    # Fill arrays using vectorized operations
+    data["pos"] = np.array([r["pos"] + 1 for r in stats])
+    data["ref"] = np.array([r["ref"] for r in stats])
+    for base in "ACGT":
+        data[base] = np.array([r[base] for r in stats])
+    data["ins"] = np.array([r["insertions"] for r in stats])
+    data["del"] = np.array([r["deletions"] for r in stats])
+    data["consensus"] = "N"
+
+    # Create nucleotide matrix for vectorized operations
+    nucleotides = np.stack([data[base] for base in "ACGT"], axis=1)
+    total_coverage = np.sum(nucleotides, axis=1)
+
+    # Vectorized consensus calculation
+    max_counts = np.max(nucleotides, axis=1)
+    mask = np.divide(max_counts, total_coverage, where=total_coverage > 0) >= 0.7
+    data["consensus"][mask] = np.array(["A", "C", "G", "T"])[np.argmax(nucleotides[mask], axis=1)]
+
+    # Calculate shannon entropy
+    data["entropy"] = shannon_entropy(nucleotides, total_coverage)
+    data["weighted_entropy"] = weighted_entropy(data["entropy"], total_coverage, k)
+
+    return data
+
+def shannon_entropy(nucleotides: NDArray, total_coverage: NDArray) -> NDArray:
+    """
+    Calculate the Shannon entropy of the nucleotide distribution
+    """
+    # Define epsilon for numerical stability
+    eps = 1e-10
+
+    # Calculate the frequency of each nucleotide
+    # Add epsilon to avoid division by zero and set a minimum threshold
+    frequencies = np.divide(
+        nucleotides,
+        total_coverage[:, np.newaxis],
+        where=total_coverage[:, np.newaxis] > eps
     )
 
-    # Extract nucleotide counts for consensus calculation
-    nucleotides = np.vstack([data[base] for base in "ACGT"]).T
-    total_coverage = nucleotides.sum(axis=1)
-    max_counts = nucleotides.max(axis=1)
+    # Set very small frequencies to zero to avoid floating point errors
+    frequencies = np.where(frequencies < eps, 0.0, frequencies)
 
-    # Update consensus column where conditions are met
-    mask = np.divide(max_counts, total_coverage, where=total_coverage > 0) >= 0.7
-    data["consensus"][mask] = np.array(["A", "C", "G", "T"])[nucleotides[mask].argmax(axis=1)]
+    # Normalize frequencies to ensure they sum to 1
+    row_sums = np.sum(frequencies, axis=1, keepdims=True)
+    frequencies = np.divide(frequencies, row_sums, where=row_sums > eps)
 
-    return data
+    # Calculate the Shannon entropy only for non-zero frequencies
+    with np.errstate(divide='ignore', invalid='ignore'):
+        entropy = -np.sum(
+            np.where(
+                frequencies > eps,
+                frequencies * np.log2(frequencies),
+                0.0
+            ),
+            axis=1
+        )
+
+    # Replace NaN values with 0.0
+    entropy = np.nan_to_num(entropy)
+
+    # Round to 3 decimal places and ensure -0.0 is converted to 0.0
+    entropy = np.where(entropy == -0.0, 0.0, np.round(entropy, 3))
+
+    return entropy
+
+def weighted_entropy(entropy: NDArray, total_coverage: NDArray, k: int) -> NDArray:
+    """
+    Correct the Shannon entropy by multiplying it with N/(N+k)
+    """
+    correction = total_coverage / (total_coverage + k)
+    return np.round(entropy * correction, 3)
 
 
 def write_csv(matrix: NDArray, prefix: str) -> None:
@@ -91,7 +136,7 @@ def write_csv(matrix: NDArray, prefix: str) -> None:
         matrix: NumPy array containing the mpileup results
         output: Path to the output file
     """
-    header = ["Position", "Reference", "A", "C", "G", "T", "Insertions", "Deletions", "Consensus"]
+    header = ["Position", "Reference", "A", "C", "G", "T", "Insertions", "Deletions", "Consensus", "Entropy", "Weighted Entropy"]
     with open(f"{prefix}.tsv", "w", newline="", encoding="utf-8") as file:
         writer = csv.writer(file, delimiter="\t")
         writer.writerow(header)
@@ -101,7 +146,7 @@ def write_csv(matrix: NDArray, prefix: str) -> None:
 def main():
     args = parse_args()
     logger.info("Starting mpileup processing")
-    matrix = process_mpileup(args.alignment, args.reference)
+    matrix = process_mpileup(args.alignment, args.reference, args.k)
     write_csv(matrix, args.prefix)
     logger.info("Mpileup processing completed")
 
diff --git a/docs/output.md b/docs/output.md
@@ -577,7 +577,12 @@ __Summary statistics__:
 
 #### Custom - mpileup like file
 
-To facilitate the intra host analysis, a mpileup like file is generated. This file contains the depth of every nucleotide at each position of the reference.
+To facilitate the intra host analysis, a mpileup like file is generated. This file contains the depth of every nucleotide at each position of the reference as well as the shannon entropy and a weighted shannon entropy based on the following formulae.
+
+- Shannon entropy: $$ H = -\sum_{i=1}^4 p_i \ln p_i $$
+- Weighted Shannon entropy: $$ w(H) = \frac{N}{N+k} \cdot H $$
+
+Where $N$ is the total bases at a position, $k$ is the pseudocount (default 50), and $p_i$ is the frequency of the nucleotide $i$.
 
 ???- abstract "Output files - variants"
 
@@ -656,7 +661,7 @@ Variant files are visualized in the MultiQC report.
 
 The consensus sequences are generated by [`BCFTools`](http://samtools.github.io/bcftools/bcftools.html) or [`iVar`](https://andersen-lab.github.io/ivar/html/manualpage.html). The consensus sequences are stored in the directory `consensus/` or in the iterations directory `assembly/polishing/iterations/it#/consensus`.
 
-`BCFtools` will use the filtered variants file whereas, `iVar` will redetermine the variants to collapse in the consensus using their own workflow, read more about their differences in the [consensus calling section](./workflow/variant_and_refinement.md#consensus-calling).
+`BCFtools` will use the filtered variants file whereas, `iVar` will redetermine the variants to collapse in the consensus using their own workflow, read more about their differences in the [consensus calling section](./workflow/variant_and_refinement.md#4-consensus-calling).
 
 ???- abstract "Output files - iterations & variants"
 
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -51,7 +51,7 @@ Options related to the trimming, low complexity and host removal steps of the re
 | `host_k2_library` | Kraken2 library(s) required to remove host and contamination <details><summary>Help</summary><small>Only used when no host kraken2 database is specified.</small></details>| human |
 | `skip_host_fastqc` | Skip the fastqc step after host & contaminants were removed |  |
 
-## Metagenomic diveristy
+## Metagenomic diversity
 
 Parameters used to determine the metagenomic diversity of the sample
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -90,7 +90,7 @@ An example samplesheet file consisting of both single- and paired-end data may l
 Viralgenie can in addition to constructing de novo consensus genomes map the sample reads to a series of references. These references are provided through the parameter `--mapping_constraints`.
 
 An example mapping constraint samplesheet file consisting of 5 references, may look something like the one below.
-> This is for 5 references, 2 of them being a multi-fasta file, only one of the multi-fasta needs to undergo [reference selection](./workflow/variant_and_refinement.md#selection-of-reference).
+> This is for 5 references, 2 of them being a multi-fasta file, only one of the multi-fasta needs to undergo [reference selection](./workflow/variant_and_refinement.md#1a-selection-of-reference).
 
 
 === "TSV"
diff --git a/docs/workflow/metagenomic_diversity.md b/docs/workflow/metagenomic_diversity.md
@@ -13,7 +13,7 @@ Viralgenie offers two main tools for the classification of reads and a summary v
 
     Feel free to reach out and suggest more classifiers. However, if the main goal of your project is to establish the presence of a virus within a sample and are therefore only focused on metagenomic diversity, have a look at [taxprofiler](https://nf-co.re/taxprofiler/)
 
-> The read classification can be skipped with the argument `--skip_read_classification`, classifiers should be specified with the parameter `--read_classifiers 'kaiju,kraken2'` (no spaces, no caps). See the [parameters classification section](../parameters.md#read-classification) for all relevant arguments to control the classification steps.
+> The read classification can be skipped with the argument `--skip_read_classification`, classifiers should be specified with the parameter `--read_classifiers 'kaiju,kraken2'` (no spaces, no caps). See the [parameters classification section](../parameters.md#metagenomic-diversity) for all relevant arguments to control the classification steps.
 
 ## Kaiju
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -203,7 +203,7 @@
             },
             "fa_icon": "fas fa-bahai"
         },
-        "metagenomic_diveristy": {
+        "metagenomic_diversity": {
             "title": "Metagenomic diversity",
             "type": "object",
             "description": "Parameters used to determine the metagenomic diversity of the sample",
@@ -903,7 +903,7 @@
             "$ref": "#/$defs/preprocessing_options"
         },
         {
-            "$ref": "#/$defs/metagenomic_diveristy"
+            "$ref": "#/$defs/metagenomic_diversity"
         },
         {
             "$ref": "#/$defs/assembly"