Skip to content

Commit 1f1c6f3

Browse files
authored
Merge pull request #151 from Joon-Klaps/custom-vcf
Add new module to inculde custom mpileup-vcf file for intra-host analyses
2 parents 1052083 + bfa16d7 commit 1f1c6f3

File tree

7 files changed

+220
-46
lines changed

7 files changed

+220
-46
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
1818
- Adding mash-screen output to result table ([#140](https://github.com/Joon-Klaps/viralgenie/pull/140))
1919
- Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141))
2020
- Add visualisation for hybrid scaffold ([#143](https://github.com/Joon-Klaps/viralgenie/pull/143))
21+
- Add new module to inculde custom mpileup-vcf file for intra-host analyses ([#151](https://github.com/Joon-Klaps/viralgenie/pull/151))
22+
2123

2224
### `Fixed`
2325

bin/custom_mpileup.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env python
2+
3+
"""Provide a command line tool to filter generate a custom tsv -csv file from a bam"""
4+
5+
import pysam
6+
import pysamstats
7+
import csv
8+
import numpy as np
9+
from numpy.typing import NDArray
10+
import argparse
11+
from pathlib import Path
12+
import logging
13+
14+
# Initialize logger
15+
logger = logging.getLogger(__name__)
16+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
17+
18+
19+
def parse_args(argv=None):
20+
"""Define and immediately parse command line arguments."""
21+
parser = argparse.ArgumentParser(
22+
description="Provide a command line tool to create a custom summary of mpileup results from a bam/cram/sam file",
23+
epilog="Example: python custom_mpileup.py --clusters_summary file1,file2,file3,... ",
24+
)
25+
parser = argparse.ArgumentParser(description="Custom mpileup processing script.")
26+
parser.add_argument("--alignment", type=Path, help="Input BAM file prefix")
27+
parser.add_argument("--reference", type=Path, help="Reference FASTA file")
28+
parser.add_argument("--prefix", type=str, help="Name of the output file")
29+
parser.add_argument(
30+
"-l",
31+
"--log-level",
32+
help="The desired log level (default WARNING).",
33+
choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
34+
default="INFO",
35+
)
36+
return parser.parse_args(argv)
37+
38+
39+
def process_mpileup(filename: Path, reference: Path) -> NDArray:
40+
"""
41+
Process mpileup data using numpy vectorized operations.
42+
43+
Args:
44+
filename: Path to the alignment file (BAM/CRAM/SAM)
45+
reference: Path to the reference FASTA file
46+
47+
Returns:
48+
NDArray: Array with columns [position, A, C, G, T, insertions, deletions, consensus]
49+
"""
50+
# Initialize FASTA file properly
51+
fasta = pysam.FastaFile(str(reference))
52+
53+
alignment_file = pysam.AlignmentFile(filename, "rc" if filename.suffix == ".cram" else "rb", reference_filename=str(reference))
54+
55+
# Convert generator to structured numpy array
56+
data = np.array(
57+
[
58+
(r["pos"], r["ref"], r["A"], r["C"], r["G"], r["T"], r["insertions"], r["deletions"], "N")
59+
for r in pysamstats.stat_variation(alignment_file, fafile=fasta)
60+
],
61+
dtype=[
62+
("pos", int),
63+
("ref", "U1"),
64+
("A", int),
65+
("C", int),
66+
("G", int),
67+
("T", int),
68+
("ins", int),
69+
("del", int),
70+
("consensus", "U1"),
71+
],
72+
)
73+
74+
# Extract nucleotide counts for consensus calculation
75+
nucleotides = np.vstack([data[base] for base in "ACGT"]).T
76+
total_coverage = nucleotides.sum(axis=1)
77+
max_counts = nucleotides.max(axis=1)
78+
79+
# Update consensus column where conditions are met
80+
mask = np.divide(max_counts, total_coverage, where=total_coverage > 0) >= 0.7
81+
data["consensus"][mask] = np.array(["A", "C", "G", "T"])[nucleotides[mask].argmax(axis=1)]
82+
83+
return data
84+
85+
86+
def write_csv(matrix: NDArray, prefix: str) -> None:
87+
"""
88+
Write the matrix to a csv file
89+
90+
Args:
91+
matrix: NumPy array containing the mpileup results
92+
output: Path to the output file
93+
"""
94+
header = ["Position", "Reference", "A", "C", "G", "T", "Insertions", "Deletions", "Consensus"]
95+
with open(f"{prefix}.tsv", "w", newline="", encoding="utf-8") as file:
96+
writer = csv.writer(file, delimiter="\t")
97+
writer.writerow(header)
98+
writer.writerows(matrix)
99+
100+
101+
def main():
102+
args = parse_args()
103+
logger.info("Starting mpileup processing")
104+
matrix = process_mpileup(args.alignment, args.reference)
105+
write_csv(matrix, args.prefix)
106+
logger.info("Mpileup processing completed")
107+
108+
109+
if __name__ == "__main__":
110+
main()

conf/modules.config

+22-29
Original file line numberDiff line numberDiff line change
@@ -1264,14 +1264,26 @@ process {
12641264
ext.prefix = { "${meta.id}_${meta.previous_step}" } // DON'T CHANGE
12651265
publishDir = [
12661266
path: { "${meta.iteration}" == "variant-calling" ?
1267-
"${params.outdir}/variants/mapping-info/picard/${meta.sample}" :
1267+
"${params.outdir}/variants/mapping-info/metrics/picard/${meta.sample}" :
12681268
"${params.outdir}/assembly/polishing/iterations/${meta.step}/metrics/picard/" },
12691269
mode: params.publish_dir_mode,
12701270
pattern: '*{metrics,pdf}',
12711271
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
12721272
]
12731273
}
12741274

1275+
withName: CUSTOM_MPILEUP {
1276+
ext.prefix = { "${meta.id}_${meta.previous_step}.vcf" } // DON'T CHANGE
1277+
publishDir = [
1278+
path: { "${meta.iteration}" == "variant-calling" ?
1279+
"${params.outdir}/variants/mapping-info/custom-vcf/${meta.sample}" :
1280+
"${params.outdir}/assembly/polishing/iterations/${meta.step}/custom-vcf/" },
1281+
mode: params.publish_dir_mode,
1282+
pattern: '*tsv',
1283+
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
1284+
]
1285+
}
1286+
12751287
withName: ".*BAM_STATS_METRICS:BAM_STATS_SAMTOOLS:SAMTOOLS_STATS" {
12761288
ext.prefix = { "${meta.id}_${meta.previous_step}" } // DON'T CHANGE
12771289
publishDir = [
@@ -1471,9 +1483,7 @@ process {
14711483
ext.args2 = 10
14721484
ext.prefix = { "${meta.id}_${meta.previous_step}" } // DON'T CHANGE
14731485
publishDir = [
1474-
path: { "${meta.iteration}" == "variant-calling" ?
1475-
"${params.outdir}/consensus/mask/${meta.sample}" :
1476-
"${params.outdir}/assembly/polishing/iterations/${meta.step}/consensus/mask" },
1486+
path: { "${params.outdir}/consensus/mask/${meta.step}/${meta.sample}"},
14771487
mode: params.publish_dir_mode,
14781488
enabled: params.save_intermediate_polishing,
14791489
pattern: '*.{mpileup}',
@@ -1484,9 +1494,7 @@ process {
14841494
withName: BEDTOOLS_MERGE {
14851495
ext.prefix = { "${meta.id}_${meta.previous_step}.merged" } // DON'T CHANGE
14861496
publishDir = [
1487-
path: { "${meta.iteration}" == "variant-calling" ?
1488-
"${params.outdir}/consensus/mask/${meta.sample}" :
1489-
"${params.outdir}/assembly/polishing/iterations/${meta.step}/consensus/mask" },
1497+
path: { "${params.outdir}/consensus/mask/${meta.step}/${meta.sample}"},
14901498
mode: params.publish_dir_mode,
14911499
enabled: params.save_intermediate_polishing,
14921500
pattern: '*.{bed}',
@@ -1497,9 +1505,7 @@ process {
14971505
withName: BEDTOOLS_MASKFASTA {
14981506
ext.prefix = { "${meta.id}_${meta.previous_step}.masked" } // DON'T CHANGE
14991507
publishDir = [
1500-
path: { "${meta.iteration}" == "variant-calling" ?
1501-
"${params.outdir}/consensus/mask/${meta.sample}" :
1502-
"${params.outdir}/assembly/polishing/iterations/${meta.step}/consensus/mask" },
1508+
path: { "${params.outdir}/consensus/mask/${meta.step}/${meta.sample}"},
15031509
mode: params.publish_dir_mode,
15041510
enabled: false,
15051511
pattern: '*.{fa}',
@@ -1538,24 +1544,11 @@ process {
15381544
].join(' ').trim()
15391545
ext.prefix = { "${meta.id}_it${meta.iteration}" } // DON'T CHANGE
15401546
publishDir = [
1541-
[
1542-
path: { "${meta.iteration}" == "variant-calling" ?
1543-
"${params.outdir}/consensus/mask/${meta.sample}" :
1544-
"${params.outdir}/assembly/polishing/iterations/${meta.step}/consensus/mask/${meta.sample}" },
1545-
mode: params.publish_dir_mode,
1546-
enabled: params.save_intermediate_polishing,
1547-
pattern: "*.txt",
1548-
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
1549-
],
1550-
[
1551-
path: { "${meta.iteration}" == "variant-calling" ?
1552-
"${params.outdir}/consensus/mask/${meta.sample}" :
1553-
"${params.outdir}/assembly/polishing/iterations/${meta.step}/consensus/mask/${meta.sample}" },
1554-
mode: params.publish_dir_mode,
1555-
enabled: params.save_intermediate_polishing,
1556-
pattern: "*.mpileup",
1557-
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
1558-
]
1547+
path: { "${params.outdir}/consensus/mask/${meta.step}/${meta.sample}"},
1548+
mode: params.publish_dir_mode,
1549+
enabled: params.save_intermediate_polishing,
1550+
pattern: "*.{txt,mpileup}",
1551+
saveAs: { filename -> params.prefix || params.global_prefix ? "${params.global_prefix}-$filename" : filename }
15591552
]
15601553
}
15611554

@@ -1596,7 +1589,7 @@ process {
15961589
}
15971590

15981591
withName: QUAST_QC {
1599-
ext.args = "--min-contig 0"
1592+
ext.args = "--min-contig 0"
16001593
publishDir = [
16011594
path: { "${params.outdir}/consensus/quality_control/quast/${meta.sample}/${meta.step}/" },
16021595
mode: params.publish_dir_mode,

docs/output.md

+22-17
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,21 @@ __Summary statistics__:
574574
- `*.CollectMultipleMetrics.*`: Alignment QC files from picard CollectMultipleMetrics in `*_metrics` textual format.
575575
- `*.pdf` plots for metrics obtained from CollectMultipleMetrics.
576576

577+
#### Custom - mpileup like file
578+
579+
To facilitate the intra host analysis, a mpileup like file is generated. This file contains the depth of every nucletoride at each position of the reference.
580+
581+
???- abstract "Output files - variants"
582+
583+
- `variants/mapping-info/custom-vcf/<sample-id>`
584+
- `*.tsv`: A custom tsv file containing the depth of every nucleotide at each position of the reference.
585+
586+
587+
???- abstract "Output files - iterations"
588+
589+
- `assembly/polishing/iterations/it#/custom-vcf/<sample-id>`
590+
- `*.tsv`: A custom tsv file containing the depth of every nucleotide at each position of the reference.
591+
577592
#### Mosdepth - Coverage
578593

579594
[mosdepth](https://github.com/brentp/mosdepth) is a fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. mosdepth is used in this pipeline to obtain genome-wide coverage values in 200bp windows. The results are rendered in MultiQC (genome-wide coverage).
@@ -645,25 +660,15 @@ The consensus sequences are generated by [`BCFTools`](http://samtools.github.io/
645660

646661
`BCFtools` will use the filtered variants file whereas, `iVar` will redetermine the variants to collapse in the consensus using their own workflow, read more about their differences in the [consensus calling section](./workflow/variant_and_refinement.md#consensus-calling).
647662

648-
???- abstract "Output files - variants"
663+
???- abstract "Output files - iterations & variants"
649664

650665
- `consensus`
651-
- `seq/`
652-
- `<sample-id>/<sample-id>_<constrain-id>-CONSTRAIN_itvariant_calling.fa`: A fasta file containing the consensus sequence.
653-
- `mask/`
654-
- `<sample-id>/<sample-id>_<constrain-id>-CONSTRAIN*.qual.txt`: A log file of the consensus run containing statistics. [`iVar` only]
655-
- `<sample-id>/<sample-id>_<constrain-id>-CONSTRAIN*.bed`: A bed file containing the masked regions. [`BCFtools` only]
656-
- `<sample-id>/<sample-id>_<constrain-id>-CONSTRAIN*.mpileup`: A mpileup file containing information on the depth and the quality of each alinged base.
657-
658-
???- abstract "Output files - iterations"
659-
660-
- `assembly/polishing/iterations/it#/consensus`
661-
- `seq/`
662-
- `<sample-id>/<sample-id>_cl#_itvariant_calling.fa`: A fasta file containing the consensus sequence.
663-
- `mask/`
664-
- `<sample-id>/<sample-id>_cl#_it*.qual.txt`: A log file of the consensus run containing statistics. [`iVar` only]
665-
- `<sample-id>/<sample-id>_cl#_it*.bed`: A bed file containing the masked regions. [`BCFtools` only]
666-
- `<sample-id>/<sample-id>_cl#_it*.mpileup`: A mpileup file containing information on the depth and the quality of each alinged base.
666+
- `seq/<it# | consensus | singleton | constrain>/ `
667+
- `<sample-id>/*.fasta`: A fasta file containing the consensus sequence.
668+
- `mask/<it# | consensus | singleton | constrain>`
669+
- `<sample-id>/*.qual.txt`: A log file of the consensus run containing statistics. [`iVar` only]
670+
- `<sample-id>/*.bed`: A bed file containing the masked regions. [`BCFtools` only]
671+
- `<sample-id>/*.mpileup`: A mpileup file containing information on the depth and the quality of each alinged base.
667672

668673

669674
## Consensus Quality control
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name: custom_mpileup
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
dependencies:
6+
- bioconda::argparse
7+
- bioconda::pysam
8+
- bioconda::pysamstats
9+
- conda-forge::pandas

modules/local/custom_mpileup/main.nf

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
process CUSTOM_MPILEUP {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/pysamstats:1.1.2--py39he47c912_12':
8+
'quay.io/biocontainers/pysamstats:1.1.2--py311h0152c62_12' }"
9+
10+
input:
11+
tuple val(meta), path(bam), path(ref)
12+
13+
output:
14+
tuple val(meta), path("*.tsv"), emit: tsv
15+
path "versions.yml" , emit: versions
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
def args = task.ext.args ?: ''
22+
def prefix = task.ext.prefix ?: "${meta.id}"
23+
"""
24+
custom_mpileup.py \\
25+
$args \\
26+
--alignment ${bam} \\
27+
--reference ${ref} \\
28+
--prefix ${prefix}
29+
30+
cat <<-END_VERSIONS > versions.yml
31+
"${task.process}":
32+
python: \$(python --version | sed 's/Python //g')
33+
pandas: \$(pip show pandas | grep Version: | sed 's/Version: //g')
34+
pysam: \$(pip show pysam | grep Version: | sed 's/Version: //g')
35+
pysamstats: \$(pip show pysamstats | grep Version: | sed 's/Version: //g')
36+
END_VERSIONS
37+
"""
38+
39+
stub:
40+
def args = task.ext.args ?: ''
41+
def prefix = task.ext.prefix ?: "${meta.id}"
42+
"""
43+
touch ${prefix}.tsv
44+
45+
cat <<-END_VERSIONS > versions.yml
46+
"${task.process}":
47+
python: \$(python --version | sed 's/Python //g')
48+
pandas: \$(pip show pandas | grep Version: | sed 's/Version: //g')
49+
pysam: \$(pip show pysam | grep Version: | sed 's/Version: //g')
50+
pysamstats: \$(pip show pysamstats | grep Version: | sed 's/Version: //g')
51+
"""
52+
}

subworkflows/local/bam_stats_metrics.nf

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/i
22
include { PICARD_COLLECTMULTIPLEMETRICS } from '../../modules/nf-core/picard/collectmultiplemetrics/main'
33
include { MOSDEPTH } from '../../modules/nf-core/mosdepth/main'
44
include { BAM_STATS_SAMTOOLS } from '../nf-core/bam_stats_samtools/main'
5+
include { CUSTOM_MPILEUP } from '../../modules/local/custom_mpileup/main'
56

67
workflow BAM_STATS_METRICS {
78

@@ -27,6 +28,8 @@ workflow BAM_STATS_METRICS {
2728
bam_bai_bed: [meta, bam, bai, []]
2829
}
2930

31+
CUSTOM_MPILEUP (sort_bam_ref)
32+
3033
PICARD_COLLECTMULTIPLEMETRICS ( input_metrics.bam_bai, input_metrics.ref, [[:], []] )
3134
ch_versions = ch_versions.mix(PICARD_COLLECTMULTIPLEMETRICS.out.versions)
3235

0 commit comments

Comments
 (0)