Skip to content

Commit 268725c

Browse files
authored
Merge pull request #7 from TRON-Bioinformatics/sort-step
Add an initial sorting step + record software versions with output data
2 parents d9a07e9 + 5d20362 commit 268725c

7 files changed

+60
-6
lines changed

modules/01_prepare_bam.nf

+19-2
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,30 @@ process PREPARE_BAM {
1616
cpus "${params.prepare_bam_cpus}"
1717
memory "${params.prepare_bam_memory}"
1818
tag "${name}"
19+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1920

20-
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
21+
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::samtools=1.12" : null)
2122

2223
input:
2324
tuple val(name), val(type), file(bam)
2425

2526
output:
2627
tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams
28+
file("software_versions.${task.process}.txt")
2729

2830
script:
2931
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
3032
"""
3133
mkdir tmp
3234
35+
samtools sort \
36+
--threads ${params.prepare_bam_cpus} \
37+
-o ${name}.sorted.bam ${bam}
38+
3339
gatk AddOrReplaceReadGroups \
3440
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
3541
--VALIDATION_STRINGENCY SILENT \
36-
--INPUT ${bam} \
42+
--INPUT ${name}.sorted.bam \
3743
--OUTPUT /dev/stdout \
3844
--REFERENCE_SEQUENCE ${params.reference} \
3945
--RGPU 1 \
@@ -50,13 +56,20 @@ process PREPARE_BAM {
5056
--INPUT /dev/stdin \
5157
--OUTPUT ${name}.prepared.bam \
5258
--SEQUENCE_DICTIONARY ${params.reference}
59+
60+
rm -f ${name}.sorted.bam
61+
62+
echo ${params.manifest} >> software_versions.${task.process}.txt
63+
gatk --version >> software_versions.${task.process}.txt
64+
samtools --version >> software_versions.${task.process}.txt
5365
"""
5466
}
5567

5668
process INDEX_BAM {
5769
cpus "${params.index_cpus}"
5870
memory "${params.index_memory}"
5971
tag "${name}"
72+
publishDir "${params.output}/${name}", mode: "copy", pattern: "software_versions.*"
6073

6174
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
6275

@@ -65,6 +78,7 @@ process INDEX_BAM {
6578

6679
output:
6780
tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams
81+
file("software_versions.${task.process}.txt")
6882

6983
script:
7084
"""
@@ -73,5 +87,8 @@ process INDEX_BAM {
7387
gatk BuildBamIndex \
7488
--java-options '-Xmx8g -Djava.io.tmpdir=./tmp' \
7589
--INPUT ${bam}
90+
91+
echo ${params.manifest} >> software_versions.${task.process}.txt
92+
gatk --version >> software_versions.${task.process}.txt
7693
"""
7794
}

modules/02_mark_duplicates.nf

+5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ process MARK_DUPLICATES {
1010
memory "${params.mark_duplicates_memory}"
1111
tag "${name}"
1212
publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt"
13+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1314

1415
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
1516

@@ -19,6 +20,7 @@ process MARK_DUPLICATES {
1920
output:
2021
tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams
2122
file("${name}.dedup_metrics.txt") optional true
23+
file("software_versions.${task.process}.txt")
2224

2325
script:
2426
dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt"
@@ -41,5 +43,8 @@ process MARK_DUPLICATES {
4143
cp ${name}.dedup.bai ${name}.dedup.bam.bai
4244
4345
rm -f ${name}.sorted.bam
46+
47+
echo ${params.manifest} >> software_versions.${task.process}.txt
48+
gatk --version >> software_versions.${task.process}.txt
4449
"""
4550
}

modules/03_metrics.nf

+15
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ process HS_METRICS {
1212
memory params.metrics_memory
1313
tag "${name}"
1414
publishDir "${params.output}/${name}/metrics/hs_metrics", mode: "copy"
15+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1516

1617
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
1718

@@ -22,6 +23,7 @@ process HS_METRICS {
2223
file("*_metrics") optional true
2324
file("*.pdf") optional true
2425
file("${name}.hs_metrics.txt")
26+
file("software_versions.${task.process}.txt")
2527

2628
script:
2729
minimum_base_quality = params.collect_hs_metrics_min_base_quality ?
@@ -43,6 +45,9 @@ process HS_METRICS {
4345
--TARGET_INTERVALS my.intervals \
4446
--BAIT_INTERVALS my.intervals \
4547
${minimum_base_quality} ${minimum_mapping_quality}
48+
49+
echo ${params.manifest} >> software_versions.${task.process}.txt
50+
gatk --version >> software_versions.${task.process}.txt
4651
"""
4752
}
4853

@@ -51,6 +56,7 @@ process METRICS {
5156
memory params.metrics_memory
5257
tag "${name}"
5358
publishDir "${params.output}/${name}/metrics/gatk_multiple_metrics", mode: "copy"
59+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
5460

5561
// NOTE: the method CollectMultipleMetrics has a hidden dependency to R for making plots
5662
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 r::r=3.6.0" : null)
@@ -61,6 +67,7 @@ process METRICS {
6167
output:
6268
file("*_metrics") optional true
6369
file("*.pdf") optional true
70+
file("software_versions.${task.process}.txt")
6471

6572
"""
6673
mkdir tmp
@@ -78,6 +85,9 @@ process METRICS {
7885
--PROGRAM CollectInsertSizeMetrics \
7986
--PROGRAM CollectSequencingArtifactMetrics \
8087
--PROGRAM CollectSequencingArtifactMetrics
88+
89+
echo ${params.manifest} >> software_versions.${task.process}.txt
90+
gatk --version >> software_versions.${task.process}.txt
8191
"""
8292
}
8393

@@ -86,6 +96,7 @@ process COVERAGE_ANALYSIS {
8696
memory params.metrics_memory
8797
tag "${name}"
8898
publishDir "${params.output}/${name}/metrics/coverage", mode: "copy"
99+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
89100

90101
conda (params.enable_conda ? "bioconda::samtools=1.12" : null)
91102

@@ -95,6 +106,7 @@ process COVERAGE_ANALYSIS {
95106
output:
96107
file("${name}.coverage.tsv")
97108
file("${name}.depth.tsv")
109+
file("software_versions.${task.process}.txt")
98110

99111
script:
100112
minimum_base_quality = params.collect_hs_metrics_min_base_quality ?
@@ -105,5 +117,8 @@ process COVERAGE_ANALYSIS {
105117
"""
106118
samtools coverage ${minimum_base_quality} ${minimum_mapping_quality} ${bam} > ${name}.coverage.tsv
107119
samtools depth -s -d 0 -H ${intervals} ${bam} > ${name}.depth.tsv
120+
121+
echo ${params.manifest} >> software_versions.${task.process}.txt
122+
samtools --version >> software_versions.${task.process}.txt
108123
"""
109124
}

modules/04_realignment_around_indels.nf

+5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ process REALIGNMENT_AROUND_INDELS {
1111
memory "${params.realignment_around_indels_memory}"
1212
tag "${name}"
1313
publishDir "${params.output}/${name}/metrics/realignment", mode: "copy", pattern: "*.RA.intervals"
14+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1415

1516
// NOTE: this dependency is fixed to GATK 3 as the realignment around indels is not anymore maintained in GATK 4
1617
// but still for some reason for GATK 3 to work the dependency to GATK 4.2.0.0 is needed
@@ -22,6 +23,7 @@ process REALIGNMENT_AROUND_INDELS {
2223
output:
2324
tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams
2425
file("${name}.RA.intervals")
26+
file("software_versions.${task.process}.txt")
2527

2628
script:
2729
known_indels1 = params.known_indels1 ? " --known ${params.known_indels1}" : ""
@@ -44,5 +46,8 @@ process REALIGNMENT_AROUND_INDELS {
4446
--consensusDeterminationModel USE_SW \
4547
--LODThresholdForCleaning 0.4 \
4648
--maxReadsInMemory 600000 ${known_alleles1} ${known_alleles2}
49+
50+
echo ${params.manifest} >> software_versions.${task.process}.txt
51+
gatk3 --version >> software_versions.${task.process}.txt
4752
"""
4853
}

modules/05_bqsr.nf

+6-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ params.output = 'output'
88
process BQSR {
99
cpus "${params.bqsr_cpus}"
1010
memory "${params.bqsr_memory}"
11-
publishDir "${params.output}/${name}", mode: "copy"
1211
tag "${name}"
12+
publishDir "${params.output}/${name}", mode: "copy"
13+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1314

1415
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
1516

@@ -21,6 +22,7 @@ process BQSR {
2122
file "${name}.recalibration_report.grp"
2223
file "${name}.preprocessed.bam"
2324
file "${name}.preprocessed.bai"
25+
file("software_versions.${task.process}.txt")
2426

2527
"""
2628
mkdir tmp
@@ -38,6 +40,9 @@ process BQSR {
3840
--output ${name}.preprocessed.bam \
3941
--reference ${params.reference} \
4042
--bqsr-recal-file ${name}.recalibration_report.grp
43+
44+
echo ${params.manifest} >> software_versions.${task.process}.txt
45+
gatk --version >> software_versions.${task.process}.txt
4146
"""
4247
}
4348

nextflow.config

+2-3
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ profiles {
1515
params.prepare_bam_memory = "3g"
1616
params.mark_duplicates_cpus = 1
1717
params.mark_duplicates_memory = "3g"
18-
params.skip_mark_duplicates_cpus = 1
19-
params.skip_mark_duplicates_memory = "3g"
2018
params.realignment_around_indels_cpus = 1
2119
params.realignment_around_indels_memory = "3g"
2220
params.bqsr_cpus = 1
@@ -46,7 +44,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
4644

4745
cleanup = true
4846

49-
VERSION = '1.8.1'
47+
VERSION = '1.9.0'
5048
DOI = 'https://zenodo.org/badge/latestdoi/358400957'
5149

5250
manifest {
@@ -59,6 +57,7 @@ manifest {
5957
version = VERSION
6058
doi = DOI
6159
}
60+
params.manifest = manifest
6261

6362
params.help_message = """
6463
TronFlow bam preprocessing v${VERSION} ${DOI}

tests/test_01.sh

+8
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,13 @@ nextflow main.nf -profile test,conda --output $output
77

88
test -s $output/sample1/sample1.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
99
test -s $output/sample1/sample1.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }
10+
test -s $output/sample1/software_versions.PREPARE_BAM.txt || { echo "Missing software versions file!"; exit 1; }
11+
test -s $output/sample1/software_versions.MARK_DUPLICATES.txt || { echo "Missing software versions file!"; exit 1; }
12+
test -s $output/sample1/software_versions.HS_METRICS.txt || { echo "Missing software versions file!"; exit 1; }
13+
test -s $output/sample1/software_versions.METRICS.txt || { echo "Missing software versions file!"; exit 1; }
14+
test -s $output/sample1/software_versions.COVERAGE_ANALYSIS.txt || { echo "Missing software versions file!"; exit 1; }
15+
test -s $output/sample1/software_versions.REALIGNMENT_AROUND_INDELS.txt || { echo "Missing software versions file!"; exit 1; }
16+
test -s $output/sample1/software_versions.BQSR.txt || { echo "Missing software versions file!"; exit 1; }
17+
1018
test -s $output/sample2/sample2.preprocessed.bam || { echo "Missing output BAM file!"; exit 1; }
1119
test -s $output/sample2/sample2.preprocessed.bai || { echo "Missing output BAI file!"; exit 1; }

0 commit comments

Comments
 (0)