Skip to content

Commit 6a64388

Browse files
authored
Merge pull request #10 from TRON-Bioinformatics/rna-support
RNA support
2 parents 802fa8c + 7c514be commit 6a64388

11 files changed

+138
-56
lines changed

.github/workflows/automated_tests.yml

+1
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@ jobs:
2929
key: ${{ runner.os }}-tronflow-bam-preprocessing
3030
- name: Run tests
3131
run: |
32+
export NXF_VER=22.04.5
3233
make

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ test:
1616
bash tests/test_07.sh
1717
bash tests/test_08.sh
1818
bash tests/test_09.sh
19+
bash tests/test_10.sh

main.nf

+13-6
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
nextflow.enable.dsl = 2
44

55
include { PREPARE_BAM; INDEX_BAM } from './modules/01_prepare_bam'
6-
include { MARK_DUPLICATES } from './modules/02_mark_duplicates'
7-
include { METRICS; HS_METRICS; COVERAGE_ANALYSIS } from './modules/03_metrics'
6+
include { MARK_DUPLICATES; SPLIT_CIGAR_N_READS } from './modules/02_mark_duplicates'
7+
include { METRICS; HS_METRICS; COVERAGE_ANALYSIS; FLAGSTAT } from './modules/03_metrics'
88
include { REALIGNMENT_AROUND_INDELS } from './modules/04_realignment_around_indels'
99
include { BQSR; CREATE_OUTPUT } from './modules/05_bqsr'
1010

@@ -26,6 +26,7 @@ params.output = 'output'
2626
params.platform = "ILLUMINA"
2727
params.collect_hs_metrics_min_base_quality = false
2828
params.collect_hs_metrics_min_mapping_quality = false
29+
params.split_cigarn = false
2930

3031
// computational resources
3132
params.prepare_bam_cpus = 3
@@ -84,7 +85,7 @@ else if (params.input_files) {
8485

8586
workflow {
8687

87-
PREPARE_BAM(input_files)
88+
PREPARE_BAM(input_files, params.reference)
8889

8990
if (!params.skip_deduplication) {
9091
MARK_DUPLICATES(PREPARE_BAM.out.prepared_bams)
@@ -95,24 +96,30 @@ workflow {
9596
deduplicated_bams = INDEX_BAM.out.indexed_bams
9697
}
9798

99+
if (params.split_cigarn) {
100+
SPLIT_CIGAR_N_READS(deduplicated_bams, params.reference)
101+
deduplicated_bams = SPLIT_CIGAR_N_READS.out.split_cigarn_bams
102+
}
103+
98104
if (! params.skip_metrics) {
99105
if (params.intervals) {
100106
HS_METRICS(deduplicated_bams)
101107
}
102-
METRICS(deduplicated_bams)
108+
METRICS(deduplicated_bams, params.reference)
103109
COVERAGE_ANALYSIS(deduplicated_bams)
110+
FLAGSTAT(deduplicated_bams)
104111
}
105112

106113
if (!params.skip_realignment) {
107-
REALIGNMENT_AROUND_INDELS(deduplicated_bams)
114+
REALIGNMENT_AROUND_INDELS(deduplicated_bams, params.reference)
108115
realigned_bams = REALIGNMENT_AROUND_INDELS.out.realigned_bams
109116
}
110117
else {
111118
realigned_bams = deduplicated_bams
112119
}
113120

114121
if (!params.skip_bqsr) {
115-
BQSR(realigned_bams)
122+
BQSR(realigned_bams, params.reference)
116123
preprocessed_bams = BQSR.out.recalibrated_bams
117124
}
118125
else {

modules/01_prepare_bam.nf

+20-21
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@ params.prepare_bam_memory = "8g"
33
params.index_cpus = 1
44
params.index_memory = "8g"
55
params.platform = "ILLUMINA"
6-
params.reference = false
7-
params.skip_deduplication = false
86
params.output = 'output'
97

108
/*
@@ -18,35 +16,31 @@ process PREPARE_BAM {
1816
tag "${name}"
1917
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
2018

21-
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0 bioconda::samtools=1.12" : null)
19+
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
2220

2321
input:
2422
tuple val(name), val(type), file(bam)
23+
val(reference)
2524

2625
output:
2726
tuple val(name), val(type), file("${name}.prepared.bam"), emit: prepared_bams
2827
file("software_versions.${task.process}.txt")
2928

3029
script:
31-
order = params.skip_deduplication ? "--SORT_ORDER coordinate": "--SORT_ORDER queryname"
3230
"""
3331
mkdir tmp
3432
35-
samtools sort \
36-
--threads ${params.prepare_bam_cpus} \
37-
-o ${name}.sorted.bam ${bam}
38-
3933
gatk AddOrReplaceReadGroups \
4034
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
4135
--VALIDATION_STRINGENCY SILENT \
42-
--INPUT ${name}.sorted.bam \
36+
--INPUT ${bam} \
4337
--OUTPUT /dev/stdout \
44-
--REFERENCE_SEQUENCE ${params.reference} \
38+
--REFERENCE_SEQUENCE ${reference} \
4539
--RGPU 1 \
4640
--RGID 1 \
4741
--RGSM ${type} \
4842
--RGLB 1 \
49-
--RGPL ${params.platform} ${order} | \
43+
--RGPL ${params.platform} | \
5044
gatk CleanSam \
5145
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
5246
--INPUT /dev/stdin \
@@ -55,13 +49,10 @@ process PREPARE_BAM {
5549
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
5650
--INPUT /dev/stdin \
5751
--OUTPUT ${name}.prepared.bam \
58-
--SEQUENCE_DICTIONARY ${params.reference}
59-
60-
rm -f ${name}.sorted.bam
52+
--SEQUENCE_DICTIONARY ${reference}
6153
6254
echo ${params.manifest} >> software_versions.${task.process}.txt
6355
gatk --version >> software_versions.${task.process}.txt
64-
samtools --version >> software_versions.${task.process}.txt
6556
"""
6657
}
6758

@@ -71,24 +62,32 @@ process INDEX_BAM {
7162
tag "${name}"
7263
publishDir "${params.output}/${name}", mode: "copy", pattern: "software_versions.*"
7364

74-
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
65+
conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)
7566

7667
input:
7768
tuple val(name), val(type), file(bam)
7869

7970
output:
80-
tuple val(name), val(type), file("${bam}"), file("${bam.baseName}.bai"), emit: indexed_bams
71+
tuple val(name), val(type), file("${name}.sorted.bam"), file("${name}.sorted.bam.bai"), emit: indexed_bams
8172
file("software_versions.${task.process}.txt")
8273

8374
script:
8475
"""
8576
mkdir tmp
8677
87-
gatk BuildBamIndex \
88-
--java-options '-Xmx8g -Djava.io.tmpdir=./tmp' \
89-
--INPUT ${bam}
78+
# sort
79+
sambamba sort \
80+
--nthreads=${task.cpus} \
81+
--tmpdir=./tmp \
82+
--out=${name}.sorted.bam \
83+
${bam}
84+
85+
# indexes the output BAM file
86+
sambamba index \
87+
--nthreads=${task.cpus} \
88+
${name}.sorted.bam ${name}.sorted.bam.bai
9089
9190
echo ${params.manifest} >> software_versions.${task.process}.txt
92-
gatk --version >> software_versions.${task.process}.txt
91+
sambamba --version >> software_versions.${task.process}.txt
9392
"""
9493
}

modules/02_mark_duplicates.nf

+52-19
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,83 @@
11
params.mark_duplicates_cpus = 2
22
params.mark_duplicates_memory = "16g"
33
params.remove_duplicates = true
4-
params.skip_metrics = false
54
params.output = 'output'
65

76

87
process MARK_DUPLICATES {
98
cpus "${params.mark_duplicates_cpus}"
109
memory "${params.mark_duplicates_memory}"
1110
tag "${name}"
12-
publishDir "${params.output}/${name}/metrics/mark_duplicates", mode: "copy", pattern: "*.dedup_metrics.txt"
1311
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
1412

15-
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
13+
conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)
1614

1715
input:
1816
tuple val(name), val(type), file(bam)
1917

2018
output:
2119
tuple val(name), val(type), file("${name}.dedup.bam"), file("${name}.dedup.bam.bai"), emit: deduplicated_bams
22-
file("${name}.dedup_metrics.txt") optional true
2320
file("software_versions.${task.process}.txt")
2421

2522
script:
26-
dedup_metrics = params.skip_metrics ? "": "--METRICS_FILE ${name}.dedup_metrics.txt"
27-
remove_duplicates = params.remove_duplicates ? "--REMOVE_DUPLICATES true" : "--REMOVE_DUPLICATES false"
23+
remove_duplicates_param = params.remove_duplicates ? "--remove-duplicates" : ""
2824
"""
2925
mkdir tmp
3026
31-
gatk SortSam \
32-
--java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \
33-
--INPUT ${bam} \
34-
--OUTPUT ${name}.sorted.bam \
35-
--SORT_ORDER coordinate
36-
37-
gatk MarkDuplicates \
38-
--java-options '-Xmx${params.mark_duplicates_memory} -Djava.io.tmpdir=./tmp' \
39-
--INPUT ${name}.sorted.bam \
40-
--OUTPUT ${name}.dedup.bam \
41-
--ASSUME_SORT_ORDER coordinate \
42-
--CREATE_INDEX true ${remove_duplicates} ${dedup_metrics}
27+
# sort
28+
sambamba sort \
29+
--nthreads=${task.cpus} \
30+
--tmpdir=./tmp \
31+
--out=${name}.sorted.bam \
32+
${bam}
4333
44-
cp ${name}.dedup.bai ${name}.dedup.bam.bai
34+
# removes duplicates (sorted from the alignment process)
35+
sambamba markdup ${remove_duplicates_param} \
36+
--nthreads=${task.cpus} \
37+
--tmpdir=./tmp \
38+
${name}.sorted.bam ${name}.dedup.bam
4539
4640
rm -f ${name}.sorted.bam
4741
42+
# indexes the output BAM file
43+
sambamba index \
44+
--nthreads=${task.cpus} \
45+
${name}.dedup.bam ${name}.dedup.bam.bai
46+
47+
echo ${params.manifest} >> software_versions.${task.process}.txt
48+
sambamba --version >> software_versions.${task.process}.txt
49+
"""
50+
}
51+
52+
process SPLIT_CIGAR_N_READS {
53+
cpus "${params.prepare_bam_cpus}"
54+
memory "${params.prepare_bam_memory}"
55+
tag "${name}"
56+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
57+
58+
conda (params.enable_conda ? "bioconda::gatk4=4.2.5.0" : null)
59+
60+
input:
61+
tuple val(name), val(type), file(bam), file(bai)
62+
val(reference)
63+
64+
output:
65+
tuple val(name), val(type), file("${name}.split_cigarn.bam"), file("${name}.split_cigarn.bam.bai"), emit: split_cigarn_bams
66+
file("software_versions.${task.process}.txt")
67+
68+
script:
69+
"""
70+
mkdir tmp
71+
72+
gatk SplitNCigarReads \
73+
--java-options '-Xmx${params.prepare_bam_memory} -Djava.io.tmpdir=./tmp' \
74+
--input ${bam} \
75+
--output ${name}.split_cigarn.bam \
76+
--create-output-bam-index true \
77+
--reference ${reference}
78+
79+
cp ${name}.split_cigarn.bai ${name}.split_cigarn.bam.bai
80+
4881
echo ${params.manifest} >> software_versions.${task.process}.txt
4982
gatk --version >> software_versions.${task.process}.txt
5083
"""

modules/03_metrics.nf

+30-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ params.metrics_cpus = 1
22
params.metrics_memory = "8g"
33
params.collect_hs_metrics_min_base_quality = false
44
params.collect_hs_metrics_min_mapping_quality = false
5-
params.reference = false
65
params.output = 'output'
76
params.intervals = false
87

@@ -63,6 +62,7 @@ process METRICS {
6362

6463
input:
6564
tuple val(name), val(type), file(bam), file(bai)
65+
val(reference)
6666

6767
output:
6868
file("*_metrics") optional true
@@ -76,7 +76,7 @@ process METRICS {
7676
--java-options '-Xmx${params.metrics_memory} -Djava.io.tmpdir=./tmp' \
7777
--INPUT ${bam} \
7878
--OUTPUT ${name} \
79-
--REFERENCE_SEQUENCE ${params.reference} \
79+
--REFERENCE_SEQUENCE ${reference} \
8080
--PROGRAM QualityScoreDistribution \
8181
--PROGRAM MeanQualityByCycle \
8282
--PROGRAM CollectAlignmentSummaryMetrics \
@@ -122,3 +122,31 @@ process COVERAGE_ANALYSIS {
122122
samtools --version >> software_versions.${task.process}.txt
123123
"""
124124
}
125+
126+
process FLAGSTAT {
127+
cpus "${params.metrics_cpus}"
128+
memory "${params.metrics_memory}"
129+
tag "${name}"
130+
publishDir "${params.output}/${name}/metrics/flagstat", mode: "copy", pattern: "*.flagstat.csv"
131+
publishDir "${params.output}/${name}/", mode: "copy", pattern: "software_versions.*"
132+
133+
conda (params.enable_conda ? "bioconda::sambamba=0.8.2" : null)
134+
135+
input:
136+
tuple val(name), val(type), file(bam), file(bai)
137+
138+
output:
139+
file("${name}.flagstat.csv")
140+
file("software_versions.${task.process}.txt")
141+
142+
script:
143+
"""
144+
sambamba flagstat \
145+
--nthreads=${task.cpus} \
146+
--tabular \
147+
${bam} > ${name}.flagstat.csv
148+
149+
echo ${params.manifest} >> software_versions.${task.process}.txt
150+
sambamba --version >> software_versions.${task.process}.txt
151+
"""
152+
}

modules/04_realignment_around_indels.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ params.realignment_around_indels_cpus = 2
22
params.realignment_around_indels_memory = "31g"
33
params.known_indels1 = false
44
params.known_indels2 = false
5-
params.reference = false
65
params.output = 'output'
76

87

@@ -19,6 +18,7 @@ process REALIGNMENT_AROUND_INDELS {
1918

2019
input:
2120
tuple val(name), val(type), file(bam), file(bai)
21+
val(reference)
2222

2323
output:
2424
tuple val(name), val(type), file("${name}.realigned.bam"), file("${name}.realigned.bai"), emit: realigned_bams
@@ -36,12 +36,12 @@ process REALIGNMENT_AROUND_INDELS {
3636
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T RealignerTargetCreator \
3737
--input_file ${bam} \
3838
--out ${name}.RA.intervals \
39-
--reference_sequence ${params.reference} ${known_indels1} ${known_indels2}
39+
--reference_sequence ${reference} ${known_indels1} ${known_indels2}
4040
4141
gatk3 -Xmx${params.realignment_around_indels_memory} -Djava.io.tmpdir=./tmp -T IndelRealigner \
4242
--input_file ${bam} \
4343
--out ${name}.realigned.bam \
44-
--reference_sequence ${params.reference} \
44+
--reference_sequence ${reference} \
4545
--targetIntervals ${name}.RA.intervals \
4646
--consensusDeterminationModel USE_SW \
4747
--LODThresholdForCleaning 0.4 \

0 commit comments

Comments
 (0)