Skip to content

Commit 8a52bb9

Browse files
author
Pablo Riesgo-Ferreiro
authoredMar 25, 2024
Merge pull request #1 from TRON-Bioinformatics/support-bams
Support bams
2 parents 099c6d7 + 2c08937 commit 8a52bb9

21 files changed

+1788
-61
lines changed
 

‎.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
/workspace.xml
44
work
55
.nextflow.log*
6-
report.html
7-
timeline.html
6+
report.html*
7+
timeline.html*
8+
trace.txt*
89
trace.txt
910
dag.dot
1011
.nextflow

‎README.md

+20-7
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,21 @@ Nextflow (Di Tommaso, 2017) pipeline for HLA typing using HLA-HD (Kawaguchi, 201
1414

1515
## How to run it
1616

17-
Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header**
17+
Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header** using `--input_fastqs`.
1818

19-
| Sample name | FASTQ1 | FASTQ2 |
20-
|----------------------|---------------------------------|------------------------------|
21-
| sample_1 | /path/to/sample_1.1.fq | /path/to/sample_1.2.fq |
22-
| sample_2 | /path/to/sample_2.1.fq | /path/to/sample_2.2.fq |
19+
| Sample name | FASTQ1 | FASTQ2 |
20+
|----------------------|---------------------------|------------------------------|
21+
| sample_1 | /path/to/sample_1.1.fq.gz | /path/to/sample_1.2.fq.gz |
22+
| sample_2 | /path/to/sample_2.1.fq.gz | /path/to/sample_2.2.fq.gz |
2323

24+
Alternatively, provide a table with BAM files using `--input_bams`.
25+
26+
| Sample name | BAM |
27+
|----------------------|-----------------------|
28+
| sample_1 | /path/to/sample_1.bam |
29+
| sample_2 | /path/to/sample_2.bam |
30+
31+
BAM files should be indexed.
2432

2533
Run as indicated below.
2634

@@ -34,22 +42,27 @@ Usage:
3442
nextflow run main.nf --input_files input_files --output output_folder
3543
3644
Input:
37-
* input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
45+
* input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
3846
The input file does not have header!
3947
Example input file:
4048
name1 fastq1.fq.gz fastq2.fq.gz
4149
name2 fastq1.fq.gz fastq2.fq.gz
50+
* input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
51+
The input file does not have header!
52+
Example input file:
53+
name1 name1.bam
54+
name2 name2.bam
4255
* output: output folder where results will be stored
4356
4457
Optional input:
58+
* reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
4559
* read_length: the read length (default: 50)
4660
* hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
4761
* bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)
4862
* bowtie2_module: the module to load with bowtie2
4963
* ld_library_path: the value to set in LD_LIBRARY_PATH
5064
* cpus: the number of CPUs per sample (default: 15)
5165
* memory: the amount of memory per sample (default: 30g)
52-
5366
```
5467

5568

‎main.nf

+29-47
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
nextflow.enable.dsl = 2
44

5+
include { BAM2FASTQ } from './modules/00_bam2fastq'
6+
include { HLAHD } from './modules/01_hlahd'
7+
58

69
def helpMessage() {
710
log.info params.help_message
@@ -13,63 +16,42 @@ if (params.help) {
1316
}
1417

1518
// checks required inputs
16-
if (params.input_files) {
19+
if (params.input_fastqs) {
1720
Channel
18-
.fromPath(params.input_files)
21+
.fromPath(params.input_fastqs)
1922
.splitCsv(header: ['name', 'fastq1', 'fastq2'], sep: "\t")
2023
.map{ row-> tuple(row.name, file(row.fastq1), file(row.fastq2)) }
21-
.set { input_files }
24+
.set { input_fastqs }
25+
} else if (params.input_bams) {
26+
Channel
27+
.fromPath(params.input_bams)
28+
.splitCsv(header: ['name', 'bam'], sep: "\t")
29+
.map{ row-> tuple(row.name, file(row.bam)) }
30+
.set { input_bams }
2231
} else {
23-
exit 1, "Input file not specified!"
32+
exit 1, "Provide either --input_fastqs or --input_bams"
2433
}
2534

2635

27-
process HLAHD {
28-
cpus params.cpus
29-
memory params.memory
30-
tag "${name}"
31-
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
32-
module params.bowtie2_module
33-
34-
input:
35-
tuple val(name), val(fastq1), val(fastq2)
36-
37-
output:
38-
tuple val("${name}"), path("*final*")
39-
40-
script:
41-
"""
42-
mkdir temp
43-
44-
# HLA-HD wants its own binaries and bowtie2 in path
45-
export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
46-
export LD_LIBRARY_PATH=${params.ld_library_path}
36+
if (params.reference == 'hg38') {
37+
contigs = "$baseDir/references/contigs_hla_reads_hg38.bed"
38+
}
39+
else if (params.reference == 'hg19') {
40+
contigs = "$baseDir/references/contigs_hla_reads_hg19.bed"
41+
}
42+
else {
43+
log.error "--reference only supports hg38 or hg19"
44+
exit 1
45+
}
4746

48-
zcat ${fastq1} > input_fastq1.fastq
49-
zcat ${fastq2} > input_fastq2.fastq
5047

51-
# HLA HD does not accept gzipped fastq files, unzip them first
52-
hlahd.sh \
53-
-m ${params.read_length} \
54-
-t ${task.cpus} \
55-
-f ${params.hlahd_folder}/freq_data/ \
56-
input_fastq1.fastq \
57-
input_fastq2.fastq \
58-
${params.hlahd_folder}/HLA_gene.split.txt \
59-
${params.hlahd_folder}/dictionary/ \
60-
${name} \
61-
temp
6248

63-
# moves the final result to base folder
64-
mv temp/${name}/result/* .
49+
workflow {
6550

66-
# deletes temp folder
67-
rm -rf temp
68-
rm -f input_fastq1.fastq
69-
rm -f input_fastq2.fastq
70-
"""
71-
}
51+
if (input_bams) {
52+
BAM2FASTQ(input_bams, contigs)
53+
input_fastqs =BAM2FASTQ.out
54+
}
7255

73-
workflow {
74-
HLAHD(input_files)
56+
HLAHD(input_fastqs)
7557
}

‎modules/00_bam2fastq.nf

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
process BAM2FASTQ {
2+
cpus params.cpus
3+
memory params.memory
4+
tag "${name}"
5+
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
6+
module params.bowtie2_module
7+
8+
conda (params.enable_conda ? "bioconda::samtools=1.18 bioconda::gatk4=4.2.5.0" : null)
9+
10+
input:
11+
tuple val(name), val(bam)
12+
val(contigs)
13+
14+
output:
15+
tuple val("${name}"), path("${name}.hla.1.fastq.gz"), path("${name}.hla.2.fastq.gz")
16+
17+
script:
18+
"""
19+
samtools index $bam
20+
21+
# gets reads in the provided regions
22+
# only non duplicated reads
23+
samtools view \
24+
-b \
25+
-L ${contigs} \
26+
-F 1024 \
27+
${bam} > ${name}.mhc.bam
28+
29+
# Extract unmap reads
30+
samtools view -b -f 4 $bam > ${name}.unmap.bam
31+
32+
#Merge bam files
33+
samtools merge -o ${name}.merge.bam ${name}.unmap.bam ${name}.mhc.bam
34+
35+
#Create fastq
36+
gatk SamToFastq --VALIDATION_STRINGENCY SILENT -I ${name}.merge.bam -F ${name}.hlatmp.1.fastq -F2 ${name}.hlatmp.2.fastq
37+
38+
#Change fastq ID
39+
cat ${name}.hlatmp.1.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/1"," 1",O);print O}else{print \$0}}' | gzip > ${name}.hla.1.fastq.gz
40+
cat ${name}.hlatmp.2.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/2"," 2",O);print O}else{print \$0}}' | gzip > ${name}.hla.2.fastq.gz
41+
"""
42+
}

‎modules/01_hlahd.nf

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
process HLAHD {
2+
cpus params.cpus
3+
memory params.memory
4+
tag "${name}"
5+
publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
6+
module params.bowtie2_module
7+
8+
input:
9+
tuple val(name), val(fastq1), val(fastq2)
10+
11+
output:
12+
tuple val("${name}"), path("*final*")
13+
14+
script:
15+
"""
16+
mkdir temp
17+
18+
# HLA-HD wants its own binaries and bowtie2 in path
19+
export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
20+
export LD_LIBRARY_PATH=${params.ld_library_path}
21+
22+
zcat ${fastq1} > input_fastq1.fastq
23+
zcat ${fastq2} > input_fastq2.fastq
24+
25+
# HLA HD does not accept gzipped fastq files, unzip them first
26+
hlahd.sh \
27+
-m ${params.read_length} \
28+
-t ${task.cpus} \
29+
-f ${params.hlahd_folder}/freq_data/ \
30+
input_fastq1.fastq \
31+
input_fastq2.fastq \
32+
${params.hlahd_folder}/HLA_gene.split.txt \
33+
${params.hlahd_folder}/dictionary/ \
34+
${name} \
35+
temp
36+
37+
# moves the final result to base folder
38+
mv temp/${name}/result/* .
39+
40+
# deletes temp folder
41+
rm -rf temp
42+
rm -f input_fastq1.fastq
43+
rm -f input_fastq2.fastq
44+
"""
45+
}

‎nextflow.config

+25-5
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,21 @@ params.bowtie2_folder = "/code/bowtie/2.3.4.3"
1414
params.bowtie2_module = "bioinf/bowtie2/2.3.4.3"
1515
params.ld_library_path = "/usr/local/lib64/"
1616
params.read_length = 50
17+
params.reference = 'hg38'
1718

1819
profiles {
20+
conda {
21+
params.enable_conda = true
22+
}
1923
debug { process.beforeScript = 'echo $HOSTNAME' }
24+
test {
25+
params.cpus = 1
26+
params.memory = "3g"
27+
timeline.enabled = false
28+
report.enabled = false
29+
trace.enabled = false
30+
dag.enabled = false
31+
}
2032
}
2133

2234
// Export this variable to prevent local Python libraries from conflicting with those in the container
@@ -27,7 +39,7 @@ env {
2739
// Capture exit codes from upstream processes when piping
2840
process.shell = ['/bin/bash', '-euo', 'pipefail']
2941

30-
VERSION = '0.1.0'
42+
VERSION = '0.2.0'
3143

3244
manifest {
3345
name = 'TRON-Bioinformatics/tronflow-hlahd'
@@ -40,20 +52,28 @@ manifest {
4052
}
4153

4254
params.help_message = """
43-
TronFlow HLA-HD v${VERSION}
55+
nextflow run tron-bioinformatics/tronflow-hla-hd --help
56+
57+
Launching `main.nf` [intergalactic_shannon] - revision: e707c77d7b
4458
4559
Usage:
4660
nextflow run main.nf --input_files input_files --output output_folder
4761
4862
Input:
49-
* input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
63+
* input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
64+
The input file does not have header!
65+
Example input file:
66+
name1 fastq1.fq.gz fastq2.fq.gz
67+
name2 fastq1.fq.gz fastq2.fq.gz
68+
* input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
5069
The input file does not have header!
5170
Example input file:
52-
name1 fastq1.fq.gz fastq2.fq.gz
53-
name2 fastq1.fq.gz fastq2.fq.gz
71+
name1 name1.bam
72+
name2 name2.bam
5473
* output: output folder where results will be stored
5574
5675
Optional input:
76+
* reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
5777
* read_length: the read length (default: 50)
5878
* hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
5979
* bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)

‎references/contigs_hla_reads_hg19.bed

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
chr6 28477797 33448354
2+
chr6_apd_hap1 1 4622290
3+
chr6_cox_hap2 1 4795371
4+
chr6_dbb_hap3 1 4610396
5+
chr6_mann_hap4 1 4683263
6+
chr6_mcf_hap5 1 4833398
7+
chr6_qbl_hap6 1 4611984
8+
chr6_ssto_hap7 1 4928567

0 commit comments

Comments
 (0)