Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make quality trimming and filtering optional #18

Merged
merged 1 commit into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ nextflow run BCCDC-PHL/downsample-reads \

...will add the file `test_downsampling_summary.csv` to the outdir.

### Quality Trimming & Filtering

By default, input fastq files will be run through [fastp](https://github.com/OpenGene/fastp) using its default settings. This
means that [quality filtering](https://github.com/OpenGene/fastp?tab=readme-ov-file#quality-filter) will be applied to remove
poor-quality reads. But [quality trimming](https://github.com/OpenGene/fastp?tab=readme-ov-file#per-read-cutting-by-quality-score)
is not applied.

To disable quality filtering, use the `--disable_quality_filtering` flag. To enable quality trimming, use the `--enable_quality_trimming`
flag. For example:

```
nextflow run BCCDC-PHL/downsample-reads \
-profile conda \
--cache ~/.conda/envs \
--samplesheet_input samplesheet.csv \
--disable_quality_filtering \
--enable_quality_trimming \
--outdir </path/to/output_dir>
```

## Output

Expand Down Expand Up @@ -179,10 +198,10 @@ In the output directory for each sample, a provenance file will be written with
nextflow_session_id: ceb7cc4c-644b-47bd-9469-5f3a7658119f
nextflow_run_name: voluminous_jennings
analysis_start_time: 2024-03-19T15:23:43.570174-07:00
- input_filename: NC000962_R1.fastq.gz
- filename: NC000962_R1.fastq.gz
file_type: fastq-input
sha256: 2793587aeb2b87bece4902183c295213a7943ea178c83f8b5432594d4b2e3b84
- input_filename: NC000962_R2.fastq.gz
- filename: NC000962_R2.fastq.gz
file_type: fastq-input
sha256: 336e4c42a60f22738c87eb1291270ab4ddfd918f32fa1fc662421d4f9605ea59
- process_name: fastp
Expand All @@ -201,13 +220,12 @@ In the output directory for each sample, a provenance file will be written with
value: 10
- parameter: --genome-size
value: 4.4m
- process_name: fastp
tools:
- tool_name: fastp
tool_version: 0.23.2
parameters:
- parameter: --cut_tail
value: null
- filename: NC000962-downsample-10x_R1.fastq.gz
file_type: fastq-output
sha256: 2fe74753d889d1b6f02832a09b10a1cab51b1fb2e16a2af20577277aded07a83
- filename: NC000962-downsample-10x_R2.fastq.gz
file_type: fastq-output
sha256: b6041ce11ccad3522b3f0ae4117967839ccad78a90e90f106ac399e2e23a8000
```

If multiple coverage levels are specified for a sample, then multiple provenance files will be created (one for each coverage level).
21 changes: 12 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

nextflow.enable.dsl = 2

include { hash_files } from './modules/hash_files.nf'
include { fastp as fastp_input } from './modules/downsample_reads.nf'
include { downsample } from './modules/downsample_reads.nf'
include { fastp as fastp_output } from './modules/downsample_reads.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'
include { hash_files as hash_fastq_input } from './modules/hash_files.nf'
include { hash_files as hash_fastq_output } from './modules/hash_files.nf'
include { fastp as fastp_input } from './modules/downsample_reads.nf'
include { downsample } from './modules/downsample_reads.nf'
include { fastp as fastp_output } from './modules/downsample_reads.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'

workflow {

Expand All @@ -29,14 +30,16 @@ workflow {

main:

hash_files(ch_fastq.map{ it -> [it[0], it[1]] }.combine(Channel.of("fastq-input")))
hash_fastq_input(ch_fastq.join(ch_coverages).map({ it -> [it[0], it[2], it[1]] }).combine(Channel.of("fastq-input")))

ch_fastp_input = ch_fastq.join(ch_coverages.map({ it -> [it[0], it[2]] }))

fastp_input(ch_fastp_input.combine(Channel.of("original")))

downsample(ch_fastq.join(ch_coverages))

hash_fastq_output(downsample.out.reads.map{ it -> [it[0], it[3], it[1]] }.combine(Channel.of("fastq-output")))

fastp_output(downsample.out.reads)

fastp_input.out.csv.concat(fastp_output.out.csv).map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_downsampling_summary.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it[0] })
Expand All @@ -50,10 +53,10 @@ workflow {
ch_provenance = ch_sample_ids_with_coverages
ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)
ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map({ it -> [it[0], it[1], [it[2]]] })
ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(hash_fastq_input.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(fastp_input.out.provenance).map{ it -> [it[0], it[1], it[2] << it[4]] }
ch_provenance = ch_provenance.join(downsample.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(fastp_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(hash_fastq_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }

collect_provenance(ch_provenance.map{ it -> [it[0], it[1], it[2].minus(null)] })
}
41 changes: 30 additions & 11 deletions modules/downsample_reads.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,50 @@ process fastp {
output:
tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_fastp.json"), emit: json
tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_downsampling_summary.csv"), emit: csv
tuple val(sample_id), val(target_coverage), path("${sample_id}_${target_coverage_filename}_fastp_provenance.yml"), emit: provenance
tuple val(sample_id), val(target_coverage), path("${sample_id}_original_fastp_provenance.yml"), emit: provenance, optional: true

script:
if (target_coverage == 'original') {
target_coverage_filename = 'original'
} else {
target_coverage_filename = target_coverage + 'x'
}
if (target_coverage == 'original' && params.enable_quality_trimming) {
quality_trimming = '--cut_tail'
} else {
quality_trimming = ''
}
if (target_coverage == 'original' && params.disable_quality_filtering) {
quality_filtering = '--disable_quality_filtering'
} else {
quality_filtering = ''
}
"""
printf -- "- process_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " tools:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " - tool_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
if [[ "${quality_trimming}" != "" || "${quality_filtering}" != "" ]]; then
printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi
if [[ "${quality_trimming}" != "" ]]; then
printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi
if [[ "${quality_filtering}" != "" ]]; then
printf -- " - parameter: --disable_quality_filtering\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi

fastp \
-t ${task.cpus} \
-i ${reads[0]} \
-I ${reads[1]} \
--cut_tail \
-o ${sample_id}_R1.trim.fastq.gz \
-O ${sample_id}_R2.trim.fastq.gz \
-j ${sample_id}_${target_coverage_filename}_fastp.json
-t ${task.cpus} \
-i ${reads[0]} \
-I ${reads[1]} \
${quality_trimming} \
${quality_filtering} \
-o ${sample_id}_R1.trim.fastq.gz \
-O ${sample_id}_R2.trim.fastq.gz \
-j ${sample_id}_${target_coverage_filename}_fastp.json

echo "target_coverage" >> coverage_field.csv
echo ${target_coverage} >> coverage_field.csv
Expand Down
22 changes: 14 additions & 8 deletions modules/hash_files.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,26 @@ process hash_files {
tag { sample_id + " / " + file_type }

input:
tuple val(sample_id), path(files_to_hash), val(file_type)
tuple val(sample_id), val(coverage), path(files_to_hash), val(file_type)

output:
tuple val(sample_id), path("${sample_id}_${file_type}.sha256.csv"), emit: csv
tuple val(sample_id), path("${sample_id}_${file_type}_provenance.yml"), emit: provenance
tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}.sha256.csv"), emit: csv
tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}_provenance.yml"), emit: provenance

script:
if (coverage == "original") {
coverage_filename = "original"
} else {
coverage_filename = coverage + "x"
}

"""
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' >> ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
while IFS=',' read -r hash filename; do
printf -- "- input_filename: \$filename\\n" >> ${sample_id}_${file_type}_provenance.yml;
printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${file_type}_provenance.yml;
printf -- " sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml;
done < ${sample_id}_${file_type}.sha256.csv
printf -- "- filename: \$filename\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
printf -- " sha256: \$hash\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
done < ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
"""

}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ params {
coverages = 'NO_FILE'
coverage = 30
genome_size = '5m'
enable_quality_trimming = false
disable_quality_filtering = false
collect_outputs = false
collected_outputs_prefix = 'collected'
}
Expand Down