From 20ecf58d524ac979ae4f5f87b1f1c6c79af27da1 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 27 Dec 2022 11:00:31 +0100 Subject: [PATCH 01/10] feat!: 237 revive cnvkit #339 --- .../workflows/panel_of_normals/Snakefile | 70 ++++ .../workflows/panel_of_normals/__init__.py | 305 +++++++++++++++--- .../wrappers/cnvkit/access/wrapper.py | 5 + .../wrappers/cnvkit/autobin/environment.yaml | 1 + .../wrappers/cnvkit/autobin/wrapper.py | 60 ++++ .../wrappers/cnvkit/coverage/wrapper.py | 47 ++- .../wrappers/cnvkit/reference/wrapper.py | 54 ++-- .../test_workflows_panel_of_normals.py | 228 ++++++++++++- 8 files changed, 680 insertions(+), 90 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index c944deec5..3bfe00d32 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -97,3 +97,73 @@ rule panel_of_normals_mutect2_create_panel: **wf.get_log_file("mutect2", "create_panel"), wrapper: wf.wrapper_path("mutect2/create_panel") + + +# Panel of normals (cnvkit) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Write out the tumor-only results for the normals ---------------------------- + + +rule panel_of_normals_cnvkit_access: + output: + **wf.get_output_files("cnvkit", "access"), + threads: wf.get_resource("cnvkit", "access", "threads") + resources: + time=wf.get_resource("cnvkit", "access", "time"), + memory=wf.get_resource("cnvkit", "access", "memory"), + partition=wf.get_resource("cnvkit", "access", "partition"), + log: + **wf.get_log_file("cnvkit", "access"), + wrapper: + wf.wrapper_path("cnvkit/access") + + +rule panel_of_normals_cnvkit_autobin: + input: + unpack(wf.get_input_files("cnvkit", "autobin")), + output: + **wf.get_output_files("cnvkit", "autobin"), + threads: wf.get_resource("cnvkit", "autobin", "threads") + resources: + time=wf.get_resource("cnvkit", "autobin", "time"), + memory=wf.get_resource("cnvkit", "autobin", "memory"), + partition=wf.get_resource("cnvkit", "autobin", "partition"), + log: + **wf.get_log_file("cnvkit", "autobin"), + wrapper: + wf.wrapper_path("cnvkit/autobin") + + +rule panel_of_normals_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + output: + **wf.get_output_files("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + log: + **wf.get_log_file("cnvkit", "coverage"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +# Create the panel of normals ------------------------------------------------- + + +rule panel_of_normals_cnvkit_create_panel: + input: + unpack(wf.get_input_files("cnvkit", "create_panel")), + output: + **wf.get_output_files("cnvkit", "create_panel"), + threads: wf.get_resource("cnvkit", "create_panel", "threads") + resources: + time=wf.get_resource("cnvkit", "create_panel", "time"), + memory=wf.get_resource("cnvkit", "create_panel", "memory"), + partition=wf.get_resource("cnvkit", "create_panel", "partition"), + log: + **wf.get_log_file("cnvkit", "create_panel"), + wrapper: + wf.wrapper_path("cnvkit/reference") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index a06ffa832..ce9a7dfc6 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -52,7 +52,6 @@ import os from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions -from snakemake.io import expand from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -65,14 +64,8 @@ __author__ = "Manuel Holtgrewe " -#: Extensions of files to create as main payload -EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") - -#: Names of the files to create for the extension -EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") - #: Names of the tools that might use panel of normals -TOOLS = ["mutect2"] +TOOLS = ["mutect2", "cnvkit"] #: Default configuration for the somatic_variant_calling schema DEFAULT_CONFIG = r""" @@ -81,7 +74,7 @@ panel_of_normals: tools: ['mutect2'] # REQUIRED - available: 'mutect2' path_ngs_mapping: ../ngs_mapping # REQUIRED - ignore_chroms: # patterns of chromosome names to ignore + ignore_chroms: # patterns of chromosome names to ignore # TODO maybe remove? - NC_007605 # herpes virus - hs37d5 # GRCh37 decoy - chrEBV # Eppstein-Barr Virus @@ -108,6 +101,35 @@ job_mult_time: 1 # running time multiplier merge_mult_memory: 1 # memory multiplier for merging merge_mult_time: 1 # running time multiplier for merging + ignore_chroms: # patterns of chromosome names to ignore + - NC_007605 # herpes virus + - hs37d5 # GRCh37 decoy + - chrEBV # Eppstein-Barr Virus + - '*_decoy' # decoy contig + - 'HLA-*' # HLA genes + - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 + cnvkit: + path_target_regions: REQUIRED # Bed files of targetted regions + exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) + min_gap_size: 5000 # [access] Minimum gap size between accessible sequence regions + annotate: "" # [autobin] Optional targets annotations + bp_per_bin: 100000 # [autobin] Desired average number of sequencing read bases mapped to each bin + target_min_size: 20 # [autobin] Min size of target bins + target_max_size: 20000 # [autobin] Max size of target bins + antitarget_min_size: 500 # [autobin] Min size of antitarget bins + antitarget_max_size: 500000 # [autobin] Max size of antitarget bins + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: no # [coverage] Alternative couting algorithm + min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering + gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: no # [reference] Create male reference + gc_correction: yes # [reference] Use GC correction + edge_correction: yes # [reference] Use edge correction + rmask_correction: yes # [reference] Use rmask correction + split_targets: False # Split targets into equal size bins + targets_average_bin_size: 266.67 # Average size of split target bins + antitargets_average_bin_size: 150000 # Average size of split antitarget bins + min_antitargets_bin_size: 0 # Minimum bin size for antitargets (0: computed on the fly) """ @@ -250,6 +272,202 @@ def get_log_file(self, action): yield key, tpls[action].format(tool=self.name, ext=ext) +class CnvkitStepPart(PanelOfNormalsStepPart): + """Somatic variant calling with MuTect 2""" + + #: Step name + name = "cnvkit" + + #: Class available actions + actions = ("access", "autobin", "coverage", "create_panel") + + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). + resource_usage_dict = { + "access": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", + ), + "autobin": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + "coverage": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + "create_panel": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + } + + def __init__(self, parent): + super().__init__(parent) + if self.config["mutect2"]["path_normals_list"]: + self.normal_libraries = [] + with open(self.config["mutect2"]["path_normals_list"], "rt") as f: + for line in f: + self.normal_libraries.append(line.strip()) + + def check_config(self): + if self.name not in self.config["tools"]: + return # cnvkit not enabled, skip + self.parent.ensure_w_config( + ("static_data_config", "reference", "path"), + "Path to reference FASTA not configured but required for %s" % (self.name,), + ) + + def get_resource_usage(self, action): + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + """ + # Validate action + self._validate_action(action) + return self.resource_usage_dict.get(action) + + def get_input_files(self, action): + """Return input files for mutect2 variant calling""" + # Validate action + self._validate_action(action) + mapping = { + "access": None, + "autobin": self._get_input_files_autobin, + "coverage": self._get_input_files_coverage, + "create_panel": self._get_input_files_create_panel, + } + return mapping[action] + + def _get_input_files_autobin(self, wildcards): + """Helper wrapper function for creating targets & anttargets files""" + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + bams = [ + ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + for x in self.normal_libraries + ] + bais = [x + ".bai" for x in bams] + return { + "bams": bams, + "bais": bais, + "access": "work/cnvkit.access/out/cnvkit.access.bed", + } + + def _get_input_files_coverage(self, wildcards): + """Helper wrapper function for computing coverage""" + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + bam = ngs_mapping(tpl.format(**wildcards)) + return { + "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed".format( + **wildcards + ), + "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed".format( + **wildcards + ), + "bam": bam, + "bai": bam + ".bai", + } + + def _get_input_files_create_panel(self, wildcards): + """Helper wrapper function for computing panel of normals""" + normal_libraries = list(self.normal_libraries) + tpl = ( + "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn" + ) + targets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in normal_libraries + ] + tpl = "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in normal_libraries + ] + return { + "target": targets, + "antitarget": antitargets, + } + + def get_output_files(self, action): + """Return panel of normal files""" + if action == "access": + return self._get_output_files_access() + elif action == "autobin": + return self._get_output_files_autobin() + elif action == "coverage": + return self._get_output_files_coverage() + elif action == "create_panel": + return self._get_output_files_create_panel() + else: + self._validate_action(action) + + def _get_output_files_access(self): + return { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + } + + def _get_output_files_autobin(self): + return { + "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", + "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", + "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", + } + + def _get_output_files_coverage(self): + return { + "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn", + "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn.md5", + "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn", + "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn.md5", + } + + def _get_output_files_create_panel(self): + return { + "panel": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + + @dictify + def get_log_file(self, action): + """Return panel of normal files""" + ext_dict = { + "conda_list": "conda_list.txt", + "conda_list_md5": "conda_list.txt.md5", + "conda_info": "conda_info.txt", + "conda_info_md5": "conda_info.txt.md5", + "log": "log", + "log_md5": "log.md5", + } + if action == "access": + tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" + for key, ext in ext_dict.items(): + print("DEBUG- log for access: {}".format(tpl.format(ext=ext))) + yield key, tpl.format(ext=ext) + elif action == "autobin": + tpl = "work/{{mapper}}.cnvkit.autobin/log/{{mapper}}.cnvkit.autobin.{ext}" + for key, ext in ext_dict.items(): + print("DEBUG- log for autobin: {}".format(tpl.format(ext=ext))) + yield key, tpl.format(ext=ext) + elif action == "coverage": + tpl = "work/{{mapper}}.cnvkit.coverage/log/{{mapper}}.cnvkit.coverage.{{normal_library}}.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + elif action == "create_panel": + tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.panel_of_normals.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + else: + self._validate_action(action) + + class PanelOfNormalsWorkflow(BaseStep): """Creates a panel of normals""" @@ -281,6 +499,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) self.register_sub_step_classes( ( Mutect2StepPart, + CnvkitStepPart, LinkOutStepPart, ) ) @@ -293,45 +512,39 @@ def get_result_files(self): We will process all NGS libraries of all bio samples in all sample sheets. """ - # Panel of normals - files = [] - - # yield from expand( - files.extend( - expand( - os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "out", - "{mapper}.{caller}.panel_of_normals" + "{ext}", - ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=set(self.config["tools"]) & set(TOOLS), - ext=EXT_VALUES, - ) + ext_list = { + "mutect2": ["vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5"], + "cnvkit": ["cnn", "cnn.md5"], + } + tpl = os.path.join( + "output", + "{mapper}.{caller}.create_panel", + "out", + "{mapper}.{caller}.panel_of_normals.{ext}", ) - # yield from expand( - files.extend( - expand( - os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "log", - "{mapper}.{caller}.panel_of_normals" + "{ext}", - ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=set(self.config["tools"]) & set(TOOLS), - ext=( - ".log", - ".log.md5", - ".conda_info.txt", - ".conda_info.txt.md5", - ".conda_list.txt", - ".conda_list.txt.md5", - ), - ) + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for caller in set(self.config["tools"]) & set(TOOLS): + for ext in ext_list[caller]: + yield tpl.format(mapper=mapper, caller=caller, ext=ext) + + ext_list = [ + "log", + "log.md5", + "conda_list.txt", + "conda_list.txt.md5", + "conda_info.txt", + "conda_info.txt.md5", + ] + tpl = os.path.join( + "output", + "{mapper}.{caller}.create_panel", + "log", + "{mapper}.{caller}.panel_of_normals.{ext}", ) - return files + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for caller in set(self.config["tools"]) & set(TOOLS): + for ext in ext_list: + yield tpl.format(mapper=mapper, caller=caller, ext=ext) def check_config(self): """Check that the path to the NGS mapping is present""" diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 522beecc4..ebf471400 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -7,6 +7,10 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["cnvkit"] + +exclude = " -- exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" + shell( r""" # Also pipe everything to log file @@ -30,6 +34,7 @@ cnvkit.py access \ -o {snakemake.output} \ + --min-gap-size config[min_gap_size] {exclude} \ {snakemake.config[static_data_config][reference][path]} diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py new file mode 100644 index 000000000..ce9c638df --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +"""Wrapper vor cnvkit.py genome2access +""" + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe" +__email__ = "manuel.holtgrewe@bih-charite.de" + +shell( + r""" +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Function definitions --------------------------------------------------------- +md5() +{{ + set -x + + fn=$1 + f=$(basename $fn) + d=$(dirname $fn) + pushd $d + md5sum $f > $f.md5 + popd +}} + +# ----------------------------------------------------------------------------- + +cnvkit.py autobin --method "hybrid" \ + --fasta {snakemake.config[static_data_config][reference][path] \ + --access {snakemake.input.access} \ + --targets {config[targets]} \ + --bp-per-bin {config[bp_per_bin]} \ + --target-min-size {config[target_min_size]} --target-max-size {config[target_max_size]} \ + --antitarget-min-size {config[antitarget_min_size]} --antitarget-max-size {config[antitarget_max_size]} \ + $(if [[ -n "{config[annotate]}" ]] ; then \ + echo --annotate {config[annotate]} --short-names + fi) \ + --target-output-bed {snakemake.output.target} --antitarger-output-bed {snakemake.output.antitarget} \ + {snakemake.input.bams} + +md5 {snakemake.output.target} +md5 {snakemake.output.antitarget} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index fc294b99b..de170ca30 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -7,6 +7,9 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + shell( r""" # Also pipe everything to log file @@ -26,17 +29,43 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +# Function definitions --------------------------------------------------------- + +coverage() +{{ + cnvkit.py coverage \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + --min-mapq {config[min_mapq]} \ + --processes {snakemake.threads} \ + {snakemake.input.bam} \ + --output $2 $1 +}} + +md5() +{{ + set -x + + fn=$1 + f=$(basename $fn) + d=$(dirname $fn) + pushd $d + md5sum $f > $f.md5 + popd +}} + # ----------------------------------------------------------------------------- -cnvkit.py coverage {snakemake.input.bam} {snakemake.input.target} -o {snakemake.output.target} -cnvkit.py coverage {snakemake.input.bam} {snakemake.input.antitarget} -o {snakemake.output.antitarget} +coverage {snakemake.input.target} {snakemake.output.target} +md5 {snakemake.output.target} -d=$(dirname "{snakemake.output.target}") -pushd $d -fn=$(basename "{snakemake.output.target}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.antitarget}") -md5sum $fn > $fn.md5 -popd +coverage {snakemake.input.antitarget} {snakemake.output.antitarget} +md5 {snakemake.output.antitarget} +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 9ee0f97c8..2580dcf33 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -10,6 +10,17 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +cluster = ( + " --cluster --min-cluster-size {}".format(config["min_cluster_size"]) + if config["min_cluster_size"] > 0 + else "" +) +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" +no_gc = " --no-gc" if not config["gc_correction"] else "" +no_edge = " --no-edge" if not config["edge_correction"] else "" +no_rmask = " --no-rmask" if not config["rmask_correction"] else "" + shell( r""" # Also pipe everything to log file @@ -31,38 +42,23 @@ # ----------------------------------------------------------------------------- -if [[ "{config[normals_reference]}" == "" ]] -then - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - mkdir $TMPDIR/out - - cnvkit.py coverage \ - -o $TMPDIR/out/target.cnn \ - {snakemake.input.bam} \ - {snakemake.input.target} - - cnvkit.py coverage \ - -o $TMPDIR/out/antitarget.cnn \ - {snakemake.input.bam} \ - {snakemake.input.antitarget} - - cnvkit.py reference \ - -o {snakemake.output} \ - -f {snakemake.config[static_data_config][reference][path]} \ - -t $TMPDIR/out/target.cnn \ - -a $TMPDIR/out/antitarget.cnn - - rm -rf $TMPDIR -else - r=$(realpath --relative-to=$d {config[normals_reference]}) - ln -s $r {snakemake.output} -fi +cnvkit reference \ + --output {snakemake.output.panel} \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ + {snakemake.input.target} {snakemake.input.antitarget} -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.panel}") +d=$(dirname "{snakemake.output.panel}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index ebb50ebe0..a5a92b925 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -38,9 +38,14 @@ def minimal_config(): path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['mutect2'] + tools: ['mutect2', 'cnvkit'] mutect2: germline_resource: /path/to/germline_resource.vcf + path_normals_list: "" + cnvkit: + path_excluded_regions: "" + path_target_regions: /path/to/target_regions.bed + path_normals_list: "" data_sets: first_batch: @@ -119,8 +124,6 @@ def test_mutect2_step_part_get_input_files_create_panel(panel_of_normals_workflo def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_output_files_prepare_panel()""" - # TODO: Potential extension error in output files, `vcf.tbi.gz` instead of `vcf.gz.tbi`. - # TODO: If confirmed, create expected using `get_expected_output_vcf_files_dict()` expected = { "vcf": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz", "vcf_md5": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.md5", @@ -170,7 +173,6 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "8G", "partition": "medium", } - run_expected_dict = {"threads": 1, "time": "01:00:00", "memory": "2G", "partition": "medium"} # Evaluate action `create_panel` for resource, expected in create_panel_expected_dict.items(): @@ -185,19 +187,214 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == expected, msg_error +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_autobin()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + "bais": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "normal_library": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "target": "work/bwa.cnvkit.autobin/out/bwa.cnvkit.autobin.target.bed", + "antitarget": "work/bwa.cnvkit.autobin/out/bwa.cnvkit.autobin.antitarget.bed", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_access(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_access()""" + expected = { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_autobin(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_autobin()""" + expected = { + "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", + "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", + "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "autobin") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_coverage()""" + expected = { + "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn", + "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn.md5", + "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn", + "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_create_panel()""" + base_name_out = "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals" + expected = { + "panel": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_access(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_access()""" + base_name_out = "work/cnvkit.access/log/cnvkit.access" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_autobin(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_autobin()""" + base_name_out = "work/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "autobin") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_coverage()""" + base_name_out = "work/{mapper}.cnvkit.coverage/log/{mapper}.cnvkit.coverage.{normal_library}" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_create_panel()""" + base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests CvnkitStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + access_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + autobin_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + coverage_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + reference_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + + # Evaluate action `access` + for resource, expected in access_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'access'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "access", resource) + assert actual == expected, msg_error + + # Evaluate action `autobin` + for resource, expected in autobin_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'autobin'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "autobin", resource) + assert actual == expected, msg_error + + # Evaluate action `coverage` + for resource, expected in coverage_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + # Evaluate action `create_panel` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + assert actual == expected, msg_error + + # PanelOfNormalsWorkflow -------------------------------------------------------------------------- def test_panel_of_normals_workflow(panel_of_normals_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["link_out", "mutect2"] + expected = ["cnvkit", "link_out", "mutect2"] actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) assert actual == expected # Check result file construction + expected = [] tpl = "output/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals.{ext}" - expected = [ + expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for mapper in ("bwa",) @@ -216,6 +413,25 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): ) for mapper in ("bwa",) ] + # Now for basic cnvkit files (panel of normal only) + tpl = "output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.{ext}" + expected += [ + tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) + ] + # add log files + tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals.{ext}" + expected += [ + tpl.format(mapper=mapper, ext=ext) + for ext in ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) + for mapper in ("bwa",) + ] expected = list(sorted(expected)) actual = list(sorted(panel_of_normals_workflow.get_result_files())) assert actual == expected From b9ddd9253dd5a4cc0763c1eb8b4cadbbee62b00f Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Thu, 29 Dec 2022 16:10:19 +0100 Subject: [PATCH 02/10] Added list of samples, symlink to access file, access, target & antitarget in output, fixed typos --- .../workflows/panel_of_normals/__init__.py | 26 +++++++++++-- .../wrappers/cnvkit/access/wrapper.py | 18 +++++---- .../wrappers/cnvkit/autobin/wrapper.py | 8 ++-- .../wrappers/cnvkit/environment.yaml | 2 +- .../wrappers/cnvkit/reference/wrapper.py | 2 +- .../test_workflows_panel_of_normals.py | 38 ++++++++----------- 6 files changed, 55 insertions(+), 39 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index ce9a7dfc6..ca8f616ed 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -109,7 +109,9 @@ - 'HLA-*' # HLA genes - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 cnvkit: + path_normals_list: "" # Optional file listing libraries to include in panel path_target_regions: REQUIRED # Bed files of targetted regions + access: "" # Access bed file (when absent, created by cnvkit.py access exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) min_gap_size: 5000 # [access] Minimum gap size between accessible sequence regions annotate: "" # [autobin] Optional targets annotations @@ -307,9 +309,9 @@ class CnvkitStepPart(PanelOfNormalsStepPart): def __init__(self, parent): super().__init__(parent) - if self.config["mutect2"]["path_normals_list"]: + if self.config["cnvkit"]["path_normals_list"]: self.normal_libraries = [] - with open(self.config["mutect2"]["path_normals_list"], "rt") as f: + with open(self.config["cnvkit"]["path_normals_list"], "rt") as f: for line in f: self.normal_libraries.append(line.strip()) @@ -449,12 +451,10 @@ def get_log_file(self, action): if action == "access": tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" for key, ext in ext_dict.items(): - print("DEBUG- log for access: {}".format(tpl.format(ext=ext))) yield key, tpl.format(ext=ext) elif action == "autobin": tpl = "work/{{mapper}}.cnvkit.autobin/log/{{mapper}}.cnvkit.autobin.{ext}" for key, ext in ext_dict.items(): - print("DEBUG- log for autobin: {}".format(tpl.format(ext=ext))) yield key, tpl.format(ext=ext) elif action == "coverage": tpl = "work/{{mapper}}.cnvkit.coverage/log/{{mapper}}.cnvkit.coverage.{{normal_library}}.{ext}" @@ -527,6 +527,15 @@ def get_result_files(self): for ext in ext_list[caller]: yield tpl.format(mapper=mapper, caller=caller, ext=ext) + if "cnvkit" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" + for ext in ("bed", "bed.md5"): + yield tpl.format(ext=ext) + tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5"): + yield tpl.format(mapper=mapper, ext=ext) + ext_list = [ "log", "log.md5", @@ -546,6 +555,15 @@ def get_result_files(self): for ext in ext_list: yield tpl.format(mapper=mapper, caller=caller, ext=ext) + if "cnvkit" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" + for ext in ext_list: + yield tpl.format(ext=ext) + tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin.{ext}" + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for ext in ext_list: + yield tpl.format(mapper=mapper, ext=ext) + def check_config(self): """Check that the path to the NGS mapping is present""" self.ensure_w_config( diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index ebf471400..6c0de112c 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -32,14 +32,18 @@ # ----------------------------------------------------------------------------- -cnvkit.py access \ - -o {snakemake.output} \ - --min-gap-size config[min_gap_size] {exclude} \ - {snakemake.config[static_data_config][reference][path]} - +if [[ -n "{config[access]}" ]] +then + ln -sr {config[access]} {snakemake.output.access} +else + cnvkit.py access \ + -o {snakemake.output.access} \ + --min-gap-size {config[min_gap_size]} {exclude} \ + {snakemake.config[static_data_config][reference][path]} +fi -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.access}") +d=$(dirname "{snakemake.output.access}") pushd $d md5sum $fn > $fn.md5 popd diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index ce9c638df..198a007d8 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -7,6 +7,8 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["cnvkit"] + shell( r""" # Also pipe everything to log file @@ -42,16 +44,16 @@ # ----------------------------------------------------------------------------- cnvkit.py autobin --method "hybrid" \ - --fasta {snakemake.config[static_data_config][reference][path] \ + --fasta {snakemake.config[static_data_config][reference][path]} \ --access {snakemake.input.access} \ - --targets {config[targets]} \ + --targets {config[path_target_regions]} \ --bp-per-bin {config[bp_per_bin]} \ --target-min-size {config[target_min_size]} --target-max-size {config[target_max_size]} \ --antitarget-min-size {config[antitarget_min_size]} --antitarget-max-size {config[antitarget_max_size]} \ $(if [[ -n "{config[annotate]}" ]] ; then \ echo --annotate {config[annotate]} --short-names fi) \ - --target-output-bed {snakemake.output.target} --antitarger-output-bed {snakemake.output.antitarget} \ + --target-output-bed {snakemake.output.target} --antitarget-output-bed {snakemake.output.antitarget} \ {snakemake.input.bams} md5 {snakemake.output.target} diff --git a/snappy_wrappers/wrappers/cnvkit/environment.yaml b/snappy_wrappers/wrappers/cnvkit/environment.yaml index 3e2333136..5b6fd5393 100644 --- a/snappy_wrappers/wrappers/cnvkit/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/environment.yaml @@ -3,6 +3,6 @@ channels: - bioconda - r dependencies: -- python =3.6 +- python >3.7 - cnvkit - htslib diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 2580dcf33..f984daaee 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -42,7 +42,7 @@ # ----------------------------------------------------------------------------- -cnvkit reference \ +cnvkit.py reference \ --output {snakemake.output.panel} \ --fasta {snakemake.config[static_data_config][reference][path]} \ {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index a5a92b925..fd643d45b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -400,38 +400,30 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): for mapper in ("bwa",) ] # add log files - tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals.{ext}" - expected += [ - tpl.format(mapper=mapper, ext=ext) - for ext in ( - "conda_info.txt", - "conda_list.txt", - "log", - "conda_info.txt.md5", - "conda_list.txt.md5", - "log.md5", - ) - for mapper in ("bwa",) - ] + tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals" + for mapper in ("bwa",): + expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() # Now for basic cnvkit files (panel of normal only) tpl = "output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.{ext}" expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) ] - # add log files - tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals.{ext}" + tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" + expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] + tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" expected += [ tpl.format(mapper=mapper, ext=ext) - for ext in ( - "conda_info.txt", - "conda_list.txt", - "log", - "conda_info.txt.md5", - "conda_list.txt.md5", - "log.md5", - ) + for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5") for mapper in ("bwa",) ] + # add log files + tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals" + for mapper in ("bwa",): + expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + expected += get_expected_log_files_dict("output/cnvkit.access/log/cnvkit.access").values() + tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" + for mapper in ("bwa",): + expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() expected = list(sorted(expected)) actual = list(sorted(panel_of_normals_workflow.get_result_files())) assert actual == expected From 8f6b5ff286a63f3e669b1ba88c9182ca03bbdc8f Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 2 Jan 2023 12:05:59 +0100 Subject: [PATCH 03/10] Discard autobin for hybrid method, add sex & metrics --- .../workflows/panel_of_normals/Snakefile | 82 +++++- .../workflows/panel_of_normals/__init__.py | 266 +++++++++++------ .../wrappers/cnvkit/metrics/environment.yaml | 1 + .../wrappers/cnvkit/metrics/wrapper.py | 62 ++++ .../wrappers/cnvkit/sex/environment.yaml | 1 + .../wrappers/cnvkit/sex/wrapper.py | 56 ++++ .../test_workflows_panel_of_normals.py | 269 +++++++++++++++--- 7 files changed, 611 insertions(+), 126 deletions(-) create mode 120000 snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py create mode 120000 snappy_wrappers/wrappers/cnvkit/sex/environment.yaml create mode 100644 snappy_wrappers/wrappers/cnvkit/sex/wrapper.py diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 3bfe00d32..8f783150b 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -118,20 +118,52 @@ rule panel_of_normals_cnvkit_access: wf.wrapper_path("cnvkit/access") -rule panel_of_normals_cnvkit_autobin: +# rule panel_of_normals_cnvkit_autobin: +# input: +# unpack(wf.get_input_files("cnvkit", "autobin")), +# output: +# **wf.get_output_files("cnvkit", "autobin"), +# threads: wf.get_resource("cnvkit", "autobin", "threads") +# resources: +# time=wf.get_resource("cnvkit", "autobin", "time"), +# memory=wf.get_resource("cnvkit", "autobin", "memory"), +# partition=wf.get_resource("cnvkit", "autobin", "partition"), +# log: +# **wf.get_log_file("cnvkit", "autobin"), +# wrapper: +# wf.wrapper_path("cnvkit/autobin") + + +rule panel_of_normals_cnvkit_target: input: - unpack(wf.get_input_files("cnvkit", "autobin")), + unpack(wf.get_input_files("cnvkit", "target")), output: - **wf.get_output_files("cnvkit", "autobin"), - threads: wf.get_resource("cnvkit", "autobin", "threads") + **wf.get_output_files("cnvkit", "target"), + threads: wf.get_resource("cnvkit", "target", "threads") resources: - time=wf.get_resource("cnvkit", "autobin", "time"), - memory=wf.get_resource("cnvkit", "autobin", "memory"), - partition=wf.get_resource("cnvkit", "autobin", "partition"), + time=wf.get_resource("cnvkit", "target", "time"), + memory=wf.get_resource("cnvkit", "target", "memory"), + partition=wf.get_resource("cnvkit", "target", "partition"), log: - **wf.get_log_file("cnvkit", "autobin"), + **wf.get_log_file("cnvkit", "target"), wrapper: - wf.wrapper_path("cnvkit/autobin") + wf.wrapper_path("cnvkit/target") + + +rule panel_of_normals_cnvkit_antitarget: + input: + unpack(wf.get_input_files("cnvkit", "antitarget")), + output: + **wf.get_output_files("cnvkit", "antitarget"), + threads: wf.get_resource("cnvkit", "antitarget", "threads") + resources: + time=wf.get_resource("cnvkit", "antitarget", "time"), + memory=wf.get_resource("cnvkit", "antitarget", "memory"), + partition=wf.get_resource("cnvkit", "antitarget", "partition"), + log: + **wf.get_log_file("cnvkit", "antitarget"), + wrapper: + wf.wrapper_path("cnvkit/antitarget") rule panel_of_normals_cnvkit_coverage: @@ -167,3 +199,35 @@ rule panel_of_normals_cnvkit_create_panel: **wf.get_log_file("cnvkit", "create_panel"), wrapper: wf.wrapper_path("cnvkit/reference") + + +rule panel_of_normals_cnvkit_sex: + input: + unpack(wf.get_input_files("cnvkit", "sex")), + output: + **wf.get_output_files("cnvkit", "sex"), + threads: wf.get_resource("cnvkit", "sex", "threads") + resources: + time=wf.get_resource("cnvkit", "sex", "time"), + memory=wf.get_resource("cnvkit", "sex", "memory"), + partition=wf.get_resource("cnvkit", "sex", "partition"), + log: + **wf.get_log_file("cnvkit", "sex"), + wrapper: + wf.wrapper_path("cnvkit/sex") + + +rule panel_of_normals_cnvkit_metrics: + input: + unpack(wf.get_input_files("cnvkit", "metrics")), + output: + **wf.get_output_files("cnvkit", "metrics"), + threads: wf.get_resource("cnvkit", "metrics", "threads") + resources: + time=wf.get_resource("cnvkit", "metrics", "time"), + memory=wf.get_resource("cnvkit", "metrics", "memory"), + partition=wf.get_resource("cnvkit", "metrics", "partition"), + log: + **wf.get_log_file("cnvkit", "metrics"), + wrapper: + wf.wrapper_path("cnvkit/metrics") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index ca8f616ed..fda24ddc1 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -49,8 +49,6 @@ Currently, no reports are generated. """ -import os - from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snappy_pipeline.utils import dictify, listify @@ -114,20 +112,25 @@ access: "" # Access bed file (when absent, created by cnvkit.py access exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) min_gap_size: 5000 # [access] Minimum gap size between accessible sequence regions - annotate: "" # [autobin] Optional targets annotations + annotate: "" # [autobin, target] Optional targets annotations bp_per_bin: 100000 # [autobin] Desired average number of sequencing read bases mapped to each bin target_min_size: 20 # [autobin] Min size of target bins target_max_size: 20000 # [autobin] Max size of target bins antitarget_min_size: 500 # [autobin] Min size of antitarget bins antitarget_max_size: 500000 # [autobin] Max size of antitarget bins + target_avg_size: 266.666666666666667 # [target] Average size of split target bins (set to average exons size) + split: False # [target] Split large intervals into smaller ones + antitarget_avg_size: 150000 # [antitarget] Average size of antitarget bins + min_size: 25000 # [antitarget] Min size of antitarget bins (should be set to 1/6 of antitarget_avg_size) min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth - count: no # [coverage] Alternative couting algorithm + count: False # [coverage] Alternative couting algorithm min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing - male_reference: no # [reference] Create male reference - gc_correction: yes # [reference] Use GC correction - edge_correction: yes # [reference] Use edge correction - rmask_correction: yes # [reference] Use rmask correction + male_reference: False # [reference & sex] Create male reference + gc_correction: True # [reference] Use GC correction + edge_correction: True # [reference] Use edge correction + rmask_correction: True # [reference] Use rmask correction + drop_low_coverage: False # [metrics] Drop very-low-coverage bins before calculations split_targets: False # Split targets into equal size bins targets_average_bin_size: 266.67 # Average size of split target bins antitargets_average_bin_size: 150000 # Average size of split antitarget bins @@ -148,7 +151,7 @@ class PanelOfNormalsStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample - self.normal_libraries = self._get_normal_libraries() + self.normal_libraries = list(self._get_normal_libraries()) def _get_normal_libraries(self): for sheet in self.parent.shortcut_sheets: @@ -281,7 +284,15 @@ class CnvkitStepPart(PanelOfNormalsStepPart): name = "cnvkit" #: Class available actions - actions = ("access", "autobin", "coverage", "create_panel") + actions = ( + "access", + "target", + "antitarget", + "coverage", + "create_panel", + "sex", + "metrics", + ) # , "autobin") #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). resource_usage_dict = { @@ -290,11 +301,21 @@ class CnvkitStepPart(PanelOfNormalsStepPart): time="02:00:00", # 2 hours memory="8G", ), - "autobin": ResourceUsage( + "target": ResourceUsage( threads=2, time="02:00:00", # 2 hours - memory="16G", + memory="8G", + ), + "antitarget": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", ), + # "autobin": ResourceUsage( + # threads=2, + # time="02:00:00", # 2 hours + # memory="16G", + # ), "coverage": ResourceUsage( threads=2, time="02:00:00", # 2 hours @@ -305,6 +326,16 @@ class CnvkitStepPart(PanelOfNormalsStepPart): time="02:00:00", # 2 hours memory="16G", ), + "sex": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), + "metrics": ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="16G", + ), } def __init__(self, parent): @@ -341,25 +372,36 @@ def get_input_files(self, action): self._validate_action(action) mapping = { "access": None, - "autobin": self._get_input_files_autobin, + # "autobin": self._get_input_files_autobin, + "target": None, + "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, + "sex": self._get_input_files_create_panel, + "metrics": self._get_input_files_create_panel, } return mapping[action] - def _get_input_files_autobin(self, wildcards): - """Helper wrapper function for creating targets & anttargets files""" - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - bams = [ - ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - for x in self.normal_libraries - ] - bais = [x + ".bai" for x in bams] + # def _get_input_files_autobin(self, wildcards): + # """Helper wrapper function for creating targets & anttargets files""" + # ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + # tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + # bams = [ + # ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + # for x in self.normal_libraries + # ] + # bais = [x + ".bai" for x in bams] + # return { + # "bams": bams, + # "bais": bais, + # "access": "work/cnvkit.access/out/cnvkit.access.bed", + # } + + def _get_input_files_antitarget(self, wildcards): + """Helper wrapper function for computing antitarget locations""" return { - "bams": bams, - "bais": bais, "access": "work/cnvkit.access/out/cnvkit.access.bed", + "target": "work/cnvkit.target/out/cnvkit.target.bed", } def _get_input_files_coverage(self, wildcards): @@ -368,44 +410,55 @@ def _get_input_files_coverage(self, wildcards): tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed".format( - **wildcards - ), - "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed".format( - **wildcards - ), + "target": "work/cnvkit.target/out/cnvkit.target.bed".format(**wildcards), + "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed".format(**wildcards), "bam": bam, "bai": bam + ".bai", } def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for computing panel of normals""" - normal_libraries = list(self.normal_libraries) tpl = ( "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn" ) targets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in normal_libraries + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] tpl = "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn" antitargets = [ - tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in normal_libraries + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] return { "target": targets, "antitarget": antitargets, } + def _get_input_files_sex(self, wildcards): + """Helper wrapper function for report of samples gender""" + return self._get_input_files_create_panel(wildcards) + + def _get_input_files_metrics(self, wildcards): + """Helper wrapper function for report of samples coverage noise""" + return self._get_input_files_create_panel(wildcards) + def get_output_files(self, action): """Return panel of normal files""" if action == "access": return self._get_output_files_access() - elif action == "autobin": - return self._get_output_files_autobin() + # elif action == "autobin": + # return self._get_output_files_autobin() + elif action == "target": + return self._get_output_files_target() + elif action == "antitarget": + return self._get_output_files_antitarget() elif action == "coverage": return self._get_output_files_coverage() elif action == "create_panel": return self._get_output_files_create_panel() + elif action == "sex": + return self._get_output_files_sex() + elif action == "metrics": + return self._get_output_files_metrics() else: self._validate_action(action) @@ -415,12 +468,24 @@ def _get_output_files_access(self): "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", } - def _get_output_files_autobin(self): + # def _get_output_files_autobin(self): + # return { + # "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", + # "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", + # "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", + # "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", + # } + + def _get_output_files_target(self): + return { + "target": "work/cnvkit.target/out/cnvkit.target.bed", + "target_md5": "work/cnvkit.target/out/cnvkit.target.bed.md5", + } + + def _get_output_files_antitarget(self): return { - "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", - "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", - "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", + "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed.md5", } def _get_output_files_coverage(self): @@ -437,6 +502,18 @@ def _get_output_files_create_panel(self): "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", } + def _get_output_files_sex(self): + return { + "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv", + "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv.md5", + } + + def _get_output_files_metrics(self): + return { + "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv", + "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv.md5", + } + @dictify def get_log_file(self, action): """Return panel of normal files""" @@ -452,8 +529,16 @@ def get_log_file(self, action): tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" for key, ext in ext_dict.items(): yield key, tpl.format(ext=ext) - elif action == "autobin": - tpl = "work/{{mapper}}.cnvkit.autobin/log/{{mapper}}.cnvkit.autobin.{ext}" + # elif action == "autobin": + # tpl = "work/{{mapper}}.cnvkit.autobin/log/{{mapper}}.cnvkit.autobin.{ext}" + # for key, ext in ext_dict.items(): + # yield key, tpl.format(ext=ext) + elif action == "target": + tpl = "work/cnvkit.target/log/cnvkit.target.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + elif action == "antitarget": + tpl = "work/cnvkit.antitarget/log/cnvkit.antitarget.{ext}" for key, ext in ext_dict.items(): yield key, tpl.format(ext=ext) elif action == "coverage": @@ -464,6 +549,14 @@ def get_log_file(self, action): tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.panel_of_normals.{ext}" for key, ext in ext_dict.items(): yield key, tpl.format(ext=ext) + elif action == "sex": + tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.sex.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + elif action == "metrics": + tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.metrics.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) else: self._validate_action(action) @@ -512,31 +605,9 @@ def get_result_files(self): We will process all NGS libraries of all bio samples in all sample sheets. """ - ext_list = { - "mutect2": ["vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5"], - "cnvkit": ["cnn", "cnn.md5"], - } - tpl = os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "out", - "{mapper}.{caller}.panel_of_normals.{ext}", - ) - for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: - for caller in set(self.config["tools"]) & set(TOOLS): - for ext in ext_list[caller]: - yield tpl.format(mapper=mapper, caller=caller, ext=ext) + result_files = [] - if "cnvkit" in set(self.config["tools"]) & set(TOOLS): - tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" - for ext in ("bed", "bed.md5"): - yield tpl.format(ext=ext) - tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" - for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: - for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5"): - yield tpl.format(mapper=mapper, ext=ext) - - ext_list = [ + log_ext_list = [ "log", "log.md5", "conda_list.txt", @@ -544,25 +615,56 @@ def get_result_files(self): "conda_info.txt", "conda_info.txt.md5", ] - tpl = os.path.join( - "output", - "{mapper}.{caller}.create_panel", - "log", - "{mapper}.{caller}.panel_of_normals.{ext}", - ) - for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: - for caller in set(self.config["tools"]) & set(TOOLS): - for ext in ext_list: - yield tpl.format(mapper=mapper, caller=caller, ext=ext) + + if "mutect2" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals.{ext}" + ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") + result_files.extend(self._expand_result_files(tpl, ext_list)) + tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config["tools"]) & set(TOOLS): - tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" + tpls = ( + ("output/cnvkit.access/out/cnvkit.access.{ext}", ("bed", "bed.md5")), + # ( + # "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}", + # ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5"), + # ), + ("output/cnvkit.target/out/cnvkit.target.{ext}", ("bed", "bed.md5")), + ("output/cnvkit.antitarget/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + ( + "output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.{ext}", + ("cnn", "cnn.md5"), + ), + ( + "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.{ext}", + ("tsv", "tsv.md5"), + ), + ( + "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.{ext}", + ("tsv", "tsv.md5"), + ), + ) + for tpl, ext_list in tpls: + result_files.extend(self._expand_result_files(tpl, ext_list)) + tpls = ( + "output/cnvkit.access/log/cnvkit.access.{ext}", + # "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin.{ext}", + "output/cnvkit.target/log/cnvkit.target.{ext}", + "output/cnvkit.antitarget/log/cnvkit.antitarget.{ext}", + "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals.{ext}", + "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex.{ext}", + "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics.{ext}", + ) + for tpl in tpls: + result_files.extend(self._expand_result_files(tpl, log_ext_list)) + + return result_files + + def _expand_result_files(self, tpl, ext_list): + for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: for ext in ext_list: - yield tpl.format(ext=ext) - tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin.{ext}" - for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: - for ext in ext_list: - yield tpl.format(mapper=mapper, ext=ext) + yield tpl.format(mapper=mapper, ext=ext) def check_config(self): """Check that the path to the NGS mapping is present""" diff --git a/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py new file mode 100644 index 000000000..6667df5f5 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py metrics +""" + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe" +__email__ = "manuel.holtgrewe@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +try: + segments = " --segments " + " -s ".join(snakemake.input.segments) +except AttributeError: + segments = "" + +shell( + r""" +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +set -x + +# ----------------------------------------------------------------------------- + +cnvkit.py metrics \ + --output {snakemake.output.table} \ + {segments} \ + $(if [[ "config[drop_low_coverage]" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {snakemake.input.target} {snakemake.input.antitarget} + +fn=$(basename "{snakemake.output.table}") +d=$(dirname "{snakemake.output.table}") +pushd $d +md5sum $fn > $fn.md5 +popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py new file mode 100644 index 000000000..7171083ad --- /dev/null +++ b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +"""Wrapper for cnvkit.py sex +""" + +from snakemake.shell import shell + +__author__ = "Manuel Holtgrewe" +__email__ = "manuel.holtgrewe@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +shell( + r""" +# Also pipe everything to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec &> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +set -x + +# ----------------------------------------------------------------------------- + +cnvkit.py sex \ + --output {snakemake.output.table} \ + $(if [[ "{config[male_reference]}" = "True" ]]; then \ + echo --male-reference + fi) \ + {snakemake.input.target} {snakemake.input.antitarget} + +fn=$(basename "{snakemake.output.table}") +d=$(dirname "{snakemake.output.table}") +pushd $d +md5sum $fn > $fn.md5 +popd +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index fd643d45b..a46416865 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -190,25 +190,41 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ -def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): - """Tests CnvkitStepPart._get_input_files_autobin()""" +# def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): +# """Tests CnvkitStepPart._get_input_files_autobin()""" +# wildcards = Wildcards( +# fromdict={ +# "mapper": "bwa", +# } +# ) +# expected = { +# "access": "work/cnvkit.access/out/cnvkit.access.bed", +# "bams": [ +# "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", +# "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", +# ], +# "bais": [ +# "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", +# "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", +# ], +# } +# actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) +# assert actual == expected + + +def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_antitarget()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", + "normal_library": "P001-N1-DNA1-WGS1", } ) expected = { "access": "work/cnvkit.access/out/cnvkit.access.bed", - "bams": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", - ], - "bais": [ - "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", - ], + "target": "work/cnvkit.target/out/cnvkit.target.bed", } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) assert actual == expected @@ -223,8 +239,8 @@ def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): expected = { "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/bwa.cnvkit.autobin/out/bwa.cnvkit.autobin.target.bed", - "antitarget": "work/bwa.cnvkit.autobin/out/bwa.cnvkit.autobin.antitarget.bed", + "target": "work/cnvkit.target/out/cnvkit.target.bed", + "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected @@ -251,6 +267,48 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow assert actual == expected +def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_sex()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "sex")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_metrics(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_metrics()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "metrics")(wildcards) + assert actual == expected + + def test_cnvkit_step_part_get_output_files_access(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_access()""" expected = { @@ -261,15 +319,35 @@ def test_cnvkit_step_part_get_output_files_access(panel_of_normals_workflow): assert actual == expected -def test_cnvkit_step_part_get_output_files_autobin(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_autobin()""" +# def test_cnvkit_step_part_get_output_files_autobin(panel_of_normals_workflow): +# """Tests CvnkitStepPart._get_output_files_autobin()""" +# expected = { +# "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", +# "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", +# "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", +# "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", +# } +# actual = panel_of_normals_workflow.get_output_files("cnvkit", "autobin") +# assert actual == expected + + +def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_target()""" + expected = { + "target": "work/cnvkit.target/out/cnvkit.target.bed", + "target_md5": "work/cnvkit.target/out/cnvkit.target.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_antitarget()""" expected = { - "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", - "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", - "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", - "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", + "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", + "antitarget_md5": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed.md5", } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "autobin") + actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") assert actual == expected @@ -287,7 +365,6 @@ def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals" expected = { "panel": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn", "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", @@ -296,6 +373,26 @@ def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflo assert actual == expected +def test_cnvkit_step_part_get_output_files_sex(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_sex()""" + expected = { + "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv", + "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "sex") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_metrics(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_metrics()""" + expected = { + "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv", + "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "metrics") + assert actual == expected + + def test_cnvkit_step_part_get_log_file_access(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_access()""" base_name_out = "work/cnvkit.access/log/cnvkit.access" @@ -304,11 +401,27 @@ def test_cnvkit_step_part_get_log_file_access(panel_of_normals_workflow): assert actual == expected -def test_cnvkit_step_part_get_log_file_autobin(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_autobin()""" - base_name_out = "work/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" +# def test_cnvkit_step_part_get_log_file_autobin(panel_of_normals_workflow): +# """Tests CvnkitStepPart._get_log_files_autobin()""" +# base_name_out = "work/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" +# expected = get_expected_log_files_dict(base_out=base_name_out) +# actual = panel_of_normals_workflow.get_log_file("cnvkit", "autobin") +# assert actual == expected + + +def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_target()""" + base_name_out = "work/cnvkit.target/log/cnvkit.target" expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "autobin") + actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_antitarget()""" + base_name_out = "work/cnvkit.antitarget/log/cnvkit.antitarget" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") assert actual == expected @@ -328,6 +441,22 @@ def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): assert actual == expected +def test_cnvkit_step_part_get_log_file_sex(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_sex()""" + base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "sex") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_metrics(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_metrics()""" + base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "metrics") + assert actual == expected + + def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): """Tests CvnkitStepPart.get_resource_usage()""" # Define expected: default defined workflow.abstract @@ -337,10 +466,22 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "8G", "partition": "medium", } - autobin_expected_dict = { + # autobin_expected_dict = { + # "threads": 2, + # "time": "02:00:00", + # "memory": "16G", + # "partition": "medium", + # } + target_expected_dict = { "threads": 2, "time": "02:00:00", - "memory": "16G", + "memory": "8G", + "partition": "medium", + } + antitarget_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", "partition": "medium", } coverage_expected_dict = { @@ -355,6 +496,18 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "16G", "partition": "medium", } + sex_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + metrics_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } # Evaluate action `access` for resource, expected in access_expected_dict.items(): @@ -363,9 +516,21 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): assert actual == expected, msg_error # Evaluate action `autobin` - for resource, expected in autobin_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'autobin'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "autobin", resource) + # for resource, expected in autobin_expected_dict.items(): + # msg_error = f"Assertion error for resource '{resource}' for action 'autobin'." + # actual = panel_of_normals_workflow.get_resource("cnvkit", "autobin", resource) + # assert actual == expected, msg_error + + # Evaluate action `target` + for resource, expected in target_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'target'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + assert actual == expected, msg_error + + # Evaluate action `antitarget` + for resource, expected in antitarget_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) assert actual == expected, msg_error # Evaluate action `coverage` @@ -380,6 +545,18 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) assert actual == expected, msg_error + # Evaluate action `sex` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'sex'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "sex", resource) + assert actual == expected, msg_error + + # Evaluate action `metrics` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'metrics'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "metrics", resource) + assert actual == expected, msg_error + # PanelOfNormalsWorkflow -------------------------------------------------------------------------- @@ -410,18 +587,40 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): ] tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" + # tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" + # expected += [ + # tpl.format(mapper=mapper, ext=ext) + # for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5") + # for mapper in ("bwa",) + # ] + tpl = "output/cnvkit.target/out/cnvkit.target.{ext}" + expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] + tpl = "output/cnvkit.antitarget/out/cnvkit.antitarget.{ext}" + expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] + tpl = "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.{ext}" expected += [ - tpl.format(mapper=mapper, ext=ext) - for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5") - for mapper in ("bwa",) + tpl.format(mapper=mapper, ext=ext) for ext in ("tsv", "tsv.md5") for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.{ext}" + expected += [ + tpl.format(mapper=mapper, ext=ext) for ext in ("tsv", "tsv.md5") for mapper in ("bwa",) ] # add log files tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals" for mapper in ("bwa",): expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() expected += get_expected_log_files_dict("output/cnvkit.access/log/cnvkit.access").values() - tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" + # tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" + # for mapper in ("bwa",): + # expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + expected += get_expected_log_files_dict("output/cnvkit.target/log/cnvkit.target").values() + expected += get_expected_log_files_dict( + "output/cnvkit.antitarget/log/cnvkit.antitarget" + ).values() + tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex" + for mapper in ("bwa",): + expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics" for mapper in ("bwa",): expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() expected = list(sorted(expected)) From 23382439d94dc489ecc21708bb8e6210b9eccbd3 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 2 Jan 2023 12:09:44 +0100 Subject: [PATCH 04/10] Fix wrapper logs --- .../wrappers/cnvkit/access/wrapper.py | 12 ++++++- .../wrappers/cnvkit/antitarget/wrapper.py | 24 ++++++++++--- .../wrappers/cnvkit/autobin/wrapper.py | 12 ++++++- .../wrappers/cnvkit/coverage/wrapper.py | 4 ++- .../wrappers/cnvkit/reference/wrapper.py | 4 ++- .../wrappers/cnvkit/target/wrapper.py | 34 ++++++++++++------- 6 files changed, 69 insertions(+), 21 deletions(-) diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 6c0de112c..c637ab74c 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py access """ from snakemake.shell import shell @@ -30,6 +30,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- if [[ -n "{config[access]}" ]] @@ -49,3 +51,11 @@ popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) + diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 126364ad6..41d6c4ec4 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py antitarget """ from snakemake.shell import shell @@ -7,6 +7,9 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + shell( r""" # Also pipe everything to log file @@ -26,17 +29,28 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py antitarget \ - -o {snakemake.output} \ - -g {snakemake.input.access} \ + --output {snakemake.output.antitarget} \ + --access {snakemake.input.access} \ + --avg-size {config[antitarget_avg_size]} --min-size {config[min_size]} \ {snakemake.input.target} -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") +fn=$(basename "{snakemake.output.antitarget}") +d=$(dirname "{snakemake.output.antitarget}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) + diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 198a007d8..46d63b64c 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py autobin """ from snakemake.shell import shell @@ -28,6 +28,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # Function definitions --------------------------------------------------------- md5() {{ @@ -60,3 +62,11 @@ md5 {snakemake.output.antitarget} """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) + diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index de170ca30..f7d79c9b6 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py coverage +"""Wrapper for cnvkit.py coverage """ from snakemake.shell import shell @@ -29,6 +29,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # Function definitions --------------------------------------------------------- coverage() diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index f984daaee..96d48c5d4 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py reference """ from snakemake.shell import shell @@ -40,6 +40,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py reference \ diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 95ab0d6e9..c989d27df 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py genome2access +"""Wrapper for cnvkit.py target """ from snakemake.shell import shell @@ -29,23 +29,33 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT +set -x # ----------------------------------------------------------------------------- -zcat {config[path_target_regions]} > $TMPDIR/regions.bed - cnvkit.py target \ - --short-names \ - --split \ - -o {snakemake.output} \ - $TMPDIR/regions.bed - -fn=$(basename "{snakemake.output}") -d=$(dirname "{snakemake.output}") + --output {snakemake.output.target} \ + $(if [[ -n "{config[annotate]}" ]]; then \ + echo --short-names --annotate {config[annotate]} + fi) \ + $(if [[ "config{split}" = "True" ]]; then \ + echo --split + fi) \ + --avg-size {config[target_avg_size]} \ + {config[path_target_regions]} + +fn=$(basename "{snakemake.output.target}") +d=$(dirname "{snakemake.output.target}") pushd $d md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) + From c3010541f996022b8c30563986386eb4e95fb876 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Tue, 3 Jan 2023 16:27:16 +0100 Subject: [PATCH 05/10] After panel_of_normals successful test on Becnel dataset --- .../workflows/panel_of_normals/Snakefile | 2 -- .../workflows/panel_of_normals/__init__.py | 34 +++++++++---------- .../wrappers/cnvkit/access/wrapper.py | 6 ++-- .../wrappers/cnvkit/antitarget/wrapper.py | 8 +++-- .../wrappers/cnvkit/autobin/wrapper.py | 21 +++++++++--- .../wrappers/cnvkit/target/wrapper.py | 7 ++-- .../test_workflows_panel_of_normals.py | 34 +++++++++---------- 7 files changed, 64 insertions(+), 48 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index 8f783150b..ea694afe5 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -135,8 +135,6 @@ rule panel_of_normals_cnvkit_access: rule panel_of_normals_cnvkit_target: - input: - unpack(wf.get_input_files("cnvkit", "target")), output: **wf.get_output_files("cnvkit", "target"), threads: wf.get_resource("cnvkit", "target", "threads") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index fda24ddc1..fd95f7ad6 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -111,17 +111,17 @@ path_target_regions: REQUIRED # Bed files of targetted regions access: "" # Access bed file (when absent, created by cnvkit.py access exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) - min_gap_size: 5000 # [access] Minimum gap size between accessible sequence regions + min_gap_size: 0 # [access] Minimum gap size between accessible sequence regions (0: use default value) annotate: "" # [autobin, target] Optional targets annotations - bp_per_bin: 100000 # [autobin] Desired average number of sequencing read bases mapped to each bin - target_min_size: 20 # [autobin] Min size of target bins - target_max_size: 20000 # [autobin] Max size of target bins - antitarget_min_size: 500 # [autobin] Min size of antitarget bins - antitarget_max_size: 500000 # [autobin] Max size of antitarget bins - target_avg_size: 266.666666666666667 # [target] Average size of split target bins (set to average exons size) - split: False # [target] Split large intervals into smaller ones - antitarget_avg_size: 150000 # [antitarget] Average size of antitarget bins - min_size: 25000 # [antitarget] Min size of antitarget bins (should be set to 1/6 of antitarget_avg_size) + bp_per_bin: 0 # [autobin] Desired average number of sequencing read bases mapped to each bin (0: use default value) + target_min_size: 0 # [autobin] Min size of target bins (0: use default value) + target_max_size: 0 # [autobin] Max size of target bins (0: use default value) + antitarget_min_size: 0 # [autobin] Min size of antitarget bins (0: use default value) + antitarget_max_size: 0 # [autobin] Max size of antitarget bins (0: use default value) + target_avg_size: 0 # [target] Average size of split target bins (0: use default value) + split: True # [target] Split large intervals into smaller ones + antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) + min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth count: False # [coverage] Alternative couting algorithm min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering @@ -317,7 +317,7 @@ class CnvkitStepPart(PanelOfNormalsStepPart): # memory="16G", # ), "coverage": ResourceUsage( - threads=2, + threads=8, time="02:00:00", # 2 hours memory="16G", ), @@ -419,12 +419,12 @@ def _get_input_files_coverage(self, wildcards): def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for computing panel of normals""" tpl = ( - "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn" + "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" ) targets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] - tpl = "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn" + tpl = "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" antitargets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] @@ -490,10 +490,10 @@ def _get_output_files_antitarget(self): def _get_output_files_coverage(self): return { - "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn", - "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn.md5", - "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn", - "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn.md5", + "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", } def _get_output_files_create_panel(self): diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index c637ab74c..951792609 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -40,7 +40,10 @@ else cnvkit.py access \ -o {snakemake.output.access} \ - --min-gap-size {config[min_gap_size]} {exclude} \ + $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ + echo --min-gap-size {config[min_gap_size]} + fi) \ + {exclude} \ {snakemake.config[static_data_config][reference][path]} fi @@ -58,4 +61,3 @@ md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) - diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 41d6c4ec4..1c47b4b80 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -36,7 +36,12 @@ cnvkit.py antitarget \ --output {snakemake.output.antitarget} \ --access {snakemake.input.access} \ - --avg-size {config[antitarget_avg_size]} --min-size {config[min_size]} \ + $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ + echo --avg-size {config[antitarget_avg_size]} + fi) \ + $(if [[ {config[min_size]} -gt 0 ]]; then \ + echo --min-size {config[min_size]} + fi) \ {snakemake.input.target} fn=$(basename "{snakemake.output.antitarget}") @@ -53,4 +58,3 @@ md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) - diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py index 46d63b64c..32eac2126 100644 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py @@ -49,9 +49,21 @@ --fasta {snakemake.config[static_data_config][reference][path]} \ --access {snakemake.input.access} \ --targets {config[path_target_regions]} \ - --bp-per-bin {config[bp_per_bin]} \ - --target-min-size {config[target_min_size]} --target-max-size {config[target_max_size]} \ - --antitarget-min-size {config[antitarget_min_size]} --antitarget-max-size {config[antitarget_max_size]} \ + $(if [[ {config[bp_per_bin]} -gt 0 ]]; then \ + echo --bp-per-bin {config[bp_per_bin]} + fi) \ + $(if [[ {config[target_min_size]} -gt 0 ]]; then \ + echo --target-min-size {config[target_min_size]} + fi) \ + $(if [[ {config[target_max_size]} -gt 0 ]]; then \ + echo --target-max-size {config[target_max_size]} + fi) \ + $(if [[ {config[antitarget_min_size]} -gt 0 ]]; then \ + echo --antitarget-min-size {config[antitarget_min_size]} + fi) + $(if [[ {config[antitarget_max_size]} -gt 0 ]]; then \ + echo --antitarget-max-size {config[antitarget_max_size]} + fi) \ $(if [[ -n "{config[annotate]}" ]] ; then \ echo --annotate {config[annotate]} --short-names fi) \ @@ -66,7 +78,6 @@ # Compute MD5 sums of logs. shell( r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} +md5sum {snakemake.log.log} >{snakemake.log.log_md5}") """ ) - diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index c989d27df..29cfb5880 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -38,10 +38,12 @@ $(if [[ -n "{config[annotate]}" ]]; then \ echo --short-names --annotate {config[annotate]} fi) \ - $(if [[ "config{split}" = "True" ]]; then \ + $(if [[ "{config[split]}" = "True" ]]; then \ echo --split fi) \ - --avg-size {config[target_avg_size]} \ + $(if [[ {config[target_avg_size]} -gt 0 ]]; then \ + echo --avg-size {config[target_avg_size]} + fi) \ {config[path_target_regions]} fn=$(basename "{snakemake.output.target}") @@ -58,4 +60,3 @@ md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) - diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index a46416865..2ffcc8d1a 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -255,12 +255,12 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow ) expected = { "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) @@ -276,12 +276,12 @@ def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): ) expected = { "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "sex")(wildcards) @@ -297,12 +297,12 @@ def test_cnvkit_step_part_get_input_files_metrics(panel_of_normals_workflow): ) expected = { "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.target.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.target.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P001-N1-DNA1-WGS1.antitarget.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.coverage.P002-N1-DNA1-WGS1.antitarget.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "metrics")(wildcards) @@ -354,10 +354,10 @@ def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow) def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_coverage()""" expected = { - "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn", - "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.target.cnn.md5", - "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn", - "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.coverage.{normal_library}.antitarget.cnn.md5", + "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") assert actual == expected @@ -485,7 +485,7 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "partition": "medium", } coverage_expected_dict = { - "threads": 2, + "threads": 8, "time": "02:00:00", "memory": "16G", "partition": "medium", From adc2bed9c8589a7a3fe006953e266774b721b278 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 9 Jan 2023 16:21:46 +0100 Subject: [PATCH 06/10] Access separate from cnvkit --- docs/step/panel_of_normals.rst | 7 + .../workflows/panel_of_normals/Snakefile | 78 ++-- .../workflows/panel_of_normals/__init__.py | 405 ++++++++++-------- .../wrappers/cnvkit/access/wrapper.py | 21 +- .../wrappers/cnvkit/antitarget/wrapper.py | 27 +- .../wrappers/cnvkit/autobin/environment.yaml | 1 - .../wrappers/cnvkit/autobin/wrapper.py | 83 ---- .../wrappers/cnvkit/reference/wrapper.py | 23 +- .../wrappers/cnvkit/target/wrapper.py | 61 ++- .../test_workflows_panel_of_normals.py | 356 ++++++--------- .../test_workflows_panel_of_normals_wgs.py | 380 ++++++++++++++++ 11 files changed, 867 insertions(+), 575 deletions(-) create mode 100644 docs/step/panel_of_normals.rst delete mode 120000 snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml delete mode 100644 snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py diff --git a/docs/step/panel_of_normals.rst b/docs/step/panel_of_normals.rst new file mode 100644 index 000000000..9e95a62e8 --- /dev/null +++ b/docs/step/panel_of_normals.rst @@ -0,0 +1,7 @@ +.. _step_panel_of_normals:: + +========================================================== +Creation of panel of normals for somatic SNV & CNV calling +========================================================== + +.. automodule:: snappy_pipeline.workflows.panel_of_normals diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index ea694afe5..d3c1c5e52 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -101,40 +101,26 @@ rule panel_of_normals_mutect2_create_panel: # Panel of normals (cnvkit) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Write out the tumor-only results for the normals ---------------------------- - +# Write out access file (if required, must be run prior to the cnvkit panel of normals) -rule panel_of_normals_cnvkit_access: +rule panel_of_normals_access_run: output: - **wf.get_output_files("cnvkit", "access"), - threads: wf.get_resource("cnvkit", "access", "threads") + **wf.get_output_files("access", "run"), resources: - time=wf.get_resource("cnvkit", "access", "time"), - memory=wf.get_resource("cnvkit", "access", "memory"), - partition=wf.get_resource("cnvkit", "access", "partition"), + time=wf.get_resource("access", "run", "time"), + memory=wf.get_resource("access", "run", "memory"), + partition=wf.get_resource("access", "run", "partition"), log: - **wf.get_log_file("cnvkit", "access"), + **wf.get_log_file("access", "run"), wrapper: wf.wrapper_path("cnvkit/access") -# rule panel_of_normals_cnvkit_autobin: -# input: -# unpack(wf.get_input_files("cnvkit", "autobin")), -# output: -# **wf.get_output_files("cnvkit", "autobin"), -# threads: wf.get_resource("cnvkit", "autobin", "threads") -# resources: -# time=wf.get_resource("cnvkit", "autobin", "time"), -# memory=wf.get_resource("cnvkit", "autobin", "memory"), -# partition=wf.get_resource("cnvkit", "autobin", "partition"), -# log: -# **wf.get_log_file("cnvkit", "autobin"), -# wrapper: -# wf.wrapper_path("cnvkit/autobin") - +# Write out the normals-only results for the normals -------------------------- rule panel_of_normals_cnvkit_target: + input: + unpack(wf.get_input_files("cnvkit", "target")), output: **wf.get_output_files("cnvkit", "target"), threads: wf.get_resource("cnvkit", "target", "threads") @@ -144,6 +130,8 @@ rule panel_of_normals_cnvkit_target: partition=wf.get_resource("cnvkit", "target", "partition"), log: **wf.get_log_file("cnvkit", "target"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "target"), wrapper: wf.wrapper_path("cnvkit/target") @@ -160,6 +148,8 @@ rule panel_of_normals_cnvkit_antitarget: partition=wf.get_resource("cnvkit", "antitarget", "partition"), log: **wf.get_log_file("cnvkit", "antitarget"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "antitarget"), wrapper: wf.wrapper_path("cnvkit/antitarget") @@ -176,6 +166,8 @@ rule panel_of_normals_cnvkit_coverage: partition=wf.get_resource("cnvkit", "coverage", "partition"), log: **wf.get_log_file("cnvkit", "coverage"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "coverage"), wrapper: wf.wrapper_path("cnvkit/coverage") @@ -195,37 +187,25 @@ rule panel_of_normals_cnvkit_create_panel: partition=wf.get_resource("cnvkit", "create_panel", "partition"), log: **wf.get_log_file("cnvkit", "create_panel"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "create_panel"), wrapper: wf.wrapper_path("cnvkit/reference") -rule panel_of_normals_cnvkit_sex: - input: - unpack(wf.get_input_files("cnvkit", "sex")), - output: - **wf.get_output_files("cnvkit", "sex"), - threads: wf.get_resource("cnvkit", "sex", "threads") - resources: - time=wf.get_resource("cnvkit", "sex", "time"), - memory=wf.get_resource("cnvkit", "sex", "memory"), - partition=wf.get_resource("cnvkit", "sex", "partition"), - log: - **wf.get_log_file("cnvkit", "sex"), - wrapper: - wf.wrapper_path("cnvkit/sex") - - -rule panel_of_normals_cnvkit_metrics: +rule panel_of_normals_cnvkit_report: input: - unpack(wf.get_input_files("cnvkit", "metrics")), + unpack(wf.get_input_files("cnvkit", "report")), output: - **wf.get_output_files("cnvkit", "metrics"), - threads: wf.get_resource("cnvkit", "metrics", "threads") + **wf.get_output_files("cnvkit", "report"), + threads: wf.get_resource("cnvkit", "report", "threads") resources: - time=wf.get_resource("cnvkit", "metrics", "time"), - memory=wf.get_resource("cnvkit", "metrics", "memory"), - partition=wf.get_resource("cnvkit", "metrics", "partition"), + time=wf.get_resource("cnvkit", "report", "time"), + memory=wf.get_resource("cnvkit", "report", "memory"), + partition=wf.get_resource("cnvkit", "report", "partition"), log: - **wf.get_log_file("cnvkit", "metrics"), + **wf.get_log_file("cnvkit", "report"), + params: + args=wf.substep_dispatch("cnvkit", "get_args", "report"), wrapper: - wf.wrapper_path("cnvkit/metrics") + wf.wrapper_path("cnvkit/report") diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index fd95f7ad6..37bb5483c 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -16,6 +16,11 @@ The somatic variant calling step uses Snakemake sub workflows for using the result of the ``ngs_mapping`` step. +By default, all normal DNA samples in the ``ngs_mapping`` step are using to create the panel of normals. +However, the user can select of subset of those using the ``path_normals_list`` configuration option +(which can be different for the different tools). +In this case, the libraries listed in the file will be used, **even if they are not flagged as corresponding to normal samples**. + =========== Step Output =========== @@ -28,6 +33,26 @@ vcf files for each normal) are kept in the ``work`` directory. This enables the augmentation of the panel by new files when they become available. +================================ +Notes on the ``cnvkit`` workflow +================================ + +``cnvkit`` is a set of tools originally designed to call somatic copy number alterations from exome data. +Its design is modular, which enables its use for whole genome and amplicon data. + +``cnvkit`` provides a tool to encapsulate common practice workflows (``batch``), depending on the type of data, and on the availability of optional inputs. +The current implementation recapitulates the common practice, while still dispaching computations on multiple cluster nodes. + +For exome and whole genome data, the ``cnvkit`` `documentation `_ +recommends the creation of a panel of normal (called ``reference``). +The actual workflow to generate this reference is slightly different between exome and whole genome data, +and also changes depending whether an accessibility file is provided by the user or not. + +Therefore, the ``cnvkit`` tool to generate such accessibility file is implemented as a separate tool. +If a user wants to create this accessibility file with ``cnvkit`` tools, then she must first run the ``access`` tool. +Only after it has been created can she use it to generate the panel of normals. +For that, she will need to modify the configuration file, adding ``cnvkit`` in the list of tools, and setting the ``access`` parameter to the output of the ``access`` tool. + ===================== Default Configuration ===================== @@ -41,12 +66,21 @@ ===================================== - Panel of normal for ``mutect2`` somatic variant caller +- Panel of normal for ``cvnkit`` somatic Copy Number Alterations caller ======= Reports ======= -Currently, no reports are generated. +Report tables and plots can be found in the ``output/{mapper}.cnvkit/report`` directory. +Two tables are produced, grouping results for all normal samples together: + +- ``metrics.txt``: coverage metrics over target and antitarget regions. +- ``sex.txt``: prediction of the donor's gender based on the coverage of chromosome X & Y target and antitarget regions. + +Coverage plots are also produced, for the whole genome or by chromosome. + +The cnvkit authors recommend to check these reports to ensure that all data is suitable for panel of normal creation. """ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions @@ -63,7 +97,7 @@ __author__ = "Manuel Holtgrewe " #: Names of the tools that might use panel of normals -TOOLS = ["mutect2", "cnvkit"] +TOOLS = ["mutect2", "cnvkit", "access"] #: Default configuration for the somatic_variant_calling schema DEFAULT_CONFIG = r""" @@ -72,13 +106,6 @@ panel_of_normals: tools: ['mutect2'] # REQUIRED - available: 'mutect2' path_ngs_mapping: ../ngs_mapping # REQUIRED - ignore_chroms: # patterns of chromosome names to ignore # TODO maybe remove? - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 # Configuration for mutect2 mutect2: path_normals_list: null # Optional file listing libraries to include in panel @@ -107,34 +134,26 @@ - 'HLA-*' # HLA genes - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 cnvkit: - path_normals_list: "" # Optional file listing libraries to include in panel - path_target_regions: REQUIRED # Bed files of targetted regions - access: "" # Access bed file (when absent, created by cnvkit.py access - exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) - min_gap_size: 0 # [access] Minimum gap size between accessible sequence regions (0: use default value) - annotate: "" # [autobin, target] Optional targets annotations - bp_per_bin: 0 # [autobin] Desired average number of sequencing read bases mapped to each bin (0: use default value) - target_min_size: 0 # [autobin] Min size of target bins (0: use default value) - target_max_size: 0 # [autobin] Max size of target bins (0: use default value) - antitarget_min_size: 0 # [autobin] Min size of antitarget bins (0: use default value) - antitarget_max_size: 0 # [autobin] Max size of antitarget bins (0: use default value) - target_avg_size: 0 # [target] Average size of split target bins (0: use default value) - split: True # [target] Split large intervals into smaller ones - antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) - min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) - min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth - count: False # [coverage] Alternative couting algorithm - min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering - gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing - male_reference: False # [reference & sex] Create male reference - gc_correction: True # [reference] Use GC correction - edge_correction: True # [reference] Use edge correction - rmask_correction: True # [reference] Use rmask correction - drop_low_coverage: False # [metrics] Drop very-low-coverage bins before calculations - split_targets: False # Split targets into equal size bins - targets_average_bin_size: 266.67 # Average size of split target bins - antitargets_average_bin_size: 150000 # Average size of split antitarget bins - min_antitargets_bin_size: 0 # Minimum bin size for antitargets (0: computed on the fly) + path_normals_list: "" # Optional file listing libraries to include in panel + path_target_regions: "" # Bed files of targetted regions (Missing when creating a panel of normals for WGS data) + access: "" # Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run) + annotate: "" # [target] Optional targets annotations + target_avg_size: 0 # [target] Average size of split target bins (0: use default value) + split: True # [target] Split large intervals into smaller ones + antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) + min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering + gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [reference & sex] Create male reference + gc_correction: True # [reference] Use GC correction + edge_correction: True # [reference] Use edge correction + rmask_correction: True # [reference] Use rmask correction + drop_low_coverage: False # [metrics] Drop very-low-coverage bins before calculations + access: # Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally) + exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) + min_gap_size: 0 # [access] Minimum gap size between accessible sequence regions (0: use default value) """ @@ -235,7 +254,7 @@ def _get_input_files_prepare_panel(self, wildcards): def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for merging individual results & panel creation""" paths = [] - tpl = "work/{mapper}.{tool}.prepare_panel/out/{normal_library}.vcf.gz" + tpl = "work/{mapper}.{tool}/out/{mapper}.{tool}.{normal_library}.prepare.vcf.gz" for normal in self.normal_libraries: paths.append(tpl.format(normal_library=normal, tool=self.name, **wildcards)) return {"normals": paths} @@ -251,8 +270,8 @@ def get_output_files(self, action): "vcf_tbi_md5": "vcf.gz.tbi.md5", } tpls = { - "prepare_panel": "work/{{mapper}}.{tool}.prepare_panel/out/{{normal_library}}.{ext}", - "create_panel": "work/{{mapper}}.{tool}.create_panel/out/{{mapper}}.{tool}.panel_of_normals.{ext}", + "prepare_panel": "work/{{mapper}}.{tool}/out/{{mapper}}.{tool}.{{normal_library}}.prepare.{ext}", + "create_panel": "work/{{mapper}}.{tool}/out/{{mapper}}.{tool}.panel_of_normals.{ext}", } for key, ext in ext_dict.items(): yield key, tpls[action].format(tool=self.name, ext=ext) @@ -270,8 +289,8 @@ def get_log_file(self, action): "log_md5": "log.md5", } tpls = { - "prepare_panel": "work/{{mapper}}.{tool}.prepare_panel/log/{{normal_library}}.{ext}", - "create_panel": "work/{{mapper}}.{tool}.create_panel/log/{{mapper}}.{tool}.panel_of_normals.{ext}", + "prepare_panel": "work/{{mapper}}.{tool}/log/{{mapper}}.{tool}.{{normal_library}}.prepare.{ext}", + "create_panel": "work/{{mapper}}.{tool}/log/{{mapper}}.{tool}.panel_of_normals.{ext}", } for key, ext in ext_dict.items(): yield key, tpls[action].format(tool=self.name, ext=ext) @@ -285,22 +304,15 @@ class CnvkitStepPart(PanelOfNormalsStepPart): #: Class available actions actions = ( - "access", "target", "antitarget", "coverage", "create_panel", - "sex", - "metrics", - ) # , "autobin") + "report", + ) #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). resource_usage_dict = { - "access": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="8G", - ), "target": ResourceUsage( threads=2, time="02:00:00", # 2 hours @@ -311,11 +323,6 @@ class CnvkitStepPart(PanelOfNormalsStepPart): time="02:00:00", # 2 hours memory="8G", ), - # "autobin": ResourceUsage( - # threads=2, - # time="02:00:00", # 2 hours - # memory="16G", - # ), "coverage": ResourceUsage( threads=8, time="02:00:00", # 2 hours @@ -326,12 +333,7 @@ class CnvkitStepPart(PanelOfNormalsStepPart): time="02:00:00", # 2 hours memory="16G", ), - "sex": ResourceUsage( - threads=2, - time="02:00:00", # 2 hours - memory="16G", - ), - "metrics": ResourceUsage( + "report": ResourceUsage( threads=2, time="02:00:00", # 2 hours memory="16G", @@ -345,6 +347,7 @@ def __init__(self, parent): with open(self.config["cnvkit"]["path_normals_list"], "rt") as f: for line in f: self.normal_libraries.append(line.strip()) + self.is_wgs = self.config["cnvkit"]["path_target_regions"] == "" def check_config(self): if self.name not in self.config["tools"]: @@ -354,6 +357,14 @@ def check_config(self): "Path to reference FASTA not configured but required for %s" % (self.name,), ) + def get_args(self, action): + self._validate_action(action) + if self.is_wgs: + method = "wgs" + else: + method = "hybrid" + return {"method": method, "flat": (len(self.normal_libraries) == 0)} + def get_resource_usage(self, action): """Get Resource Usage @@ -371,37 +382,35 @@ def get_input_files(self, action): # Validate action self._validate_action(action) mapping = { - "access": None, - # "autobin": self._get_input_files_autobin, - "target": None, + "target": self._get_input_files_target, "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, "create_panel": self._get_input_files_create_panel, - "sex": self._get_input_files_create_panel, - "metrics": self._get_input_files_create_panel, + "report": self._get_input_files_report, + "access": None, } return mapping[action] - # def _get_input_files_autobin(self, wildcards): - # """Helper wrapper function for creating targets & anttargets files""" - # ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - # tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" - # bams = [ - # ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) - # for x in self.normal_libraries - # ] - # bais = [x + ".bai" for x in bams] - # return { - # "bams": bams, - # "bais": bais, - # "access": "work/cnvkit.access/out/cnvkit.access.bed", - # } + def _get_input_files_target(self, wildcards): + """Helper wrapper function to estimate target average size in wgs mode""" + if not self.is_wgs: + return {} + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" + bams = [ + ngs_mapping(tpl.format(mapper=wildcards["mapper"], normal_library=x)) + for x in self.normal_libraries + ] + bais = [x + ".bai" for x in bams] + input_files = {"bams": bams, "bais": bais} + return input_files def _get_input_files_antitarget(self, wildcards): """Helper wrapper function for computing antitarget locations""" + if self.is_wgs: + return {} return { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "target": "work/cnvkit.target/out/cnvkit.target.bed", + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), } def _get_input_files_coverage(self, wildcards): @@ -410,21 +419,40 @@ def _get_input_files_coverage(self, wildcards): tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam" bam = ngs_mapping(tpl.format(**wildcards)) return { - "target": "work/cnvkit.target/out/cnvkit.target.bed".format(**wildcards), - "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed".format(**wildcards), + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format( + **wildcards + ), "bam": bam, "bai": bam + ".bai", } def _get_input_files_create_panel(self, wildcards): """Helper wrapper function for computing panel of normals""" - tpl = ( - "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" - ) + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" + targets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + antitargets = [ + tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries + ] + return { + "target": targets + if targets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), + "antitarget": antitargets + if antitargets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards), + } + + def _get_input_files_report(self, wildcards): + """Helper wrapper function for the panel of normals report""" + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn" targets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] - tpl = "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" + tpl = "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn" antitargets = [ tpl.format(mapper=wildcards["mapper"], normal_library=x) for x in self.normal_libraries ] @@ -433,21 +461,9 @@ def _get_input_files_create_panel(self, wildcards): "antitarget": antitargets, } - def _get_input_files_sex(self, wildcards): - """Helper wrapper function for report of samples gender""" - return self._get_input_files_create_panel(wildcards) - - def _get_input_files_metrics(self, wildcards): - """Helper wrapper function for report of samples coverage noise""" - return self._get_input_files_create_panel(wildcards) - def get_output_files(self, action): """Return panel of normal files""" - if action == "access": - return self._get_output_files_access() - # elif action == "autobin": - # return self._get_output_files_autobin() - elif action == "target": + if action == "target": return self._get_output_files_target() elif action == "antitarget": return self._get_output_files_antitarget() @@ -455,63 +471,51 @@ def get_output_files(self, action): return self._get_output_files_coverage() elif action == "create_panel": return self._get_output_files_create_panel() - elif action == "sex": - return self._get_output_files_sex() - elif action == "metrics": - return self._get_output_files_metrics() + elif action == "report": + return self._get_output_files_report() + elif action == "access": + return self._get_output_files_access() else: self._validate_action(action) - def _get_output_files_access(self): - return { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", - } - - # def _get_output_files_autobin(self): - # return { - # "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", - # "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", - # "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", - # "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", - # } - def _get_output_files_target(self): return { - "target": "work/cnvkit.target/out/cnvkit.target.bed", - "target_md5": "work/cnvkit.target/out/cnvkit.target.bed.md5", + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", } def _get_output_files_antitarget(self): return { - "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", } def _get_output_files_coverage(self): return { - "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", } def _get_output_files_create_panel(self): return { - "panel": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", } - def _get_output_files_sex(self): + def _get_output_files_report(self): return { - "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv", - "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv.md5", + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", } - def _get_output_files_metrics(self): + def _get_output_files_access(self): return { - "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv", - "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv.md5", + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", } @dictify @@ -525,40 +529,66 @@ def get_log_file(self, action): "log": "log", "log_md5": "log.md5", } - if action == "access": - tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) - # elif action == "autobin": - # tpl = "work/{{mapper}}.cnvkit.autobin/log/{{mapper}}.cnvkit.autobin.{ext}" - # for key, ext in ext_dict.items(): - # yield key, tpl.format(ext=ext) - elif action == "target": - tpl = "work/cnvkit.target/log/cnvkit.target.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) + if action == "target": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.target.{ext}" elif action == "antitarget": - tpl = "work/cnvkit.antitarget/log/cnvkit.antitarget.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.antitarget.{ext}" elif action == "coverage": - tpl = "work/{{mapper}}.cnvkit.coverage/log/{{mapper}}.cnvkit.coverage.{{normal_library}}.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.{{normal_library}}.coverage.{ext}" elif action == "create_panel": - tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.panel_of_normals.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) - elif action == "sex": - tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.sex.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) - elif action == "metrics": - tpl = "work/{{mapper}}.cnvkit.create_panel/log/{{mapper}}.cnvkit.metrics.{ext}" - for key, ext in ext_dict.items(): - yield key, tpl.format(ext=ext) + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.panel_of_normals.{ext}" + elif action == "report": + tpl = "work/{{mapper}}.cnvkit/log/{{mapper}}.cnvkit.report.{ext}" + elif action == "access": + tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" else: self._validate_action(action) + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) + + +class AccessStepPart(PanelOfNormalsStepPart): + """Utility to create access file for cnvkit""" + + name = "access" + actions = ("run",) + + def get_resource_usage(self, action): + # Validate action + self._validate_action(action) + return ResourceUsage( + threads=2, + time="02:00:00", # 2 hours + memory="8G", + ) + + def get_input_files(self, action): + # Validate action + self._validate_action(action) + return None + + def get_output_files(self, action): + # Validate action + self._validate_action(action) + tpl = "work/cnvkit.access/out/cnvkit.access.bed" + return {"access": tpl, "access_md5": tpl + ".md5"} + + @dictify + def get_log_file(self, action): + """Return log files""" + # Validate action + self._validate_action(action) + ext_dict = { + "conda_list": "conda_list.txt", + "conda_list_md5": "conda_list.txt.md5", + "conda_info": "conda_info.txt", + "conda_info_md5": "conda_info.txt.md5", + "log": "log", + "log_md5": "log.md5", + } + tpl = "work/cnvkit.access/log/cnvkit.access.{ext}" + for key, ext in ext_dict.items(): + yield key, tpl.format(ext=ext) class PanelOfNormalsWorkflow(BaseStep): @@ -593,6 +623,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ( Mutect2StepPart, CnvkitStepPart, + AccessStepPart, LinkOutStepPart, ) ) @@ -617,48 +648,46 @@ def get_result_files(self): ] if "mutect2" in set(self.config["tools"]) & set(TOOLS): - tpl = "output/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals.{ext}" + tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") result_files.extend(self._expand_result_files(tpl, ext_list)) - tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals.{ext}" + tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) if "cnvkit" in set(self.config["tools"]) & set(TOOLS): - tpls = ( - ("output/cnvkit.access/out/cnvkit.access.{ext}", ("bed", "bed.md5")), - # ( - # "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}", - # ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5"), - # ), - ("output/cnvkit.target/out/cnvkit.target.{ext}", ("bed", "bed.md5")), - ("output/cnvkit.antitarget/out/cnvkit.antitarget.{ext}", ("bed", "bed.md5")), + tpls = [ + ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.target.{ext}", ("bed", "bed.md5")), + ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.{ext}", ("bed", "bed.md5")), ( - "output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.{ext}", + "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}", ("cnn", "cnn.md5"), ), ( - "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.{ext}", + "output/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.{ext}", ("tsv", "tsv.md5"), ), ( - "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.{ext}", + "output/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.{ext}", ("tsv", "tsv.md5"), ), - ) + ] for tpl, ext_list in tpls: result_files.extend(self._expand_result_files(tpl, ext_list)) - tpls = ( - "output/cnvkit.access/log/cnvkit.access.{ext}", - # "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin.{ext}", - "output/cnvkit.target/log/cnvkit.target.{ext}", - "output/cnvkit.antitarget/log/cnvkit.antitarget.{ext}", - "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals.{ext}", - "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex.{ext}", - "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics.{ext}", - ) + tpls = [ + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.target.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals.{ext}", + "output/{mapper}.cnvkit/log/{mapper}.cnvkit.report.{ext}", + ] for tpl in tpls: result_files.extend(self._expand_result_files(tpl, log_ext_list)) + if "access" in set(self.config["tools"]) & set(TOOLS): + tpl = "output/cnvkit.access/out/cnvkit.access.bed" + result_files.extend([tpl + md5 for md5 in ("", ".md5")]) + tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" + result_files.extend(self._expand_result_files(tpl, log_ext_list)) + return result_files def _expand_result_files(self, tpl, ext_list): diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 951792609..84799f6f4 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -9,7 +9,7 @@ config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["cnvkit"] -exclude = " -- exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" +exclude = " --exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" shell( r""" @@ -34,18 +34,13 @@ # ----------------------------------------------------------------------------- -if [[ -n "{config[access]}" ]] -then - ln -sr {config[access]} {snakemake.output.access} -else - cnvkit.py access \ - -o {snakemake.output.access} \ - $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ - echo --min-gap-size {config[min_gap_size]} - fi) \ - {exclude} \ - {snakemake.config[static_data_config][reference][path]} -fi +cnvkit.py access \ + -o {snakemake.output.access} \ + $(if [[ {config[min_gap_size]} -gt 0 ]]; then \ + echo --min-gap-size {config[min_gap_size]} + fi) \ + {exclude} \ + {snakemake.config[static_data_config][reference][path]} fn=$(basename "{snakemake.output.access}") d=$(dirname "{snakemake.output.access}") diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 1c47b4b80..83a153c97 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -33,16 +33,23 @@ # ----------------------------------------------------------------------------- -cnvkit.py antitarget \ - --output {snakemake.output.antitarget} \ - --access {snakemake.input.access} \ - $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ - echo --avg-size {config[antitarget_avg_size]} - fi) \ - $(if [[ {config[min_size]} -gt 0 ]]; then \ - echo --min-size {config[min_size]} - fi) \ - {snakemake.input.target} +if [[ -n "{config[path_target_regions]}" ]] +then + cnvkit.py antitarget \ + --output {snakemake.output.antitarget} \ + $(if [[ -n "{config[access]}" ]]; then \ + echo --access {config[access]} + fi) \ + $(if [[ {config[antitarget_avg_size]} -gt 0 ]]; then \ + echo --avg-size {config[antitarget_avg_size]} + fi) \ + $(if [[ {config[min_size]} -gt 0 ]]; then \ + echo --min-size {config[min_size]} + fi) \ + {snakemake.input.target} +else + touch {snakemake.output.antitarget} +fi fn=$(basename "{snakemake.output.antitarget}") d=$(dirname "{snakemake.output.antitarget}") diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml b/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/cnvkit/autobin/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py b/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py deleted file mode 100644 index 32eac2126..000000000 --- a/snappy_wrappers/wrappers/cnvkit/autobin/wrapper.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for cnvkit.py autobin -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["cnvkit"] - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# Function definitions --------------------------------------------------------- -md5() -{{ - set -x - - fn=$1 - f=$(basename $fn) - d=$(dirname $fn) - pushd $d - md5sum $f > $f.md5 - popd -}} - -# ----------------------------------------------------------------------------- - -cnvkit.py autobin --method "hybrid" \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --access {snakemake.input.access} \ - --targets {config[path_target_regions]} \ - $(if [[ {config[bp_per_bin]} -gt 0 ]]; then \ - echo --bp-per-bin {config[bp_per_bin]} - fi) \ - $(if [[ {config[target_min_size]} -gt 0 ]]; then \ - echo --target-min-size {config[target_min_size]} - fi) \ - $(if [[ {config[target_max_size]} -gt 0 ]]; then \ - echo --target-max-size {config[target_max_size]} - fi) \ - $(if [[ {config[antitarget_min_size]} -gt 0 ]]; then \ - echo --antitarget-min-size {config[antitarget_min_size]} - fi) - $(if [[ {config[antitarget_max_size]} -gt 0 ]]; then \ - echo --antitarget-max-size {config[antitarget_max_size]} - fi) \ - $(if [[ -n "{config[annotate]}" ]] ; then \ - echo --annotate {config[annotate]} --short-names - fi) \ - --target-output-bed {snakemake.output.target} --antitarget-output-bed {snakemake.output.antitarget} \ - {snakemake.input.bams} - -md5 {snakemake.output.target} -md5 {snakemake.output.antitarget} -""" -) - -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5}") -""" -) diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 96d48c5d4..73ef5e176 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -10,6 +10,10 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +# NOTE: snakemake.input.target and snakemake.input.antitarget contain +# the output of target & antitarget substeps when there is no bam files +# the bam files lists when the list of normals is not empty + cluster = ( " --cluster --min-cluster-size {}".format(config["min_cluster_size"]) if config["min_cluster_size"] > 0 @@ -44,11 +48,20 @@ # ----------------------------------------------------------------------------- -cnvkit.py reference \ - --output {snakemake.output.panel} \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ - {snakemake.input.target} {snakemake.input.antitarget} +if [[ "{snakemake.params.args[flat]}" = "True" ]] +then + cnvkit.py reference \ + --output {snakemake.output.panel} \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ + --targets {snakemake.input.target} --antitargets {snakemake.input.antitarget} +else + cnvkit.py reference \ + --output {snakemake.output.panel} \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + {cluster} {gender} {male} {no_gc} {no_edge} {no_rmask} \ + {snakemake.input.target} {snakemake.input.antitarget} +fi fn=$(basename "{snakemake.output.panel}") d=$(dirname "{snakemake.output.panel}") diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index 29cfb5880..ccded6235 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -10,6 +10,8 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +bams = " ".join(snakemake.input.get("bams", [""])) + shell( r""" # Also pipe everything to log file @@ -33,6 +35,63 @@ # ----------------------------------------------------------------------------- +access() +{{ + cnvkit.py access \ + -o $tmpdir/access.bed \ + {snakemake.config[static_data_config][reference][path]} +}} + +autobin() +{{ + cnvkit.py autobin --method $1 \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + --access $2 \ + $(if [[ {config[bp_per_bin]} -gt 0 ]]; then \ + echo --bp-per-bin {config[bp_per_bin]} + fi) \ + $(if [[ {config[target_min_size]} -gt 0 ]]; then \ + echo --target-min-size {config[target_min_size]} + fi) \ + $(if [[ {config[target_max_size]} -gt 0 ]]; then \ + echo --target-max-size {config[target_max_size]} + fi) \ + --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ + {bams} > $tmpdir/autobin.txt +}} + +# ----------------------------------------------------------------------------- + +target="{config[path_target_regions]}" +target_avg_size={config[target_avg_size]} + +if [[ -z "$target" ]] && [[ $target_avg_size -eq 0 ]] +then + tmpdir=$(mktemp -d $TMPDIR) + + if [[ -n "{bams}" ]] + then + if [[ -z "{config[access]}" ]] + then + access + autobin wgs $tmpdir/access.bed + + target=$tmpdir/access.bed + target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) + else + autobin amplicon "{config[access]}" + + target="{config[access]}" + target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) + fi + else + access + + target=$tmpdir/access.bed + target_avg_size=50000 + fi +fi + cnvkit.py target \ --output {snakemake.output.target} \ $(if [[ -n "{config[annotate]}" ]]; then \ @@ -44,7 +103,7 @@ $(if [[ {config[target_avg_size]} -gt 0 ]]; then \ echo --avg-size {config[target_avg_size]} fi) \ - {config[path_target_regions]} + $target fn=$(basename "{snakemake.output.target}") d=$(dirname "{snakemake.output.target}") diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 2ffcc8d1a..38b8645b1 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -38,13 +38,13 @@ def minimal_config(): path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['mutect2', 'cnvkit'] + tools: ['mutect2', 'cnvkit', 'access'] mutect2: germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: path_excluded_regions: "" - path_target_regions: /path/to/target_regions.bed + path_target_regions: /path/to/regions.bed # WGS mode path_normals_list: "" data_sets: @@ -114,8 +114,8 @@ def test_mutect2_step_part_get_input_files_create_panel(panel_of_normals_workflo ) expected = { "normals": [ - "work/bwa.mutect2.prepare_panel/out/P001-N1-DNA1-WGS1.vcf.gz", - "work/bwa.mutect2.prepare_panel/out/P002-N1-DNA1-WGS1.vcf.gz", + "work/bwa.mutect2/out/bwa.mutect2.P001-N1-DNA1-WGS1.prepare.vcf.gz", + "work/bwa.mutect2/out/bwa.mutect2.P002-N1-DNA1-WGS1.prepare.vcf.gz", ], } actual = panel_of_normals_workflow.get_input_files("mutect2", "create_panel")(wildcards) @@ -125,10 +125,10 @@ def test_mutect2_step_part_get_input_files_create_panel(panel_of_normals_workflo def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_output_files_prepare_panel()""" expected = { - "vcf": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz", - "vcf_md5": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.md5", - "vcf_tbi": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.tbi", - "vcf_tbi_md5": "work/{mapper}.mutect2.prepare_panel/out/{normal_library}.vcf.gz.tbi.md5", + "vcf": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz", + "vcf_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.md5", + "tbi": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi", + "tbi_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi.md5", } actual = panel_of_normals_workflow.get_output_files("mutect2", "prepare_panel") assert actual == expected @@ -136,7 +136,7 @@ def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workf def test_mutect2_step_part_get_output_files_create_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_output_files_create_panel()""" - base_name_out = "work/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals" + base_name_out = "work/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals" expected = get_expected_output_vcf_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_output_files("mutect2", "create_panel") assert actual == expected @@ -144,7 +144,7 @@ def test_mutect2_step_part_get_output_files_create_panel(panel_of_normals_workfl def test_mutect2_step_part_get_log_file_prepare_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_log_files_prepare_panel()""" - base_name_out = "work/{mapper}.mutect2.prepare_panel/log/{normal_library}" + base_name_out = "work/{mapper}.mutect2/log/{mapper}.mutect2.{normal_library}.prepare" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("mutect2", "prepare_panel") assert actual == expected @@ -152,7 +152,7 @@ def test_mutect2_step_part_get_log_file_prepare_panel(panel_of_normals_workflow) def test_mutect2_step_part_get_log_file_create_panel(panel_of_normals_workflow): """Tests Mutect2StepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals" + base_name_out = "work/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("mutect2", "create_panel") assert actual == expected @@ -190,26 +190,15 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): # Tests for CnvkitStepPart ------------------------------------------------------------------------ -# def test_cnvkit_step_part_get_input_files_autobin(panel_of_normals_workflow): -# """Tests CnvkitStepPart._get_input_files_autobin()""" -# wildcards = Wildcards( -# fromdict={ -# "mapper": "bwa", -# } -# ) -# expected = { -# "access": "work/cnvkit.access/out/cnvkit.access.bed", -# "bams": [ -# "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", -# "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", -# ], -# "bais": [ -# "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", -# "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", -# ], -# } -# actual = panel_of_normals_workflow.get_input_files("cnvkit", "autobin")(wildcards) -# assert actual == expected +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == {} def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): @@ -217,12 +206,10 @@ def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): wildcards = Wildcards( fromdict={ "mapper": "bwa", - "normal_library": "P001-N1-DNA1-WGS1", } ) expected = { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "target": "work/cnvkit.target/out/cnvkit.target.bed", + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) assert actual == expected @@ -239,8 +226,8 @@ def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): expected = { "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "target": "work/cnvkit.target/out/cnvkit.target.bed", - "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", } actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected @@ -255,20 +242,20 @@ def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow ) expected = { "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", ], } actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_sex()""" +def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_report()""" wildcards = Wildcards( fromdict={ "mapper": "bwa", @@ -276,66 +263,23 @@ def test_cnvkit_step_part_get_input_files_sex(panel_of_normals_workflow): ) expected = { "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", ], "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", ], } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "sex")(wildcards) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_input_files_metrics(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_input_files_metrics()""" - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - } - ) - expected = { - "target": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", - ], - "antitarget": [ - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", - "work/bwa.cnvkit.coverage/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", - ], - } - actual = panel_of_normals_workflow.get_input_files("cnvkit", "metrics")(wildcards) - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_access(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_access()""" - expected = { - "access": "work/cnvkit.access/out/cnvkit.access.bed", - "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "access") - assert actual == expected - - -# def test_cnvkit_step_part_get_output_files_autobin(panel_of_normals_workflow): -# """Tests CvnkitStepPart._get_output_files_autobin()""" -# expected = { -# "target": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed", -# "target_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.target.bed.md5", -# "antitarget": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed", -# "antitarget_md5": "work/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.antitarget.bed.md5", -# } -# actual = panel_of_normals_workflow.get_output_files("cnvkit", "autobin") -# assert actual == expected - - def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_target()""" expected = { - "target": "work/cnvkit.target/out/cnvkit.target.bed", - "target_md5": "work/cnvkit.target/out/cnvkit.target.bed.md5", + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") assert actual == expected @@ -344,8 +288,8 @@ def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_antitarget()""" expected = { - "antitarget": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed", - "antitarget_md5": "work/cnvkit.antitarget/out/cnvkit.antitarget.bed.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") assert actual == expected @@ -354,10 +298,10 @@ def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow) def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_coverage()""" expected = { - "target": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", - "target_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", - "antitarget": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", - "antitarget_md5": "work/{mapper}.cnvkit.coverage/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") assert actual == expected @@ -366,52 +310,28 @@ def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): """Tests CvnkitStepPart._get_output_files_create_panel()""" expected = { - "panel": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn", - "panel_md5": "work/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", } actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_output_files_sex(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_sex()""" - expected = { - "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv", - "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.tsv.md5", - } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "sex") - assert actual == expected - - -def test_cnvkit_step_part_get_output_files_metrics(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_output_files_metrics()""" +def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_report()""" expected = { - "table": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv", - "table_md5": "work/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.tsv.md5", + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", } - actual = panel_of_normals_workflow.get_output_files("cnvkit", "metrics") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_access(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_access()""" - base_name_out = "work/cnvkit.access/log/cnvkit.access" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "access") + actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") assert actual == expected -# def test_cnvkit_step_part_get_log_file_autobin(panel_of_normals_workflow): -# """Tests CvnkitStepPart._get_log_files_autobin()""" -# base_name_out = "work/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" -# expected = get_expected_log_files_dict(base_out=base_name_out) -# actual = panel_of_normals_workflow.get_log_file("cnvkit", "autobin") -# assert actual == expected - - def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_target()""" - base_name_out = "work/cnvkit.target/log/cnvkit.target" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") assert actual == expected @@ -419,7 +339,7 @@ def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_antitarget()""" - base_name_out = "work/cnvkit.antitarget/log/cnvkit.antitarget" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") assert actual == expected @@ -427,7 +347,7 @@ def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_coverage()""" - base_name_out = "work/{mapper}.cnvkit.coverage/log/{mapper}.cnvkit.coverage.{normal_library}" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") assert actual == expected @@ -435,43 +355,23 @@ def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): """Tests CvnkitStepPart._get_log_files_create_panel()""" - base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" expected = get_expected_log_files_dict(base_out=base_name_out) actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") assert actual == expected -def test_cnvkit_step_part_get_log_file_sex(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_sex()""" - base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex" - expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "sex") - assert actual == expected - - -def test_cnvkit_step_part_get_log_file_metrics(panel_of_normals_workflow): - """Tests CvnkitStepPart._get_log_files_metrics()""" - base_name_out = "work/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics" +def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_report()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" expected = get_expected_log_files_dict(base_out=base_name_out) - actual = panel_of_normals_workflow.get_log_file("cnvkit", "metrics") + actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") assert actual == expected def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): """Tests CvnkitStepPart.get_resource_usage()""" # Define expected: default defined workflow.abstract - access_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "8G", - "partition": "medium", - } - # autobin_expected_dict = { - # "threads": 2, - # "time": "02:00:00", - # "memory": "16G", - # "partition": "medium", - # } target_expected_dict = { "threads": 2, "time": "02:00:00", @@ -496,31 +396,13 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): "memory": "16G", "partition": "medium", } - sex_expected_dict = { - "threads": 2, - "time": "02:00:00", - "memory": "16G", - "partition": "medium", - } - metrics_expected_dict = { + report_expected_dict = { "threads": 2, "time": "02:00:00", "memory": "16G", "partition": "medium", } - # Evaluate action `access` - for resource, expected in access_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'access'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "access", resource) - assert actual == expected, msg_error - - # Evaluate action `autobin` - # for resource, expected in autobin_expected_dict.items(): - # msg_error = f"Assertion error for resource '{resource}' for action 'autobin'." - # actual = panel_of_normals_workflow.get_resource("cnvkit", "autobin", resource) - # assert actual == expected, msg_error - # Evaluate action `target` for resource, expected in target_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'target'." @@ -545,16 +427,50 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) assert actual == expected, msg_error - # Evaluate action `sex` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'sex'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "sex", resource) + # Evaluate action `report` + for resource, expected in report_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'report'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) assert actual == expected, msg_error - # Evaluate action `metrics` - for resource, expected in reference_expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' for action 'metrics'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "metrics", resource) + +# Tests for AccessStepPart ------------------------------------------------------------------------- + + +def test_access_step_part_get_input_files_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_input_files_run()""" + assert panel_of_normals_workflow.get_input_files("access", "run") == None + + +def test_access_step_part_get_output_files_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_output_files_run()""" + expected = { + "access": "work/cnvkit.access/out/cnvkit.access.bed", + "access_md5": "work/cnvkit.access/out/cnvkit.access.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("access", "run") + assert actual == expected + + +def test_access_step_part_get_log_file_run(panel_of_normals_workflow): + """Tests AccessStepPart._get_log_file_run()""" + expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") + actual = panel_of_normals_workflow.get_log_file("access", "run") + assert actual == expected + + +def test_access_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests AccessStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'run'." + actual = panel_of_normals_workflow.get_resource("access", "run", resource) assert actual == expected, msg_error @@ -564,65 +480,55 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): def test_panel_of_normals_workflow(panel_of_normals_workflow): """Test simple functionality of the workflow""" # Check created sub steps - expected = ["cnvkit", "link_out", "mutect2"] + expected = ["access", "cnvkit", "link_out", "mutect2"] actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) assert actual == expected # Check result file construction expected = [] - tpl = "output/{mapper}.mutect2.create_panel/out/{mapper}.mutect2.panel_of_normals.{ext}" + tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") for mapper in ("bwa",) ] # add log files - tpl = "output/{mapper}.mutect2.create_panel/log/{mapper}.mutect2.panel_of_normals" + tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals" for mapper in ("bwa",): expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + # Now for basic cnvkit files (panel of normal only) - tpl = "output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.{ext}" + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" expected += [ tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) ] + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("target", "antitarget"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("bed", "bed.md5") + for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("sex", "metrics"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("tsv", "tsv.md5") + for mapper in ("bwa",) + ] + # add log files + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("target", "antitarget", "panel_of_normals", "report"): + for mapper in ("bwa",): + expected += get_expected_log_files_dict( + tpl.format(mapper=mapper, substep=substep) + ).values() + + # Access tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - # tpl = "output/{mapper}.cnvkit.autobin/out/{mapper}.cnvkit.autobin.{ext}" - # expected += [ - # tpl.format(mapper=mapper, ext=ext) - # for ext in ("target.bed", "target.bed.md5", "antitarget.bed", "antitarget.bed.md5") - # for mapper in ("bwa",) - # ] - tpl = "output/cnvkit.target/out/cnvkit.target.{ext}" - expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - tpl = "output/cnvkit.antitarget/out/cnvkit.antitarget.{ext}" - expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - tpl = "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.sex.{ext}" - expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("tsv", "tsv.md5") for mapper in ("bwa",) - ] - tpl = "output/{mapper}.cnvkit.create_panel/report/{mapper}.cnvkit.metrics.{ext}" - expected += [ - tpl.format(mapper=mapper, ext=ext) for ext in ("tsv", "tsv.md5") for mapper in ("bwa",) - ] - # add log files - tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.panel_of_normals" - for mapper in ("bwa",): - expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() expected += get_expected_log_files_dict("output/cnvkit.access/log/cnvkit.access").values() - # tpl = "output/{mapper}.cnvkit.autobin/log/{mapper}.cnvkit.autobin" - # for mapper in ("bwa",): - # expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() - expected += get_expected_log_files_dict("output/cnvkit.target/log/cnvkit.target").values() - expected += get_expected_log_files_dict( - "output/cnvkit.antitarget/log/cnvkit.antitarget" - ).values() - tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.sex" - for mapper in ("bwa",): - expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() - tpl = "output/{mapper}.cnvkit.create_panel/log/{mapper}.cnvkit.metrics" - for mapper in ("bwa",): - expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + expected = list(sorted(expected)) actual = list(sorted(panel_of_normals_workflow.get_result_files())) assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py new file mode 100644 index 000000000..d54974619 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py @@ -0,0 +1,380 @@ +# -*- coding: utf-8 -*- +"""Tests for the panel_of_normals workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.panel_of_normals import PanelOfNormalsWorkflow + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for (cancer) configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + compute_coverage_bed: true + path_target_regions: /path/to/regions.bed + bwa: + path_index: /path/to/bwa/index.fa + + panel_of_normals: + tools: ['cnvkit'] + cnvkit: + path_excluded_regions: "" + path_target_regions: "" # WGS mode + path_normals_list: "" + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def panel_of_normals_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + # Construct the workflow object + return PanelOfNormalsWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for CnvkitStepPart ------------------------------------------------------------------------ + + +def test_cnvkit_step_part_get_input_files_target(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_target()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "bams": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam", + ], + "bais": [ + "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "NGS_MAPPING/output/bwa.P002-N1-DNA1-WGS1/out/bwa.P002-N1-DNA1-WGS1.bam.bai", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "target")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_antitarget(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_antitarget()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + actual = panel_of_normals_workflow.get_input_files("cnvkit", "antitarget")(wildcards) + assert actual == {} + + +def test_cnvkit_step_part_get_input_files_coverage(panel_of_normals_workflow): + """Tests CnvkitStepPart._get_input_files_coverage()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "normal_library": "P001-N1-DNA1-WGS1", + } + ) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + "target": "work/bwa.cnvkit/out/bwa.cnvkit.target.bed", + "antitarget": "work/bwa.cnvkit/out/bwa.cnvkit.antitarget.bed", + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "coverage")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_create_panel()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "create_panel")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_input_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_input_files_report()""" + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + } + ) + expected = { + "target": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.targetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.targetcoverage.cnn", + ], + "antitarget": [ + "work/bwa.cnvkit/out/bwa.cnvkit.P001-N1-DNA1-WGS1.antitargetcoverage.cnn", + "work/bwa.cnvkit/out/bwa.cnvkit.P002-N1-DNA1-WGS1.antitargetcoverage.cnn", + ], + } + actual = panel_of_normals_workflow.get_input_files("cnvkit", "report")(wildcards) + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_target()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_antitarget()""" + expected = { + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_coverage()""" + expected = { + "target": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn", + "target_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.targetcoverage.cnn.md5", + "antitarget": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn", + "antitarget_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.{normal_library}.antitargetcoverage.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_create_panel()""" + expected = { + "panel": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn", + "panel_md5": "work/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.cnn.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_output_files_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_output_files_report()""" + expected = { + "sex": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv", + "sex_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.sex.tsv.md5", + "metrics": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv", + "metrics_md5": "work/{mapper}.cnvkit/report/{mapper}.cnvkit.metrics.tsv.md5", + } + actual = panel_of_normals_workflow.get_output_files("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_target(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_target()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.target" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "target") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_antitarget(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_antitarget()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.antitarget" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "antitarget") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_coverage(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_coverage()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.{normal_library}.coverage" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_create_panel(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_create_panel()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.panel_of_normals" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "create_panel") + assert actual == expected + + +def test_cnvkit_step_part_get_log_file_report(panel_of_normals_workflow): + """Tests CvnkitStepPart._get_log_files_report()""" + base_name_out = "work/{mapper}.cnvkit/log/{mapper}.cnvkit.report" + expected = get_expected_log_files_dict(base_out=base_name_out) + actual = panel_of_normals_workflow.get_log_file("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): + """Tests CvnkitStepPart.get_resource_usage()""" + # Define expected: default defined workflow.abstract + target_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + antitarget_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "8G", + "partition": "medium", + } + coverage_expected_dict = { + "threads": 8, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + reference_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + report_expected_dict = { + "threads": 2, + "time": "02:00:00", + "memory": "16G", + "partition": "medium", + } + + # Evaluate action `target` + for resource, expected in target_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'target'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + assert actual == expected, msg_error + + # Evaluate action `antitarget` + for resource, expected in antitarget_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) + assert actual == expected, msg_error + + # Evaluate action `coverage` + for resource, expected in coverage_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + # Evaluate action `create_panel` + for resource, expected in reference_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + assert actual == expected, msg_error + + # Evaluate action `report` + for resource, expected in report_expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}' for action 'report'." + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) + assert actual == expected, msg_error + + +# PanelOfNormalsWorkflow -------------------------------------------------------------------------- + + +def test_panel_of_normals_workflow(panel_of_normals_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["access", "cnvkit", "link_out", "mutect2"] + actual = list(sorted(panel_of_normals_workflow.sub_steps.keys())) + assert actual == expected + expected = [] + + # Now for basic cnvkit files (panel of normal only) + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" + expected += [ + tpl.format(mapper=mapper, ext=ext) for ext in ("cnn", "cnn.md5") for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("target", "antitarget"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("bed", "bed.md5") + for mapper in ("bwa",) + ] + tpl = "output/{mapper}.cnvkit/report/{mapper}.cnvkit.{substep}.{ext}" + for substep in ("sex", "metrics"): + expected += [ + tpl.format(substep=substep, mapper=mapper, ext=ext) + for ext in ("tsv", "tsv.md5") + for mapper in ("bwa",) + ] + # add log files + tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.{substep}" + for substep in ("target", "antitarget", "panel_of_normals", "report"): + for mapper in ("bwa",): + expected += get_expected_log_files_dict( + tpl.format(mapper=mapper, substep=substep) + ).values() + + expected = list(sorted(expected)) + actual = list(sorted(panel_of_normals_workflow.get_result_files())) + assert actual == expected From 1511006c813dda6bbacaf2eee4619d5ba3ffd064 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 11 Jan 2023 09:13:48 +0100 Subject: [PATCH 07/10] Changes to panel of normals --- .../workflows/panel_of_normals/__init__.py | 17 ++++-- .../wrappers/cnvkit/access/wrapper.py | 2 +- .../wrappers/cnvkit/antitarget/wrapper.py | 4 +- .../wrappers/cnvkit/coverage/wrapper.py | 21 +++++++- .../wrappers/cnvkit/reference/wrapper.py | 2 +- .../wrappers/cnvkit/target/wrapper.py | 52 +++++++------------ 6 files changed, 57 insertions(+), 41 deletions(-) diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 37bb5483c..714d28d3e 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -68,18 +68,26 @@ - Panel of normal for ``mutect2`` somatic variant caller - Panel of normal for ``cvnkit`` somatic Copy Number Alterations caller +``access`` is used to create a genome accessibility file that can be used for ``cnvkit`` panel of normals creation. +Its output (``output/cnvkit.access/out/cnvkit.access.bed``) is optional, but its presence impacts of the way the target and antitarget regions are computed in whole genome mode. + +In a nutshell, for exome data, the accessibility file is only used to create antitarget regions. +For genome data, it is used by the ``autobin`` tool to compute the average target size used during target regions creation. +If it is present, the target size is computed in amplicon mode, and when it is absent, +an accessibility file is created with default settings, which value is used by ``autobin`` is whole genome mode. + +This follows the internal ``batch`` code of ``cnvkit``. + ======= Reports ======= -Report tables and plots can be found in the ``output/{mapper}.cnvkit/report`` directory. +Report tables can be found in the ``output/{mapper}.cnvkit/report`` directory. Two tables are produced, grouping results for all normal samples together: - ``metrics.txt``: coverage metrics over target and antitarget regions. - ``sex.txt``: prediction of the donor's gender based on the coverage of chromosome X & Y target and antitarget regions. -Coverage plots are also produced, for the whole genome or by chromosome. - The cnvkit authors recommend to check these reports to ensure that all data is suitable for panel of normal creation. """ @@ -139,6 +147,7 @@ access: "" # Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run) annotate: "" # [target] Optional targets annotations target_avg_size: 0 # [target] Average size of split target bins (0: use default value) + bp_per_bin: 50000 # [autobin] Expected base per bin split: True # [target] Split large intervals into smaller ones antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) @@ -378,7 +387,7 @@ def get_resource_usage(self, action): return self.resource_usage_dict.get(action) def get_input_files(self, action): - """Return input files for mutect2 variant calling""" + """Return input files for cnvkit panel of normals creation""" # Validate action self._validate_action(action) mapping = { diff --git a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py index 84799f6f4..c72460394 100644 --- a/snappy_wrappers/wrappers/cnvkit/access/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/access/wrapper.py @@ -7,7 +7,7 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" -config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["cnvkit"] +config = snakemake.config["step_config"][snakemake.config["pipeline_step"]["name"]]["access"] exclude = " --exclude " + " -x ".join(config["exclude"]) if config["exclude"] else "" diff --git a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py index 83a153c97..2154efe06 100644 --- a/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/antitarget/wrapper.py @@ -10,6 +10,8 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +target = snakemake.input.get("target", "") + shell( r""" # Also pipe everything to log file @@ -46,7 +48,7 @@ $(if [[ {config[min_size]} -gt 0 ]]; then \ echo --min-size {config[min_size]} fi) \ - {snakemake.input.target} + {target} else touch {snakemake.output.antitarget} fi diff --git a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py index f7d79c9b6..47fa87535 100644 --- a/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/coverage/wrapper.py @@ -10,6 +10,23 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +# During panel_of_normals step, the target regions are created by the target substep. +# During somatic CNV calling (both exome & wgs), the target regions are obtained from the configuration +if "target" in snakemake.input.keys(): + target = snakemake.input.target +elif "path_target" in config.keys(): + target = config["path_target"] +else: + raise Exception("Unsupported naming") + +# Same for antitarget regions +if "antitarget" in snakemake.input.keys(): + antitarget = snakemake.input.antitarget +elif "path_antitarget" in config.keys(): + antitarget = config["path_antitarget"] +else: + raise Exception("Unsupported naming") + shell( r""" # Also pipe everything to log file @@ -57,10 +74,10 @@ # ----------------------------------------------------------------------------- -coverage {snakemake.input.target} {snakemake.output.target} +coverage {target} {snakemake.output.target} md5 {snakemake.output.target} -coverage {snakemake.input.antitarget} {snakemake.output.antitarget} +coverage {antitarget} {snakemake.output.antitarget} md5 {snakemake.output.antitarget} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py index 73ef5e176..987987df2 100644 --- a/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/reference/wrapper.py @@ -22,7 +22,7 @@ gender = " --gender {}".format(config["gender"]) if config["gender"] else "" male = " --male-reference" if config["male_reference"] else "" no_gc = " --no-gc" if not config["gc_correction"] else "" -no_edge = " --no-edge" if not config["edge_correction"] else "" +no_edge = " --no-edge" if not config["edge_correction"] or not config["path_target_regions"] else "" no_rmask = " --no-rmask" if not config["rmask_correction"] else "" shell( diff --git a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py index ccded6235..f15c0bcf0 100644 --- a/snappy_wrappers/wrappers/cnvkit/target/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/target/wrapper.py @@ -42,24 +42,6 @@ {snakemake.config[static_data_config][reference][path]} }} -autobin() -{{ - cnvkit.py autobin --method $1 \ - --fasta {snakemake.config[static_data_config][reference][path]} \ - --access $2 \ - $(if [[ {config[bp_per_bin]} -gt 0 ]]; then \ - echo --bp-per-bin {config[bp_per_bin]} - fi) \ - $(if [[ {config[target_min_size]} -gt 0 ]]; then \ - echo --target-min-size {config[target_min_size]} - fi) \ - $(if [[ {config[target_max_size]} -gt 0 ]]; then \ - echo --target-max-size {config[target_max_size]} - fi) \ - --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ - {bams} > $tmpdir/autobin.txt -}} - # ----------------------------------------------------------------------------- target="{config[path_target_regions]}" @@ -67,28 +49,34 @@ if [[ -z "$target" ]] && [[ $target_avg_size -eq 0 ]] then - tmpdir=$(mktemp -d $TMPDIR) + tmpdir=$(mktemp -d) if [[ -n "{bams}" ]] then + access + cnvkit.py autobin --method wgs \ + --fasta {snakemake.config[static_data_config][reference][path]} \ + --access $tmpdir/access.bed \ + --bp-per-bin {config[bp_per_bin]} \ + --target-output-bed $tmpdir/target.bed --antitarget-output-bed $tmpdir/antitarget.bed \ + {bams} > $tmpdir/autobin.txt + target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) + if [[ -z "{config[access]}" ]] then - access - autobin wgs $tmpdir/access.bed - target=$tmpdir/access.bed - target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) else - autobin amplicon "{config[access]}" - target="{config[access]}" - target_avg_size=$(cat $tmpdir/autobin.txt | grep "Target:" | cut -f 3) fi else - access - - target=$tmpdir/access.bed - target_avg_size=50000 + if [[ -z "{config[access]}" ]] + then + access + target=$tmpdir/access.bed + else + target="{config[access]}" + fi + target_avg_size=5000 fi fi @@ -100,8 +88,8 @@ $(if [[ "{config[split]}" = "True" ]]; then \ echo --split fi) \ - $(if [[ {config[target_avg_size]} -gt 0 ]]; then \ - echo --avg-size {config[target_avg_size]} + $(if [[ $target_avg_size -gt 0 ]]; then \ + echo --avg-size $target_avg_size fi) \ $target From 405d5e715abcb16f2ac5c9425a8f201c9440e797 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 11 Jan 2023 13:31:13 +0100 Subject: [PATCH 08/10] Updated CNVkit usage for exome & wgs --- .../__init__.py | 377 ++++++++--------- .../cnvkit.rules | 72 +--- .../somatic_wgs_cnv_calling/Snakefile | 17 +- .../somatic_wgs_cnv_calling/__init__.py | 367 ++++++++++++++++- .../somatic_wgs_cnv_calling/cnvkit.rules | 110 +++++ .../wrappers/cnvkit/call/wrapper.py | 40 +- .../wrappers/cnvkit/export/wrapper.py | 12 +- .../wrappers/cnvkit/fix/wrapper.py | 36 +- .../wrappers/cnvkit/metrics/environment.yaml | 1 - .../wrappers/cnvkit/metrics/wrapper.py | 62 --- .../wrappers/cnvkit/plot/wrapper.py | 141 +++++-- .../wrappers/cnvkit/report/wrapper.py | 204 ++++++++-- .../wrappers/cnvkit/segment/wrapper.py | 32 +- .../wrappers/cnvkit/sex/environment.yaml | 1 - .../wrappers/cnvkit/sex/wrapper.py | 56 --- ...kflows_somatic_targeted_seq_cnv_calling.py | 372 +++++------------ .../test_workflows_somatic_wgs_cnv_calling.py | 381 ++++++++++++++++-- 17 files changed, 1452 insertions(+), 829 deletions(-) create mode 100644 snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules delete mode 120000 snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml delete mode 100644 snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py delete mode 120000 snappy_wrappers/wrappers/cnvkit/sex/environment.yaml delete mode 100644 snappy_wrappers/wrappers/cnvkit/sex/wrapper.py diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 3245f07be..4e017c21f 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -29,52 +29,56 @@ of this tool. In the future, this section will contain "common" output and tool-specific output sub sections. -- ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}/out/`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.bed`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.seg`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.vcf.gz`` - - ``{mapper}.cnvkit.export.{lib_name}-{lib_pk}.vcf.gz.tbi`` -- ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}/out`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.diagram.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.scatter.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.heatmap.pdf`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.heatmap.chr1.pdf`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/out/`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.bed`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.seg`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.vcf.gz`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.vcf.gz.tbi`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/report`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.diagram.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.scatter.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.heatmap.pdf`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.heatmap.chr1.pdf`` - ... - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.scatter.chrX.pdf`` -- ``{mapper}.cnvkit.report.{lib_name}-{lib_pk}/out`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.breaks.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.gainloss.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.gender.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.metrics.txt`` - - ``{mapper}.cnvkit.plot.{lib_name}-{lib_pk}.segmetrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.scatter.chrX.pdf`` +- ``{mapper}.cnvkit.{lib_name}-{lib_pk}/report`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.breaks.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.genemetrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.gender.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.metrics.txt`` + - ``{mapper}.cnvkit.{lib_name}-{lib_pk}.segmetrics.txt`` For example: :: output/ - |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007 + |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007 | `-- out - | |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.bed - | |-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.seg - | `-- bwa.cnvkit.export.P001-T1-DNA1-WES1-000007.vcf - |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016 - | `-- out - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.diagram.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.heatmap.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.scatter.pdf - | |-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.heatmap.chr1.pdf + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.bed + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.seg + | `-- bwa.cnvkit.P001-T1-DNA1-WES1-000007.vcf + |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016 + | `-- report + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.diagram.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.heatmap.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.scatter.pdf + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.heatmap.chr1.pdf | |-- ... - | `-- bwa.cnvkit.plot.P002-T1-DNA1-WES1-000016.scatter.chrX.pdf - |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016 - | `-- out - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.breaks.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.gainloss.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.gender.txt - | |-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.metrics.txt - | `-- bwa.cnvkit.report.P002-T1-DNA1-WES1-000016.segmetrics.txt + | `-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.scatter.chrX.pdf + |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016 + | `-- report + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.breaks.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.genemetrics.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.gender.txt + | |-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.metrics.txt + | `-- bwa.cnvkit.P002-T1-DNA1-WES1-000016.segmetrics.txt [...] +Note that tool ``cnvetti`` doesn't follow the snappy convention above: +the tool name is followed by an underscore & the action, where the action is one of ``coverage``, ``segment`` and ``postprocess``. +For example, the output directory would contain a directory named ``bwa.cnvetti_coverage.P002-T1-DNA1-WES1-000016``. + ===================== Default Configuration ===================== @@ -92,7 +96,7 @@ """ from collections import OrderedDict -import itertools +from itertools import chain import os import os.path import sys @@ -119,15 +123,44 @@ tools: ['cnvkit'] # REQUIRED - available: 'cnvkit', 'copywriter', 'cnvetti_on_target' and 'cnvetti_off_target' path_ngs_mapping: ../ngs_mapping # REQUIRED cnvkit: - path_target_regions: REQUIRED # REQUIRED - seg_method: haar - seg_threshold: 0.0001 + path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + gc_correction: True # [fix] Use GC correction + edge_correction: True # [fix] Use edge correction + rmask_correction: True # [fix] Use rmask correction # BCBIO uses # seg_method: haar # seg_threshold: 0.0001 # -- OR # seg_method: cbs # seg_threshold: 0.000001 + segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none + segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) + drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins + drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) + smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) + center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. + filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering + calling_method: threshold # [call] One of threshold, clonal, none + call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number + ploidy: 2 # [call] Ploidy of sample cells + purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) + gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [call, diagram] Create male reference + diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes + diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes + shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex + breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene + genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene + genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene + genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff + genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps + segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff + segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps + smooth_bootstrap: False # [segmetrics] Smooth bootstrap results copywriter: path_target_regions: REQUIRED # REQUIRED bin_size: 20000 # TODO: make actually configurable @@ -394,14 +427,10 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): #: Class available actions actions = ( - "access", - "target", - "antitarget", "coverage", - "reference", "fix", - "call", "segment", + "call", "export", "plot", "report", @@ -411,12 +440,17 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): resource_usage_dict = { "plot": ResourceUsage( threads=1, - time="1-00:00:00", # 1 day + time="08:00:00", # 1 day memory=f"{30 * 1024}M", ), + "coverage": ResourceUsage( + threads=8, + time="08:00:00", # 8 hours + memory=f"{16 * 1024}M", + ), "default": ResourceUsage( threads=1, - time="1-00:00:00", # 1 day + time="03:59:59", # 1 day memory=f"{int(7.5 * 1024)}M", ), } @@ -429,20 +463,24 @@ def check_config(self): if "cnvkit" not in self.config["tools"]: return # cnvkit not enabled, skip self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_target_regions"), + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_target"), "Path to target regions is missing for cnvkit", ) + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_antitarget"), + "Path to antitarget regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_panel_of_normals"), + "Path to panel of normals (reference) is missing for cnvkit", + ) def get_input_files(self, action): """Return input paths input function, dependent on rule""" # Validate action self._validate_action(action) method_mapping = { - "access": None, - "target": self._get_input_files_target, - "antitarget": self._get_input_files_antitarget, "coverage": self._get_input_files_coverage, - "reference": self._get_input_files_reference, "call": self._get_input_files_call, "fix": self._get_input_files_fix, "segment": self._get_input_files_segment, @@ -452,232 +490,173 @@ def get_input_files(self, action): } return method_mapping[action] - @staticmethod - def _get_input_files_target(_): - input_files = {"access": "work/cnvkit.access/out/access.bed"} - return input_files - - @staticmethod - def _get_input_files_antitarget(_): - input_files = { - "access": "work/cnvkit.access/out/access.bed", - "target": "work/cnvkit.target/out/target.bed", - } - return input_files - def _get_input_files_coverage(self, wildcards): - input_files = { - "target": "work/cnvkit.target/out/target.bed", - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", - } # BAM/BAI file ngs_mapping = self.parent.sub_workflows["ngs_mapping"] base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) - input_files["bam"] = ngs_mapping(base_path + ".bam") - input_files["bai"] = ngs_mapping(base_path + ".bam.bai") - return input_files - - def _get_input_files_reference(self, wildcards): input_files = { - "target": "work/cnvkit.target/out/target.bed", - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), } - # BAM/BAI file - ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( - normal_library=self.get_normal_lib_name(wildcards), **wildcards - ) - input_files["bam"] = ngs_mapping(base_path + ".bam") - input_files["bai"] = ngs_mapping(base_path + ".bam.bai") return input_files @staticmethod def _get_input_files_fix(wildcards): - tpl_base = "{mapper}.cnvkit.{substep}.{library_name}" - tpl = "work/" + tpl_base + "/out/" + tpl_base + ".cnn" - input_files = {"ref": tpl.format(substep="reference", **wildcards)} + tpl_base = "{mapper}.cnvkit.{library_name}" tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" - for target in ("target", "antitarget"): - input_files[target] = tpl.format(target=target, substep="coverage", **wildcards) + input_files = { + "target": tpl.format(target="target", **wildcards), + "antitarget": tpl.format(target="antitarget", **wildcards), + } return input_files @staticmethod def _get_input_files_segment(wildcards): - cnr_pattern = ( - "work/{mapper}.cnvkit.fix.{library_name}/out/{mapper}.cnvkit.fix.{library_name}.cnr" - ) + cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" input_files = {"cnr": cnr_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_call(wildcards): segment_pattern = ( - "work/{mapper}.cnvkit.segment.{library_name}/out/" - "{mapper}.cnvkit.segment.{library_name}.cns" + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" ) input_files = {"segment": segment_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_export(wildcards): - cns_pattern = ( - "work/{mapper}.cnvkit.call.{library_name}/out/{mapper}.cnvkit.call.{library_name}.cns" - ) + cns_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" input_files = {"cns": cns_pattern.format(**wildcards)} return input_files @staticmethod def _get_input_files_plot(wildcards): - tpl = ( - "work/{mapper}.cnvkit.{substep}.{library_name}/out/" - "{mapper}.cnvkit.{substep}.{library_name}.{ext}" - ) + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" input_files = { - "cnr": tpl.format(substep="fix", ext="cnr", **wildcards), - "cns": tpl.format(substep="segment", ext="cns", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), } return input_files def _get_input_files_report(self, wildcards): - return self._get_input_files_plot(wildcards) + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "target": tpl.format(ext="targetcoverage.cnn", **wildcards), + "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files def get_output_files(self, action): """Return output files for the given action""" - # Validate action - self._validate_action(action) - method_mapping = { - "access": self._get_output_files_access, - "target": self._get_output_files_target, - "antitarget": self._get_output_files_antitarget, - "coverage": self._get_output_files_coverage, - "reference": self._get_output_files_reference, - "fix": self._get_output_files_fix, - "call": self._get_output_files_call, - "segment": self._get_output_files_segment, - "export": self._get_output_files_export, - "plot": self._get_output_files_plot, - "report": self._get_output_files_report, - } - return method_mapping[action]() - - @staticmethod - def _get_output_files_access(): - return "work/cnvkit.access/out/access.bed" - - @staticmethod - def _get_output_files_target(): - return "work/cnvkit.target/out/target.bed" - - @staticmethod - def _get_output_files_antitarget(): - return "work/cnvkit.antitarget/out/antitarget.bed" + if action == "coverage": + return self._get_output_files_coverage() + elif action == "fix": + return self._get_output_files_fix() + elif action == "segment": + return self._get_output_files_segment() + elif action == "call": + return self._get_output_files_call() + elif action == "export": + return self._get_output_files_export() + elif action == "plot": + return self._get_output_files_plot() + elif action == "report": + return self._get_output_files_report() + else: + self._validate_action(action) @staticmethod def _get_output_files_coverage(): - name_pattern = "{mapper}.cnvkit.coverage.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" output_files = {} for target in ("target", "antitarget"): output_files[target] = os.path.join( "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) ) + output_files[target + "_md5"] = output_files[target] + ".md5" return output_files - @staticmethod - def _get_output_files_reference(): - name_pattern = "{mapper}.cnvkit.reference.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnn") - return tpl - @staticmethod def _get_output_files_fix(): - name_pattern = "{mapper}.cnvkit.fix.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") - return tpl + return {"ratios": tpl, "ratios_md5": tpl + ".md5"} @staticmethod def _get_output_files_segment(): - name_pattern = "{mapper}.cnvkit.segment.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") - return tpl + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") + return {"segments": tpl, "segments_md5": tpl + ".md5"} @staticmethod def _get_output_files_call(): - name_pattern = "{mapper}.cnvkit.call.{library_name}" + name_pattern = "{mapper}.cnvkit.{library_name}" tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") - return tpl + return {"calls": tpl, "calls_md5": tpl + ".md5"} @dictify def _get_output_files_plot(self): - plots = ("scatter", "diagram", "heatmap") - chroms = list(itertools.chain(range(1, 23), ["X", "Y"])) + plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) + chroms = list(chain(range(1, 23), ["X", "Y"])) + output_files = {} # Yield file name pairs for global plots tpl = ( - "work/{mapper}.cnvkit.plot.{library_name}/out/" - "{mapper}.cnvkit.plot.{library_name}.{plot}.pdf" - ) - yield from ( - (plot, tpl.format(plot=plot, **format_id("mapper", "library_name"))) for plot in plots + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" ) + for plot, ext in plots: + output_files[plot] = tpl.format(plot=plot, ext=ext) + output_files[plot + "_md5"] = output_files[plot] + ".md5" # Yield file name pairs for the chromosome-wise plots - chrom_plots = ("scatter", "heatmap") tpl_chrom = ( - "work/{mapper}.cnvkit.plot.{library_name}/out/" - "{mapper}.cnvkit.plot.{library_name}.{plot}.chr{chrom}.pdf" - ) - yield from ( - ( - "{plot}_chr{chrom}".format(plot=plot, chrom=chrom), - tpl_chrom.format(plot=plot, chrom=chrom, **format_id("mapper", "library_name")), - ) - for plot in chrom_plots - for chrom in chroms + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" ) + for plot, ext in chrom_plots: + for chrom in chroms: + key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) + output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) + output_files[key + "_md5"] = output_files[key] + ".md5" + return output_files @staticmethod def _get_output_files_export(): - keys = ("bed", "seg", "vcf", "vcf_tbi") - exts = ("bed", "seg", "vcf.gz", "vcf.gz.tbi") - name_pattern = "{mapper}.cnvkit.export.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".{ext}") + exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) output_files = {} - for key, ext in zip(keys, exts): - output_files[key] = tpl.format(ext=ext, **format_id("mapper", "library_name")) + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/out/" + "{{mapper}}.cnvkit.{{library_name}}.{ext}" + ) + for export, ext in exports: + output_files[export] = tpl.format(export=export, ext=ext) + output_files[export + "_md5"] = output_files[export] + ".md5" return output_files @dictify def _get_output_files_report(self): - reports = ("breaks", "gainloss", "gender", "metrics", "segmetrics") + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + output_files = {} tpl = ( - "work/{mapper}.cnvkit.report.{library_name}/out/" - "{mapper}.cnvkit.report.{library_name}.{report}.txt" - ) - yield from ( - (report, tpl.format(report=report, **format_id("mapper", "library_name"))) - for report in reports + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" ) + for report in reports: + output_files[report] = tpl.format(report=report) + output_files[report + "_md5"] = output_files[report] + ".md5" + return output_files - @staticmethod - def get_log_file(action): + def get_log_file(self, action): """Return path to log file for the given action""" - if action in ("access", "target", "antitarget"): - prefix = "work/cnvkit.{action}/log/cnvkit.{action}" - elif action in ( - "coverage", - "reference", - "fix", - "call", - "segment", - "export", - "plot", - "report", - ): - prefix = ( - "work/{{mapper}}.cnvkit.{action}.{{library_name}}/log/" - "{{mapper}}.cnvkit.{action}.{{library_name}}" - ) - else: - raise ValueError("Unknown action {}".format(action)) - prefix = prefix.format(action=action) + # Validate action + self._validate_action(action) + prefix = ( + "work/{{mapper}}.cnvkit.{{library_name}}/log/" + "{{mapper}}.cnvkit.{action}.{{library_name}}" + ).format(action=action) key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), @@ -701,6 +680,8 @@ def get_resource_usage(self, action): self._validate_action(action) if action == "plot": return self.resource_usage_dict.get("plot") + elif action == "coverage": + return self.resource_usage_dict.get("coverage") else: return self.resource_usage_dict.get("default") @@ -908,7 +889,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) def get_result_files(self): """Return list of result files for the somatic targeted sequencing CNV calling step""" tool_actions = { - "cnvkit": ("call", "report", "export", "plot"), # ("report", "export", "plot"), + "cnvkit": ("fix", "call", "report", "export", "plot"), "copywriter": ("call",), "cnvetti_on_target": ("coverage", "segment", "postprocess"), "cnvetti_off_target": ("coverage", "segment", "postprocess"), @@ -946,7 +927,7 @@ def get_result_files(self): yield f.replace("work/", "output/") def check_config(self): - """Check that the necessary globalc onfiguration is present""" + """Check that the necessary global configuration is present""" self.ensure_w_config( ("step_config", "somatic_targeted_seq_cnv_calling", "path_ngs_mapping"), ( diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules index c01f4c198..4014660c1 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/cnvkit.rules @@ -1,52 +1,3 @@ -rule somatic_accessed_seq_cnv_calling_cnvkit_access: - output: - wf.get_output_files("cnvkit", "access"), - threads: wf.get_resource("cnvkit", "access", "threads") - resources: - time=wf.get_resource("cnvkit", "access", "time"), - memory=wf.get_resource("cnvkit", "access", "memory"), - partition=wf.get_resource("cnvkit", "access", "partition"), - tmpdir=wf.get_resource("cnvkit", "access", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "access"), - wrapper: - wf.wrapper_path("cnvkit/access") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_target: - input: - unpack(wf.get_input_files("cnvkit", "target")), - output: - wf.get_output_files("cnvkit", "target"), - threads: wf.get_resource("cnvkit", "target", "threads") - resources: - time=wf.get_resource("cnvkit", "target", "time"), - memory=wf.get_resource("cnvkit", "target", "memory"), - partition=wf.get_resource("cnvkit", "target", "partition"), - tmpdir=wf.get_resource("cnvkit", "target", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "target"), - wrapper: - wf.wrapper_path("cnvkit/target") - - -rule somatic_targeted_seq_cnv_calling_cnvkit_antitarget: - input: - unpack(wf.get_input_files("cnvkit", "antitarget")), - output: - wf.get_output_files("cnvkit", "antitarget"), - threads: wf.get_resource("cnvkit", "antitarget", "threads") - resources: - time=wf.get_resource("cnvkit", "antitarget", "time"), - memory=wf.get_resource("cnvkit", "antitarget", "memory"), - partition=wf.get_resource("cnvkit", "antitarget", "partition"), - tmpdir=wf.get_resource("cnvkit", "antitarget", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "antitarget"), - wrapper: - wf.wrapper_path("cnvkit/antitarget") - - rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: input: unpack(wf.get_input_files("cnvkit", "coverage")), @@ -64,28 +15,11 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_coverage: wf.wrapper_path("cnvkit/coverage") -rule somatic_targeted_seq_cnv_calling_cnvkit_reference: - input: - unpack(wf.get_input_files("cnvkit", "reference")), - output: - wf.get_output_files("cnvkit", "reference"), - threads: wf.get_resource("cnvkit", "reference", "threads") - resources: - time=wf.get_resource("cnvkit", "reference", "time"), - memory=wf.get_resource("cnvkit", "reference", "memory"), - partition=wf.get_resource("cnvkit", "reference", "partition"), - tmpdir=wf.get_resource("cnvkit", "reference", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "reference"), - wrapper: - wf.wrapper_path("cnvkit/reference") - - rule somatic_targeted_seq_cnv_calling_cnvkit_fix: input: unpack(wf.get_input_files("cnvkit", "fix")), output: - wf.get_output_files("cnvkit", "fix"), + **wf.get_output_files("cnvkit", "fix"), threads: wf.get_resource("cnvkit", "fix", "threads") resources: time=wf.get_resource("cnvkit", "fix", "time"), @@ -102,7 +36,7 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_segment: input: unpack(wf.get_input_files("cnvkit", "segment")), output: - wf.get_output_files("cnvkit", "segment"), + **wf.get_output_files("cnvkit", "segment"), threads: wf.get_resource("cnvkit", "segment", "threads") resources: time=wf.get_resource("cnvkit", "segment", "time"), @@ -119,7 +53,7 @@ rule somatic_targeted_seq_cnv_calling_cnvkit_call: input: unpack(wf.get_input_files("cnvkit", "call")), output: - wf.get_output_files("cnvkit", "call"), + **wf.get_output_files("cnvkit", "call"), threads: wf.get_resource("cnvkit", "call", "threads") resources: time=wf.get_resource("cnvkit", "call", "time"), diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile index a4e9d2dae..d24816811 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile @@ -193,19 +193,4 @@ rule somatic_wgs_cnv_calling_control_freec_plot: # Run cnvkit ------------------------------------------------ - -rule somatic_wgs_cnv_calling_cnvkit_run: - input: - unpack(wf.get_input_files("cnvkit", "run")), - output: - **wf.get_output_files("cnvkit", "run"), - threads: wf.get_resource("cnvkit", "run", "threads") - resources: - time=wf.get_resource("cnvkit", "run", "time"), - memory=wf.get_resource("cnvkit", "run", "memory"), - partition=wf.get_resource("cnvkit", "run", "partition"), - tmpdir=wf.get_resource("cnvkit", "run", "tmpdir"), - log: - **wf.get_log_file("cnvkit", "run"), - wrapper: - wf.wrapper_path("cnvkit/wgs") +include: "cnvkit.rules" diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py index 9793a8e55..6cbfdf4e2 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py @@ -137,7 +137,44 @@ tx_obj: TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene bs_obj: BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5 cnvkit: - path_annotate_refflat: REQUIRED #REQUIRED + path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn + min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth + count: False # [coverage] Alternative couting algorithm + gc_correction: True # [fix] Use GC correction + edge_correction: True # [fix] Use edge correction + rmask_correction: True # [fix] Use rmask correction + # BCBIO uses + # seg_method: haar + # seg_threshold: 0.0001 + # -- OR + # seg_method: cbs + # seg_threshold: 0.000001 + segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none + segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) + drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins + drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) + smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) + center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. + filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering + calling_method: threshold # [call] One of threshold, clonal, none + call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number + ploidy: 2 # [call] Ploidy of sample cells + purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) + gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing + male_reference: False # [call, diagram] Create male reference + diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes + diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes + shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex + breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene + genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene + genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene + genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff + genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps + segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff + segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps + smooth_bootstrap: False # [segmetrics] Smooth bootstrap results """ @@ -406,19 +443,243 @@ class CnvkitSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): name = "cnvkit" #: Class available actions - actions = ("run",) + actions = ( + "coverage", + "fix", + "segment", + "call", + "export", + "plot", + "report", + ) + + #: Class resource usage dictionary. Key: action type (string); Value: resource (ResourceUsage). + resource_usage_dict = { + "plot": ResourceUsage( + threads=1, + time="1-00:00:00", # 1 day + memory=f"{30 * 1024}M", + ), + "coverage": ResourceUsage( + threads=8, + time="1-00:00:00", # 1 day + memory=f"{16 * 1024}M", + ), + "default": ResourceUsage( + threads=1, + time="1-00:00:00", # 1 day + memory=f"{int(7.5 * 1024)}M", + ), + } + + def __init__(self, parent): + super().__init__(parent) + + def check_config(self): + """Check configuration for cnvkit""" + if "cnvkit" not in self.config["tools"]: + return # cnvkit not enabled, skip + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_target"), + "Path to target regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_antitarget"), + "Path to antitarget regions is missing for cnvkit", + ) + self.parent.ensure_w_config( + ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_panel_of_normals"), + "Path to panel of normals (reference) is missing for cnvkit", + ) + + def get_input_files(self, action): + """Return input paths input function, dependent on rule""" + # Validate action + self._validate_action(action) + method_mapping = { + "coverage": self._get_input_files_coverage, + "call": self._get_input_files_call, + "fix": self._get_input_files_fix, + "segment": self._get_input_files_segment, + "export": self._get_input_files_export, + "plot": self._get_input_files_plot, + "report": self._get_input_files_report, + } + return method_mapping[action] + + def _get_input_files_coverage(self, wildcards): + # BAM/BAI file + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(**wildcards) + input_files = { + "bam": ngs_mapping(base_path + ".bam"), + "bai": ngs_mapping(base_path + ".bam.bai"), + } + return input_files + + @staticmethod + def _get_input_files_fix(wildcards): + tpl_base = "{mapper}.cnvkit.{library_name}" + tpl = "work/" + tpl_base + "/out/" + tpl_base + ".{target}coverage.cnn" + input_files = { + "target": tpl.format(target="target", **wildcards), + "antitarget": tpl.format(target="antitarget", **wildcards), + } + return input_files + + @staticmethod + def _get_input_files_segment(wildcards): + cnr_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + input_files = {"cnr": cnr_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_call(wildcards): + segment_pattern = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" + ) + input_files = {"segment": segment_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_export(wildcards): + cns_pattern = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + input_files = {"cns": cns_pattern.format(**wildcards)} + return input_files + + @staticmethod + def _get_input_files_plot(wildcards): + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files + + def _get_input_files_report(self, wildcards): + tpl = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.{ext}" + input_files = { + "target": tpl.format(ext="targetcoverage.cnn", **wildcards), + "antitarget": tpl.format(ext="antitargetcoverage.cnn", **wildcards), + "cnr": tpl.format(ext="cnr", **wildcards), + "cns": tpl.format(ext="cns", **wildcards), + } + return input_files def get_output_files(self, action): + """Return output files for the given action""" # Validate action self._validate_action(action) - return { - "segment": self.base_path_out.format(var_caller=self.name, ext=".cns"), - "segment_md5": self.base_path_out.format(var_caller=self.name, ext=".cns.md5"), - "bins": self.base_path_out.format(var_caller=self.name, ext=".cnr"), - "bins_md5": self.base_path_out.format(var_caller=self.name, ext=".cnr.md5"), - "scatter": self.base_path_out.format(var_caller=self.name, ext=".scatter.png"), - "scatter_md5": self.base_path_out.format(var_caller=self.name, ext=".scatter.png.md5"), + method_mapping = { + "coverage": self._get_output_files_coverage, + "fix": self._get_output_files_fix, + "call": self._get_output_files_call, + "segment": self._get_output_files_segment, + "export": self._get_output_files_export, + "plot": self._get_output_files_plot, + "report": self._get_output_files_report, } + return method_mapping[action]() + + @staticmethod + def _get_output_files_coverage(): + name_pattern = "{mapper}.cnvkit.{library_name}" + output_files = {} + for target in ("target", "antitarget"): + output_files[target] = os.path.join( + "work", name_pattern, "out", name_pattern + ".{}coverage.cnn".format(target) + ) + output_files[target + "_md5"] = output_files[target] + ".md5" + return output_files + + @staticmethod + def _get_output_files_fix(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cnr") + return {"ratios": tpl, "ratios_md5": tpl + ".md5"} + + @staticmethod + def _get_output_files_segment(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".segment.cns") + return {"segments": tpl, "segments_md5": tpl + ".md5"} + + @staticmethod + def _get_output_files_call(): + name_pattern = "{mapper}.cnvkit.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") + return {"calls": tpl, "calls_md5": tpl + ".md5"} + + @dictify + def _get_output_files_plot(self): + plots = (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + chrom_plots = (("heatmap", "pdf"), ("scatter", "png")) + chroms = list(chain(range(1, 23), ["X", "Y"])) + output_files = {} + # Yield file name pairs for global plots + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in plots: + output_files[plot] = tpl.format(plot=plot, ext=ext) + output_files[plot + "_md5"] = output_files[plot] + ".md5" + # Yield file name pairs for the chromosome-wise plots + tpl_chrom = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in chrom_plots: + for chrom in chroms: + key = "{plot}_chr{chrom}".format(plot=plot, chrom=chrom) + output_files[key] = tpl_chrom.format(plot=plot, ext=ext, chrom=chrom) + output_files[key + "_md5"] = output_files[key] + ".md5" + return output_files + + @staticmethod + def _get_output_files_export(): + exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) + output_files = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/out/" + "{{mapper}}.cnvkit.{{library_name}}.{ext}" + ) + for export, ext in exports: + output_files[export] = tpl.format(export=export, ext=ext) + output_files[export + "_md5"] = output_files[export] + ".md5" + return output_files + + @dictify + def _get_output_files_report(self): + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + output_files = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{report}.txt" + ) + for report in reports: + output_files[report] = tpl.format(report=report) + output_files[report + "_md5"] = output_files[report] + ".md5" + return output_files + + def get_log_file(self, action): + """Return path to log file for the given action""" + # Validate action + self._validate_action(action) + prefix = ( + "work/{{mapper}}.cnvkit.{{library_name}}/log/" + "{{mapper}}.cnvkit.{action}.{{library_name}}" + ).format(action=action) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + log_files = {} + for key, ext in key_ext: + log_files[key] = prefix + ext + log_files[key + "_md5"] = prefix + ext + ".md5" + return log_files def get_resource_usage(self, action): """Get Resource Usage @@ -430,11 +691,12 @@ def get_resource_usage(self, action): """ # Validate action self._validate_action(action) - return ResourceUsage( - threads=4, - time="1-16:00:00", # 1 day and 16 hours - memory=f"{10 * 1024 * 4}M", - ) + if action == "plot": + return self.resource_usage_dict.get("plot") + elif action == "coverage": + return self.resource_usage_dict.get("coverage") + else: + return self.resource_usage_dict.get("default") class ControlFreecSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): @@ -624,11 +886,38 @@ def get_result_files(self): ) # Plots for cnvetti if "cnvkit" in self.config["tools"]: + exts = (".cnr", ".cns", ".bed", ".seg", ".vcf.gz", ".vcf.gz.tbi") yield from self._yield_result_files( tpl, mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], caller="cnvkit", - ext=[".cnr", ".cnr.md5", ".cns", ".cns.md5", ".scatter.png", ".scatter.png.md5"], + ext=exts, + ) + yield from self._yield_result_files( + tpl, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", + ext=[ext + ".md5" for ext in exts], + ) + reports = ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + yield from self._yield_report_files( + ( + "output/{mapper}.{caller}.{cancer_library.name}/report/" + "{mapper}.{caller}.{cancer_library.name}.{ext}" + ), + [(report, "txt", False) for report in reports], + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", + ) + plots = (("diagram", "pdf", False), ("heatmap", "pdf", True), ("scatter", "png", True)) + yield from self._yield_report_files( + ( + "output/{mapper}.{caller}.{cancer_library.name}/report/" + "{mapper}.{caller}.{cancer_library.name}.{ext}" + ), + plots, + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + caller="cnvkit", ) if "cnvetti" in bcf_tools: for sheet in filter(is_not_background, self.shortcut_sheets): @@ -675,6 +964,50 @@ def _yield_result_files(self, tpl, **kwargs): tpl, cancer_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs ) + def _yield_report_files(self, tpl, exts, **kwargs): + """Build report paths from path template and extension list""" + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) # pragma: no cover + print( + msg.format(sample_pair.tumor_sample.name), file=sys.stderr + ) # pragma: no cover + continue # pragma: no cover + for plot, ext, chrom in exts: + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + "." + ext, + **kwargs, + ) + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + "." + ext + ".md5", + **kwargs, + ) + if chrom: + for c in map(str, chain(range(1, 23), ("X", "Y"))): + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + ".chr" + c + "." + ext, + **kwargs, + ) + yield from expand( + tpl, + cancer_library=[sample_pair.tumor_sample.dna_ngs_library], + ext=plot + ".chr" + c + "." + ext + ".md5", + **kwargs, + ) + def check_config(self): """Check that the necessary configuration is available for the step""" self.ensure_w_config( @@ -688,7 +1021,3 @@ def check_config(self): "WGS CNV calling" ), ) - self.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "path_somatic_variant_calling"), - "Somatic (small) variant calling tool not configured for somatic WGS CNV calling", - ) diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules new file mode 100644 index 000000000..5e91bd261 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/cnvkit.rules @@ -0,0 +1,110 @@ +rule somatic_wgs_cnv_calling_cnvkit_coverage: + input: + unpack(wf.get_input_files("cnvkit", "coverage")), + output: + **wf.get_output_files("cnvkit", "coverage"), + threads: wf.get_resource("cnvkit", "coverage", "threads") + resources: + time=wf.get_resource("cnvkit", "coverage", "time"), + memory=wf.get_resource("cnvkit", "coverage", "memory"), + partition=wf.get_resource("cnvkit", "coverage", "partition"), + log: + **wf.get_log_file("cnvkit", "coverage"), + wrapper: + wf.wrapper_path("cnvkit/coverage") + + +rule somatic_wgs_cnv_calling_cnvkit_fix: + input: + unpack(wf.get_input_files("cnvkit", "fix")), + output: + **wf.get_output_files("cnvkit", "fix"), + threads: wf.get_resource("cnvkit", "fix", "threads") + resources: + time=wf.get_resource("cnvkit", "fix", "time"), + memory=wf.get_resource("cnvkit", "fix", "memory"), + partition=wf.get_resource("cnvkit", "fix", "partition"), + log: + **wf.get_log_file("cnvkit", "fix"), + wrapper: + wf.wrapper_path("cnvkit/fix") + + +rule somatic_wgs_cnv_calling_cnvkit_segment: + input: + unpack(wf.get_input_files("cnvkit", "segment")), + output: + **wf.get_output_files("cnvkit", "segment"), + threads: wf.get_resource("cnvkit", "segment", "threads") + resources: + time=wf.get_resource("cnvkit", "segment", "time"), + memory=wf.get_resource("cnvkit", "segment", "memory"), + partition=wf.get_resource("cnvkit", "segment", "partition"), + log: + **wf.get_log_file("cnvkit", "segment"), + wrapper: + wf.wrapper_path("cnvkit/segment") + + +rule somatic_wgs_cnv_calling_cnvkit_call: + input: + unpack(wf.get_input_files("cnvkit", "call")), + output: + **wf.get_output_files("cnvkit", "call"), + threads: wf.get_resource("cnvkit", "call", "threads") + resources: + time=wf.get_resource("cnvkit", "call", "time"), + memory=wf.get_resource("cnvkit", "call", "memory"), + partition=wf.get_resource("cnvkit", "call", "partition"), + log: + **wf.get_log_file("cnvkit", "call"), + wrapper: + wf.wrapper_path("cnvkit/call") + + +rule somatic_wgs_cnv_calling_cnvkit_plot: + input: + unpack(wf.get_input_files("cnvkit", "plot")), + output: + **wf.get_output_files("cnvkit", "plot"), + threads: wf.get_resource("cnvkit", "plot", "threads") + resources: + time=wf.get_resource("cnvkit", "plot", "time"), + memory=wf.get_resource("cnvkit", "plot", "memory"), + partition=wf.get_resource("cnvkit", "plot", "partition"), + log: + **wf.get_log_file("cnvkit", "plot"), + wrapper: + wf.wrapper_path("cnvkit/plot") + + +rule somatic_wgs_cnv_calling_cnvkit_export: + input: + unpack(wf.get_input_files("cnvkit", "export")), + output: + **wf.get_output_files("cnvkit", "export"), + threads: wf.get_resource("cnvkit", "export", "threads") + resources: + time=wf.get_resource("cnvkit", "export", "time"), + memory=wf.get_resource("cnvkit", "export", "memory"), + partition=wf.get_resource("cnvkit", "export", "partition"), + log: + **wf.get_log_file("cnvkit", "export"), + wrapper: + wf.wrapper_path("cnvkit/export") + + +rule somatic_wgs_cnv_calling_cnvkit_report: + input: + unpack(wf.get_input_files("cnvkit", "report")), + output: + **wf.get_output_files("cnvkit", "report"), + threads: wf.get_resource("cnvkit", "report", "threads") + resources: + time=wf.get_resource("cnvkit", "report", "time"), + memory=wf.get_resource("cnvkit", "report", "memory"), + partition=wf.get_resource("cnvkit", "report", "partition"), + log: + **wf.get_log_file("cnvkit", "report"), + wrapper: + wf.wrapper_path("cnvkit/report") diff --git a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py index c935ebff2..1630d2422 100644 --- a/snappy_wrappers/wrappers/cnvkit/call/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/call/wrapper.py @@ -7,6 +7,19 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +center = config["center"] +if center: + if center in set("mean", "median", "mode", "biweight"): + center = " --center " + center + else: + center = " --center-at" + center + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + shell( r""" # Also pipe everything to log file @@ -26,14 +39,35 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -cnvkit.py call --filter ampdel {snakemake.input} -o {snakemake.output} +cnvkit.py call \ + --output {snakemake.output.calls} \ + --method {config[calling_method]} \ + --thresholds={config[call_thresholds]} \ + $(if [[ "{config[filter]}" ]]; then \ + echo --filter {config[filter]} + fi) \ + {center} {gender} {male} \ + --ploidy {config[ploidy]} \ + $(if [[ {config[purity]} -gt 0 ]]; then \ + echo --purity {config[purity]} + fi) \ + {snakemake.input} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.calls}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.calls}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/export/wrapper.py b/snappy_wrappers/wrappers/cnvkit/export/wrapper.py index 8a6cc0deb..38b513d12 100644 --- a/snappy_wrappers/wrappers/cnvkit/export/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/export/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py export +"""Wrapper for cnvkit.py export """ from snakemake.shell import shell @@ -26,6 +26,8 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + export TMPDIR=$(mktemp -d) cnvkit.py export bed {snakemake.input} -o {snakemake.output.bed} @@ -46,6 +48,14 @@ md5sum $fn > $fn.md5 fn=$(basename "{snakemake.output.vcf}") md5sum $fn > $fn.md5 +md5sum $fn.tbi > $fn.tbi.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py index 3b4391d24..f3e3cde0f 100644 --- a/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/fix/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py fix +"""Wrapper for cnvkit.py fix """ from snakemake.shell import shell @@ -7,6 +7,22 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +if "ref" in snakemake.input.keys(): + ref = snakemake.input.target +elif "path_panel_of_normals" in config.keys(): + ref = config["path_panel_of_normals"] +else: + raise Exception("Unsupported naming") + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" +no_gc = " --no-gc" if not config["gc_correction"] else "" +no_edge = " --no-edge" if not config["edge_correction"] else "" +no_rmask = " --no-rmask" if not config["rmask_correction"] else "" + shell( r""" # Also pipe everything to log file @@ -26,18 +42,28 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py fix \ + --output {snakemake.output.ratios} \ + {gender} {male} {no_gc} {no_edge} {no_rmask} \ {snakemake.input.target} \ {snakemake.input.antitarget} \ - {snakemake.input.ref} \ - -o {snakemake.output} + {ref} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.ratios}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.ratios}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml b/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/cnvkit/metrics/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py b/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py deleted file mode 100644 index 6667df5f5..000000000 --- a/snappy_wrappers/wrappers/cnvkit/metrics/wrapper.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for cnvkit.py metrics -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -try: - segments = " --segments " + " -s ".join(snakemake.input.segments) -except AttributeError: - segments = "" - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -cnvkit.py metrics \ - --output {snakemake.output.table} \ - {segments} \ - $(if [[ "config[drop_low_coverage]" = "True" ]]; then \ - echo --drop-low-coverage - fi) \ - {snakemake.input.target} {snakemake.input.antitarget} - -fn=$(basename "{snakemake.output.table}") -d=$(dirname "{snakemake.output.table}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" -) - -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) diff --git a/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py b/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py index fffa95c46..83f52375c 100644 --- a/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/plot/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py plot +"""Wrapper for cnvkit.py plot """ from snakemake.shell import shell @@ -7,19 +7,24 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" -mapper = snakemake.wildcards.mapper -if "library_name" in snakemake.wildcards.keys(): - libname = snakemake.wildcards.library_name - tool_and_mapper = mapper + ".cnvkit.plot" - make_per_chr_plots = "true" - opt_filetype = "pdf" -elif "cancer_library" in snakemake.wildcards.keys(): - libname = snakemake.wildcards.cancer_library - tool_and_mapper = mapper + ".control_freec" - make_per_chr_plots = "false" - opt_filetype = "png" -else: - raise Exception("Unsupported naming") +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + +heatmaps = [ + snakemake.output.get(x) + for x in filter( + lambda x: x.startswith("heatmap_chr") and not x.endswith("_md5"), snakemake.output.keys() + ) +] +scatters = [ + snakemake.output.get(x) + for x in filter( + lambda x: x.startswith("scatter_chr") and not x.endswith("_md5"), snakemake.output.keys() + ) +] shell( r""" @@ -40,42 +45,94 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} -# ----------------------------------------------------------------------------- - -unset DISPLAY +set -x -d="work/{tool_and_mapper}.{libname}/out" +# ----------------------------------------------------------------------------- -fn="{tool_and_mapper}.{libname}.diagram.pdf" -cnvkit.py diagram -s {snakemake.input.cns} {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +md5() +{{ + d=$(dirname $1) + f=$(basename $1) + pushd $d + md5sum $f > $f.md5 + popd +}} -fn="{tool_and_mapper}.{libname}.scatter.{opt_filetype}" -cnvkit.py scatter -s {snakemake.input.cns} {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +# ----------------------------------------------------------------------------- -fn="{tool_and_mapper}.{libname}.heatmap.{opt_filetype}" -cnvkit.py heatmap {snakemake.input.cnr} -o $d/$fn -pushd $d ; md5sum $fn > $fn.md5 ; popd +unset DISPLAY -if [ "{make_per_chr_plots}" = true ] ; then - chroms=$(tail -n +2 "{snakemake.input.cnr}" | cut -f 1 | sort | uniq | grep -E "^(chr)?([0-9]+|X|Y)") +if [[ -n "{snakemake.output.diagram}" ]] +then + cnvkit.py diagram \ + --output {snakemake.output.diagram} \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + --threshold {config[diagram_threshold]} --min-probes {config[diagram_min_probes]} \ + $(if [[ "{config[shift_xy]}" = "False" ]]; then \ + echo --no-shift-xy + fi) \ + {snakemake.input.cnr} +else + touch {snakemake.output.diagram} +fi +md5 {snakemake.output.diagram} - for chr in $chroms ; do - c=$(echo "$chr" | sed -e "s/^chr//") - fn="{tool_and_mapper}.{libname}.scatter.chr${{c}}.{opt_filetype}" - cnvkit.py scatter -s {snakemake.input.cns} --chromosome $chr {snakemake.input.cnr} -o $d/$fn - pushd $d ; md5sum $fn > $fn.md5 ; popd - done +if [[ -n "{snakemake.output.scatter}" ]] +then + cnvkit.py scatter \ + --output {snakemake.output.scatter} \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + {snakemake.input.cnr} +else + touch {snakemake.output.scatter} +fi +md5 {snakemake.output.scatter} - for chr in $chroms ; do - c=$(echo "$chr" | sed -e "s/^chr//") - fn="{tool_and_mapper}.{libname}.heatmap.chr${{c}}.{opt_filetype}" - cnvkit.py heatmap --chromosome $chr {snakemake.input.cnr} -o $d/$fn - pushd $d ; md5sum $fn > $fn.md5 ; popd - done +if [[ -n "{snakemake.output.heatmap}" ]] +then + cnvkit.py heatmap \ + --output {snakemake.output.heatmap} \ + {snakemake.input.cnr} else - echo "Not creating per-chromosome heatmap or scatter plots." + touch {snakemake.output.heatmap} fi +md5 {snakemake.output.heatmap} + +for heatmap in {heatmaps} +do + if [[ -n "$heatmap" ]] + then + cnvkit.py heatmap \ + --output $heatmap \ + {snakemake.input.cnr} + else + touch $heatmap + fi + md5 $heatmap +done + +for scatter in {scatters} +do + if [[ -n "$scatter" ]] + then + cnvkit.py scatter \ + --output $scatter \ + --segment {snakemake.input.cns} \ + {gender} {male} \ + {snakemake.input.cnr} + else + touch $scatter + fi + md5 $scatter +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/report/wrapper.py b/snappy_wrappers/wrappers/cnvkit/report/wrapper.py index 29d619e50..010b588b7 100644 --- a/snappy_wrappers/wrappers/cnvkit/report/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/report/wrapper.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Wrapper vor cnvkit.py report +"""Wrapper for cnvkit.py report """ from snakemake.shell import shell @@ -7,6 +7,38 @@ __author__ = "Manuel Holtgrewe" __email__ = "manuel.holtgrewe@bih-charite.de" +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step]["cnvkit"] + +for param in ( + "breaks_min_probes", + "genemetrics_min_probes", + "genemetrics_threshold", + "drop_low_coverage", + "genemetrics_alpha", + "genemetrics_bootstrap", + "segmetrics_alpha", + "segmetrics_bootstrap", + "smooth_bootstrap", +): + if not param in config.keys(): + config[param] = "" + +gender = " --gender {}".format(config["gender"]) if config["gender"] else "" +male = " --male-reference" if config["male_reference"] else "" + +input_target = snakemake.input.get("target", "") +input_antitarget = snakemake.input.get("antitarget", "") +input_cnr = snakemake.input.get("cnr", "") +input_cns = snakemake.input.get("cns", "") + +output_breaks = snakemake.output.get("breaks", "") +output_genemetrics = snakemake.output.get("genemetrics", "") + +output_segmetrics = snakemake.output.get("segmetrics", "") +output_sex = snakemake.output.get("sex", "") +output_metrics = snakemake.output.get("metrics", "") + shell( r""" # Also pipe everything to log file @@ -26,47 +58,137 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- -cnvkit.py breaks {snakemake.input.cnr} {snakemake.input.cns} \ -> {snakemake.output.breaks} - -cnvkit.py gainloss {snakemake.input.cnr} -s {snakemake.input.cns} \ -> {snakemake.output.gainloss} - -cnvkit.py gender {snakemake.input.cnr} \ -> {snakemake.output.gender} - -cnvkit.py metrics {snakemake.input.cnr} -s {snakemake.input.cns} \ -> {snakemake.output.metrics} - -cnvkit.py segmetrics {snakemake.input.cnr} \ - --mean \ - --median \ - --mode \ - --stdev \ - --sem \ - --mad \ - --mse \ - --iqr \ - --bivar \ - --ci \ - --pi \ - -s {snakemake.input.cns} \ - -o {snakemake.output.segmetrics} - -d=$(dirname "{snakemake.output.breaks}") -pushd $d -fn=$(basename "{snakemake.output.breaks}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.gainloss}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.gender}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.metrics}") -md5sum $fn > $fn.md5 -fn=$(basename "{snakemake.output.segmetrics}") -md5sum $fn > $fn.md5 -popd +md5() +{{ + d=$(dirname $1) + f=$(basename $1) + pushd $d + md5sum $f > $f.md5 + popd +}} + +# ----------------------------------------------------------------------------- + +if [[ -n "{output_breaks}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py breaks \ + --output {output_breaks} \ + --min-probes {config[breaks_min_probes]} \ + {input_cnr} {input_cns} + else + touch {output_breaks} + fi + md5 {output_breaks} +fi + +if [[ -n "{output_genemetrics}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py genemetrics \ + --output {output_genemetrics} \ + --segment {input_cns} \ + --min-probes {config[genemetrics_min_probes]} --threshold {config[genemetrics_threshold]} \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {gender} {male} \ + --mean --median --mode --ttest --stdev --sem --mad --mse --iqr --bivar --ci --pi \ + --alpha {config[genemetrics_alpha]} --bootstrap {config[genemetrics_bootstrap]} \ + {input_cnr} + else + touch {output_genemetrics} + fi + md5 {output_genemetrics} +fi + +if [[ -n "{output_segmetrics}" ]] +then + if [[ -n "{input_cnr}" ]] && [[ -n "{input_cns}" ]] + then + cnvkit.py segmetrics \ + --output {output_segmetrics} \ + --segment {input_cns} \ + --mean --median --mode --t-test --stdev --sem --mad --mse --iqr --bivar --ci --pi \ + --alpha {config[segmetrics_alpha]} --bootstrap {config[segmetrics_bootstrap]} \ + $(if [[ "{config[smooth_bootstrap]}" = "True" ]]; then \ + echo --smooth-bootstrap + fi) \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {input_cnr} + else + touch {output_segmetrics} + fi + md5 {output_segmetrics} +fi + +if [[ -n "{output_sex}" ]] +then + if [[ -n "{input_target}" || -n "{input_antitarget}" || -n "{input_cnr}" || -n "{input_cns}" ]] + then + cnvkit.py sex \ + --output {output_sex} \ + $(if [[ "{male}" = "True" ]]; then \ + echo --male-reference + fi) \ + $(if [[ -n "{input_target}" ]]; then \ + echo {input_target} + fi) \ + $(if [[ -n "{input_antitarget}" ]]; then \ + echo {input_antitarget} + fi) \ + $(if [[ -n "{input_cnr}" ]]; then \ + echo {input_cnr} + fi) \ + $(if [[ -n "{input_cns}" ]]; then \ + echo {input_cns} + fi) + else + touch {output_sex} + fi + md5 {output_sex} +fi + +if [[ -n "{output_metrics}" ]] +then + if [[ -n "{input_target}" || -n "{input_antitarget}" || -n "{input_cnr}" || -n "{input_cns}" ]] + then + cnvkit.py metrics \ + --output {output_metrics} \ + $(if [[ "config[drop_low_coverage]" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + $(if [[ -n "{input_target}" ]]; then \ + echo {input_target} + fi) \ + $(if [[ -n "{input_antitarget}" ]]; then \ + echo {input_antitarget} + fi) \ + $(if [[ -n "{input_cnr}" ]]; then \ + echo {input_cnr} + fi) \ + $(if [[ -n "{input_cns}" ]]; then \ + echo --segments {input_cns} + fi) + else + touch {output_metrics} + fi + md5 {output_metrics} +fi +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py index 99b1acdce..201ebb2dd 100644 --- a/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/segment/wrapper.py @@ -10,6 +10,15 @@ step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step]["cnvkit"] +method = config["segmentation_method"] +if method == "cbs" and config["smooth_cbs"]: + method += " --smooth-cbs" + +if float(config["segmentation_threshold"]) > 0: + threshold = " --threshold " + str(config["segmentation_threshold"]) +else: + threshold = "" + shell( r""" # Also pipe everything to log file @@ -29,16 +38,31 @@ md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +set -x + # ----------------------------------------------------------------------------- cnvkit.py segment \ - -m {config[seg_method]} \ - {snakemake.input} -o {snakemake.output} + --output {snakemake.output.segments} \ + --method {method} \ + $(if [[ "{config[drop_low_coverage]}" = "True" ]]; then \ + echo --drop-low-coverage + fi) \ + {threshold} \ + --drop-outliers {config[drop_outliers]} \ + {snakemake.input} -d=$(dirname "{snakemake.output}") +d=$(dirname "{snakemake.output.segments}") pushd $d -fn=$(basename "{snakemake.output}") +fn=$(basename "{snakemake.output.segments}") md5sum $fn > $fn.md5 popd """ ) + +# Compute MD5 sums of logs. +shell( + r""" +md5sum {snakemake.log.log} >{snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml b/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml deleted file mode 120000 index 2e107ac86..000000000 --- a/snappy_wrappers/wrappers/cnvkit/sex/environment.yaml +++ /dev/null @@ -1 +0,0 @@ -../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py b/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py deleted file mode 100644 index 7171083ad..000000000 --- a/snappy_wrappers/wrappers/cnvkit/sex/wrapper.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- -"""Wrapper for cnvkit.py sex -""" - -from snakemake.shell import shell - -__author__ = "Manuel Holtgrewe" -__email__ = "manuel.holtgrewe@bih-charite.de" - -step = snakemake.config["pipeline_step"]["name"] -config = snakemake.config["step_config"][step]["cnvkit"] - -shell( - r""" -# Also pipe everything to log file -if [[ -n "{snakemake.log.log}" ]]; then - if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - exec &> >(tee -a "{snakemake.log.log}" >&2) - else - rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) - echo "No tty, logging disabled" >"{snakemake.log.log}" - fi -fi - -# Write out information about conda installation. -conda list >{snakemake.log.conda_list} -conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} - -set -x - -# ----------------------------------------------------------------------------- - -cnvkit.py sex \ - --output {snakemake.output.table} \ - $(if [[ "{config[male_reference]}" = "True" ]]; then \ - echo --male-reference - fi) \ - {snakemake.input.target} {snakemake.input.antitarget} - -fn=$(basename "{snakemake.output.table}") -d=$(dirname "{snakemake.output.table}") -pushd $d -md5sum $fn > $fn.md5 -popd -""" -) - -# Compute MD5 sums of logs. -shell( - r""" -md5sum {snakemake.log.log} >{snakemake.log.log_md5} -""" -) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 4fae436e5..dd66c9deb 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -46,6 +46,10 @@ def minimal_config(): - cnvetti_on_target - cnvkit - copywriter + cnvkit: + path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn data_sets: first_batch: @@ -270,119 +274,6 @@ def test_cnvetti_on_target_step_part_get_resource_usage(somatic_targeted_seq_cnv assert actual == expected, msg_error -# Tests for CnvKitStepPart (access) --------------------------------------------------------------- - - -def test_cnvkit_access_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "access") - assert actual is None - - -def test_cnvkit_access_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.access/out/access.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "access") == expected - ) - - -def test_cnvkit_access_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.access/log/cnvkit.access") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "access") - assert actual == expected - - -def test_cnvkit_access_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'access'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "access", resource - ) - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (target) --------------------------------------------------------------- - - -def test_cnvkit_target_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "target")( - wildcards - ) - assert actual == {"access": "work/cnvkit.access/out/access.bed"} - - -def test_cnvkit_target_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.target/out/target.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "target") == expected - ) - - -def test_cnvkit_target_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.target/log/cnvkit.target") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "target") - assert actual == expected - - -def test_cnvkit_target_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'target'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "target", resource - ) - assert actual == expected, msg_error - - -# Tests for CnvKitStepPart (antitarget) ----------------------------------------------------------- - - -def test_cnvkit_antitarget_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "access": "work/cnvkit.access/out/access.bed", - "target": "work/cnvkit.target/out/target.bed", - } - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "antitarget")( - wildcards - ) - assert actual == expected - - -def test_cnvkit_antitarget_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/cnvkit.antitarget/out/antitarget.bed" - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "antitarget") - == expected - ) - - -def test_cnvkit_antitarget_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - expected = get_expected_log_files_dict(base_out="work/cnvkit.antitarget/log/cnvkit.antitarget") - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "antitarget") - assert actual == expected - - -def test_cnvkit_antitarget_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'antitarget'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "antitarget", resource - ) - assert actual == expected, msg_error - - # Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- @@ -391,10 +282,8 @@ def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_call fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} ) expected = { - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", - "target": "work/cnvkit.target/out/target.bed", } actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "coverage")( wildcards @@ -404,12 +293,12 @@ def test_cnvkit_coverage_step_part_get_input_files(somatic_targeted_seq_cnv_call def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.coverage.{library_name}/out/{mapper}.cnvkit.coverage.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" expected = { "target": base_name_out + ".targetcoverage.cnn", + "target_md5": base_name_out + ".targetcoverage.cnn.md5", "antitarget": base_name_out + ".antitargetcoverage.cnn", + "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", } # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "coverage") @@ -419,7 +308,7 @@ def test_cnvkit_coverage_step_part_get_output_files(somatic_targeted_seq_cnv_cal def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): base_file_name = ( - "work/{mapper}.cnvkit.coverage.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" + "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" ) expected = get_expected_log_files_dict(base_out=base_file_name) actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "coverage") @@ -429,7 +318,7 @@ def test_cnvkit_coverage_step_part_get_log_file(somatic_targeted_seq_cnv_calling def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 8, "time": "08:00:00", "memory": "16384M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -439,70 +328,14 @@ def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling assert actual == expected, msg_error -# Tests for CnvKitStepPart (reference) ------------------------------------------------------------ - - -def test_cnvkit_reference_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): - wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "reference")( - wildcards - ) - expected = { - "antitarget": "work/cnvkit.antitarget/out/antitarget.bed", - "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "target": "work/cnvkit.target/out/target.bed", - } - assert actual == expected - - -def test_cnvkit_reference_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.reference.{library_name}/out/" - "{mapper}.cnvkit.reference.{library_name}.cnn" - ) - actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "reference") - assert actual == expected - - -def test_cnvkit_reference_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): - # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.reference.{library_name}/log/{mapper}.cnvkit.reference.{library_name}" - ) - expected = get_expected_log_files_dict(base_out=base_name_out) - # Get actual - actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "reference") - assert actual == expected - - -def test_cnvkit_reference_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): - """Tests CnvKitStepPart.get_resource_usage() - action 'reference'""" - # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} - # Evaluate - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( - "cnvkit", "reference", resource - ) - assert actual == expected, msg_error - - # Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - coverage_base_out = ( - "work/bwa.cnvkit.coverage.P001-T1-DNA1-WGS1/out/bwa.cnvkit.coverage.P001-T1-DNA1-WGS1" - ) - reference_base_out = ( - "work/bwa.cnvkit.reference.P001-T1-DNA1-WGS1/out/bwa.cnvkit.reference.P001-T1-DNA1-WGS1" - ) + coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" expected = { "antitarget": coverage_base_out + ".antitargetcoverage.cnn", - "ref": reference_base_out + ".cnn", "target": coverage_base_out + ".targetcoverage.cnn", } # Get actual @@ -512,13 +345,14 @@ def test_cnvkit_fix_step_part_get_input_files(somatic_targeted_seq_cnv_calling_w def test_cnvkit_fix_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = "work/{mapper}.cnvkit.fix.{library_name}/out/{mapper}.cnvkit.fix.{library_name}.cnr" + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = "work/{mapper}.cnvkit.fix.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "fix") @@ -528,7 +362,7 @@ def test_cnvkit_fix_step_part_get_log_file(somatic_targeted_seq_cnv_calling_work def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -541,9 +375,7 @@ def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_work def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cnr": "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - } + expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "segment")( wildcards ) @@ -551,30 +383,27 @@ def test_cnvkit_segment_step_part_get_input_files(somatic_targeted_seq_cnv_calli def test_cnvkit_segment_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.segment.{library_name}/out/{mapper}.cnvkit.segment.{library_name}.cns" - ) - assert ( - somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") == expected + base_name_out = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" ) + expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "segment") + assert actual == expected def test_cnvkit_segment_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.segment.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "segment") - assert actual == expected def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -589,9 +418,7 @@ def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_ def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - segment_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" - ) + segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" expected = {"segment": segment_file} # Get actual wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) @@ -600,17 +427,15 @@ def test_cnvkit_call_step_part_get_input_files(somatic_targeted_seq_cnv_calling_ def test_cnvkit_call_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - expected = ( - "work/{mapper}.cnvkit.call.{library_name}/out/{mapper}.cnvkit.call.{library_name}.cns" - ) - assert somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") == expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} + actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "call") + assert actual == expected def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.call.{library_name}/log/{mapper}.cnvkit.call.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "call") @@ -620,7 +445,7 @@ def test_cnvkit_call_step_part_get_log_file(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -633,10 +458,8 @@ def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - cnr_file = "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - cns_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" - ) + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" expected = { "cnr": cnr_file, "cns": cns_file, @@ -649,17 +472,23 @@ def test_cnvkit_plot_step_part_get_input_files(somatic_targeted_seq_cnv_calling_ def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_out = "work/{mapper}.cnvkit.plot.{library_name}/out/{mapper}.cnvkit.plot.{library_name}" - expected = { - "diagram": base_out + ".diagram.pdf", - "heatmap": base_out + ".heatmap.pdf", - "scatter": base_out + ".scatter.pdf", - } - for chrom in chain(range(1, 23), ("X", "Y")): - for diagram in ("heatmap", "scatter"): - tpl = base_out + ".%s.chr%s.pdf" - expected["{}_chr{}".format(diagram, chrom)] = tpl % (diagram, chrom) - + expected = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): + expected[plot] = tpl.format(plot=plot, ext=ext) + expected[plot + "_md5"] = expected[plot] + ".md5" + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): + for chrom in chain(range(1, 23), ("X", "Y")): + key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) + expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) + expected[key + "_md5"] = expected[key] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "plot") assert actual == expected @@ -668,7 +497,7 @@ def test_cnvkit_plot_step_part_get_output_files(somatic_targeted_seq_cnv_calling def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected expected = get_expected_log_files_dict( - base_out="work/{mapper}.cnvkit.plot.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" + base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" ) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "plot") @@ -678,7 +507,7 @@ def test_cnvkit_plot_step_part_get_log_file(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "30720M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "08:00:00", "memory": "30720M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -691,9 +520,7 @@ def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) - expected = { - "cns": "work/bwa.cnvkit.call.P001-T1-DNA1-WGS1/out/bwa.cnvkit.call.P001-T1-DNA1-WGS1.cns" - } + expected = {"cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"} actual = somatic_targeted_seq_cnv_calling_workflow.get_input_files("cnvkit", "export")( wildcards ) @@ -702,15 +529,11 @@ def test_cnvkit_export_step_part_get_input_files(somatic_targeted_seq_cnv_callin def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.export.{library_name}/out/{mapper}.cnvkit.export.{library_name}" - ) - expected = { - "bed": base_name_out + ".bed", - "seg": base_name_out + ".seg", - "vcf_tbi": base_name_out + ".vcf.gz.tbi", - "vcf": base_name_out + ".vcf.gz", - } + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" + for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): + expected[key] = base_name_out + "." + ext + expected[key + "_md5"] = expected[key] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "export") assert actual == expected @@ -718,9 +541,7 @@ def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calli def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.export.{library_name}/log/{mapper}.cnvkit.export.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "export") @@ -730,7 +551,7 @@ def test_cnvkit_export_step_part_get_log_file(somatic_targeted_seq_cnv_calling_w def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -745,13 +566,19 @@ def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_w def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - cnr_file = "work/bwa.cnvkit.fix.P001-T1-DNA1-WGS1/out/bwa.cnvkit.fix.P001-T1-DNA1-WGS1.cnr" - cns_file = ( - "work/bwa.cnvkit.segment.P001-T1-DNA1-WGS1/out/bwa.cnvkit.segment.P001-T1-DNA1-WGS1.cns" + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + target_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" + ) + antitarget_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" ) expected = { "cnr": cnr_file, "cns": cns_file, + "target": target_file, + "antitarget": antitarget_file, } # Get actual wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) @@ -763,16 +590,11 @@ def test_cnvkit_report_step_part_get_input_files(somatic_targeted_seq_cnv_callin def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.report.{library_name}/out/{mapper}.cnvkit.report.{library_name}" - ) - expected = { - "breaks": base_name_out + ".breaks.txt", - "gainloss": base_name_out + ".gainloss.txt", - "gender": base_name_out + ".gender.txt", - "metrics": base_name_out + ".metrics.txt", - "segmetrics": base_name_out + ".segmetrics.txt", - } + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): + expected[report] = base_name_out + "." + report + ".txt" + expected[report + "_md5"] = expected[report] + ".md5" # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "report") assert actual == expected @@ -780,9 +602,7 @@ def test_cnvkit_report_step_part_get_output_files(somatic_targeted_seq_cnv_calli def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_workflow): # Define expected - base_name_out = ( - "work/{mapper}.cnvkit.report.{library_name}/log/{mapper}.cnvkit.report.{library_name}" - ) + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" expected = get_expected_log_files_dict(base_out=base_name_out) # Get actual actual = somatic_targeted_seq_cnv_calling_workflow.get_log_file("cnvkit", "report") @@ -792,7 +612,7 @@ def test_cnvkit_report_step_part_get_log_file(somatic_targeted_seq_cnv_calling_w def test_cnvkit_report_step_part_get_resource(somatic_targeted_seq_cnv_calling_workflow): """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" # Define expected - expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + expected_dict = {"threads": 1, "time": "03:59:59", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." @@ -995,49 +815,43 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call ) ] # cnvkit - tpl = ( - "output/bwa.cnvkit.call.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.call.P00{i}-T{t}-DNA1-WGS1.{ext}" - ) + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" expected += [ - tpl.format(i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) for ext in ("cns",) - ] - tpl = ( - "output/bwa.cnvkit.export.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.export.P00{i}-T{t}-DNA1-WGS1.{ext}" - ) - expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("bed", "seg", "vcf.gz", "vcf.gz.tbi") + for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1.{ext}" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" ) expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("diagram.pdf", "heatmap.pdf", "scatter.pdf") + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.plot.P00{i}-T{t}-DNA1-WGS1.{diagram}.chr{chrom}.pdf" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" ) expected += [ - tpl.format(i=i, t=t, diagram=diagram, chrom=chrom) + tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for diagram in ("heatmap", "scatter") + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) for chrom in chain(range(1, 23), ("X", "Y")) + for md5 in ("", ".md5") ] tpl = ( - "output/bwa.cnvkit.report.P00{i}-T{t}-DNA1-WGS1/out/" - "bwa.cnvkit.report.P00{i}-T{t}-DNA1-WGS1.{ext}" + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" ) expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, report=report, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("breaks.txt", "gainloss.txt", "gender.txt", "metrics.txt", "segmetrics.txt") + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + for md5 in ("", ".md5") ] # copywriter tpl = ( diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py index 5e00fcf45..7bec82eaa 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py @@ -2,6 +2,7 @@ """Tests for the somatic_wgs_cnv_calling workflow module code""" +from itertools import chain import textwrap import pytest @@ -55,6 +56,11 @@ def minimal_config(): reference: /path/to/reference.fasta filter_bed: /path/to/filter.bed genome_folder: /path/to/genome/folder + cnvkit: + path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed + path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed + path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + data_sets: first_batch: @@ -230,53 +236,333 @@ def test_control_freec_step_part_get_resource(somatic_wgs_cnv_calling_workflow): assert actual == expected, msg_error -# Tests for CnvkitSomaticWgsStepPart --------------------------------------------------------------- +# Tests for CnvKitStepPart (coverage) ------------------------------------------------------------- -def test_cnvkit_somatic_wgs_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_input_files()""" - wildcards = Wildcards(fromdict={"mapper": "bwa", "cancer_library": "P001-T1-DNA1-WGS1"}) +def test_cnvkit_coverage_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards( + fromdict={"mapper": "bwa", "target": "target", "library_name": "P001-T1-DNA1-WGS1"} + ) expected = { - "normal_bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", - "normal_bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", - "tumor_bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", - "tumor_bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", } - actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "run")(wildcards) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "coverage")(wildcards) assert actual == expected -def test_cnvkit_somatic_wgs_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_output_files()""" - base_name = "work/{mapper}.cnvkit.{cancer_library}/out/" +def test_cnvkit_coverage_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" expected = { - "segment": base_name + "{mapper}.cnvkit.{cancer_library}.cns", - "segment_md5": base_name + "{mapper}.cnvkit.{cancer_library}.cns.md5", - "bins": base_name + "{mapper}.cnvkit.{cancer_library}.cnr", - "bins_md5": base_name + "{mapper}.cnvkit.{cancer_library}.cnr.md5", - "scatter": base_name + "{mapper}.cnvkit.{cancer_library}.scatter.png", - "scatter_md5": base_name + "{mapper}.cnvkit.{cancer_library}.scatter.png.md5", + "target": base_name_out + ".targetcoverage.cnn", + "target_md5": base_name_out + ".targetcoverage.cnn.md5", + "antitarget": base_name_out + ".antitargetcoverage.cnn", + "antitarget_md5": base_name_out + ".antitargetcoverage.cnn.md5", } - actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "run") + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "coverage") + assert actual == expected -def test_cnvkit_somatic_wgs_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_log_file()""" - base_name = "work/{mapper}.cnvkit.{cancer_library}/log/{mapper}.cnvkit.{cancer_library}" - expected = get_expected_log_files_dict(base_out=base_name) - actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "run") +def test_cnvkit_coverage_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + base_file_name = ( + "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.coverage.{library_name}" + ) + expected = get_expected_log_files_dict(base_out=base_file_name) + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "coverage") + assert actual == expected + + +def test_cnvkit_coverage_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'coverage'""" + # Define expected + expected_dict = {"threads": 8, "time": "1-00:00:00", "memory": "16384M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "coverage", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (fix) ------------------------------------------------------------------ + + +def test_cnvkit_fix_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + coverage_base_out = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1" + expected = { + "antitarget": coverage_base_out + ".antitargetcoverage.cnn", + "target": coverage_base_out + ".targetcoverage.cnn", + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "fix")(wildcards) + assert actual == expected + + +def test_cnvkit_fix_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cnr" + expected = {"ratios": base_name_out, "ratios_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "fix") == expected + + +def test_cnvkit_fix_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.fix.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "fix") + assert actual == expected + + +def test_cnvkit_fix_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "fix", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (segment) -------------------------------------------------------------- + + +def test_cnvkit_segment_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = {"cnr": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr"} + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "segment")(wildcards) assert actual == expected -def test_cnvkit_step_part_get_resource(somatic_wgs_cnv_calling_workflow): - """Tests CnvkitSomaticWgsStepPart.get_resource()""" +def test_cnvkit_segment_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = ( + "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.segment.cns" + ) + expected = {"segments": base_name_out, "segments_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "segment") == expected + + +def test_cnvkit_segment_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): # Define expected - expected_dict = {"threads": 4, "time": "1-16:00:00", "memory": "40960M", "partition": "medium"} + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.segment.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "segment") + assert actual == expected + + +def test_cnvkit_segment_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'fix'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "run", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "segment", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (call) ----------------------------------------------------------------- + + +def test_cnvkit_call_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + segment_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.segment.cns" + expected = {"segment": segment_file} + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "call")(wildcards) + assert actual == expected + + +def test_cnvkit_call_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" + expected = {"calls": base_name_out, "calls_md5": base_name_out + ".md5"} + assert somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "call") == expected + + +def test_cnvkit_call_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.call.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "call") + assert actual == expected + + +def test_cnvkit_call_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "call", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (plot) ----------------------------------------------------------------- + + +def test_cnvkit_plot_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + expected = { + "cnr": cnr_file, + "cns": cns_file, + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "plot")(wildcards) + assert actual == expected + + +def test_cnvkit_plot_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.{ext}" + ) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")): + expected[plot] = tpl.format(plot=plot, ext=ext) + expected[plot + "_md5"] = expected[plot] + ".md5" + tpl = ( + "work/{{mapper}}.cnvkit.{{library_name}}/report/" + "{{mapper}}.cnvkit.{{library_name}}.{plot}.chr{chrom}.{ext}" + ) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")): + for chrom in chain(range(1, 23), ("X", "Y")): + key = "{plot}_chr{chrom}".format(plot=plot, chrom=str(chrom)) + expected[key] = tpl.format(plot=plot, ext=ext, chrom=str(chrom)) + expected[key + "_md5"] = expected[key] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "plot") + assert actual == expected + + +def test_cnvkit_plot_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = get_expected_log_files_dict( + base_out="work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.plot.{library_name}" + ) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "plot") + assert actual == expected + + +def test_cnvkit_plot_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "30720M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "plot", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (export) --------------------------------------------------------------- + + +def test_cnvkit_export_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + expected = {"cns": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns"} + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "export")(wildcards) + assert actual == expected + + +def test_cnvkit_export_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" + for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): + expected[key] = base_name_out + "." + ext + expected[key + "_md5"] = expected[key] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "export") + assert actual == expected + + +def test_cnvkit_export_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.export.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "export") + assert actual == expected + + +def test_cnvkit_export_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "export", resource) + assert actual == expected, msg_error + + +# Tests for CnvKitStepPart (report) --------------------------------------------------------------- + + +def test_cnvkit_report_step_part_get_input_files(somatic_wgs_cnv_calling_workflow): + # Define expected + cnr_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cnr" + cns_file = "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.cns" + target_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.targetcoverage.cnn" + ) + antitarget_file = ( + "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.antitargetcoverage.cnn" + ) + expected = { + "cnr": cnr_file, + "cns": cns_file, + "target": target_file, + "antitarget": antitarget_file, + } + # Get actual + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) + actual = somatic_wgs_cnv_calling_workflow.get_input_files("cnvkit", "report")(wildcards) + assert actual == expected + + +def test_cnvkit_report_step_part_get_output_files(somatic_wgs_cnv_calling_workflow): + # Define expected + expected = {} + base_name_out = "work/{mapper}.cnvkit.{library_name}/report/{mapper}.cnvkit.{library_name}" + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics"): + expected[report] = base_name_out + "." + report + ".txt" + expected[report + "_md5"] = expected[report] + ".md5" + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_output_files("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_report_step_part_get_log_file(somatic_wgs_cnv_calling_workflow): + # Define expected + base_name_out = "work/{mapper}.cnvkit.{library_name}/log/{mapper}.cnvkit.report.{library_name}" + expected = get_expected_log_files_dict(base_out=base_name_out) + # Get actual + actual = somatic_wgs_cnv_calling_workflow.get_log_file("cnvkit", "report") + assert actual == expected + + +def test_cnvkit_report_step_part_get_resource(somatic_wgs_cnv_calling_workflow): + """Tests CnvKitStepPart.get_resource_usage() - action 'call'""" + # Define expected + expected_dict = {"threads": 1, "time": "1-00:00:00", "memory": "7680M", "partition": "medium"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "report", resource) assert actual == expected, msg_error @@ -471,12 +757,43 @@ def test_somatic_cnv_calling_workflow(somatic_wgs_cnv_calling_workflow): for cnv_caller in ("control_freec",) ] # -- add files from cnvkit + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{md5}" expected += [ - tpl.format(mapper=mapper, cnv_caller=cnv_caller, i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("cnr", "cnr.md5", "cns", "cns.md5", "scatter.png", "scatter.png.md5") - for mapper in ("bwa",) - for cnv_caller in ("cnvkit",) + for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.{ext}{md5}" + ) + expected += [ + tpl.format(i=i, t=t, plot=plot, ext=ext, md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for plot, ext in (("diagram", "pdf"), ("heatmap", "pdf"), ("scatter", "png")) + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{plot}.chr{chrom}.{ext}{md5}" + ) + expected += [ + tpl.format(i=i, t=t, plot=plot, ext=ext, chrom=str(chrom), md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for plot, ext in (("heatmap", "pdf"), ("scatter", "png")) + for chrom in chain(range(1, 23), ("X", "Y")) + for md5 in ("", ".md5") + ] + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/" + "bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{report}.txt{md5}" + ) + expected += [ + tpl.format(i=i, t=t, report=report, md5=md5) + for i, t in ((1, 1), (2, 1), (2, 2)) + for report in ("breaks", "genemetrics", "segmetrics", "sex", "metrics") + for md5 in ("", ".md5") ] # Perform the comparison expected = list(sorted(expected)) From f7c14f9d4949c0a2b0fd7aaaabc5b31b400b3981 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Wed, 11 Jan 2023 23:41:35 +0100 Subject: [PATCH 09/10] Make snakefmt happy --- snappy_pipeline/workflows/panel_of_normals/Snakefile | 2 ++ snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile | 1 + 2 files changed, 3 insertions(+) diff --git a/snappy_pipeline/workflows/panel_of_normals/Snakefile b/snappy_pipeline/workflows/panel_of_normals/Snakefile index d3c1c5e52..427ed25d0 100644 --- a/snappy_pipeline/workflows/panel_of_normals/Snakefile +++ b/snappy_pipeline/workflows/panel_of_normals/Snakefile @@ -103,6 +103,7 @@ rule panel_of_normals_mutect2_create_panel: # Write out access file (if required, must be run prior to the cnvkit panel of normals) + rule panel_of_normals_access_run: output: **wf.get_output_files("access", "run"), @@ -118,6 +119,7 @@ rule panel_of_normals_access_run: # Write out the normals-only results for the normals -------------------------- + rule panel_of_normals_cnvkit_target: input: unpack(wf.get_input_files("cnvkit", "target")), diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile index d24816811..ea996d6ee 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile @@ -193,4 +193,5 @@ rule somatic_wgs_cnv_calling_control_freec_plot: # Run cnvkit ------------------------------------------------ + include: "cnvkit.rules" From cae427d2035d8a301bdbba7609fb0ef9b6ece885 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 20 Jan 2023 16:26:19 +0100 Subject: [PATCH 10/10] fix: tbi to vcf_tbi in tests to conform to new convention --- .../workflows/test_workflows_panel_of_normals.py | 12 +++++++----- .../workflows/test_workflows_panel_of_normals_wgs.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 38b8645b1..76835beb4 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -127,8 +127,8 @@ def test_mutect2_step_part_get_output_files_prepare_panel(panel_of_normals_workf expected = { "vcf": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz", "vcf_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.md5", - "tbi": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi", - "tbi_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi.md5", + "vcf_tbi": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi", + "vcf_tbi_md5": "work/{mapper}.mutect2/out/{mapper}.mutect2.{normal_library}.prepare.vcf.gz.tbi.md5", } actual = panel_of_normals_workflow.get_output_files("mutect2", "prepare_panel") assert actual == expected @@ -495,7 +495,7 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): # add log files tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals" for mapper in ("bwa",): - expected += get_expected_log_files_dict(tpl.format(mapper=mapper)).values() + expected += get_expected_log_files_dict(base_out=tpl.format(mapper=mapper)).values() # Now for basic cnvkit files (panel of normal only) tpl = "output/{mapper}.cnvkit/out/{mapper}.cnvkit.panel_of_normals.{ext}" @@ -521,13 +521,15 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): for substep in ("target", "antitarget", "panel_of_normals", "report"): for mapper in ("bwa",): expected += get_expected_log_files_dict( - tpl.format(mapper=mapper, substep=substep) + base_out=tpl.format(mapper=mapper, substep=substep) ).values() # Access tpl = "output/cnvkit.access/out/cnvkit.access.{ext}" expected += [tpl.format(ext=ext) for ext in ("bed", "bed.md5")] - expected += get_expected_log_files_dict("output/cnvkit.access/log/cnvkit.access").values() + expected += get_expected_log_files_dict( + base_out="output/cnvkit.access/log/cnvkit.access" + ).values() expected = list(sorted(expected)) actual = list(sorted(panel_of_normals_workflow.get_result_files())) diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py index d54974619..2d785cb67 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py @@ -372,7 +372,7 @@ def test_panel_of_normals_workflow(panel_of_normals_workflow): for substep in ("target", "antitarget", "panel_of_normals", "report"): for mapper in ("bwa",): expected += get_expected_log_files_dict( - tpl.format(mapper=mapper, substep=substep) + base_out=tpl.format(mapper=mapper, substep=substep) ).values() expected = list(sorted(expected))