Skip to content

Commit

Permalink
feat: make runnable with Slurm (#38) (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jun 7, 2023
1 parent 9ce1cab commit 2f6deac
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 14 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Run them all with `all`:

Note that this will take a long time, use a lot of disk space, and download a lot of data.

To run on a Slurm cluster, you can use the Snakemake `--slurm` option.
See `run-slurm.sh` for an example.

## Development Setup

### Prerequisites: Install `mamba` for Conda Package Management
Expand Down
4 changes: 2 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ rule all:
f"work/download/annos/grch37/seqvars/dbnsfp/{DV.dbnsfp}c/LICENSE.txt",
f"work/download/annos/grch37/seqvars/dbscsnv/{DV.dbscsnv}/dbscSNV{DV.dbscsnv}.chr1",
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
"work/annos/grch37/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz"
"work/annos/grch37/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz",
f"work/annos/grch37/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
f"work/annos/grch37/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
f"work/annos/grch37/seqvars/gnomad_genomes/{DV.gnomad_v2}/.done",
Expand All @@ -86,7 +86,7 @@ rule all:
# NB: dbNSFP is dual reference (for download)
# NB: dbscSNV is dual reference (for download)
f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz",
"work/annos/grch38/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz"
"work/annos/grch38/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz",
f"work/annos/grch38/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz",
f"work/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done",
f"work/annos/grch38/seqvars/gnomad_genomes/{DV.gnomad_v3}/.done",
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dependencies:
# JSON transformation tool used in many rules.
- jq
# Tools for file downloads.
- aria2
- aria2 >=1.36.0
- wget
# Tool for processing BED files.
- bedops =2
Expand Down
1 change: 1 addition & 0 deletions rules/annos/seqvars/dbnsfp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ rule annos_seqvars_dbnsfp_download: # -- download dbNSFP ZIP file
zip="work/download/annos/grch37/seqvars/dbnsfp/{version}{variant}/dbNSFP{version}{variant}.zip",
wildcard_constraints:
version=r"\d\.\d",
threads: 8
shell:
r"""
if [[ "{wildcards.variant}" == a ]]; then
Expand Down
25 changes: 18 additions & 7 deletions rules/annos/seqvars/gnomad_nuclear.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@


# URL prefix of gnomAD downloads.
GNOMAD_PREFIX = "https://gnomad-public-us-east-1.s3.amazonaws.com/release"
# GNOMAD_PREFIX = "https://gnomad-public-us-east-1.s3.amazonaws.com/release"
# GNOMAD_PREFIX = "https://datasetgnomad.blob.core.windows.net/dataset/release"
GNOMAD_PREFIX = "https://storage.googleapis.com/gcp-public-data--gnomad/release"


rule annos_gnomad_nuclear_download_grch37: # -- download gnomAD v2 exomes/genomes for GRCh37
Expand All @@ -11,24 +13,27 @@ rule annos_gnomad_nuclear_download_grch37: # -- download gnomAD v2 exomes/genom
vcf_tbi="work/download/annos/grch37/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz.tbi",
wildcard_constraints:
kind=r"[a-z]+",
threads: 8
resources:
runtime="48h",
shell:
r"""
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf} \
{GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf_tbi} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf_tbi} \
{GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz.tbi
"""

Expand All @@ -39,24 +44,27 @@ rule annos_gnomad_nuclear_download_grch38_liftover_v2: # -- download gnomAD v2
vcf_tbi="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz.tbi",
wildcard_constraints:
kind=r"[a-z]+",
threads: 8
resources:
runtime="48h",
shell:
r"""
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf} \
{GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf_tbi} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf_tbi} \
{GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz.tbi
"""

Expand All @@ -67,24 +75,27 @@ rule annos_gnomad_nuclear_download_grch38_v3: # -- download gnomAD genomes v3
vcf_tbi="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz.tbi",
wildcard_constraints:
kind=r"[a-z]+",
threads: 8
resources:
runtime="48h",
shell:
r"""
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf} \
{GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.vcf_tbi} \
--split=8 \
--max-concurrent-downloads=8 \
--max-connection-per-server=8 \
--out={output.vcf_tbi} \
{GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz.tbi
"""

Expand Down
1 change: 1 addition & 0 deletions rules/reference/human.smk
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ rule reference_download: # -- download reference genome sequence
r"""
aria2c \
--check-certificate=false \
--file-allocation=trunc \
--out={output.download} \
--split=8 \
--max-concurrent-downloads=8 \
Expand Down
42 changes: 42 additions & 0 deletions run-slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/bash

# Example call to run the workflow on a Slurm cluster.

#SBATCH --job-name=varfish-db-downloader
#SBATCH --output=slurm-varfish-db-downloader-%j.out
#
#SBATCH --nodes=1
#SBATCH --ntasks=2
#SBATCH --time=2-00:00:00
#SBATCH --memory=2G

set -x
set -euo pipefail

# Number of jobs to run at the same time.
JOBS=${JOBS-500}
# Default partition.
PART=${PART-critical}
# Be relaxed with reruns.
RELAXED_RERUNS=${RELAXED_RERUNS-true}
# Whether to add --keep-going
KEEP_GOING=${KEEP_GOING-false}

snakemake \
--rerun-incomplete \
$(if [[ "$RELAXED_RERUNS" == true ]]; then \
echo --rerun-triggers mtime; \
echo --rerun-triggers params; \
echo --rerun-triggers input; \
fi) \
$(if [[ "$KEEP_GOING" == true ]]; then \
echo --keep-going; \
fi) \
--jobs $JOBS \
--slurm \
--default-resources \
slurm_partition="$PART" \
'runtime="4h"' \
mem_mb=2000 \
-- \
"${@-all}"
2 changes: 1 addition & 1 deletion scripts/vardbs-strucvar-dgv.awk
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ BEGIN {
print "#chromosome", "begin", "end", "sv_type", "observed_gains", "observed_losses";
}
($1 !~ /^variantaccession/) {
if ($16 + $17 > 0) {
if (($2 != "") && ($16 + $17 > 0)) {
print $2, $3 - 1, $4, $6, $16, $17;
}
}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2a941b5967fdbdafe91c53ea789679d5 work/annos/grch37/strucvars/dbvar/2023-05-16/dbvar.bed.gz.tbi
a49371b85557ddbdfcc6af54fbbcefeb work/annos/grch37/strucvars/dbvar/2023-05-16/dbvar.bed.gz.tbi
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3c0a5195f3ecab7b49628be7e122a5c9 work/annos/grch38/strucvars/dbvar/2023-05-16/dbvar.bed.gz.tbi
9f2f08207dd74c16954f5b92676a7ea0 work/annos/grch38/strucvars/dbvar/2023-05-16/dbvar.bed.gz.tbi
8 changes: 7 additions & 1 deletion varfish_db_downloader/data_versions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""Declaration of data versions."""

import os
from datetime import datetime

import attrs

#: Value to use for "today".
TODAY = os.environ.get("TODAY", datetime.today().strftime("%Y-%m-%d"))


@attrs.frozen()
class DataVersions:
Expand Down Expand Up @@ -72,7 +78,7 @@ class DataVersions:
ensembl_37="87",
ensembl_38="109",
ensembl="109",
today="2023-06-05",
today=TODAY,
dbnsfp="4.4",
dbscsnv="1.1",
cadd="1.6",
Expand Down

0 comments on commit 2f6deac

Please sign in to comment.