Skip to content

Commit

Permalink
feat: restructuring, cleanup, documnetation (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Jun 1, 2023
1 parent 5430e2e commit bb128c6
Show file tree
Hide file tree
Showing 32 changed files with 7,796 additions and 1,663 deletions.
58 changes: 11 additions & 47 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,55 +1,19 @@
**/download
**/rocksdb*
**/.done

annos/**/*.vcf*
annos/**/*.bed*
annos/**/*.tsv*

features/**/*.bed*
features/**/*.tsv*
features/**/*.md5

genes/**/*.bed*
genes/**/*.tsv*
genes/**/*.jsonl*
genes/**/*.md5

vardbs/**/*.tsv*
vardbs/**/*.bed*
vardbs/**/*.md5

reference/**/*.fa*

tracks/**/*.bed*

/stats-*/
/*.xlsx
.~*
core.*
/report.NEW
# Ignore the workflow directories.
work/
output/

# Python
*.egg-info
src/
*.pyc
*.pyo

# Snakemake
.snakemake

# Text Editors / IDEs
*~
.*.sw?
.idea
logs/
run.sh
.snakemake
/slurm_log
/old
/tmp
/noref
/GRCh37
/GRCh38
src/
*.pyc
*.pyo
*.bak*
varfish-server-background-db-*
jannovar-db-*
varfish-annotator-*
*.tar.gz
*.sha256
.vscode/
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ check-black:
.PHONY: check-snakefmt
snakefmt:
snakefmt --check --diff --line-length 100 Snakefile
snakefmt --check --diff --line-length 100 snakefiles/*.smk
snakefmt --check --diff --line-length 100 rules/*/*.smk

# Run Python linting with flake8.
.PHONY: flake8
Expand Down Expand Up @@ -86,4 +86,4 @@ black:
.PHONY: run-snakefmt
run-snakefmt:
snakefmt --line-length 100 Snakefile
snakefmt --line-length 100 snakefiles/*.smk
snakefmt --line-length 100 rules/*/*.smk
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,22 @@ This repository contains a Snakemake workflow with supporting code for downloadi
- License: MIT
- Programming Language: Python / Snakemake

## Running

Use the utility rule `help` to get a list of all available rules:

```
# snakemake --cores=1 help
```

Run them all with `all`:

```
# snakemake --cores=1 all
```

Note that this will take a long time, use a lot of disk space, and download a lot of data.

## Development Setup

### Prerequisites: Install `mamba` for Conda Package Management
Expand Down Expand Up @@ -48,7 +64,6 @@ This will install the `varfish-db-downloader` tools:
# pip install -e .
```


## Developer Rules

### Download Commands
Expand Down
139 changes: 91 additions & 48 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
# ``varfish-server-worker`` and is used in the backend for filtering and/or exposed to the
# user via a REST API.

from varfish_db_downloader.data_versions import DATA_VERSIONS as DV

# The prefix to use for all shell commands.
SHELL_PREFIX = "export LC_ALL=C; set -x -euo pipefail;"
# Setup the shell prefix by default.
shell.prefix(SHELL_PREFIX)


# ===============================================================================================
# Test Mode
# ===============================================================================================
Expand All @@ -20,62 +28,97 @@ if os.environ.get("CI", "false").lower() == "true":


# ===============================================================================================
# Default Rule
# Top-Level Rules
# ===============================================================================================


rule default:
## help -- print this help
rule help:
input:
"annos/grch37/cadd/.done",
"annos/grch37/dbnsfp-4.4a/.done",
"annos/grch37/dbnsfp-4.4c/.done",
"annos/grch37/dbscsnv/.done",
"annos/grch37/helixmtdb/helixmtdb.vcf.gz",
"annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz",
"annos/grch37/ucsc_conservation/ucsc_conservation.tsv",
"annos/grch37/dbsnp/dbsnp.vcf.gz",
"annos/grch37/gnomad_exomes/.done",
"annos/grch37/gnomad_genomes/.done",
"annos/grch38/cadd/.done",
"annos/grch38/dbnsfp-4.4a/.done",
"annos/grch38/dbnsfp-4.4c/.done",
"annos/grch38/gnomad_exomes/.done",
"annos/grch38/gnomad_genomes/.done",
"annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz",
"annos/grch38/helixmtdb/helixmtdb.vcf.gz",
"features/grch37/tads/imr90.bed",
"features/grch37/tads/hesc.bed",
"features/grch37/gene_regions/refseq.bed.gz",
"features/grch37/gene_regions/ensembl.bed.gz",
"features/grch37/masked/repeat.bed.gz",
"features/grch37/masked/segdup.bed.gz",
"genes/hgnc/hgnc_info.jsonl",
"genes/ncbi/gene_info.jsonl",
"genes/dbnsfp/genes.tsv.gz",
"genes/xlink/ensembl.tsv",
"genes/xlink/hgnc.tsv",
"genes/mim2gene/mim2gene.tsv",
"tracks/grch37/ucsc_genomicSuperDups.bed.gz",
"tracks/grch37/ucsc_rmsk.bed.gz",
"tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz",
"tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz",
# "vardbs/grch37/strucvar/clinvar.bed.gz",
"vardbs/grch37/strucvar/dbvar.bed.gz",
"vardbs/grch37/strucvar/dgv.bed.gz",
"vardbs/grch37/strucvar/dgv_gs.bed.gz",
"vardbs/grch37/strucvar/g1k.bed.gz",
"vardbs/grch37/strucvar/gnomad_sv.bed.gz",
"vardbs/grch37/strucvar/exac.bed.gz",
"Snakefile",
run:
shell.prefix("") # no ``set -x`` for this rule
shell(
r"""
echo
echo "=== Available Rules ==="
echo
for f in Snakefile $(find rules/* -name '*.smk' | sort); do
echo "--- $f ---"
echo
grep '^##' $f
echo
grep -e '^rule' $f
echo
done
"""
)


## all -- run all rules
# rule all:
# input:
# Gene-Related Information
# "work/genes/hgnc/hgnc_info.jsonl",
# "genes/ncbi/gene_info.jsonl",
# "genes/dbnsfp/genes.tsv.gz",
# "genes/xlink/ensembl.tsv",
# "genes/xlink/hgnc.tsv",
# "genes/mim2gene/mim2gene.tsv",
# # Per-Reference Variant Annotations
# "annos/grch37/cadd/.done",
# "annos/grch37/dbnsfp-4.4a/.done",
# "annos/grch37/dbnsfp-4.4c/.done",
# "annos/grch37/dbscsnv/.done",
# "annos/grch37/helixmtdb/helixmtdb.vcf.gz",
# "annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz",
# "annos/grch37/ucsc_conservation/ucsc_conservation.tsv",
# "annos/grch37/dbsnp/dbsnp.vcf.gz",
# "annos/grch37/gnomad_exomes/.done",
# "annos/grch37/gnomad_genomes/.done",
# "annos/grch38/cadd/.done",
# "annos/grch38/dbnsfp-4.4a/.done",
# "annos/grch38/dbnsfp-4.4c/.done",
# "annos/grch38/gnomad_exomes/.done",
# "annos/grch38/gnomad_genomes/.done",
# "annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz",
# "annos/grch38/helixmtdb/helixmtdb.vcf.gz",
# # Per-Reference "Features"
# "features/grch37/tads/imr90.bed",
# "features/grch37/tads/hesc.bed",
# "features/grch37/gene_regions/refseq.bed.gz",
# "features/grch37/gene_regions/ensembl.bed.gz",
# "features/grch37/masked/repeat.bed.gz",
# "features/grch37/masked/segdup.bed.gz",
# "tracks/grch37/ucsc_genomicSuperDups.bed.gz",
# "tracks/grch37/ucsc_rmsk.bed.gz",
# "tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz",
# "tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz",
# # "vardbs/grch37/strucvar/clinvar.bed.gz",
# "vardbs/grch37/strucvar/dbvar.bed.gz",
# "vardbs/grch37/strucvar/dgv.bed.gz",
# "vardbs/grch37/strucvar/dgv_gs.bed.gz",
# "vardbs/grch37/strucvar/g1k.bed.gz",
# "vardbs/grch37/strucvar/gnomad_sv.bed.gz",
# "vardbs/grch37/strucvar/exac.bed.gz",


# ===============================================================================================
# Modular Snakefile Includes
# ===============================================================================================


include: "snakefiles/annos.smk"
include: "snakefiles/genes.smk"
include: "snakefiles/features.smk"
include: "snakefiles/vardbs-grch37-strucvars.smk"
include: "snakefiles/tracks-grch37.smk"
include: "snakefiles/reference.smk"
# Gene-related information.
include: "rules/genes/dbnsfp.smk"
include: "rules/genes/ensembl.smk"
include: "rules/genes/gnomad.smk"
include: "rules/genes/hgnc.smk"
include: "rules/genes/ncbi.smk"
# Refernece sequence--related information.
include: "rules/reference/human.smk"


# include: "rules/annos.smk"
# include: "rules/features.smk"
# include: "rules/vardbs-grch37-strucvars.smk"
# include: "rules/tracks-grch37.smk"
7 changes: 5 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ dependencies:
- vcfpy
- tqdm
- prov =2
- jq
- aria2
- pigz
# Elementary Python dependencies
- python =3.10
Expand All @@ -37,3 +35,8 @@ dependencies:
# Shell formatting and linting.
- beautysh >=6.0,<7.0
- shellcheck >=0.9,<0.10
# JSON transformation tool used in many rules.
- jq
# Tools for file downloads.
- aria2
- wget
6 changes: 0 additions & 6 deletions genes/README.md

This file was deleted.

21 changes: 0 additions & 21 deletions genes/acmg/acmg.spec.json

This file was deleted.

Loading

0 comments on commit bb128c6

Please sign in to comment.