Skip to content

Commit

Permalink
feat: add reduced development dataset (#44) (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jun 23, 2023
1 parent 3f70b57 commit dd8170e
Show file tree
Hide file tree
Showing 34 changed files with 430 additions and 151 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Ignore the workflow directories.
/work/
/output/
/reduced-*/

# Python
__pycache__
Expand Down
184 changes: 132 additions & 52 deletions Snakefile

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.10.0
- viguno =0.1.1
- mehari =0.5.0
- varfish-server-worker
- annonars =0.12.7
- viguno =0.1.6
- mehari =0.5.7
- varfish-server-worker =0.7.0
4 changes: 2 additions & 2 deletions rules/output/annonars/cadd.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ rule output_annonars_cadd: # -- build CADD RocksDB with annonars
unpack(input_output_annonars_cadd),
output:
rocksdb_identity=(
"output/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=("output/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/spec.yaml"),
spec_yaml=("output/full/annonars/cadd-{genome_release}-{v_cadd}+{v_annonars}/spec.yaml"),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/cons.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ rule output_annonars_cons: # -- build UCSC conservation track RocksDB with anno
tsv="work/annos/{genome_release}/features/cons/{v_cons}/ucsc_conservation.tsv",
output:
rocksdb_identity=(
"output/annonars/cons-{genome_release}-{v_cons}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/cons-{genome_release}-{v_cons}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=("output/annonars/cons-{genome_release}-{v_cons}+{v_annonars}/spec.yaml"),
spec_yaml=("output/full/annonars/cons-{genome_release}-{v_cons}+{v_annonars}/spec.yaml"),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/dbnsfp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ rule output_annonars_dbnsfp: # -- build dbNSFP RocksDB with annonars
input_output_annonars_dbnsfp,
output:
rocksdb_identity=(
"output/annonars/dbnsfp-{genome_release}-{v_dbnsfp}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/dbnsfp-{genome_release}-{v_dbnsfp}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=("output/annonars/dbnsfp-{genome_release}-{v_dbnsfp}+{v_annonars}/spec.yaml"),
spec_yaml=("output/full/annonars/dbnsfp-{genome_release}-{v_dbnsfp}+{v_annonars}/spec.yaml"),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
Expand Down
6 changes: 4 additions & 2 deletions rules/output/annonars/dbscsnv.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ rule output_annonars_dbscsnv: # -- build dbscSNV RocksDB with annonars
input_output_annonars_dbscsnv,
output:
rocksdb_identity=(
"output/annonars/dbscsnv-{genome_release}-{v_dbscsnv}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/dbscsnv-{genome_release}-{v_dbscsnv}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=(
"output/full/annonars/dbscsnv-{genome_release}-{v_dbscsnv}+{v_annonars}/spec.yaml"
),
spec_yaml=("output/annonars/dbscsnv-{genome_release}-{v_dbscsnv}+{v_annonars}/spec.yaml"),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/dbsnp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ rule output_annonars_dbsnp: # -- build dbSNP RocksDB with annonars
vcf="work/download/annos/{genome_release}/seqvars/dbsnp/{v_dbsnp}/dbsnp.vcf.gz",
output:
rocksdb_identity=(
"output/annonars/dbsnp-{genome_release}-{v_dbsnp}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/dbsnp-{genome_release}-{v_dbsnp}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=("output/annonars/dbsnp-{genome_release}-{v_dbsnp}+{v_annonars}/spec.yaml"),
spec_yaml=("output/full/annonars/dbsnp-{genome_release}-{v_dbsnp}+{v_annonars}/spec.yaml"),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
## Rules to create build worker genes database..
## Rules to create build annonars genes database..


rule output_worker_genes: # -- build genes protobuf file
rule output_annonars_genes: # -- build annonars genes RocksDB file
input:
acmg_sf="data/acmg/{v_acmg_sf}/acmg.tsv",
gnomad_constraints="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv",
Expand All @@ -10,22 +10,22 @@ rule output_worker_genes: # -- build genes protobuf file
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
output:
rocksdb_identity=(
"output/worker/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_worker}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_annonars}/"
"rocksdb/IDENTITY"
),
spec_yaml=(
"output/worker/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_worker}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_annonars}/"
"spec.yaml"
),
wildcard_constraints:
v_acmg_sf=RE_VERSION,
v_gnomad_constraints=RE_VERSION,
v_dbnsfp=RE_VERSION,
date=RE_VERSION,
v_worker=RE_VERSION,
v_annonars=RE_VERSION,
shell:
r"""
varfish-server-worker db genes build \
annonars gene import \
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
--path-in-acmg {input.acmg_sf} \
--path-in-gnomad-constraints {input.gnomad_constraints} \
Expand All @@ -34,16 +34,16 @@ rule output_worker_genes: # -- build genes protobuf file
--path-in-ncbi {input.ncbi}
varfish-db-downloader tpl \
--template rules/output/worker/genes.spec.yaml \
--template rules/output/annonars/genes.spec.yaml \
--value today={TODAY} \
\
--value version={wildcards.v_acmg_sf}+{wildcards.v_gnomad_constraints}+{wildcards.v_dbnsfp}+{wildcards.date}+{wildcards.v_worker} \
--value version={wildcards.v_acmg_sf}+{wildcards.v_gnomad_constraints}+{wildcards.v_dbnsfp}+{wildcards.date}+{wildcards.v_annonars} \
--value v_acmg_sf={wildcards.v_acmg_sf} \
--value v_gnomad_constraints={wildcards.v_gnomad_constraints} \
--value v_dbnsfp={wildcards.v_dbnsfp} \
--value date={wildcards.date} \
\
--value v_worker={wildcards.v_worker} \
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
dc.identifier: worker/genes:{{ version }}
dc.title: VarFish Worker genes database
dc.identifier: annonars/genes:{{ version }}
dc.title: annonars genes database
dc.creator: VarFish Development Team
dc.format: application/x-rocksdb
dc.date: {{ today }}
x-version: {{ version }}
dc.description: |
Gene information from the following databases, aggregated using VarFish worker
v{{ v_worker }} in varfish-downloader v{{ v_downloader }}:
Gene information from the following databases, aggregated using annonars
v{{ v_annonars }} in varfish-downloader v{{ v_downloader }}:
- ACMG Supplementary Findings Gene List {{ v_acmg_sf }}
- gnomAD constraints v{{ v_gnomad_constraints }}
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/gnomad_exomes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ rule output_annonars_gnomad_exomes: # -- build gnomAD-exomes RocksDB with annon
vcf="work/download/annos/{genome_release}/seqvars/gnomad_exomes/{v_gnomad}/.done",
output:
rocksdb_identity=(
"output/annonars/gnomad-exomes-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/gnomad-exomes-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=(
"output/annonars/gnomad-exomes-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
"output/full/annonars/gnomad-exomes-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/gnomad_genomes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ rule output_annonars_gnomad_genomes: # -- build gnomAD-genomes RocksDB with ann
vcf="work/download/annos/{genome_release}/seqvars/gnomad_genomes/{v_gnomad}/.done",
output:
rocksdb_identity=(
"output/annonars/gnomad-genomes-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/gnomad-genomes-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=(
"output/annonars/gnomad-genomes-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
"output/full/annonars/gnomad-genomes-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/gnomad_mtdna.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ rule output_annonars_gnomad_mtdna: # -- build gnomAD-mtDNA RocksDB with annonar
vcf="work/annos/{genome_release}/seqvars/gnomad_mtdna/{v_gnomad}/gnomad_mtdna.vcf.gz",
output:
rocksdb_identity=(
"output/annonars/gnomad-mtdna-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
"output/full/annonars/gnomad-mtdna-{genome_release}-{v_gnomad}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=(
"output/annonars/gnomad-mtdna-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
"output/full/annonars/gnomad-mtdna-{genome_release}-{v_gnomad}+{v_annonars}/spec.yaml"
),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
Expand Down
4 changes: 2 additions & 2 deletions rules/output/annonars/helix.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ rule output_annonars_helixmtdb: # -- build HelixMtDb RocksDB with annonars
vcf="work/annos/{genome_release}/seqvars/helixmtdb/{v_helixmtdb}/helixmtdb.vcf.gz",
output:
rocksdb_identity=(
"output/annonars/helixmtdb-{genome_release}-{v_helixmtdb}+{v_annonars}/rocksdb/IDENTITY",
"output/full/annonars/helixmtdb-{genome_release}-{v_helixmtdb}+{v_annonars}/rocksdb/IDENTITY",
),
spec_yaml=(
"output/annonars/helixmtdb-{genome_release}-{v_helixmtdb}+{v_annonars}/spec.yaml",
"output/full/annonars/helixmtdb-{genome_release}-{v_helixmtdb}+{v_annonars}/spec.yaml",
),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
Expand Down
4 changes: 2 additions & 2 deletions rules/output/mehari/freqs.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ rule output_mehari_freqs_build: # -- build frequency tables for mehari
helixmtdb="work/annos/{genome_release}/seqvars/helixmtdb/{v_helixmtdb}/helixmtdb.vcf.gz",
output:
rocksdb_identity=(
"output/mehari/freqs-{genome_release}-{v_gnomad_genomes}+{v_gnomad_exomes}+"
"output/full/mehari/freqs-{genome_release}-{v_gnomad_genomes}+{v_gnomad_exomes}+"
"{v_gnomad_mtdna}+{v_helixmtdb}+{v_annonars}/rocksdb/IDENTITY"
),
spec_yaml=(
"output/mehari/freqs-{genome_release}-{v_gnomad_genomes}+{v_gnomad_exomes}+"
"output/full/mehari/freqs-{genome_release}-{v_gnomad_genomes}+{v_gnomad_exomes}+"
"{v_gnomad_mtdna}+{v_helixmtdb}+{v_annonars}/spec.yaml"
),
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
Expand Down
18 changes: 9 additions & 9 deletions rules/output/viguno/hpo.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Rules to create build worker phenotypes database.
## Rules to create build viguno phenotypes database.

import os

Expand All @@ -7,16 +7,16 @@ import os
VIGUNO_SIMULATE_THREADS = int(os.environ.get("VIGUNO_SIMULATE_THREADS", 96))


rule output_worker_pheno: # -- copy HPO and simulate
rule output_viguno_pheno: # -- copy HPO and simulate
input:
obo="work/download/hpo/{v_hpo}/hp.obo",
hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
genes_to_phenotype="work/download/hpo/{v_hpo}/phenotype_to_genes.txt",
output:
obo="output/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
obo="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hp.obo",
hpoa="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype.hpoa",
phenotype_to_genes="output/full/viguno/hpo-{v_hpo}+{v_viguno}/phenotype_to_genes.txt",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
wildcard_constraints:
v_hpo=RE_VERSION,
v_viguno=RE_VERSION,
Expand Down Expand Up @@ -55,10 +55,10 @@ rule global_hpo_to_bin: # -- convert to .bin
obo="work/download/hpo/{v_hpo}/hp.obo",
hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
genes_to_phenotype="work/download/hpo/{v_hpo}/phenotype_to_genes.txt",
rocksdb_identity="output/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
rocksdb_identity="output/full/viguno/hpo-{v_hpo}+{v_viguno}/scores-fun-sim-avg-resnik-gene/IDENTITY",
output:
bin="output/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml=("output/viguno/hpo-{v_hpo}-{v_viguno}/spec.yaml"),
bin="output/full/viguno/hpo-{v_hpo}+{v_viguno}/hpo.bin",
spec_yaml=("output/full/viguno/hpo-{v_hpo}+{v_viguno}/spec.yaml"),
wildcard_constraints:
v_hpo=RE_VERSION,
v_viguno=RE_VERSION,
Expand Down
6 changes: 3 additions & 3 deletions rules/output/worker/patho_mms.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

rule output_worker_patho_mms:
output:
bed="output/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.bed",
bed_md5="output/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.bed.md5",
spec="output/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.spec.yaml",
bed="output/full/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.bed",
bed_md5="output/full/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.bed.md5",
spec="output/full/worker/annos/strucvars/patho-mms-{genome_release}-{v_patho_mms}/patho-mms.spec.yaml",
wildcard_constraints:
genome_release=RE_GENOME,
v_patho_mms=RE_VERSION,
Expand Down
45 changes: 45 additions & 0 deletions rules/reduced/annonars.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## Rules to create build annonars database subsets (dev/exomes).
#
# We will copy the full HPO (text and binary) but reduce the simulation count.


def input_subset_annonars(wildcards):
"""Input function for ``rule subset_annonars``."""
if wildcards.genome_release == "grch37":
refseq_version = DV.refseq_37
else:
refseq_version = DV.refseq_38
result = {
"bed": (
f"output/reduced-{wildcards.set_name}/targets/{wildcards.genome_release}/"
f"refseq/{refseq_version}/refseq_target_exons.bed"
),
"rocksdb_identity": (
f"output/full/annonars/{wildcards.name}-{wildcards.genome_release}-"
f"{wildcards.version_multi}/rocksdb/IDENTITY"
),
}
return result


rule subset_annonars: # -- create exomes subset
input:
unpack(input_subset_annonars),
output:
rocksdb_identity="output/reduced-{set_name}/annonars/{name}-{genome_release}-{version_multi}/rocksdb/IDENTITY",
wildcard_constraints:
name=RE_NAME,
genome_release=RE_GENOME,
v_hpo=RE_VERSION,
versions=RE_VERSION_MULTI,
threads: int(os.environ.get("THREADS_ANNONARS_IMPORT", "96"))
resources:
runtime=os.environ.get("RUNTIME_ANNONARS_IMPORT", "48h"),
mem_mb_per_cpu=2000,
shell:
r"""
annonars db-utils copy \
--path-in $(dirname {input.rocksdb_identity}) \
--path-out $(dirname {output.rocksdb_identity}) \
--path-beds {input.bed}
"""
Loading

0 comments on commit dd8170e

Please sign in to comment.