Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adding annonars functional #68

Merged
merged 13 commits into from
Nov 22, 2023
7 changes: 7 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ rule all:
f"output/full/annonars/gnomad-sv-exomes-grch38-{DV.gnomad_cnv4}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/gnomad-sv-genomes-grch37-{DV.gnomad_sv}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/gnomad-sv-genomes-grch38-{DV.gnomad_sv4}+{PV.annonars}/rocksdb/IDENTITY",
# ----- sequence annotation
f"output/full/annonars/functional-grch37-{DV.refseq_fe_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/functional-grch38-{DV.refseq_fe_38}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/regions-grch37-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/regions-grch38-{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# ----- conservation
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
Expand Down Expand Up @@ -386,6 +391,8 @@ include: "rules/output/annonars/gnomad_mtdna.smk"
include: "rules/output/annonars/gnomad_sv.smk"
include: "rules/output/annonars/helix.smk"
include: "rules/output/annonars/genes.smk"
include: "rules/output/annonars/functional.smk"
include: "rules/output/annonars/regions.smk"
# ---- worker
include: "rules/output/worker/patho_mms.smk"
include: "rules/output/worker/clinvar.smk"
Expand Down
9 changes: 9 additions & 0 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
excerpt_strategy:
strategy: gz-head
count: 1000
- url: https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
excerpt_strategy:
strategy: gz-head
count: 1000

- url: https://storage.googleapis.com/gcp-public-data--gnomad/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz
excerpt_strategy:
strategy: gz-head
Expand Down
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/98935d27cc8f0dc0/url.txt
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/f0ed4b0862f1b46b/url.txt
Git LFS file not shown
67 changes: 67 additions & 0 deletions rules/output/annonars/functional.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## Rules to create build annonars functional annotation database..


rule work_annonars_functional_download_37: # -- download functional data for GRCh37
output:
"work/download/refseq/grch37/{version}/{assembly}_genomic.gff.gz",
shell:
r"""
wget -O {output} \
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
"""


rule work_annonars_functional_download_38: # -- download functional data for GRCh37
output:
"work/download/refseq/grch38/{version}/{assembly}_genomic.gff.gz",
shell:
r"""
wget -O {output} \
https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/{wildcards.version}/{wildcards.assembly}/{wildcards.assembly}_genomic.gff.gz
"""


def output_annonars_functional_input(wildcards):
if wildcards.genome_release == "grch37":
return f"work/download/refseq/grch37/{DV.refseq_fe_37}/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
else:
return f"work/download/refseq/grch38/{DV.refseq_fe_38}/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"


rule output_annonars_functional: # -- build annonars functional RocksDB file
input:
output_annonars_functional_input,
output:
rocksdb_identity=(
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/"
"rocksdb/IDENTITY"
),
spec_yaml=(
"output/full/annonars/functional-{genome_release}-{v_refseq}+{v_annonars}/spec.yaml"
),
wildcard_constraints:
v_refseq=RE_VERSION,
v_annonars=RE_VERSION,
shell:
r"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT

zgrep '^#\|RefSeqFE' {input} > $TMPDIR/tmp.gff

annonars functional import -vvv \
--genome-release {wildcards.genome_release} \
--path-in-gff $TMPDIR/tmp.gff \
--path-out-rocksdb $(dirname {output.rocksdb_identity})

varfish-db-downloader tpl \
--template rules/output/annonars/functional.spec.yaml \
--value today={TODAY} \
\
--value version={wildcards.v_refseq}+{wildcards.v_annonars} \
--value v_refseq={wildcards.v_refseq} \
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
16 changes: 16 additions & 0 deletions rules/output/annonars/functional.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dc.identifier: annonars/functional:{{ version }}-{{ genome_release }}
dc.title: annonars functional elements RocksDB
dc.creator: VarFish Developer Teams
dc.format: application/x-rocksdb
dc.date: {{ today }}
x-version: {{ version }}
x-genome-release: {{ genome_release }}
dc.description: |
RocksDB built from RefSeq Functional Elements (and other sources in
the future).
dc.source:
- PMID:34876495
- https://www.ncbi.nlm.nih.gov/refseq/
x-created-from:
- name: RefSeq Functional Elements
version: {{ v_refseq }}
52 changes: 52 additions & 0 deletions rules/output/annonars/regions.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
## Rules to create build annonars regions annotation database..


rule work_annonars_regions_download: # -- download clingen regions
output:
"work/download/clingen/{genome_release}/{today}/ClinGen_region_curation_list_{genome_release}.tsv",
shell:
r"""
if [[ "{wildcards.genome_release}" == "grch38" ]]; then
GENOME=GRCh37
else
GENOME=GRCh38
fi

wget -O {output} \
ftp://ftp.clinicalgenome.org/ClinGen_region_curation_list_$GENOME.tsv
"""


rule output_annonars_regions: # -- build annonars regions RocksDB file
input:
"work/download/clingen/{genome_release}/{date}/ClinGen_region_curation_list_{genome_release}.tsv",
output:
rocksdb_identity=(
"output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/" "rocksdb/IDENTITY"
),
spec_yaml=("output/full/annonars/regions-{genome_release}-{date}+{v_annonars}/spec.yaml"),
wildcard_constraints:
v_refseq=RE_VERSION,
v_annonars=RE_VERSION,
shell:
r"""
if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
>&2 echo "{wildcards.date} is not today"
exit 1
fi

annonars regions import -vvv \
--genome-release {wildcards.genome_release} \
--path-in-clingen {input} \
--path-out-rocksdb $(dirname {output.rocksdb_identity})

varfish-db-downloader tpl \
--template rules/output/annonars/regions.spec.yaml \
--value today={TODAY} \
\
--value version={wildcards.date}+{wildcards.v_annonars} \
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
14 changes: 14 additions & 0 deletions rules/output/annonars/regions.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
dc.identifier: annonars/regions:{{ version }}-{{ genome_release }}
dc.title: annonars regions annotation RocksDB
dc.creator: VarFish Developer Teams
dc.format: application/x-rocksdb
dc.date: {{ today }}
x-version: {{ version }}
x-genome-release: {{ genome_release }}
dc.description: |
RocksDB with region annotation.
dc.source:
- https://search.clinicalgenome.org/kb/gene-dosage
x-created-from:
- name: ClinGen Region Dosage Pathogenicity
version: {{ today }}
6 changes: 6 additions & 0 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ class DataVersions:
#: Marker file for the tracks version. This allows us to update the
#: tracks BED files later on.
tracks: str
#: RefSeq functional elements for GRCh37.
refseq_fe_37: str
#: RefSeq functional elements for GRCh38.
refseq_fe_38: str


#: The data versions to use.
Expand Down Expand Up @@ -158,6 +162,8 @@ class DataVersions:
clinvar_release=CLINVAR_RELEASE,
clinvar_version=CLINVAR_VERSION,
tracks="0",
refseq_fe_37="105.20201022",
refseq_fe_38="110",
)


Expand Down