Skip to content

Commit

Permalink
feat: adding clingen curation data (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Aug 25, 2023
1 parent 2a5e76a commit 51435df
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 5 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
environment-file: environment.yml
init-shell: bash
cache-environment: true
post-cleanup: 'all'
post-cleanup: none # breaks otherwise

- name: Check formating
run: |
Expand Down Expand Up @@ -65,7 +65,7 @@ jobs:
environment-file: environment.yml
init-shell: bash
cache-environment: true
post-cleanup: 'all'
post-cleanup: none # breaks otherwise

- name: Install python package
run: |
Expand Down
2 changes: 2 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ rule all:
# == work directory =====================================================================
#
# genes
f"work/download/genes/clingen/{DV.clingen_gene}/clingen.csv",
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
f"work/genes/enst_ensg/grch37/{DV.ensembl_37}/enst_ensg.tsv",
Expand Down Expand Up @@ -317,6 +318,7 @@ rule all:
include: "rules/work/misc/hpo.smk"
# Gene-related rules.
include: "rules/work/genes/dbnsfp.smk"
include: "rules/work/genes/clingen.smk"
include: "rules/work/genes/ensembl.smk"
include: "rules/work/genes/gnomad.smk"
include: "rules/work/genes/hgnc.smk"
Expand Down
6 changes: 6 additions & 0 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
- comment: The curation activity summary report is built in real-time.
url: https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report

- comment: ClinGen variant summary is built in real-time
url: http://erepo.clinicalgenome.org/evrepo/api/classifications/all?format=tabbed

- url: https://github.com/bihealth/annonars-data-clinvar/releases/download/clinvar-weekly-20230625/clinvar-strucvar-grch37-2023-0625+0.6.3.tar.gz
excerpt_strategy:
strategy: no-excerpt
Expand Down
3 changes: 3 additions & 0 deletions excerpt-data/0a27656c7f2ba08a/all
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/0a27656c7f2ba08a/url.txt
Git LFS file not shown
4 changes: 2 additions & 2 deletions excerpt-data/e847ec57405dca9c/hgnc_complete_set.json
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/f668bbde1014a975/url.txt
Git LFS file not shown
7 changes: 7 additions & 0 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
rule output_annonars_genes: # -- build annonars genes RocksDB file
input:
acmg_sf="data/acmg_sf/{v_acmg_sf}/acmg_sf.tsv",
clingen="work/download/genes/clingen/{date}/clingen.csv",
gnomad_constraints="work/genes/gnomad/{v_gnomad_constraints}/gnomad_constraints.tsv",
dbnsfp="work/genes/dbnsfp/{v_dbnsfp}/genes.tsv.gz",
hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
Expand All @@ -25,9 +26,15 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
v_annonars=RE_VERSION,
shell:
r"""
export TMPDIR=$(mktemp -d)
trap "rm -rf $TMPDIR" EXIT
tail -n +4 {input.clingen} > $TMPDIR/clingen.csv
annonars gene import \
--path-out-rocksdb $(dirname {output.rocksdb_identity}) \
--path-in-acmg {input.acmg_sf} \
--path-in-clingen $TMPDIR/clingen.csv \
--path-in-gnomad-constraints {input.gnomad_constraints} \
--path-in-dbnsfp {input.dbnsfp} \
--path-in-hgnc {input.hgnc} \
Expand Down
2 changes: 2 additions & 0 deletions rules/output/annonars/genes.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ dc.source:
x-created-from:
- name: ACMG SF Gene List
version: {{ v_acmg_sf }}
- name: ClinGen Gene Curation
version: {{ today }}
- name: gnomAD constraints
version: {{ v_gnomad_constraints }}
- name: dbNSFP
Expand Down
20 changes: 20 additions & 0 deletions rules/work/genes/clingen.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## Rules related to ClinGen curation download.


rule genes_clingen_download: # -- download ClinGen curations
output:
csv="work/download/genes/clingen/{date}/clingen.csv",
csv_md5="work/download/genes/clingen/{date}/clingen.csv.md5",
shell:
r"""
if [[ "$(date +%Y%m%d)" != "{wildcards.date}" ]] && [[ "{FORCE_TODAY}" != "True" ]]; then
>&2 echo "{wildcards.date} is not today"
exit 1
fi
wget --no-check-certificate \
-O {output.csv} \
https://search.clinicalgenome.org/kb/reports/curation-activity-summary-report
md5sum {output.csv} > {output.csv_md5}
"""
6 changes: 6 additions & 0 deletions varfish_db_downloader/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
class DataVersions:
"""Container with data versions."""

#: String to use for ClinGen gene curation version.
clingen_gene: str
#: String to use for ClinGen variant curation version.
clingen_variant: str
#: String to use for GRCh37 ENSEMBL version.
ensembl_37: str
#: String to use for GRCh38 ENSEMBL version.
Expand Down Expand Up @@ -105,6 +109,8 @@ class DataVersions:

#: The data versions to use.
DATA_VERSIONS = DataVersions(
clingen_gene=TODAY,
clingen_variant=TODAY,
ensembl_37="87",
ensembl_38="109",
ensembl="109",
Expand Down
2 changes: 1 addition & 1 deletion varfish_db_downloader/wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def download_excerpt(url: UrlEntry, data_dir: str, force: bool):

excerpt_fun = STRATEGY_MAP[url.excerpt_strategy.strategy]
parsed = urllib.parse.urlparse(url.url)
basename = parsed.path.split("/")[-1]
basename = parsed.path.split("/")[-1] or "__index__"
out_path_data = str(out_path / basename)
logger.info(" getting excerpt to {}", out_path_data)
excerpt_fun(url.url, str(out_path_data), url.excerpt_strategy.count)
Expand Down

0 comments on commit 51435df

Please sign in to comment.