Skip to content

Commit

Permalink
feat: integrating PanelApp download for annonars (#79) (#80)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jan 5, 2024
1 parent 4fe6866 commit 3a332a0
Show file tree
Hide file tree
Showing 19 changed files with 92 additions and 12 deletions.
1 change: 1 addition & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ include: "rules/work/genes/hgnc.smk"
include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
include: "rules/work/genes/omim.smk"
include: "rules/work/genes/panelapp.smk"
include: "rules/work/genes/orphapacket.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
Expand Down
17 changes: 15 additions & 2 deletions download_urls.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/
excerpt_strategy:
strategy: no-excerpt
count: null
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=2
excerpt_strategy:
strategy: no-excerpt
count: null
- url: https://panelapp.genomicsengland.co.uk/api/v1/entities/?page=3
excerpt_strategy:
strategy: no-excerpt
count: null

# dbNSFP v4.5a
- url: https://dbnsfp.s3.amazonaws.com/dbNSFP4.5a.zip
excerpt_strategy:
Expand Down Expand Up @@ -78,12 +91,12 @@
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv
- url: ftp://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh38.tsv

- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
- url: https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
excerpt_strategy:
strategy: no-excerpt
count: null

- url: https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
- url: https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
excerpt_strategy:
strategy: manual
count: null
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies:
# Parallel (de)compression.
- pigz
# Varfish related
- annonars =0.32.0
- annonars =0.33.0
- viguno =0.2.0
- mehari =0.21.1
- varfish-server-worker =0.10.2
Expand Down
3 changes: 3 additions & 0 deletions excerpt-data/4c4ffa6ddc180f40/url.txt
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/4efb53cbe56f8290/url.txt
Git LFS file not shown
3 changes: 0 additions & 3 deletions excerpt-data/95e8d788836873e9/url.txt

This file was deleted.

3 changes: 3 additions & 0 deletions excerpt-data/c9c7d6df0e24b954/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/c9c7d6df0e24b954/url.txt
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/cdaaf7a3f7595d3d/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/cdaaf7a3f7595d3d/url.txt
Git LFS file not shown
3 changes: 0 additions & 3 deletions excerpt-data/d0a5951ccb4cd824/url.txt

This file was deleted.

3 changes: 3 additions & 0 deletions excerpt-data/ebc07f725c64907d/__index__
Git LFS file not shown
3 changes: 3 additions & 0 deletions excerpt-data/ebc07f725c64907d/url.txt
Git LFS file not shown
2 changes: 2 additions & 0 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
orpha="work/genes/orphapacket/{v_orpha}+{date}/orpha_diseases.tsv",
panelapp="work/download/genes/panelapp/{date}/panelapp.jsonl",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
gtex="work/genes/annonars/gtex_v8/genes_tpm.jsonl.gz",
Expand Down Expand Up @@ -44,6 +45,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--path-in-hgnc {input.hgnc} \
--path-in-omim {input.omim} \
--path-in-orpha {input.orpha} \
--path-in-panelapp {input.panelapp} \
--path-in-ncbi {input.ncbi} \
--path-in-rcnv {input.rcnv} \
--path-in-shet {input.shet} \
Expand Down
4 changes: 2 additions & 2 deletions rules/work/genes/gtex.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ rule genes_gtex_v8_download: # -- download GTex v8 gene expression data
r"""
wget --no-check-certificate \
-O {output.attributes} \
https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
wget --no-check-certificate \
-O {output.genes_tpm} \
https://storage.cloud.google.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
md5sum {output.attributes} > {output.attributes_md5}
md5sum {output.genes_tpm} > {output.genes_tpm_md5}
Expand Down
43 changes: 43 additions & 0 deletions rules/work/genes/panelapp.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
## Rules related to AlphaMissense per-gene scores


import os
import subprocess
import sys
import tempfile


rule genes_panelapp_download: # -- download AlphaMissense per-gene scores
output:
jsonl="work/download/genes/panelapp/{date}/panelapp.jsonl",
run:
base_url = "https://panelapp.genomicsengland.co.uk/api/v1"

pages = []
page_no = 1
page_count = None
url = f"{base_url}/entities/"
with tempfile.TemporaryDirectory() as tmpdir:
while url:
print(
f"downloading page {page_no}/{page_count if page_count else '?'}...",
file=sys.stderr,
)
subprocess.check_call(["wget", "-O", f"{tmpdir}/page.json", url])
with open(f"{tmpdir}/page.json", "rt") as f:
page = json.load(f)
pages.append(page)
url = page.get("next")
page_no += 1
if not page_count:
per_page = len(page.get("results", [None]))
page_count = (page.get("count") + per_page - 1) // per_page
if os.environ.get("CI", None) == "true" and page_no > 2:
print("CI mode: only downloading first 2 pages", file=sys.stderr)
break

os.makedirs(f"work/download/genes/panelapp/{wildcards.date}", exist_ok=True)
with open(output.jsonl, "wt") as f:
for page in pages:
for result in page.get("results", []):
print(json.dumps(result), file=f)
5 changes: 4 additions & 1 deletion varfish_db_downloader/wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,7 @@ def copy_excerpt(url: UrlEntry, data_dir: str, output_document: str):
basename = parsed.path.split("/")[-1]
excerpt_path = in_path / basename
click.echo(err=True, message="copying {} => {}".format(excerpt_path, output_document))
shutil.copy(excerpt_path, output_document)
if os.path.isdir(excerpt_path):
shutil.copy(f"{excerpt_path}/__index__", output_document)
else:
shutil.copy(excerpt_path, output_document)

0 comments on commit 3a332a0

Please sign in to comment.