Skip to content

Commit

Permalink
feat: annotation of genes with OMIM diseases (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Aug 25, 2023
1 parent b0d6884 commit 6c19f59
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 4 deletions.
5 changes: 3 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ rule all:
# genes
f"work/download/genes/clingen/{DV.clingen_gene}/clingen.csv",
f"work/download/genes/rcnv/2022/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
f"work/download/genes/shet/2019/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz",
f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz",
f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv",
f"work/genes/enst_ensg/grch37/{DV.ensembl_37}/enst_ensg.tsv",
f"work/genes/entrez/{DV.today}/gene_info.jsonl",
f"work/genes/gnomad/{DV.gnomad_constraints}/gnomad_constraints.tsv",
f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl",
f"work/genes/omim/{DV.hpo}+{DV.today}/omim_diseases.tsv",
"work/genes/rcnv/2022/rcnv_collins_2022.tsv",
"work/genes/shet/2019/shet_weghorn_2019.tsv",
# reference-specific annotations
Expand Down Expand Up @@ -160,7 +160,7 @@ rule all:
f"output/full/annonars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY",
# ----- genes
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
f"output/full/annonars/genes-{DV.acmg_sf}+{DV.gnomad_constraints}+{DV.dbnsfp}+{DV.hpo}+{DV.today}+{PV.annonars}/rocksdb/IDENTITY",
# -- worker data
f"output/full/worker/genes-regions-grch37-{DV.refseq_37}+{PV.worker}/refseq_genes.bin",
f"output/full/worker/genes-regions-grch37-{DV.ensembl_37}+{PV.worker}/ensembl_genes.bin",
Expand Down Expand Up @@ -328,6 +328,7 @@ include: "rules/work/genes/gnomad.smk"
include: "rules/work/genes/hgnc.smk"
include: "rules/work/genes/mehari_data_tx.smk"
include: "rules/work/genes/ncbi.smk"
include: "rules/work/genes/omim.smk"
include: "rules/work/genes/rcnv.smk"
include: "rules/work/genes/shet.smk"
# Reference sequence--related rules.
Expand Down
6 changes: 4 additions & 2 deletions rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
dbnsfp="work/genes/dbnsfp/{v_dbnsfp}/genes.tsv.gz",
hgnc="work/genes/hgnc/{date}/hgnc_info.jsonl",
ncbi="work/genes/entrez/{date}/gene_info.jsonl",
omim="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
rcnv="work/genes/rcnv/2022/rcnv_collins_2022.tsv",
shet="work/genes/shet/2019/shet_weghorn_2019.tsv",
output:
rocksdb_identity=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_annonars}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
"rocksdb/IDENTITY"
),
spec_yaml=(
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{date}+{v_annonars}/"
"output/full/annonars/genes-{v_acmg_sf}+{v_gnomad_constraints}+{v_dbnsfp}+{v_hpo}+{date}+{v_annonars}/"
"spec.yaml"
),
wildcard_constraints:
Expand All @@ -40,6 +41,7 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--path-in-gnomad-constraints {input.gnomad_constraints} \
--path-in-dbnsfp {input.dbnsfp} \
--path-in-hgnc {input.hgnc} \
--path-in-omim {input.omim} \
--path-in-ncbi {input.ncbi} \
--path-in-rcnv {input.rcnv} \
--path-in-shet {input.shet}
Expand Down
6 changes: 6 additions & 0 deletions rules/output/annonars/genes.spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,9 @@ x-created-from:
version: {{ today }}
- name: HGNC
version: {{ today }}
- name: OMIM
version: {{ today }}
- name: rCNV pHaplo/pTriplo scores
version: 2022-Collins-et-al
- name: sHet scores
version: 2019-Weghorn-et-a.
51 changes: 51 additions & 0 deletions rules/work/genes/omim.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
## Rules related to Omim disease to HGNC ID annotation.


rule genes_omim: # -- prepare HGNC to OMIM disease mapping
input:
mim2gene="work/download/genes/ncbi/{date}/mim2gene_medgen",
xlink="output/full/mehari/genes-xlink-{date}/genes-xlink.tsv",
hpoa="work/download/hpo/{v_hpo}/phenotype.hpoa",
output:
tsv="work/genes/omim/{v_hpo}+{date}/omim_diseases.tsv",
shell:
"""
set -x
export TMPDIR=$(mktemp -d)
# trap "rm -rf $TMPDIR" ERR EXIT
head -n 1 {input.mim2gene} | sed -e 's/ /_/g' -e 's/#//g' \
> $TMPDIR/mim2gene.tsv
tail -n +2 {input.mim2gene} \
| sed -e 's/^/OMIM:/g' \
>> $TMPDIR/mim2gene.tsv
grep -v ^# {input.hpoa} \
> $TMPDIR/phenotype.hpoa
qsv join -d '\t' \
entrez_id {input.xlink} \
GeneID $TMPDIR/mim2gene.tsv \
| qsv select 'hgnc_id,MIM_number' \
| qsv rename 'hgnc_id,omim_id' \
| tr ',' '\t' \
| qsv sort \
> $TMPDIR/mim2gene_hgnc.tsv
echo hgnc_id,omim_id,disease_name \
> $TMPDIR/output.csv
qsv join -d '\t' \
omim_id $TMPDIR/mim2gene_hgnc.tsv \
database_id $TMPDIR/phenotype.hpoa \
| qsv select 'hgnc_id,omim_id,disease_name' \
| tail -n +2 \
| sort -t , -k1,2V -u \
>> $TMPDIR/output.csv
qsv fmt -t '\t' $TMPDIR/output.csv \
> {output.tsv}
md5sum {output.tsv} > {output.tsv}.md5
"""

0 comments on commit 6c19f59

Please sign in to comment.