From bb128c648b8cbba049a70bb3226824ee6775025e Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 1 Jun 2023 11:16:09 +0200 Subject: [PATCH] feat: restructuring, cleanup, documnetation (#34) --- .gitignore | 58 +- Makefile | 4 +- README.md | 17 +- Snakefile | 139 +- environment.yml | 7 +- genes/README.md | 6 - genes/acmg/acmg.spec.json | 21 - genes/acmg/acmg.tsv | 91 - genes/acmg/acmg.tsv.md5 | 1 - genes/acmg/acmg.tsv.sha256 | 1 - genes/dbnsfp/genes.spec.json | 118 - ...GCF_000001405.25_GRCh37.p13_genomic.gtf.gz | Bin 0 -> 9211 bytes .../grch37/download/knowntoEnsembl.txt.gz | Bin 0 -> 690 bytes .../gnomad_constraints.spec.json | 42 - genes/mim2gene/mim2gene.spec.json | 18 - genes/mim2gene/mim2gene.tsv | 7228 +++++++++++++++++ genes/mim2gene/mim2gene.tsv.md5 | 1 + genes/xlink/ensembl.spec.json | 20 - genes/xlink/hgnc.spec.json | 21 - rules/genes/dbnsfp.smk | 95 + rules/genes/ensembl.smk | 67 + rules/genes/gnomad.smk | 85 + rules/genes/hgnc.smk | 56 + rules/genes/ncbi.smk | 74 + rules/reference/human.smk | 50 + snakefiles/annos.smk | 451 - snakefiles/features.smk | 149 - snakefiles/genes.smk | 274 - snakefiles/reference.smk | 35 - snakefiles/tracks-grch37.smk | 89 - snakefiles/vardbs-grch37-strucvars.smk | 226 - varfish_db_downloader/data_versions.py | 15 + 32 files changed, 7796 insertions(+), 1663 deletions(-) delete mode 100644 genes/README.md delete mode 100644 genes/acmg/acmg.spec.json delete mode 100644 genes/acmg/acmg.tsv delete mode 100644 genes/acmg/acmg.tsv.md5 delete mode 100644 genes/acmg/acmg.tsv.sha256 delete mode 100644 genes/dbnsfp/genes.spec.json create mode 100644 genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz create mode 100644 genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz delete mode 100644 genes/gnomad_constraints/gnomad_constraints.spec.json delete mode 100644 genes/mim2gene/mim2gene.spec.json create mode 100644 genes/mim2gene/mim2gene.tsv create mode 100644 genes/mim2gene/mim2gene.tsv.md5 delete mode 100644 genes/xlink/ensembl.spec.json delete mode 100644 genes/xlink/hgnc.spec.json create mode 100644 rules/genes/dbnsfp.smk create mode 100644 rules/genes/ensembl.smk create mode 100644 rules/genes/gnomad.smk create mode 100644 rules/genes/hgnc.smk create mode 100644 rules/genes/ncbi.smk create mode 100644 rules/reference/human.smk delete mode 100644 snakefiles/annos.smk delete mode 100644 snakefiles/features.smk delete mode 100644 snakefiles/genes.smk delete mode 100644 snakefiles/reference.smk delete mode 100644 snakefiles/tracks-grch37.smk delete mode 100644 snakefiles/vardbs-grch37-strucvars.smk create mode 100644 varfish_db_downloader/data_versions.py diff --git a/.gitignore b/.gitignore index cb37c1e..fe88cf6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,55 +1,19 @@ -**/download -**/rocksdb* -**/.done - -annos/**/*.vcf* -annos/**/*.bed* -annos/**/*.tsv* - -features/**/*.bed* -features/**/*.tsv* -features/**/*.md5 - -genes/**/*.bed* -genes/**/*.tsv* -genes/**/*.jsonl* -genes/**/*.md5 - -vardbs/**/*.tsv* -vardbs/**/*.bed* -vardbs/**/*.md5 - -reference/**/*.fa* - -tracks/**/*.bed* - -/stats-*/ -/*.xlsx -.~* -core.* -/report.NEW +# Ignore the workflow directories. +work/ +output/ +# Python *.egg-info +src/ +*.pyc +*.pyo + +# Snakemake +.snakemake +# Text Editors / IDEs *~ .*.sw? .idea -logs/ -run.sh -.snakemake -/slurm_log -/old -/tmp -/noref -/GRCh37 -/GRCh38 -src/ -*.pyc -*.pyo *.bak* -varfish-server-background-db-* -jannovar-db-* -varfish-annotator-* -*.tar.gz -*.sha256 .vscode/ diff --git a/Makefile b/Makefile index 854f18d..1a9a638 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ check-black: .PHONY: check-snakefmt snakefmt: snakefmt --check --diff --line-length 100 Snakefile - snakefmt --check --diff --line-length 100 snakefiles/*.smk + snakefmt --check --diff --line-length 100 rules/*/*.smk # Run Python linting with flake8. .PHONY: flake8 @@ -86,4 +86,4 @@ black: .PHONY: run-snakefmt run-snakefmt: snakefmt --line-length 100 Snakefile - snakefmt --line-length 100 snakefiles/*.smk + snakefmt --line-length 100 rules/*/*.smk diff --git a/README.md b/README.md index 6825a0e..addf8f8 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,22 @@ This repository contains a Snakemake workflow with supporting code for downloadi - License: MIT - Programming Language: Python / Snakemake +## Running + +Use the utility rule `help` to get a list of all available rules: + +``` +# snakemake --cores=1 help +``` + +Run them all with `all`: + +``` +# snakemake --cores=1 all +``` + +Note that this will take a long time, use a lot of disk space, and download a lot of data. + ## Development Setup ### Prerequisites: Install `mamba` for Conda Package Management @@ -48,7 +64,6 @@ This will install the `varfish-db-downloader` tools: # pip install -e . ``` - ## Developer Rules ### Download Commands diff --git a/Snakefile b/Snakefile index c8820a1..a475a01 100644 --- a/Snakefile +++ b/Snakefile @@ -6,6 +6,14 @@ # ``varfish-server-worker`` and is used in the backend for filtering and/or exposed to the # user via a REST API. +from varfish_db_downloader.data_versions import DATA_VERSIONS as DV + +# The prefix to use for all shell commands. +SHELL_PREFIX = "export LC_ALL=C; set -x -euo pipefail;" +# Setup the shell prefix by default. +shell.prefix(SHELL_PREFIX) + + # =============================================================================================== # Test Mode # =============================================================================================== @@ -20,52 +28,79 @@ if os.environ.get("CI", "false").lower() == "true": # =============================================================================================== -# Default Rule +# Top-Level Rules # =============================================================================================== -rule default: +## help -- print this help +rule help: input: - "annos/grch37/cadd/.done", - "annos/grch37/dbnsfp-4.4a/.done", - "annos/grch37/dbnsfp-4.4c/.done", - "annos/grch37/dbscsnv/.done", - "annos/grch37/helixmtdb/helixmtdb.vcf.gz", - "annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz", - "annos/grch37/ucsc_conservation/ucsc_conservation.tsv", - "annos/grch37/dbsnp/dbsnp.vcf.gz", - "annos/grch37/gnomad_exomes/.done", - "annos/grch37/gnomad_genomes/.done", - "annos/grch38/cadd/.done", - "annos/grch38/dbnsfp-4.4a/.done", - "annos/grch38/dbnsfp-4.4c/.done", - "annos/grch38/gnomad_exomes/.done", - "annos/grch38/gnomad_genomes/.done", - "annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz", - "annos/grch38/helixmtdb/helixmtdb.vcf.gz", - "features/grch37/tads/imr90.bed", - "features/grch37/tads/hesc.bed", - "features/grch37/gene_regions/refseq.bed.gz", - "features/grch37/gene_regions/ensembl.bed.gz", - "features/grch37/masked/repeat.bed.gz", - "features/grch37/masked/segdup.bed.gz", - "genes/hgnc/hgnc_info.jsonl", - "genes/ncbi/gene_info.jsonl", - "genes/dbnsfp/genes.tsv.gz", - "genes/xlink/ensembl.tsv", - "genes/xlink/hgnc.tsv", - "genes/mim2gene/mim2gene.tsv", - "tracks/grch37/ucsc_genomicSuperDups.bed.gz", - "tracks/grch37/ucsc_rmsk.bed.gz", - "tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz", - "tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz", - # "vardbs/grch37/strucvar/clinvar.bed.gz", - "vardbs/grch37/strucvar/dbvar.bed.gz", - "vardbs/grch37/strucvar/dgv.bed.gz", - "vardbs/grch37/strucvar/dgv_gs.bed.gz", - "vardbs/grch37/strucvar/g1k.bed.gz", - "vardbs/grch37/strucvar/gnomad_sv.bed.gz", - "vardbs/grch37/strucvar/exac.bed.gz", + "Snakefile", + run: + shell.prefix("") # no ``set -x`` for this rule + shell( + r""" + echo + echo "=== Available Rules ===" + echo + for f in Snakefile $(find rules/* -name '*.smk' | sort); do + echo "--- $f ---" + echo + grep '^##' $f + echo + grep -e '^rule' $f + echo + done + """ + ) + + +## all -- run all rules +# rule all: +# input: +# Gene-Related Information +# "work/genes/hgnc/hgnc_info.jsonl", +# "genes/ncbi/gene_info.jsonl", +# "genes/dbnsfp/genes.tsv.gz", +# "genes/xlink/ensembl.tsv", +# "genes/xlink/hgnc.tsv", +# "genes/mim2gene/mim2gene.tsv", +# # Per-Reference Variant Annotations +# "annos/grch37/cadd/.done", +# "annos/grch37/dbnsfp-4.4a/.done", +# "annos/grch37/dbnsfp-4.4c/.done", +# "annos/grch37/dbscsnv/.done", +# "annos/grch37/helixmtdb/helixmtdb.vcf.gz", +# "annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz", +# "annos/grch37/ucsc_conservation/ucsc_conservation.tsv", +# "annos/grch37/dbsnp/dbsnp.vcf.gz", +# "annos/grch37/gnomad_exomes/.done", +# "annos/grch37/gnomad_genomes/.done", +# "annos/grch38/cadd/.done", +# "annos/grch38/dbnsfp-4.4a/.done", +# "annos/grch38/dbnsfp-4.4c/.done", +# "annos/grch38/gnomad_exomes/.done", +# "annos/grch38/gnomad_genomes/.done", +# "annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz", +# "annos/grch38/helixmtdb/helixmtdb.vcf.gz", +# # Per-Reference "Features" +# "features/grch37/tads/imr90.bed", +# "features/grch37/tads/hesc.bed", +# "features/grch37/gene_regions/refseq.bed.gz", +# "features/grch37/gene_regions/ensembl.bed.gz", +# "features/grch37/masked/repeat.bed.gz", +# "features/grch37/masked/segdup.bed.gz", +# "tracks/grch37/ucsc_genomicSuperDups.bed.gz", +# "tracks/grch37/ucsc_rmsk.bed.gz", +# "tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz", +# "tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz", +# # "vardbs/grch37/strucvar/clinvar.bed.gz", +# "vardbs/grch37/strucvar/dbvar.bed.gz", +# "vardbs/grch37/strucvar/dgv.bed.gz", +# "vardbs/grch37/strucvar/dgv_gs.bed.gz", +# "vardbs/grch37/strucvar/g1k.bed.gz", +# "vardbs/grch37/strucvar/gnomad_sv.bed.gz", +# "vardbs/grch37/strucvar/exac.bed.gz", # =============================================================================================== @@ -73,9 +108,17 @@ rule default: # =============================================================================================== -include: "snakefiles/annos.smk" -include: "snakefiles/genes.smk" -include: "snakefiles/features.smk" -include: "snakefiles/vardbs-grch37-strucvars.smk" -include: "snakefiles/tracks-grch37.smk" -include: "snakefiles/reference.smk" +# Gene-related information. +include: "rules/genes/dbnsfp.smk" +include: "rules/genes/ensembl.smk" +include: "rules/genes/gnomad.smk" +include: "rules/genes/hgnc.smk" +include: "rules/genes/ncbi.smk" +# Refernece sequence--related information. +include: "rules/reference/human.smk" + + +# include: "rules/annos.smk" +# include: "rules/features.smk" +# include: "rules/vardbs-grch37-strucvars.smk" +# include: "rules/tracks-grch37.smk" diff --git a/environment.yml b/environment.yml index 4920985..462b20d 100644 --- a/environment.yml +++ b/environment.yml @@ -16,8 +16,6 @@ dependencies: - vcfpy - tqdm - prov =2 - - jq - - aria2 - pigz # Elementary Python dependencies - python =3.10 @@ -37,3 +35,8 @@ dependencies: # Shell formatting and linting. - beautysh >=6.0,<7.0 - shellcheck >=0.9,<0.10 + # JSON transformation tool used in many rules. + - jq + # Tools for file downloads. + - aria2 + - wget diff --git a/genes/README.md b/genes/README.md deleted file mode 100644 index ddd82d7..0000000 --- a/genes/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Gene-Centric Information for VarFish - -- acmg -- ACMG supplemental findings -- dbnsfp -- dbNSFP gene information (complete with full interactions) -- gnomad-constraints -- gnomAD gene constraint scores -- xlink -- interlinks between identifier maps diff --git a/genes/acmg/acmg.spec.json b/genes/acmg/acmg.spec.json deleted file mode 100644 index a590551..0000000 --- a/genes/acmg/acmg.spec.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "dc:format": "text/tsv", - "dc:identifier": "gene-centric/acmg/acmg-sf-genes.tsv:3.1", - "dc:title": "ACMG Secondary Findings Gene List (v3.1)", - "dc:description": "This is version 3.1 of the ACMG gene list for reporting incidental findings. The file was curated from PMID:35802134 as gene symbols and then translated to ENSEMBL and Entrez/NCBI gene ID with the HGNC BioMart", - "dc:created": "2022-02-03", - "dc:creator": "American College of Medical Genetics", - "dc:contributor": [ - "VarFish Developer Team" - ], - "dc:source": [ - "PMID:35802134", - "https://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/", - "https://biomart.genenames.org/" - ], - "tsv:columns": { - "gene_symbol": "HGNC approve gene symbol", - "ensembl_gene_id": "ENSEMBL gene ID", - "entrez_gene_id": "Entrez/NCBI gene ID" - } -} \ No newline at end of file diff --git a/genes/acmg/acmg.tsv b/genes/acmg/acmg.tsv deleted file mode 100644 index b324eff..0000000 --- a/genes/acmg/acmg.tsv +++ /dev/null @@ -1,91 +0,0 @@ -hgnc_id ensembl_gene_id ncbi_gene_id gene_symbol mim_gene_id disease_phenotype disorder_mim phenotype_category inheritance sf_list_version variants_to_report -HGNC:130 ENSG00000107796 59 ACTA2 102620 Familial thoracic aortic aneurysm 611788 Cardiovascular AD 1.0 All P and LP -HGNC:143 ENSG00000159251 70 ACTC1 102540 Hypertrophic cardiomyopathy 612098 Cardiovascular AD 1.0 All P and LP -HGNC:175 ENSG00000139567 94 ACVRL1 601284 Hereditary hemorrhagic telangiectasia 600376 Miscellaneous AD 3.0 All P and LP -HGNC:583 ENSG00000134982 324 APC 611731 Familial adenomatous polyposis 175100 Cancer AD 1.0 All P and LP -HGNC:603 ENSG00000084674 338 APOB 107730 Familial hypercholesterolemia 144010 Cardiovascular AD 1.0 All P and LP -HGNC:870 ENSG00000123191 540 ATP7B 606882 Wilson disease 277900 Miscellaneous AR 2.0 P and LP (2 variants) -HGNC:939 ENSG00000151929 9531 BAG3 603883 Dilated cardiomyopathy 613881 Cardiovascular AD 3.1 All P and LP -HGNC:939 ENSG00000151929 9531 BAG3 603883 Myofibrillar myopathy 612954 Cardiovascular AD 3.1 All P and LP -HGNC:1076 ENSG00000107779 657 BMPR1A 601299 Juvenile polyposis syndrome 174900 Cancer AD 1.0 All P and LP -HGNC:1100 ENSG00000012048 672 BRCA1 113705 Hereditary breast and ovarian cancer 604370 Cancer AD 1.0 All P and LP -HGNC:1101 ENSG00000139618 675 BRCA2 600185 Hereditary breast and ovarian cancer 612555 Cancer AD 1.0 All P and LP -HGNC:1122 ENSG00000169814 686 BTD 609019 Biotinidase deficiency 253260 Metabolic AR 3.0 P and LP (2 variants) -HGNC:1397 ENSG00000081248 779 CACNA1S 114208 Malignant hyperthermia 601887 Miscellaneous AD 1.0 All P and LP -HGNC:1513 ENSG00000118729 845 CASQ2 114251 Catecholaminergic polymorphic ventricular tachycardia 611938 Cardiovascular AR 3.0 P and LP (2 variants) -HGNC:2201 ENSG00000168542 1281 COL3A1 120180 Ehlers-Danlos syndrome, vascular type 130050 Cardiovascular AD 1.0 All P and LP -HGNC:2770 ENSG00000175084 1674 DES 125660 Dliated cardiomyopathy 604765 Cardiovascular AD 3.1 All P and LP -HGNC:2770 ENSG00000175084 1674 DES 125660 Myofibrillar myopathy 601419 Cardiovascular AD 3.1 All P and LP -HGNC:3036 ENSG00000134755 1824 DSC2 125645 Arrhythmogenic right ventricular cardiomyopathy 610476 Cardiovascular AD 1.0 All P and LP -HGNC:3049 ENSG00000046604 1829 DSG2 125671 Arrhythmogenic right ventricular cardiomyopathy 610193 Cardiovascular AD 1.0 All P and LP -HGNC:3052 ENSG00000096696 1832 DSP 125647 Arrhythmogenic right ventricular cardiomyopathy 607450 Cardiovascular AD 1.0 All P and LP -HGNC:3052 ENSG00000096696 1832 DSP 125647 Dilated cardiomyopathy 615821 Cardiovascular AD 1.0 All P and LP -HGNC:3349 ENSG00000106991 2022 ENG 131195 Hereditary hemorrhagic telangiectasia 187300 Miscellaneous AD 3.0 All P and LP -HGNC:3603 ENSG00000166147 2200 FBN1 134797 Marfan syndrome 154700 Cardiovascular AD 1.0 All P and LP -HGNC:3756 ENSG00000128591 2318 FLNC 102565 Dilated cardiomyopathy 617047 Cardiovascular AD 3.0 All P and LP -HGNC:3756 ENSG00000128591 2318 FLNC 102565 Myofibrillar myopathy 609524 Cardiovascular AD 3.0 All P and LP -HGNC:4065 ENSG00000171298 2548 GAA 606800 Pompe disease 232300 Metabolic AR 3.0 P and LP (2 variants) -HGNC:4296 ENSG00000102393 2717 GLA 300644 Fabry disease 301500 Cardiovascular Metabolic XL 1.0 All hemi, het, homozygous P and LP -HGNC:4886 ENSG00000010704 3077 HFE 613609 Hereditary hemochromatosis (c.845G>A; p.C282Y homozygotes only) 235200 Miscellaneous AR 3.0 p.C282Y homozygotes only -HGNC:11621 ENSG00000135100 6927 HNF1A 142410 Maturity-Onset of Diabetes of the Young 600496 Miscellaneous AD 3.0 All P and LP -HGNC:6251 ENSG00000055118 3757 KCNH2 152427 Long-QT syndrome type 2 613688 Cardiovascular AD 1.0 All P and LP -HGNC:6294 ENSG00000053918 3784 KCNQ1 607542 Long-QT syndrome type 1 192500 Cardiovascular AD 1.0 All P and LP -HGNC:6547 ENSG00000130164 3949 LDLR 606945 Familial hypercholesterolemia 143890 Cardiovascular AD 1.0 All P and LP -HGNC:6636 ENSG00000160789 4000 LMNA 150330 Dilated cardiomyopathy 115200 Cardiovascular AD 1.0 All P and LP -HGNC:6913 ENSG00000125952 4149 MAX 154950 Hereditary paraganglioma-pheochromocytoma syndrome 171300 Cancer AD 3.0 All P and LP -HGNC:7010 ENSG00000133895 4221 MEN1 613733 Multiple endocrine neoplasia type 1 131100 Cancer AD 1.0 All P and LP -HGNC:7127 ENSG00000076242 4292 MLH1 120436 Lynch syndrome 609310 Cancer AD 1.0 All P and LP -HGNC:7325 ENSG00000095002 4436 MSH2 609309 Lynch syndrome 120435 Cancer AD 1.0 All P and LP -HGNC:7329 ENSG00000116062 2956 MSH6 600678 Lynch syndrome 614350 Cancer AD 1.0 All P and LP -HGNC:7527 ENSG00000132781 4595 MUTYH 604933 MUTYH-associated polyposis 608456 Cancer AR 1.0 P and LP (2 variants) -HGNC:7551 ENSG00000134571 4607 MYBPC3 600958 Hypertrophic cardiomyopathy 115197 Cardiovascular AD 1.0 All P and LP -HGNC:7569 ENSG00000133392 4629 MYH11 160745 Familial thoracic aortic aneurysm 132900 Cardiovascular AD 1.0 All P and LP -HGNC:7577 ENSG00000092054 4625 MYH7 160760 Hypertrophic cardiomyopathy 192600 Cardiovascular AD 1.0 All P and LP -HGNC:7577 ENSG00000092054 4625 MYH7 160760 Dilated cardiomyopathy 613426 Cardiovascular AD 1.0 All P and LP -HGNC:7583 ENSG00000111245 4633 MYL2 160781 Hypertrophic cardiomyopathy 608758 Cardiovascular AD 1.0 All P and LP -HGNC:7584 ENSG00000160808 4634 MYL3 160790 Hypertrophic cardiomyopathy 608751 Cardiovascular AD 1.0 All P and LP -HGNC:7773 ENSG00000186575 4771 NF2 607379 Neurofibromatosis type 2 101000 Cancer AD 1.0 All P and LP -HGNC:8512 ENSG00000036473 5009 OTC 300461 Ornithine transcarbamylase deficiency 311250 Metabolic XL 2.0 All hemi, het, homozygous P and LP -HGNC:26144 ENSG00000083093 79728 PALB2 610355 Hereditary breast cancer 114480 Cancer AD 3.0 All P and LP -HGNC:20001 ENSG00000169174 255738 PCSK9 607786 Familial hypercholesterolemia 603776 Cardiovascular AD 1.0 All P and LP -HGNC:9024 ENSG00000057294 5318 PKP2 602861 Arrhythmogenic right ventricular cardiomyopathy 609040 Cardiovascular AD 1.0 All P and LP -HGNC:9122 ENSG00000122512 5395 PMS2 600259 Lynch syndrome 614337 Cancer AD 1.0 All P and LP -HGNC:9386 ENSG00000106617 51422 PRKAG2 602743 Hypertrophic cardiomyopathy 600858 Cardiovascular Metabolic AD 1.0 All P and LP -HGNC:9588 ENSG00000171862 5728 PTEN 601728 PTEN hamartoma tumor syndrome 158350 Cancer AD 1.0 All P and LP -HGNC:9884 ENSG00000139687 5925 RB1 614041 Retinoblastoma 180200 Cancer AD 1.0 All P and LP -HGNC:27424 ENSG00000203867 282996 RBM20 613171 Dliated cardiomyopathy 613172 Cardiovascular AD 3.1 All P and LP -HGNC:9967 ENSG00000165731 5979 RET 164761 Familial medullary thyroid cancer 155240 Cancer AD 1.0 All P and LP -HGNC:9967 ENSG00000165731 5979 RET 164761 Multiple endocrine neoplasia type 2A 171400 Cancer AD 1.0 All P and LP -HGNC:9967 ENSG00000165731 5979 RET 164761 Multiple endocrine neoplasia type 2B 162300 Cancer AD 1.0 All P and LP -HGNC:10294 ENSG00000116745 6121 RPE65 180069 RPE65-related retinopathy 204100, 613794 Miscellaneous AR 3.0 P and LP (2 variants) -HGNC:10483 ENSG00000196218 6261 RYR1 180901 Malignant hyperthermia 145600 Miscellaneous AD 1.0 All P and LP -HGNC:10484 ENSG00000198626 6262 RYR2 180902 Catecholaminergic polymorphic ventricular tachycardia 604772 Cardiovascular AD 1.0 All P and LP -HGNC:10593 ENSG00000183873 6331 SCN5A 600163 Long QT syndrome type 3 603830 Cardiovascular AD 1.0 All P and LP -HGNC:10593 ENSG00000183873 6331 SCN5A 600163 Brugada syndrome 601144 Cardiovascular AD 1.0 All P and LP -HGNC:10593 ENSG00000183873 6331 SCN5A 600163 Dilated cardiomyopathy 601154 Cardiovascular AD 1.0 All P and LP -HGNC:26034 ENSG00000167985 54949 SDHAF2 613019 Hereditary paraganglioma-pheochromocytoma syndrome 601650 Cancer AD 1.0 All P and LP -HGNC:10681 ENSG00000117118 6390 SDHB 185470 Hereditary paraganglioma-pheochromocytoma syndrome 115310, 171300 Cancer AD 1.0 All P and LP -HGNC:10682 ENSG00000143252 6391 SDHC 602413 Hereditary paraganglioma-pheochromocytoma syndrome 605373 Cancer AD 1.0 All P and LP -HGNC:10683 ENSG00000204370 6392 SDHD 602690 Hereditary paraganglioma-pheochromocytoma syndrome 168000, 171300 Cancer AD 1.0 All P and LP -HGNC:6769 ENSG00000166949 4088 SMAD3 603109 Loeys-Dietz syndrome 613795 Cardiovascular AD 1.0 All P and LP -HGNC:6770 ENSG00000141646 4089 SMAD4 600993 Juvenile polyposis syndrome 174900 Cancer AD 1.0 All P and LP -HGNC:6770 ENSG00000141646 4089 SMAD4 600993 Hereditary hemorrhagic telangiectasia 175050 Miscellaneous AD 1.0 All P and LP -HGNC:11389 ENSG00000118046 6794 STK11 602216 Peutz-Jeghers syndrome 175200 Cancer AD 1.0 All P and LP -HGNC:11772 ENSG00000106799 7046 TGFBR1 190181 Loeys-Dietz syndrome 609192 Cardiovascular AD 1.0 All P and LP -HGNC:11773 ENSG00000163513 7048 TGFBR2 190182 Loeys-Dietz syndrome 610168 Cardiovascular AD 1.0 All P and LP -HGNC:26038 ENSG00000135956 55654 TMEM127 613403 Hereditary paraganglioma-pheochromocytoma syndrome 171300 Cancer AD 3.0 All P and LP -HGNC:28472 ENSG00000170876 79188 TMEM43 612048 Arrhythmogenic right ventricular cardiomyopathy 604400 Cardiovascular AD 1.0 All P and LP -HGNC:11943 ENSG00000114854 7134 TNNC1 191040 Dilated cardiomyopathy 611879 Cardiovascular AD 3.1 All P and LP -HGNC:11947 ENSG00000129991 7137 TNNI3 191044 Hypertrophic cardiomyopathy 613690 Cardiovascular AD 1.0 All P and LP -HGNC:11949 ENSG00000118194 7139 TNNT2 191045 Dilated cardiomyopathy 601494 Cardiovascular AD 1.0 All P and LP -HGNC:11949 ENSG00000118194 7139 TNNT2 191045 Hypertrophic cardiomyopathy 115195 Cardiovascular AD 1.0 All P and LP -HGNC:11998 ENSG00000141510 7157 TP53 191170 Li-Fraumeni syndrome 151623 Cancer AD 1.0 All P and LP -HGNC:12010 ENSG00000140416 7168 TPM1 191010 Hypertrophic cardiomyopathy 115196 Cardiovascular AD 1.0 All P and LP -HGNC:12261 ENSG00000186439 10345 TRDN 603283 Catecholaminergic polymorphic ventricular tachycardia 615441 Cardiovascular AR 3.0 All P and LP -HGNC:12261 ENSG00000186439 10345 TRDN 603283 Long QT syndrome n/a Cardiovascular AR 3.0 All P and LP -HGNC:12362 ENSG00000165699 7248 TSC1 605284 Tuberous sclerosis complex 191100 Cancer AD 1.0 All P and LP -HGNC:12363 ENSG00000103197 7249 TSC2 191092 Tuberous sclerosis complex 613254 Cancer AD 1.0 All P and LP -HGNC:12403 ENSG00000155657 7273 TTN 188840 Dilated cardiomyopathy (truncating variants only) 604145 Cardiovascular AD 3.0 P and LP (truncating variants only) -HGNC:12405 ENSG00000118271 7276 TTR 176300 Hereditary transthyretin-related amyloidosis 105210 Miscellaneous AD 3.1 All P and LP -HGNC:12687 ENSG00000134086 7428 VHL 608537 Von Hippel-Lindau syndrome 193300 Cancer AD 1.0 All P and LP -HGNC:12796 ENSG00000184937 7490 WT1 607102 WT1-related Wilms tumor 194070 Cancer AD 1.0 All P and LP diff --git a/genes/acmg/acmg.tsv.md5 b/genes/acmg/acmg.tsv.md5 deleted file mode 100644 index f36696a..0000000 --- a/genes/acmg/acmg.tsv.md5 +++ /dev/null @@ -1 +0,0 @@ -42412ebe3a8dff0d7b6df78349fa68d6 genes/acmg/acmg.tsv diff --git a/genes/acmg/acmg.tsv.sha256 b/genes/acmg/acmg.tsv.sha256 deleted file mode 100644 index c42753a..0000000 --- a/genes/acmg/acmg.tsv.sha256 +++ /dev/null @@ -1 +0,0 @@ -f7dcad914f1d4e86134cea70431761246544acfff9b182a243258f177aec2bdf genes/acmg/acmg.tsv diff --git a/genes/dbnsfp/genes.spec.json b/genes/dbnsfp/genes.spec.json deleted file mode 100644 index 3370e3d..0000000 --- a/genes/dbnsfp/genes.spec.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "dc:format": "text/tsv", - "dc:identifier": "gene-centric/dbnsfp/genes.tsv:4.4", - "dc:title": "dbNSFP gene infromation (v4.4)", - "dc:description": "This is v4.4 of dbNSFP gene information", - "dc:created": "2023-05-06", - "dc:creator": "Xiaoming Liu, Ph.D.", - "dc:contributor": [ - "VarFish Developer Team" - ], - "dc:source": [ - "PMID:21520341", - "PMID:33261662", - "http://database.liulab.science/dbNSFP", - ], - "tsv:columns": { - "Gene_name": "Gene symbol from HGNC", - "Ensembl_gene": "Ensembl gene id (from HGNC)", - "chr": "Chromosome number (from HGNC)", - "Gene_old_names": "Old gene symbol (from HGNC)", - "Gene_other_names": "Other gene names (from HGNC)", - "Uniprot_acc(HGNC/Uniprot)": "Uniprot acc number (from HGNC and Uniprot)", - "Uniprot_id(HGNC/Uniprot)": "Uniprot id (from HGNC and Uniprot)", - "Entrez_gene_id": "Entrez gene id (from HGNC)", - "CCDS_id": "CCDS id (from HGNC)", - "Refseq_id": "Refseq gene id (from HGNC)", - "ucsc_id": "UCSC gene id (from HGNC)", - "MIM_id": "MIM gene id (from HGNC)", - "OMIM_id": "MIM gene id from OMIM", - "Gene_full_name": "Gene full name (from HGNC)", - "Pathway(Uniprot)": "Pathway description from Uniprot", - "Pathway(BioCarta)_short": "Short name of the Pathway(s) the gene belongs to (from BioCarta)", - "Pathway(BioCarta)_full": "Full name(s) of the Pathway(s) the gene belongs to (from BioCarta)", - "Pathway(ConsensusPathDB)": "Pathway(s) the gene belongs to (from ConsensusPathDB)", - "Pathway(KEGG)_id": "ID(s) of the Pathway(s) the gene belongs to (from KEGG)", - "Pathway(KEGG)_full": "Full name(s) of the Pathway(s) the gene belongs to (from KEGG)", - "Function_description": "Function description of the gene (from Uniprot)", - "Disease_description": "Disease(s) the gene caused or associated with (from Uniprot)", - "MIM_phenotype_id": "MIM id(s) of the phenotype the gene caused or associated with (from Uniprot)", - "MIM_disease": "MIM disease name(s) with MIM id(s) in \"[]\" (from Uniprot)", - "Orphanet_disorder_id": "Orphanet Number of the disorder the gene caused or associated with", - "Orphanet_disorder": "Disorder name from Orphanet", - "Orphanet_association_type": "the type of association beteen the gene and the disorder", - "Trait_association(GWAS)": "Trait(s) the gene associated with (from GWAS catalog)", - "HPO_id": "ID of the mapped Human Phenotype Ontology. Multiple IDs are separated by \";\"", - "HPO_name": "Name of the mapped Human Phenotype Ontology. Multiple names are separated by \";\"", - "GO_biological_process": "GO terms for biological process", - "GO_cellular_component": "GO terms for cellular component", - "GO_molecular_function": "GO terms for molecular function", - "Tissue_specificity(Uniprot)": "Tissue specificity description from Uniprot", - "Expression(egenetics)": "Tissues/organs the gene expressed in (egenetics data from BioMart)", - "Expression(GNF/Atlas)": "Tissues/organs the gene expressed in (GNF/Atlas data from BioMart)", - "Interactions(IntAct)": "The number of other genes this gene interacting with (from IntAct). Full information (gene name followed by Pubmed id in \"[]\") can be found in the \".complete\" table", - "Interactions(BioGRID)": "The number of other genes this gene interacting with (from BioGRID) Full information (gene name followed by Pubmed id in \"[]\") can be found in the \".complete\" table", - "Interactions(ConsensusPathDB)": "The number of other genes this gene interacting with (from ConsensusPathDB). Full information (gene name followed by interaction confidence in \"[]\") can be found in the \".complete\" table", - "P(HI)": "Estimated probability of haploinsufficiency of the gene (from doi:10.1371/journal.pgen.1001154)", - "HIPred_score": "Estimated probability of haploinsufficiency of the gene (from doi:10.1093/bioinformatics/btx028)", - "HIPred": "HIPred prediction of haploinsufficiency of the gene. Y(es) or N(o). (from doi:10.1093/bioinformatics/btx028)", - "GHIS": "A score predicting the gene haploinsufficiency. The higher the score the more likely the gene is haploinsufficient. (from doi: 10.1093/nar/gkv474) ", - "P(rec)": "Estimated probability that gene is a recessive disease gene (from DOI:10.1126/science.1215040)", - "Known_rec_info": "Known recessive status of the gene (from DOI:10.1126/science.1215040) \"lof-tolerant = seen in homozygous state in at least one 1000G individual\" \"recessive = known OMIM recessive disease\" (original annotations from DOI:10.1126/science.1215040)", - "RVIS_EVS": "Residual Variation Intolerance Score, a measure of intolerance of mutational burden, the higher the score the more tolerant to mutational burden the gene is. Based on EVS (ESP6500) data. from doi:10.1371/journal.pgen.1003709", - "RVIS_percentile_EVS": "The percentile rank of the gene based on RVIS, the higher the percentile the more tolerant to mutational burden the gene is. Based on EVS (ESP6500) data.", - "LoF-FDR_ExAC": "\"A gene's corresponding FDR p-value for preferential LoF depletion among the ExAC population. Lower FDR corresponds with genes that are increasingly depleted of LoF variants.\" cited from RVIS document.", - "RVIS_ExAC": "\"ExAC-based RVIS; setting 'common' MAF filter at 0.05% in at least one of the six individual ethnic strata from ExAC.\" cited from RVIS document.", - "RVIS_percentile_ExAC": "\"Genome-Wide percentile for the new ExAC-based RVIS; setting 'common' MAF filter at 0.05% in at least one of the six individual ethnic strata from ExAC.\" cited from RVIS document.", - "ExAC_pLI": "\"the probability of being loss-of-function intolerant (intolerant of both heterozygous and homozygous lof variants)\" based on ExAC r0.3 data", - "ExAC_pRec": "\"the probability of being intolerant of homozygous, but not heterozygous lof variants\" based on ExAC r0.3 data", - "ExAC_pNull": "\"the probability of being tolerant of both heterozygous and homozygous lof variants\" based on ExAC r0.3 data", - "ExAC_nonTCGA_pLI": "\"the probability of being loss-of-function intolerant (intolerant of both heterozygous and homozygous lof variants)\" based on ExAC r0.3 nonTCGA subset", - "ExAC_nonTCGA_pRec": "\"the probability of being intolerant of homozygous, but not heterozygous lof variants\" based on ExAC r0.3 nonTCGA subset", - "ExAC_nonTCGA_pNull": "\"the probability of being tolerant of both heterozygous and homozygous lof variants\" based on ExAC r0.3 nonTCGA subset", - "ExAC_nonpsych_pLI": "\"the probability of being loss-of-function intolerant (intolerant of both heterozygous and homozygous lof variants)\" based on ExAC r0.3 nonpsych subset", - "ExAC_nonpsych_pRec": "\"the probability of being intolerant of homozygous, but not heterozygous lof variants\" based on ExAC r0.3 nonpsych subset", - "ExAC_nonpsych_pNull": "\"the probability of being tolerant of both heterozygous and homozygous lof variants\" based on ExAC r0.3 nonpsych subset", - "gnomAD_pLI": "\"the probability of being loss-of-function intolerant (intolerant of both heterozygous and homozygous lof variants)\" based on gnomAD 2.1 data", - "gnomAD_pRec": "\"the probability of being intolerant of homozygous, but not heterozygous lof variants\" based on gnomAD 2.1 data", - "gnomAD_pNull": "\"the probability of being tolerant of both heterozygous and homozygous lof variants\" based on gnomAD 2.1 data", - "ExAC_del.score": "\"Winsorised deletion intolerance z-score\" based on ExAC r0.3.1 CNV data", - "ExAC_dup.score": "\"Winsorised duplication intolerance z-score\" based on ExAC r0.3.1 CNV data", - "ExAC_cnv.score": "\"Winsorised cnv intolerance z-score\" based on ExAC r0.3.1 CNV data", - "ExAC_cnv_flag": "\"Gene is in a known region of recurrent CNVs mediated by tandem segmental duplications and intolerance scores are more likely to be biased or noisy.\" from ExAC r0.3.1 CNV release", - "GDI": "gene damage index score, \"a genome-wide, gene-level metric of the mutational damage that has accumulated in the general population\" from doi: 10.1073/pnas.1518646112. The higher the score the less likely the gene is to be responsible for monogenic diseases.", - "GDI-Phred": "Phred-scaled GDI scores", - "Gene damage prediction (all disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for all diseases", - "Gene damage prediction (all Mendelian disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for all Mendelian diseases", - "Gene damage prediction (Mendelian AD disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for Mendelian autosomal dominant diseases", - "Gene damage prediction (Mendelian AR disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for Mendelian autosomal recessive diseases", - "Gene damage prediction (all PID disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for all primary immunodeficiency diseases", - "Gene damage prediction (PID AD disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for primary immunodeficiency autosomal dominant diseases", - "Gene damage prediction (PID AR disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for primary immunodeficiency autosomal recessive diseases", - "Gene damage prediction (all cancer disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for all cancer disease", - "Gene damage prediction (cancer recessive disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for cancer recessive disease", - "Gene damage prediction (cancer dominant disease-causing genes)": "gene damage prediction (low/medium/high) by GDI for cancer dominant disease", - "LoFtool_score": "a percentile score for gene intolerance to functional change. The lower the score the higher gene intolerance to functional change. For details see doi: 10.1093/bioinformatics/btv602.", - "SORVA_LOF_MAF0.005_HetOrHom": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Heterozygote or Homozygote of LOF SNVs whose MAF<0.005. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOF_MAF0.005_HomOrCompoundHet": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Compound Heterozygote or Homozygote of LOF SNVs whose MAF<0.005. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOF_MAF0.001_HetOrHom": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Heterozygote or Homozygote of LOF SNVs whose MAF<0.001. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOF_MAF0.001_HomOrCompoundHet": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Compound Heterozygote or Homozygote of LOF SNVs whose MAF<0.001. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOForMissense_MAF0.005_HetOrHom": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Heterozygote or Homozygote of LOF or missense SNVs whose MAF<0.005. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOForMissense_MAF0.005_HomOrCompoundHet": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Compound Heterozygote or Homozygote of LOF or missense SNVs whose MAF<0.005. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOForMissense_MAF0.001_HetOrHom": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Heterozygote or Homozygote of LOF or missense SNVs whose MAF<0.001. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "SORVA_LOForMissense_MAF0.001_HomOrCompoundHet": "the fraction of individuals in the 1000 Genomes Project data (N=2504) who are either Compound Heterozygote or Homozygote of LOF or missense SNVs whose MAF<0.001. This fraction is from a method for ranking genes based on mutational burden called SORVA (Significance Of Rare VAriants). Please see doi: 10.1101/103218 for details.", - "Essential_gene": "Essential (\"E\") or Non-essential phenotype-changing (\"N\") based on Mouse Genome Informatics database. from doi:10.1371/journal.pgen.1003484", - "Essential_gene_CRISPR": "Essential (\"E\") or Non-essential phenotype-changing (\"N\") based on large scale CRISPR experiments. from doi: 10.1126/science.aac7041", - "Essential_gene_CRISPR2": "Essential (\"E\"), context-Specific essential (\"S\"), or Non-essential phenotype-changing (\"N\") based on large scale CRISPR experiments. from http://dx.doi.org/10.1016/j.cell.2015.11.015", - "Essential_gene_gene-trap": "Essential (\"E\"), HAP1-Specific essential (\"H\"), KBM7-Specific essential (\"K\"), or Non-essential phenotype-changing (\"N\"), based on large scale mutagenesis experiments. from doi: 10.1126/science.aac7557", - "Gene_indispensability_score": "A probability prediction of the gene being essential. From doi:10.1371/journal.pcbi.1002886", - "Gene_indispensability_pred": "Essential (\"E\") or loss-of-function tolerant (\"N\") based on Gene_indispensability_score.", - "MGI_mouse_gene": "Homolog mouse gene name from MGI", - "MGI_mouse_phenotype": "Phenotype description for the homolog mouse gene from MGI", - "ZFIN_zebrafish_gene": "Homolog zebrafish gene name from ZFIN", - "ZFIN_zebrafish_structure": "Affected structure of the homolog zebrafish gene from ZFIN", - "ZFIN_zebrafish_phenotype_quality": "Phenotype description for the homolog zebrafish gene from ZFIN", - "ZFIN_zebrafish_phenotype_tag": "Phenotype tag for the homolog zebrafish gene from ZFIN" - } -} - diff --git a/genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz b/genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz new file mode 100644 index 0000000000000000000000000000000000000000..89dc5a41517fb5aeea3822b98f94bf335b8b9043 GIT binary patch literal 9211 zcmYj#Wl&sQuq`sUy96glfFQvJcPDrV?(Xh7xLbe(8wdn<5AFna&*08rK?djXy?bBP z`?u>EJ~9HE8SywXuoSSpUY}D9AsQ`hv0p0R zjX{WLSoG`5bq{uaU(9c9QNQjiWw+y;JF^Ghdpw?d8$aC)FLf6L-OOxLzqCG%ob$tQ zTBy&O#r$7h+WC|CjYGIzW_zx@`C-2Uu5M2!`uAIce4Y=SUr%akj19BzzzqGyy5%qT zpKqLBpZB*e|IH2iw0my_5pyKGhIO|7_HXrl9qX ze6qe13wnIGg^(D(>_vfJ+U`-|uF;KOj^ZYgyuw~ny*7UPU%l+dadf{t1Xn&!o^{<{ zY|TBdU7YsgK%TEN*Km?vVO&plJN*+S-qhQ$t|v{>Z7-W7nDZIc`uavtKmb+#Wgzlh zKnbMgspFo=Ou{AVb@PbC6w0*r`Yp`gD6*IO#m5^~(th>~yZ#%U5dOttSU8q)U ztURv2h4p*@e!WdPudtSdx73H7=TVEi++9x87zf;sWk;>GJSscA!1^JH$Z&_1Jlj#m z8-aeWV-tbAL)=3RDTCSp!7J!F!ZT-bxw;@7Ok*MkfsEREVUrlBxQJDQG%DdiODYPR ztEF#gCaD(YZ;Y`jH6x&C#7o|9v=92opnr)R;S4u`7H?wJ#w=v-lM z6;?$zhmr-B5e*f4$(cnVu*v&l#(TUdTM88_6regb^ANlp6Lznz3;dYIsP1?Lj?F-_ zk{cFEV=3qhXY9R%~<58z^bkCvWjWwi{8%uH5o6d@kU zmENdV#x`@$ey?Tr1Cv*w6bqZPvSFJ!Q_TPfHpgnnqu`fqjN+_PNu`_&{mr6}UtCJ$ zA;($3B|_2H<;Lk6@dH%Gr5~Sb;)g}+n=Je-Oo4U$z4Ez;7f6@k5wG8_g;d$dl+iSm zMuF-3fDR3nNXQ4)aYAJ#n~tJPzo;)n;UJRD+!VO$diZjLX6-LeR|~4aa}}mCM4k%6 zr*N*^C-o@Hz?12^0;h`nUR5vQ;d1?3!(hJ2ib!wIcQ;scn9V*x62*%g;3NxsXt5{b zdAe;xD52rqamU9fcok$s^kmbtgkC{hlCGZ*Zlo6;jS{HjZk@K6$BxmrVXzOSSRRtA zA(KMKp%R$kHHvE3h~Q*CAC5^_4;(3-gG_-N>25e@E$Lc6f3!I|>G-~*7A>ZqWJ>CQ zSuYjofX!XG25y)T?FOzHRn0x=^1E1Lg)!5Ml0vLfOl?0CHI8?rNc6uFxI~DsC&}je z<p#BA(&PHKD1G-R$f1UalVBb92jKZGgFRNha1NEK(# zG@qakjyCo@jf##sJ~a~V7X`8??Z&3gM|K9K%qR&J>T4`0%256*dJ-iIQx~Q%cuey8#_yTex};BNWDL#)sO`R?%6>?~wY`Y46% zt#kvn8sb4)(Ay%_2x^T)hH5tB4nlQH^LkyCZyaN03an#=9!VaoQeqYYpJzYDG1Rab zA2VRwnh1Q}XOH_;$!7eZWCC)?Ao%rKuX8+F?~H$$yvHLjz?R`;xx!E2+vE0A+JacA zyB?Z7DQsbjshT`7K1TGNXg|&>tasc_bo?{m3)l~FNa z(ECvSS3oDpq|W&lOPK>=i!wisoPe4-_W{pNh~7!3$5Xf!6*7OC2W8QmXGBS;jxa~KpuDNiZKq`X zA7|@a%s26so1^{KF#`<=%$IP^Z~*DQ(0OWmQT;Mo5M+jy~= z>DgOZep@6Q=i*Lbwkz#>oz|!bbuD^ z@_9~f_3Bek+lORUD=i0FrO~m!8#`MK{&_{9lUrh&~lG{Nxp(Ux;7B&f~jz3zd43%sx zY*GUm3v2tymX-`FEa~PktYeNi?X^b8r%Q;mHVm$fwXub%gi1dLpCQ*NV*AA&V1w0h z1diR=4DtNQC*6`sFW|Os1$Ci+6`O&}S!)h{=mc=!hk5TUQktBt(OC?)$tv-?;MbHF zMMNA|QNH%IOIp1{?ymhk8>sHm1Qa`{cE?u>; z-fjv@K0Y!a7;#!>b5o(T6*RHp;>Rk_Q^xZO0I?Omo8xEWW^uS`*Nub&d=9^Y#J$*M2jnVsUd5usQK^^y=v$;ht8o_QaxqCTE#IhC5Xx!~Qq>Z}>g%-H zV2!I)5NqrhRglL21#HwEyMohfa1l3OxCykRUH9=X*pjVIDfToA2u$Cl4RZ!AX{-nP zbo}mY4|?2!oI+|oo_|EfZnE7rbwJzLrFMC;uFQ4axagtyds!lVTVovjta*2})^bk8 zheP?A7|DNDI~G2mZsht6Vabe3JJm5)Y5JkVoyU8*fAhBVDzZ(tj3edYEUZPcYZd*+ z^v=2}&ifpz*NeOT7cSA^wEf|l*2&PRg-9{spu5&RvL@YIw)@d5_s!qb=K?r+91h7o z<=p@$Z&8N1Au%Ie{}>O;OV(3T-x6;VhhgMiJ#layZ+8r?l|LtG`y~#L9e-sd^uP&&N5avTizqE`495j9SutiGk5sx9 z(3~z$7l%y8$xtfgt69S;qaMsl?Ga>ZH36}@e@tYo7P7MTQGw_3nEpdC(vEH9L)GCdvvCh%z* zu6kMcbliOrgIejDXe0}-9GavfDc%jh3~Gq|c98)pHsKty)1z$KiqYL>7QUm&jQ@5s zXhEd6HO0sVn_SPm?7jOEy=0`YrrU9=E`HCaQmut|E_&iGloRJM2xT!G33e}QwNqdv;EfPF6m0n}_WqL>4n_H64;PV5%#7p}WVIfwsyiguImI$h8vT=txl#fWwH^zn_wNPP01 z?o#6WMIq8}P9KMh%=>N}XcoFSnun|$m{+nwO_*qn0*))CF4tEmxM_BNgoYwIoRRmh zKR~_NI`h~>`Yra!A`cv_lft-R2Q5wKs!BWwMQMuTSfKk>ET7l65*bqaTuA8D#{l$Lc+y1HYyl(o zuKa~n3pt`QGI|!B%oCday7TDCJkR(oP`g5FkvM1_tAdYV8hc9q$?HLkAKvLIS2geS zULkuN=2g18U;{4TQvu2)`U711^SDG7YOP1UoN2vVMxzNQRZ*llvKyHSSD#Q+!L)RA zh9E$WNy+WU;3J4^5;B{os|@XOj{w#NT3@4f%x;EYF##pT(cEg^3W_yR6=`&1j|@-? z1?P^+wR?UPyPsf0BygY*YUdA&la1U3B_-c}Xs6y*nXC9{PiX z5gHZVUnf#~CV3iCj$=mwP(8k|-auHB;UsUPD;PX9wjA@F4b+Ao@cZ4yFa*O8RVBlB zgziuqlzYW~!+LdyWiEZJH58b=TScN{l;O8s%tikG2P9rPM@JLXtD(KG6r!P@x>vRE zvjW5hs>P>hWVz6+M6XBPR=i+t*=AFExY%>l#i|&JptSdVF+`Mv3%yOi6MK?kcbq<| zs9uw%I`f*aI_49^q14-f))%j8uMdvh1{P=izVfK(5;Dqj%3Zb6QMOa)Zk6U*JWQC0 zT=1yp`HZQKdNTOQWgAmrY~A`B!K1n#9~PBCLjtkeJw*Sx)Y0k*(;l5QNC$o}QGA-3 zn>92szK484@YcjfvZDzpG+LF4FeZRuUQX z3%Ytjm*tvlFQ82+6qQoq!K!aISL%G3(rT!H9U0BFh<&9QogfD&(39Z#)VB|QASe~u z4Hdq)#D)_cYFfZ?Dc8Uz$>C>5>tk9I5F*bXe4Y(STuH$A_;-nj&yG`oew^Pfv67Yc zf=<>E4RE}UP$$= z4%|5x$bqZeNdV5G%o7kgSauO@A2m;)CbFt4ys|2R&uxnj`t-dSzC? zvtz>7Jiz?C{B@iDPDhqq%7H1mCfJnmi79kyToBv0YoLRCmbIyxL%%$^C`VjKcWFwx zj~87&n)tyH4V!cN0lsZHASYYB=$G<8-zieGjCpL93ud`rmBnm$nuhW;}brxx4DIWPDX`Hhuzjof6dUL-S)9Z*WL&>$72JH%#doRr=KP%RxO^M z)39RffX0gH^P!r9CjD=+>D)Ux9AUjdW6=v8KoU6?dc7BM`**RE&Q|<3&9`Ul!g8Yh zPZGj*JO-7fWD8-mJ_QUvx@|X>Wuktq`1gPZoGlEn?1)|w77dj))ia$WxaLQmLS1`p zBjV(~VpCOK+`7&6a;ABKtg(o>0x)#EKN9A}omgd_w?|w2E&6gt=@R$PkW+xGn_^>u z@dfFwn&E8;arAmMOJP(zs+y8C$qB~EwVQ6$p7e_|1c8E+6Tz`iZ`#RM$ zg}@AzBIL$X(=V#xt)&GRC2x%u*vqFQnV6785Rz;IzE<3k;QFwI4$F7hyC+J~`qcKV zZ6G;+&zCwm?qn(Zfwh5vkYw4{uM@SFQ7w33Xu?AiUlNg!-C$ijloLqic|}kn&@m#a zXvg$&d4jn%&<(fI1fcRHCtQp29wnoTAo|+T(^}w9C9Jb`dVm>YP~sp0@5gLtkNPg7{0=p9$}(vCuWUxpQtxhoToPWA8^B5uOe5qfP{{u#lx?Pw=cyG2>av5%_vjJI5??$Zw#1+2e?G2u{hJ+`U0=HV4At{etI=wRY)g!Rj zHfU~?&}EYI%j*BK)FiUA5IV*kOf~kO1iQ`=I1B|%-dw2Gz{STmaEYk&Q6Ni(1B`hU zoku8w?0qhByE6CeIl`-)D_!n=8XF)IP~RMYN!} z5)a-!^Zrq}i>#YLPhpS!!Fwjp8Yz^-8rR?qhQ6mn(m*OUfjEPyA${=K0~K&Ho+~V| zH^2IXPoLC!vqFF8>O2UDoZIWvW^T)sLy!;@$0Kv_%vj|DDd)sy<~0TLY?ql&jI zp{VFLs0`iSO-oIUhj5svi%AT29Qv~3u+j#TV4g;{(1f2^JQC5)THICI@tH_AKBflp z6Gi^KBPqsKL(SURtfWh!%qEhuz+30#AqCNx{NbCfo0Z%~(j?jx|1x*8dl}{*lpxzM zCpEm3GiNP#&qOo5j0bm+Y}A)tQ*^`rmcK?&piM+q0!M*R!?wfZ8Hofe#lCH%np>V%>p{XQo%nm**0{D+<|D-HYl0gGG7ufRczU zAKl2hMhyQ$&(;%Uvd8H7z=-k6KeDpTj|o)))hzd<)IF!qJcxh$cd{Rcfgx3sj&S7; z{vT`L4tN7`lBh*Rdr|EM7G%DXBAGm$+`_U?l3*?8&edWf>tg;z`R|<)5Q{TkcbsAT zQE(uZR$km}>>i2a>h(xooz{VWk3$ymJ~B2Kv#<9DtdEfBmBj)n`$>W_{=be%Zm-sY z#oe7tf2}?I&m%iOu@2Gh{hUM^cb&vG>xbkVcFtbw6t@WQLI61vq`J^zM^BA8<3JLf zLO!thM)4`{M>OQ5noAKXvf0G;gGiC(%W=OhM}0F*enSH@UgChWevnZ@dm z;C(MtQ4ePMu^d@8#j!e{f`wu7^Du!`B7f&3l0Zh|N)kQ>)r~d4i+sRBX7n;F9W2;% zlYVqMVTvGw(l#TxrT#$=Hw9@~LP($yihLlkO6p()UqHeg_+}gRSQM^i(Qr8c`-{f-S)ZR!Cu{1EI%kNQw9I~W&Z43YphH40ge*bwr>|#`~Aql2Sr#L z0$6Q_U=E9Y^(}uP;t)cHc1G!v6`fujNtK~VE`-cMX|yU6j)>~&+#R{zv>U(HE*x?$ z!Fg-IVS+Lz^gNZC4=SCSyehpUM=c+;0MnM8L7L9y6K{s>F7C2O%CRx`y%#9k<7aN6 z>l@uJ>Yh5sC8R0U+3r(_>ZhjMYH8L{jYuZ7%%TRDlPF zsExy(OmiIb;e0}(4^b-;*Sk%C&}!3L+y5p+54(EqPKu=Z#!LLEdX6C+?I6kW$^EM} zy`Dea6k-PVG2H7G&QSEox$iwVJ6n5Yt3pb%R3>60XtHiKh$u@2q|wJ$5Y~9XOe>!g z`c*G>lXiayE-Hz&HN*5wXC8BE+im2cJ4g&% zQxe$vuvbS46r3_^mw z4JM8p)U3T*{LIoQ{TOADSwC6A?M*U~JNMkfFSejj)C)Qy40F3>1@Ug8bXKGPB>5mzh2j(Ku4q89dj^#krpi(@VdY*%_`0Nmco z)ov7~boYgm(I28(ILVOJpRfw;#{xBuNTL;ZH~uM6DCi>pTB`)lOEAw?uLpc5~Vq}^7?81-$p(03kFsPczpF0x!rZOlccm^NA*yp_gch_SD3 ztL3s-Sb}u-{zpa{1|9Wgnia7h7q}6C&b!)|7uuE+_`VLfK7>Ke9EGWEu`M$~j749c zycJuiL|S*ACck}IPf(QN06pu3152k3V*?KtMm=IW_z(su=IN4m-~UT}Y9{9iZ5iD{ zVHNZItlN3JI#?QUh}>afYot|nz%Z~c%*9i(!pkGEswaCmsJ!vEUpX6f`64|>et3Fq z@wB8O@K%0|XH@u=aH$C-Tsn!=l}|jkmmc>|&BV9`@89H%loT7kqOZx& zjGaJY&E;xk5bddt!*cg>K;6~+HN^ZTkW9FFzp2cp5z=uI5+#_Y(4IXJH0z?Xn_Upp zj2rh2EW!ljC0?tL0M}eP^n}NeDx_9!Uuh>qvAXQoKK0b``V;I`GsR>3ls+gBIcu9} z=<=)WDAn1_-MNQqX43OO{cJjLHCszzsmpa_Pz8JHg$=58gRbt*ZbHhrDI5x8Iuxwc zy7bo`6rntWM}8<|)?%7)$28k2R$bxf)z_^2>KVGSs}I8M99+n9=bI@*5XQHBKj$H= z#OV?+k<0UqMhsj`--H*?Wg3IVh`1A?p)6>%Jc+E~hyqvmw;%m(xmThiLb{taR&J-c z2F~a?V6Pe)O>DBfOm3~umbP2X8nTY_g~UReKRZyP44$(5K0>GHE8GF%fzD2@CJwvt zPN#Yd$(}TuT%v@KZAX0BCF1a+&p{>VTseS7z&Ai zaf;i%LhBj@TOL8nwSr+jc#1qHBupU4Bwc1*e9vY9FF5oR^}_+18`;1b4?F-|D_A?A%Pw1kM-F$H$4A;m0)? z{}Pz$oV96_7yJ9?m!xjU1_i=$BtU}R2Z<9tB#b6Eod<}>B>@~+Dra#N!7~nie6VY@ z%jrdY^YNM}SaT*tOoEKvzvn8)L@`(#DRoyDcl42=>Eo4{8k0wF0~TexNfib4{k*K2 zay%=#jk=xjm@d9_;%6W;3@bJo3rxcPC)p+sbys(FBVWfKO}4ASs>9qB1*i7XG8eC4 zGQmDdMAy(R=sfYNWIy+vHXo%6X)(Umth3rAp60}u7RvYFm+iD-c$RPG=&W;k#Vi&L zN@mgvejT{OhnlDHWNUHTusy!X0$`Y2rVmbB;+5kgGvyh~iN9ZT^jIeyKI3mh*&qX{ zN7HTz`mgHg^a@Ciua|#?%rVbOfu-4h4V6i5pnN#~E44oNsXE(GoSwodN6Tb$))Dm! zZRVn1@Itb&ApT=NWK#MZxyiXO3cJeJYXw))qj4q>TxLZl0%1`u@yM$*B^QCTQQ7=T z8!XCbELS2l|9n%U9s(74**16H#lTWBO#{B<#_MDf9LLW|SzjD#L}fQj;duHOWREv2 zTaXeS1Ge{Fs#B2auh_bZ%FU;QZOi^t;n%jii*lkTRb7$bN zqPOD**#6n&Ev;a&tdPiC4L`z#_U^46QQ>6dX#`Fst-OY?r;TFkXPg01=_f`DDyl>+ zIXw-Be*D>6vu}bb+Gu|D4>ThDA_@2HIZ>V;3zvOf<8F7rY>3w8v3GUqCrF*{q_@yM z&pZ)pPv3!*AMkz0=JuH4Cwxki3?55wenpe$7iQ0ceWauxkOGYa>*h;3v;Q`5CKpYY z%YGwFhsg{^e{2Ont@!aJgrruPs+`FfdE>R0YKb&e07MA_Ve$h4X&fi_OPxfT8UVlH z?!2aGZ^*hcyT)|vs3PZ`1S?#!YONkl_*4W4;K@MR;zqx^E+pLV)>r>q%}j9tr7%@2 zC??~=Otj&m%kud*CBuLhf1vD2N5yyXQv0|lJSq;6i6=|X)s^r$HXTTYR79!Ok#5We z{)ab5RIM~caGMeNk5Z*m&3b9Bg`bIzfJqjOq+NH(c%d5J5|1uN!HT#a_%`o>A@&z7?xaXZ` z>3Y8zXkv6#RB?vkr-CYB(tqDkS9}kiQ^&!*A7bzX7)L~I1DRfCCX6fY#l48{2y=^iGeR_(@j&OmT4@RL;D!4w4vE2cib*NNL0#epk>EmSkZ%)#v zCTB}JRE|p6U<(UXcps*>hnO^&^|^f`vO#H6f|*1S*=K$^rKe#_#!|RAm}xzzc!m0f OdN1TzV=@^5?*9PLsUlqf literal 0 HcmV?d00001 diff --git a/genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz b/genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..993885a57deb29914667e0fbcb040f00e6d5af24 GIT binary patch literal 690 zcmV;j0!{rNiwFqAL3d;V|7&h+YbBt6dc#Dw+oJPAwu9`L$&3~tnQ?=6?X~x+y zD1M$#8$ndY=%)>scQvu$=A;awCXd=Q@@HMhy=7rFfBihFWU-#wFA&q>?9o-G9 zKq2Zgh}F}5jodV>QH%L;Bq}Mb#=gcOkLGB+?y%Qc_r8kHl6Fx|)p~oMD!DkEcOL~M zs=D>|J`qP$=TVbJ{;c}rte|T11$Q4vV8JFm4S$u}XnU7w_(9d_LHUAhPF2rc8hM9( ze)5M8l}7Y0MjV%|`Hl+eC2hpM?DA-zMlJy^v@cZ%8Bmu--o}yaAvcyUmvU$iB-sj3 zCKM!-7gDo#+MJ0)a7h&2R~GxGeK_r9?szbB@QomMI^p9H{ z=1UMcbkCuExs6|?SZ=vGPHDv9Hji=3i^@G%vh%KAiIH;ed(v`10xsW2(ieA)DOBy# z$md2*1W~*|`7S@cE_4A5hXPe}4+HY5A+I9w<@T^pIH=)7RPcU8%|QNrDdnOg6rA(u YnnyQco+}7fIhG04e?1V44~z-`0EV {output.tsv} + + md5sum {output.tsv} >{output.tsv_md5} + """ diff --git a/rules/genes/ensembl.smk b/rules/genes/ensembl.smk new file mode 100644 index 0000000..bbba433 --- /dev/null +++ b/rules/genes/ensembl.smk @@ -0,0 +1,67 @@ +## Rules related to ENSEMBL gene information. + + +rule genes_ensembl_create_xlink: # -- create ENSEMBL gene information xlink table + output: + tsv="work/genes/ensembl/ensembl_xlink.tsv", + tsv_md5="work/genes/ensembl/ensembl_xlink.tsv.md5", + shell: + r""" + echo -e "ensembl_gene_id\tensembl_transcript_id\tentrez_id\tgene_symbol" \ + >{output.tsv} + + wget --no-check-certificate \ + -O- \ + 'https://ensembl.org/biomart/martservice?query=' \ + | sort -u \ + >> {output.tsv} + + md5sum {output.tsv} >{output.tsv_md5} + """ + + +rule genes_ensembl_download_maps: # -- download files for ENST-ENSG mapping + output: + download_txt="genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz", + download_gtf="genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", + shell: + r""" + wget --no-check-certificate \ + -O {output.download_gtf} \ + 'https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz' + wget --no-check-certificate \ + -O {output.download_txt} \ + 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownToEnsembl.txt.gz' + """ + + +rule genes_ensembl_process_maps: # -- process ENST-ENSG mapping + input: + download_txt="genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz", + download_gtf="genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", + output: + tsv="genes/enst_ensg/grch37/enst_ensg.tsv", + tsv_md5="genes/enst_ensg/grch37/enst_ensg.tsv.md5", + shell: + r""" + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + awk \ + -F $'\t' \ + -f scripts/genes-enst-ensg.awk \ + <(zcat {output.input_gtf}) \ + | sort \ + > $TMPDIR/tmp1.txt + + zcat {output.input_txt} \ + | sed -e 's/\..//g' \ + | sort -k2,2 \ + >> $TMPDIR/tmp2.txt + + echo -e "real_enst\tenst\tensg" > {output.tsv} + join -t $'\t' -1 2 -2 1 $TMPDIR/tmp2.txt $TMPDIR/tmp1.txt \ + >> {output.tsv} + + md5sum {output.tsv} >{output.tsv_md5} + """ diff --git a/rules/genes/gnomad.smk b/rules/genes/gnomad.smk new file mode 100644 index 0000000..9fcb598 --- /dev/null +++ b/rules/genes/gnomad.smk @@ -0,0 +1,85 @@ +## Rules related to gnomAD gene constraints. + + +rule genes_gnomad_download: # -- download gnomAD gene constraints + output: + bgz="work/download/genes/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", + bgz_md5="work/download/genes/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz.md5", + shell: + r""" + wget --no-check-certificate \ + -O {output.bgz} \ + https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz + + md5sum {output.bgz} >{output.bgz_md5} + """ + + +def run_genes_gnomad_constraints_v2_1_1_to_tsv(input, output, wildcards): + """Extra function because of snakefmt issues.""" + columns_src = [ + "transcript", + "exp_lof", + "exp_mis", + "exp_syn", + "mis_z", + "obs_lof", + "obs_mis", + "obs_syn", + "oe_lof", + "oe_lof_lower", + "oe_lof_upper", + "oe_mis", + "oe_mis_lower", + "oe_mis_upper", + "oe_syn", + "oe_syn_lower", + "oe_syn_upper", + "pLI", + "syn_z", + "exac_pLI", + "exac_obs_lof", + "exac_exp_lof", + "exac_oe_lof", + ] + columns_src_str = ",".join(columns_src) + columns_tmp = ["ensembl_transcript_id"] + columns_src[1:] + columns_tmp_str = ",".join(columns_tmp) + columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + columns_src[1:] + columns_dst_str = ",".join(columns_dst) + shell( + r""" + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + zcat {input.bgz} \ + | tr '\t' ',' \ + > $TMPDIR/tmp.txt + + qsv select {columns_src_str} $TMPDIR/tmp.txt \ + | qsv rename {columns_tmp_str} \ + | qsv sort -u \ + | tr ',' '\t' \ + > $TMPDIR/tmp.tsv + + qsv join -d '\t' \ + ensembl_transcript_id $TMPDIR/tmp.tsv \ + ensembl_transcript_id {input.xlink_ensembl} \ + | qsv select {columns_dst_str} \ + | tr ',' '\t' \ + > {output.tsv} + + md5sum {output.tsv} >{output.tsv_md5} + """ + ) + + +rule genes_gnomad_convert: # -- create gnomAD gene constraints TSV + input: + bgz="work/download/genes/gnomad/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", + xlink_ensembl="work/genes/ensembl/ensembl_xlink.tsv", + output: + tsv="work/genes/gnomad/gnomad_constraints.tsv", + tsv_md5="work/genes/gnomad/gnomad_constraints.tsv.md5", + run: + run_genes_gnomad_constraints_v2_1_1_to_tsv(input, output, wildcards) diff --git a/rules/genes/hgnc.smk b/rules/genes/hgnc.smk new file mode 100644 index 0000000..a6ee01a --- /dev/null +++ b/rules/genes/hgnc.smk @@ -0,0 +1,56 @@ +## Rules related to the HGNC data. + + +rule genes_hgnc_download: # -- Download the HGNC data + output: + json="work/download/hgnc/hgnc_complete_set.json", + json_md5="work/download/hgnc/hgnc_complete_set.json.md5", + date_txt="work/download/hgnc/date.txt", + shell: + r""" + date > {output.date_txt} + + wget --no-check-certificate \ + -O {output.json} \ + https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json + + md5sum {output.json} > {output.json_md5} + """ + + +rule genes_hgnc_xlink: # -- Build HGNC xlink table. + input: + json="work/download/hgnc/hgnc_complete_set.json", + output: + tsv="work/genes/hgnc/hgnc_xlink.tsv", + tsv_md5="work/genes/hgnc/hgnc_xlink.tsv.md5", + shell: + r""" + jq \ + --raw-output \ + --from-file scripts/genes-xlink-hgnc.jq \ + {input.json} \ + > {output.tsv} + + + md5sum {output.tsv} > {output.tsv_md5} + """ + + +rule genes_hgnc_gene_info: # -- Build HGNC gene_info JSONL file. + input: + json="work/download/hgnc/hgnc_complete_set.json", + output: + jsonl="work/genes/hgnc/hgnc_info.jsonl", + jsonl_md5="work/genes/hgnc/hgnc_info.jsonl.md5", + shell: + r""" + jq \ + --compact-output \ + --raw-output \ + --from-file scripts/genes-hgnc-info.jq \ + {input.json} \ + > {output.jsonl} + + md5sum {output.jsonl} > {output.jsonl_md5} + """ diff --git a/rules/genes/ncbi.smk b/rules/genes/ncbi.smk new file mode 100644 index 0000000..c523b3e --- /dev/null +++ b/rules/genes/ncbi.smk @@ -0,0 +1,74 @@ +## Rules related to gene data from NCBI. + + +rule genes_ncbi_download_mim2gene: # -- download NCBI MedGen mim2gene + output: + download="work/download/genes/ncbi/mim2gene_medgen", + shell: + r""" + wget --no-check-certificate \ + -O {output.download} \ + https://ftp.ncbi.nih.gov/gene/DATA/mim2gene_medgen + """ + + +rule genes_ncbi_process_mim2gene: # -- process NCBI MedGen mim2gene + input: + download="work/download/genes/ncbi/mim2gene_medgen", + output: + tsv="work/genes/mim2gene/mim2gene.tsv", + tsv_md5="work/genes/mim2gene/mim2gene.tsv.md5", + shell: + r""" + awk -f scripts/genes-mim2gene.awk \ + -F $'\t' \ + {input.download} \ + > {output.tsv} + + md5sum {output.tsv} >{output.tsv_md5} + """ + + +rule genes_ncbi_entrez_download: # -- download NCBI Entrez files + output: + ags="work/download/genes/ncbi/Homo_sapiens.ags.gz", + ags_md5="work/download/genes/ncbi/Homo_sapiens.ags.gz.md5", + gene2xml="work/download/genes/ncbi/linux64.gene2xml", + gene2xml_md5="work/download/genes/ncbi/linux64.gene2xml.md5", + shell: + r""" + export TMPDIR=$(mktemp -d) + trap "rm -rf $TMPDIR" EXIT + + wget --no-check-certificate \ + -O $(dirname {output.ags})/Homo_sapiens.ags.gz \ + https://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz + + wget --no-check-certificate \ + -O $TMPDIR/linux64.gene2xml.gz \ + https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/gene2xml/linux64.gene2xml.gz + + gzip -d -c $TMPDIR/linux64.gene2xml.gz \ + > {output.gene2xml} + chmod u+x {output.gene2xml} + + md5sum {output.ags} >{output.ags_md5} + md5sum {output.gene2xml} >{output.gene2xml_md5} + """ + + +rule genes_ncbi_entrez_process: # -- process NCBI Entrez files + input: + ags="work/download/genes/ncbi/Homo_sapiens.ags.gz", + gene2xml="work/download/genes/ncbi/linux64.gene2xml", + output: + jsonl="work/genes/entrez/gene_info.jsonl", + jsonl_md5="work/genes/entrez/gene_info.jsonl.md5", + shell: + r""" + ./{input.gene2xml} -b T -c T -i {input.ags} \ + | python3 scripts/refseq_xml_to_json.py \ + > {output.jsonl} + + md5sum {output.jsonl} >{output.jsonl_md5} + """ diff --git a/rules/reference/human.smk b/rules/reference/human.smk new file mode 100644 index 0000000..d1874b9 --- /dev/null +++ b/rules/reference/human.smk @@ -0,0 +1,50 @@ +## Rules related to human reference genome sequence. + +#: Download URLs +REFERENCE_URLS = { + "grch37": ( + "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" + "phase2_reference_assembly_sequence/hs37d5.fa.gz" + ), + "grch38": ( + "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/" + "seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz" + ), +} + + +rule reference_download: # -- download reference genome sequence + output: + download="work/download/reference/{genome_build}/reference.fa.gz", + run: + ref_url = REFERENCE_URLS[wildcards.genome_build] + shell( + r""" + aria2c \ + --check-certificate=false \ + --out={output.download} \ + --split=8 \ + --max-concurrent-downloads=8 \ + --max-connection-per-server=8 \ + {ref_url} + """ + ) + + +rule reference_process: # -- post-process reference sequence after download + input: + download="work/download/reference/{genome_build}/reference.fa.gz", + output: + fasta="work/reference/{genome_build}/reference.fa", + fasta_md5="work/reference/{genome_build}/reference.fa.md5", + fasta_fai="work/reference/{genome_build}/reference.fa.fai", + fasta_fai_md5="work/reference/{genome_build}/reference.fa.fai.md5", + shell: + r""" + pigz -d -c {input.download} >{output.fasta} + + samtools faidx {output.fasta} + + md5sum {output.fasta} >{output.fasta_md5} + md5sum {output.fasta_fai} >{output.fasta_fai_md5} + """ diff --git a/snakefiles/annos.smk b/snakefiles/annos.smk deleted file mode 100644 index 00e12fb..0000000 --- a/snakefiles/annos.smk +++ /dev/null @@ -1,451 +0,0 @@ -#: Maximal distance (in bp) from exon to be considered "near coding". -NEAR_CODING_DIST = 1000 - - -# Create BED file with "near coding regions", based on RefSeq. -rule annos_near_coding_regions: - input: - bed="features/{genome_build}/gene_regions/refseq.bed.gz", - output: - bed="annos/{genome_build}/near_coding/near_coding.bed", - bed_gz="annos/{genome_build}/near_coding/near_coding.bed.bgz", - bed_gz_tbi="annos/{genome_build}/near_coding/near_coding.bed.bgz.tbi", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" ERR EXIT - - zcat {input.bed} \ - | tail -n +2 \ - | bedops --range {NEAR_CODING_DIST} --everything - \ - > /tmp/regions.bed - - bedops --merge /tmp/regions.bed \ - | sort-bed - \ - > {output.bed} - - bgzip -c {output.bed} >{output.bed_gz} - tabix -p bed -f {output.bed_gz} - """ - - -rule annos_helixmtdb_download: - output: - tsv="annos/{genome_build}/helixmtdb/download/helixmtdb.tsv", - shell: - r""" - wget \ - --no-check-certificate \ - -O {output} \ - https://helix-research-public.s3.amazonaws.com/mito/HelixMTdb_20200327.tsv - """ - - -rule annos_helixmtdb_convert: - input: - tsv="annos/{genome_build}/helixmtdb/download/helixmtdb.tsv", - output: - vcf="annos/{genome_build}/helixmtdb/helixmtdb.vcf.gz", - vcf_tbi="annos/{genome_build}/helixmtdb/helixmtdb.vcf.gz.tbi", - shell: - r""" - cat {input.tsv} \ - | python3 scripts/helix-to-vcf.py \ - > {output.vcf}.tmp - - if [[ {wildcards.genome_build} == GRCh37 ]]; then - sed -e 's/chrM/MT/g' {output.vcf}.tmp \ - | bgzip -c \ - > {output.vcf} - else - bgzip -c {output.vcf}.tmp >{output.vcf} - fi - - tabix -f {output.vcf} - - rm -f {output.vcf}.tmp - """ - - -rule annos_gnomad_mtdna: - output: - dl="annos/{genome_build}/gnomad_mtdna/gnomad.genomes.v3.1.sites.chrM.vcf.bgz", - vcf="annos/{genome_build}/gnomad_mtdna/gnomad_mtdna.vcf.gz", - vcf_tbi="annos/{genome_build}/gnomad_mtdna/gnomad_mtdna.vcf.gz.tbi", - shell: - r""" - wget --no-check-certificate \ - -O {output.dl} \ - https://datasetgnomad.blob.core.windows.net/dataset/release/3.1/vcf/genomes/gnomad.genomes.v3.1.sites.chrM.vcf.bgz - - if [[ {wildcards.genome_build} == grch37 ]]; then - zcat {output.dl} \ - | sed \ - -e 's/chrM/MT/g' \ - -e 's/GRCh38_MT/GRCh37/g' \ - -e 's/,GERP_DIST/\&GERP_DIST/g' \ - -e 's/,BP_DIST/\&BP_DIST/g' \ - -e 's/,DIST_FROM_LAST_EXON/\&DIST_FROM_LAST_EXON/g' \ - -e 's/,50_BP_RULE/\&50_BP_RULE/g' \ - -e 's/,PHYLOCSF_TOO_SHORT/\&PHYLOCSF_TOO_SHORT/g' \ - | bgzip -c \ - > {output.vcf} - else - cp {output.dl} {output.vcf} - fi - - tabix -f {output.vcf} - """ - - -# GNOMAD_PREFIX = "https://datasetgnomad.blob.core.windows.net/dataset/release" -GNOMAD_PREFIX = "https://gnomad-public-us-east-1.s3.amazonaws.com/release" -GNOMAD_V3 = "3.1.2" -GNOMAD_V2 = "2.1.1" - - -rule annos_gnomad_nuclear_download_2: - output: - vcf="annos/grch37/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz", - vcf_tbi="annos/grch37/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz.tbi", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf_tbi} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz.tbi - """ - - -rule annos_gnomad_nuclear_download_liftover_2: - output: - vcf="annos/grch38/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz", - vcf_tbi="annos/grch38/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz.tbi", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf_tbi} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz.tbi - """ - - -rule annos_gnomad_nuclear_download_3: - output: - vcf="annos/grch38/gnomad_{kind}/download/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz", - vcf_tbi="annos/grch38/gnomad_{kind}/download/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz.tbi", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf_tbi} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz.tbi - """ - - -def input_annos_gnomad_grch37(wildcards): - chroms = list(range(1, 23)) + ["X"] - # chrY is only available for GRCh37 genomes - if wildcards.kind == "exomes": - chroms.append("Y") - tpl = "annos/grch37/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz" - return [tpl.format(kind=wildcards.kind, version=GNOMAD_V2, chrom=chrom) for chrom in chroms] - - -rule annos_gnomad_grch37: - input: - input_annos_gnomad_grch37, - output: - touch("annos/grch37/gnomad_{kind}/.done"), - - -def input_annos_gnomad_grch38(wildcards): - chroms = list(range(1, 23)) + ["X", "Y"] - if wildcards.kind == "exomes": - tpl = "annos/grch38/gnomad_{kind}/download/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz" - return [tpl.format(kind=wildcards.kind, version=GNOMAD_V2, chrom=chrom) for chrom in chroms] - else: - tpl = ( - "annos/grch38/gnomad_{kind}/download/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz" - ) - return [tpl.format(kind=wildcards.kind, version=GNOMAD_V3, chrom=chrom) for chrom in chroms] - - -rule annos_gnomad_grch38: - input: - input_annos_gnomad_grch38, - output: - touch("annos/grch38/gnomad_{kind}/.done"), - - -rule annos_ucsc_conservation_download: - output: - fa="annos/{genome_build}/ucsc_conservation/download/knownGene.exonAA.fa.gz", - shell: - r""" - if [[ {wildcards.genome_build} == grch37 ]]; then - ucsc_name=hg19 - else - ucsc_name=hg38 - fi - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.fa} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - "https://hgdownload.cse.ucsc.edu/goldenpath/${{ucsc_name}}/multiz100way/alignments/knownGene.exonAA.fa.gz" - """ - - -rule annos_ucsc_conservation_to_vcf: - input: - hgnc="genes/hgnc/hgnc_info.jsonl", - enst_ensg="genes/enst_ensg/{genome_build}/enst_ensg.tsv", - reference="reference/{genome_build}/reference/reference.fa", - fa="annos/{genome_build}/ucsc_conservation/download/knownGene.exonAA.fa.gz", - output: - vcf="annos/{genome_build}/ucsc_conservation/ucsc_conservation.vcf.gz", - tbi="annos/{genome_build}/ucsc_conservation/ucsc_conservation.vcf.gz.tbi", - shell: - r""" - python scripts/knowngeneaa.py \ - {input.hgnc} \ - {input.enst_ensg} \ - {input.reference} \ - {input.fa} \ - --output /dev/stdout \ - | bcftools sort \ - -O z \ - -o {output.vcf} - tabix -f {output.vcf} - """ - - -rule annos_ucsc_conservation_to_tsv: - input: - header="header/knowngeneaa.txt", - vcf="annos/{genome_build}/ucsc_conservation/ucsc_conservation.vcf.gz", - output: - tsv="annos/{genome_build}/ucsc_conservation/ucsc_conservation.tsv", - shell: - r""" - ( - cat {input.header} | tr '\n' '\t' | sed -e 's/\t*$/\n/g'; - bcftools query \ - -f "%CHROM\t%POS\t%END\t%HGNC_ID\t%ENST_ID\t%EXON\t%EXON_COUNT\t%ALIGNMENT\n" \ - {input.vcf} \ - | uniq - ) \ - > {output.tsv} - """ - - -rule annos_dbsnp_download: - output: - vcf="annos/{genome_build}/dbsnp/dbsnp.vcf.gz", - vcf_tbi="annos/{genome_build}/dbsnp/dbsnp.vcf.gz.tbi", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13/VCF/00-All.vcf.gz - tabix -f {output.vcf} - """ - - -CADD_VERSION = "1.6" -CADD_PREFIX = f"https://kircherlab.bihealth.org/download/CADD/v{CADD_VERSION}" - - -rule annos_cadd_download: - output: - tsv="annos/{genome_release}/cadd/download/{filename}.tsv.gz", - tsv_tbi="annos/{genome_release}/cadd/download/{filename}.tsv.gz.tbi", - shell: - r""" - for path in {output}; - do - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out=$path \ - --split=16 \ - --max-concurrent-downloads=16 \ - --max-connection-per-server=16 \ - {CADD_PREFIX}/$(echo {wildcards.genome_release} | sed -e 's/grch/GRCh/')/$(basename $path) - done - """ - - -rule annos_cadd_process_37: - input: - "annos/grch37/cadd/download/whole_genome_SNVs_inclAnno.tsv.gz", - "annos/grch37/cadd/download/whole_genome_SNVs_inclAnno.tsv.gz.tbi", - "annos/grch37/cadd/download/InDels_inclAnno.tsv.gz", - "annos/grch37/cadd/download/InDels_inclAnno.tsv.gz.tbi", - output: - touch("annos/grch37/cadd/.done"), - - -rule annos_cadd_process_38: - input: - "annos/grch38/cadd/download/whole_genome_SNVs_inclAnno.tsv.gz", - "annos/grch38/cadd/download/whole_genome_SNVs_inclAnno.tsv.gz.tbi", - "annos/grch38/cadd/download/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz", - "annos/grch38/cadd/download/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz.tbi", - output: - touch("annos/grch38/cadd/.done"), - - -DBNSFP_VERSION = "4.4" -DBNSFP_ACADEMIC_URL = "https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc" -DBNSFP_COMMMERCIAL_URL = "https://usf.box.com/shared/static/a84zcdlkx2asq2nxh6xr2gdb4csmyvhk" - - -def files_dbnsfp(): - lst = [ - "dbNSFP{version}{variant}.readme.txt", - "dbNSFP{version}{variant}_variant.chr1.gz", - "dbNSFP{version}{variant}_variant.chr10.gz", - "dbNSFP{version}{variant}_variant.chr11.gz", - "dbNSFP{version}{variant}_variant.chr12.gz", - "dbNSFP{version}{variant}_variant.chr13.gz", - "dbNSFP{version}{variant}_variant.chr14.gz", - "dbNSFP{version}{variant}_variant.chr15.gz", - "dbNSFP{version}{variant}_variant.chr16.gz", - "dbNSFP{version}{variant}_variant.chr17.gz", - "dbNSFP{version}{variant}_variant.chr18.gz", - "dbNSFP{version}{variant}_variant.chr19.gz", - "dbNSFP{version}{variant}_variant.chr2.gz", - "dbNSFP{version}{variant}_variant.chr20.gz", - "dbNSFP{version}{variant}_variant.chr21.gz", - "dbNSFP{version}{variant}_variant.chr22.gz", - "dbNSFP{version}{variant}_variant.chr3.gz", - "dbNSFP{version}{variant}_variant.chr4.gz", - "dbNSFP{version}{variant}_variant.chr5.gz", - "dbNSFP{version}{variant}_variant.chr6.gz", - "dbNSFP{version}{variant}_variant.chr7.gz", - "dbNSFP{version}{variant}_variant.chr8.gz", - "dbNSFP{version}{variant}_variant.chr9.gz", - "dbNSFP{version}{variant}_variant.chrM.gz", - "dbNSFP{version}{variant}_variant.chrX.gz", - "dbNSFP{version}{variant}_variant.chrY.gz", - "dbNSFP{version}_gene.complete.gz", - "dbNSFP{version}_gene.gz", - "LICENSE.txt", - "try.vcf", - "tryhg18.in", - "tryhg19.in", - "tryhg38.in", - ] - return ["annos/grch37/dbnsfp-{version}{variant}/download/%s" % e for e in lst] - - -rule annos_dbnsfp_download: - output: - files_dbnsfp(), - zip="annos/grch37/dbnsfp-{version}{variant}/download/dbNSFP{version}{variant}.zip", - wildcard_constraints: - version=r"\d\.\d", - shell: - r""" - if [[ "{wildcards.variant}" == a ]]; then - url={DBNSFP_ACADEMIC_URL} - else - url={DBNSFP_COMMMERCIAL_URL} - fi - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.zip} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - $url - unzip -d $(dirname {output.zip}) {output.zip} - """ - - -rule annos_dbnsfp_process: - input: - zip="annos/grch37/dbnsfp-{version}{variant}/download/dbNSFP{version}{variant}.zip", - wildcard_constraints: - version=r"\d\.\d", - output: - touch("annos/{genome_release}/dbnsfp-{version}{variant}/.done"), - - -def files_dbscsnv(version: str = "1.1"): - """Files contained in the dbscSNV ZIP file.""" - chroms = [str(i) for i in range(1, 23)] + ["X", "Y"] - return [f"annos/grch37/dbscsnv/download/dbscSNV{version}.chr{chrom}" for chrom in chroms] - - -rule annos_dbscsnv_download: - output: - files_dbscsnv(), - zip="annos/grch37/dbscsnv/download/dbscSNV1.1.zip", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.zip} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip - unzip -d $(dirname {output.zip}) {output.zip} - """ - - -rule annos_dbscsnv_process: - input: - zip="annos/grch37/dbscsnv/download/dbscSNV1.1.zip", - output: - touch("annos/{genome_release}/dbscsnv/.done"), diff --git a/snakefiles/features.smk b/snakefiles/features.smk deleted file mode 100644 index ecd0ea7..0000000 --- a/snakefiles/features.smk +++ /dev/null @@ -1,149 +0,0 @@ -rule features_grch37_tad_domains: - output: - download_imr90="features/grch37/tads/download/IMR90_domains_hg19.bed", - download_hesc="features/grch37/tads/download/hESC_domains_hg19.bed", - bed_imr90="features/grch37/tads/imr90.bed", - bed_imr90_md5="features/grch37/tads/imr90.bed.md5", - bed_hesc="features/grch37/tads/hesc.bed", - bed_hesc_md5="features/grch37/tads/hesc.bed.md5", - shell: - r""" - set -x - - wget --no-check-certificate \ - -O {output.download_imr90} \ - https://compbio.med.harvard.edu/modencode/webpage/hic/IMR90_domains_hg19.bed - wget --no-check-certificate \ - -O {output.download_hesc} \ - https://compbio.med.harvard.edu/modencode/webpage/hic/hESC_domains_hg19.bed - - echo -e "#chrom\tbegin\tend" >{output.bed_imr90} - sed -e 's/^chr//' {output.download_imr90} >>{output.bed_imr90} - - echo -e "#chrom\tbegin\tend" >{output.bed_hesc} - sed -e 's/^chr//' {output.download_hesc} >>{output.bed_hesc} - - md5sum {output.bed_imr90} >{output.bed_imr90_md5} - md5sum {output.bed_hesc} >{output.bed_hesc_md5} - """ - - -rule features_grch37_refseq_gene_regions: - output: - download_acc="features/grch37/gene_regions/download/chr_accessions_GRCh37.p13", - download_gtf="features/grch37/gene_regions/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - tsv="features/grch37/gene_regions/refseq.bed.gz", - tsv_md5="features/grch37/gene_regions/refseq.bed.gz.md5", - tsv_tbi="features/grch37/gene_regions/refseq.bed.gz.tbi", - tsv_tbi_md5="features/grch37/gene_regions/refseq.bed.gz.tbi.md5", - shell: - r""" - set -x - - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT ERR - - wget --no-check-certificate \ - -O {output.download_acc} \ - 'https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/Assembled_chromosomes/chr_accessions_GRCh37.p13' - - wget --no-check-certificate \ - -O {output.download_gtf} \ - 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz' - - awk \ - -F $'\t' \ - -f scripts/features-refseq-gene-regions.awk \ - {output.download_acc} \ - <(zcat {output.download_gtf}) \ - | egrep '^#|^X|^Y|^M|^[1-9]' \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.tsv} - - tabix -f {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - md5sum {output.tsv_tbi} >{output.tsv_tbi_md5} - """ - - -rule features_grch37_ensembl_gene_regions: - output: - download_gtf="features/grch37/gene_regions/download/Homo_sapiens.GRCh37.87.gtf.gz", - tsv="features/grch37/gene_regions/ensembl.bed.gz", - tsv_md5="features/grch37/gene_regions/ensembl.bed.gz.md5", - tsv_tbi="features/grch37/gene_regions/ensembl.bed.gz.tbi", - tsv_tbi_md5="features/grch37/gene_regions/ensembl.bed.gz.tbi.md5", - shell: - r""" - set -x - - wget --no-check-certificate \ - -O {output.download_gtf} \ - 'https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz' - - awk \ - -F $'\t' \ - -f scripts/features-ensembl-gene-regions.awk \ - <(zcat {output.download_gtf}) \ - | egrep '^#|^X|^Y|^M|^[1-9]' \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.tsv} - - tabix -f {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - md5sum {output.tsv_tbi} >{output.tsv_tbi_md5} - """ - - -rule features_grch37_masked_repeat: - input: - bed="tracks/grch37/ucsc_rmsk.bed.gz", - output: - bed="features/grch37/masked/repeat.bed.gz", - bed_md5="features/grch37/masked/repeat.bed.gz.md5", - bed_tbi="features/grch37/masked/repeat.bed.gz.tbi", - bed_tbi_md5="features/grch37/masked/repeat.bed.gz.tbi.md5", - shell: - r""" - set -x - - zcat {input.bed} \ - | egrep '^#|^X|^Y|^M|^[1-9]' \ - | egrep -v '^Un|_random|_fix|_alt|_hap' \ - | bgzip -c \ - > {output.bed} - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule features_grch37_masked_segdup: - input: - bed="tracks/grch37/ucsc_genomicSuperDups.bed.gz", - output: - bed="features/grch37/masked/segdup.bed.gz", - bed_md5="features/grch37/masked/segdup.bed.gz.md5", - bed_tbi="features/grch37/masked/segdup.bed.gz.tbi", - bed_tbi_md5="features/grch37/masked/segdup.bed.gz.tbi.md5", - shell: - r""" - set -x - - zcat {input.bed} \ - | egrep '^#|^X|^Y|^M|^[1-9]' \ - | egrep -v '^Un|_random|_fix|_alt|_hap' \ - | bgzip -c \ - > {output.bed} - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/snakefiles/genes.smk b/snakefiles/genes.smk deleted file mode 100644 index e76e83d..0000000 --- a/snakefiles/genes.smk +++ /dev/null @@ -1,274 +0,0 @@ -rule genes_xlink_ensembl: - output: - tsv="genes/xlink/ensembl.tsv", - tsv_md5="genes/xlink/ensembl.tsv.md5", - shell: - r""" - echo -e "ensembl_gene_id\tensembl_transcript_id\tentrez_id\tgene_symbol" >{output.tsv} - - wget --no-check-certificate \ - -O- \ - 'https://ensembl.org/biomart/martservice?query=' \ - | sort -u \ - >> {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -rule genes_xlink_hgnc: - output: - download_json="genes/xlink/download/hgnc/hgnc_complete_set.json", - tsv="genes/xlink/hgnc.tsv", - tsv_md5="genes/xlink/hgnc.tsv.md5", - shell: - r""" - set -x - - wget --no-check-certificate \ - -O {output.download_json} \ - https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json - - jq \ - --raw-output \ - --from-file scripts/genes-xlink-hgnc.jq \ - {output.download_json} \ - > {output.tsv} - - md5sum {output.tsv} > {output.tsv_md5} - """ - - -rule genes_hgnc_info: - input: - download_json="genes/xlink/download/hgnc/hgnc_complete_set.json", - output: - tsv="genes/hgnc/hgnc_info.jsonl", - tsv_md5="genes/hgnc/hgnc_info.jsonl.md5", - shell: - r""" - set -x - - jq \ - --compact-output \ - --raw-output \ - --from-file scripts/genes-hgnc-info.jq \ - {input.download_json} \ - > {output.tsv} - - md5sum {output.tsv} > {output.tsv_md5} - """ - - -rule genes_gnomad_constraints_v2_1_1_download: - output: - bgz="genes/gnomad_constraints/download/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", - bgz_md5="genes/gnomad_constraints/download/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz.md5", - shell: - r""" - set -x - - wget --no-check-certificate \ - -O {output.bgz} \ - https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz - - md5sum {output.bgz} >{output.bgz_md5} - """ - - -def run_genes_gnomad_constraints_v2_1_1_to_tsv(wildcards): - """Extra function because of snakefmt issues.""" - columns_src = [ - "transcript", - "exp_lof", - "exp_mis", - "exp_syn", - "mis_z", - "obs_lof", - "obs_mis", - "obs_syn", - "oe_lof", - "oe_lof_lower", - "oe_lof_upper", - "oe_mis", - "oe_mis_lower", - "oe_mis_upper", - "oe_syn", - "oe_syn_lower", - "oe_syn_upper", - "pLI", - "syn_z", - "exac_pLI", - "exac_obs_lof", - "exac_exp_lof", - "exac_oe_lof", - ] - columns_src_str = ",".join(columns_src) - columns_tmp = ["ensembl_transcript_id"] + columns_src[1:] - columns_tmp_str = ",".join(columns_tmp) - columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + columns_src[1:] - columns_dst_str = ",".join(columns_dst) - shell( - r""" - set -x - - zcat {input.bgz} \ - | tr '\t' ',' \ - > {output.txt_tmp} - - qsv select {columns_src_str} {output.txt_tmp} \ - | qsv rename {columns_tmp_str} \ - | qsv sort -u \ - | tr ',' '\t' \ - > {output.tsv_tmp} - - qsv join -d '\t' ensembl_transcript_id {output.tsv_tmp} ensembl_transcript_id {input.xlink_ensembl} \ - | qsv select {columns_dst_str} \ - | tr ',' '\t' \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - ) - - -rule genes_gnomad_constraints_v2_1_1_to_tsv: - input: - bgz="genes/gnomad_constraints/download/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", - xlink_ensembl="genes/xlink/ensembl.tsv", - output: - txt_tmp="genes/gnomad_constraints/download/gnomad.v2.1.1.lof_metrics.by_gene.txt", - tsv_tmp="genes/gnomad_constraints/download/gnomad_constraints-subset.tsv", - tsv="genes/gnomad_constraints/gnomad_constraints.tsv", - tsv_md5="genes/gnomad_constraints/gnomad_constraints.tsv.md5", - run: - run_genes_gnomad_constraints_v2_1_1_to_tsv(wildcards) - - -rule genes_mim2gene: - output: - download="genes/mim2gene/download/mim2gene_medgen", - tsv="genes/mim2gene/mim2gene.tsv", - tsv_md5="genes/mim2gene/mim2gene.tsv.md5", - shell: - r""" - wget --no-check-certificate \ - -O {output.download} \ - https://ftp.ncbi.nih.gov/gene/DATA/mim2gene_medgen - - awk -f scripts/genes-mim2gene.awk \ - -F $'\t' \ - {output.download} \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -rule genes_gene_download: - output: - ags="genes/ncbi/download/Homo_sapiens.ags.gz", - ags_md5="genes/ncbi/download/Homo_sapiens.ags.gz.md5", - gene2xml="genes/ncbi/download/linux64.gene2xml", - gene2xml_md5="genes/ncbi/download/linux64.gene2xml.md5", - shell: - r""" - set -x - - if [[ ! -e $(dirname {output.ags})/Homo_sapiens.ags.gz.md5 ]]; then - wget --no-check-certificate \ - -O $(dirname {output.ags})/Homo_sapiens.ags.gz \ - https://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz - pushd $(dirname {output.ags}) - md5sum Homo_sapiens.ags.gz > Homo_sapiens.ags.gz.md5 - popd - fi - - if [[ ! -e $(dirname {output.ags})/linux64.gene2xml.md5 ]]; then - wget --no-check-certificate \ - -O $(dirname {output.ags})/linux64.gene2xml.gz \ - https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/gene2xml/linux64.gene2xml.gz - pushd $(dirname {output.ags}) - gzip -d -c linux64.gene2xml.gz > linux64.gene2xml - chmod +x linux64.gene2xml - md5sum linux64.gene2xml > linux64.gene2xml.md5 - popd - fi - """ - - -rule result_noref_ncbi_gene_process: - input: - ags="genes/ncbi/download/Homo_sapiens.ags.gz", - gene2xml="genes/ncbi/download/linux64.gene2xml", - output: - jsonl="genes/ncbi/gene_info.jsonl", - jsonl_md5="genes/ncbi/gene_info.jsonl.md5", - shell: - r""" - ./{input.gene2xml} -b T -c T -i {input.ags} \ - | python3 scripts/refseq_xml_to_json.py \ - > {output.jsonl} - md5sum {output.jsonl} >{output.jsonl_md5} - """ - - -# For GRCh37, we use ucsc transcript ID instead of enst as the conservation -# file from UCSC uses these IDs. -rule genes_enst_ensg_grch37: - output: - download_txt="genes/enst_ensg/grch37/download/knowntoEnsembl.txt.gz", - download_gtf="genes/enst_ensg/grch37/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - tmp1="genes/enst_ensg/grch37/download/tmp1.txt", - tmp2="genes/enst_ensg/grch37/download/tmp2.txt", - tsv="genes/enst_ensg/grch37/enst_ensg.tsv", - tsv_md5="genes/enst_ensg/grch37/enst_ensg.tsv.md5", - shell: - r""" - set -x - export LC_ALL=C - - wget --no-check-certificate \ - -O {output.download_gtf} \ - 'https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz' - wget --no-check-certificate \ - -O {output.download_txt} \ - 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownToEnsembl.txt.gz' - - awk \ - -F $'\t' \ - -f scripts/genes-enst-ensg.awk \ - <(zcat {output.download_gtf}) \ - | sort \ - > {output.tmp1} - zcat {output.download_txt} \ - | sed -e 's/\..//g' \ - | sort -k2,2 \ - >> {output.tmp2} - - echo -e "real_enst\tenst\tensg" > {output.tsv} - join -t $'\t' -1 2 -2 1 {output.tmp2} {output.tmp1} \ - >> {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -# We use the full dbNSFP genes information file. -rule genes_dbnsfp_genes_copy: - input: - tsv=f"annos/grch37/dbnsfp-{DBNSFP_VERSION}a/download/dbNSFP{DBNSFP_VERSION}_gene.complete.gz", - output: - tsv="genes/dbnsfp/genes.tsv.gz", - tsv_md5="genes/dbnsfp/genes.tsv.gz.md5", - shell: - r""" - set -x - export LC_ALL=C - - zcat {input.tsv} \ - | pigz -c \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ diff --git a/snakefiles/reference.smk b/snakefiles/reference.smk deleted file mode 100644 index 5751a1e..0000000 --- a/snakefiles/reference.smk +++ /dev/null @@ -1,35 +0,0 @@ -# Download of reference FASTA files. - -REFERENCE_URLS = { - "grch37": ( - "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" - "phase2_reference_assembly_sequence/hs37d5.fa.gz" - ), - "GRCh38": ( - "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/" - "seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz" - ), -} - - -rule reference_download: - output: - download="reference/{genome_build}/reference/download/reference.fa.gz", - fasta="reference/{genome_build}/reference/reference.fa", - fasta_fai="reference/{genome_build}/reference/reference.fa.fai", - run: - ref_url = REFERENCE_URLS[wildcards.genome_build] - shell( - r""" - aria2c \ - --check-certificate=false \ - --out={output.download} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {ref_url} - - pigz -d -c {output.download} >{output.fasta} - samtools faidx {output.fasta} - """ - ) diff --git a/snakefiles/tracks-grch37.smk b/snakefiles/tracks-grch37.smk deleted file mode 100644 index b0d6ba4..0000000 --- a/snakefiles/tracks-grch37.smk +++ /dev/null @@ -1,89 +0,0 @@ -rule tracks_grch37_ucsc_genomic_super_dups: - output: - bed="tracks/grch37/ucsc_genomicSuperDups.bed.gz", - bed_md5="tracks/grch37/ucsc_genomicSuperDups.bed.gz.md5", - bed_tbi="tracks/grch37/ucsc_genomicSuperDups.bed.gz.tbi", - bed_tbi_md5="tracks/grch37/ucsc_genomicSuperDups.bed.gz.tbi.md5", - txt="tracks/grch37/download/genomicSuperDups.txt.gz", - shell: - r""" - set -x - - mkdir -p $(dirname {output.txt}) - wget -O {output.txt} https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/genomicSuperDups.txt.gz - - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {output.txt} \ - | cut -f 2,3,4,5 \ - | sed -e 's/^chr//g' \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule tracks_grch37_ucsc_rmsk: - output: - bed="tracks/grch37/ucsc_rmsk.bed.gz", - bed_md5="tracks/grch37/ucsc_rmsk.bed.gz.md5", - bed_tbi="tracks/grch37/ucsc_rmsk.bed.gz.tbi", - bed_tbi_md5="tracks/grch37/ucsc_rmsk.bed.gz.tbi.md5", - txt="tracks/grch37/download/rmsk.txt.gz", - shell: - r""" - set -x - - mkdir -p $(dirname {output.txt}) - wget -O {output.txt} https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/rmsk.txt.gz - - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {output.txt} \ - | awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ if ($12 == $13) {{ label = $13 "/" $11 }} else {{ label = $12 "/" $13 "/" $11 }} print $6, $7, $8, label }}' \ - | sed -e 's/^chr//g' \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule tracks_grch37_ucsc_x_seq_lift_over_psl: - output: - bed="tracks/grch37/ucsc_{prefix}SeqLiftOverPsl.bed.gz", - bed_md5="tracks/grch37/ucsc_{prefix}SeqLiftOverPsl.bed.gz.md5", - bed_tbi="tracks/grch37/ucsc_{prefix}SeqLiftOverPsl.bed.gz.tbi", - bed_tbi_md5="tracks/grch37/ucsc_{prefix}SeqLiftOverPsl.bed.gz.tbi.md5", - txt="tracks/grch37/download/{prefix}SeqLiftOverPsl.txt.gz", - shell: - r""" - set -x - - mkdir -p $(dirname {output.txt}) - wget -O {output.txt} https://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/{wildcards.prefix}SeqLiftOverPsl.txt.gz - - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {output.txt} \ - | awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ if ($11 ~ /{wildcards.prefix}/ && $15 !~ /random/ && $15 !~ /hap/) {{ print $15, $17, $18, $11 }} }}' \ - | awk -F $'\t' -f scripts/sort-bed.awk \ - | sed -e 's/^chr//g' \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/snakefiles/vardbs-grch37-strucvars.smk b/snakefiles/vardbs-grch37-strucvars.smk deleted file mode 100644 index 38eb765..0000000 --- a/snakefiles/vardbs-grch37-strucvars.smk +++ /dev/null @@ -1,226 +0,0 @@ -rule vardbs_grch37_struc_vars_clinvar: - output: - bed="vardbs/grch37/strucvar/clinvar.bed.gz", - bed_md5="vardbs/grch37/strucvar/clinvar.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/clinvar.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/clinvar.bed.gz.tbi.md5", - shell: - r""" - set -x - export LC_ALL=C - - awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-clinvar.awk \ - vardbs/grch37/strucvar/clinvar.tsv \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_dbvar: - output: - download=expand( - "vardbs/grch37/strucvar/download/GRCh37.nr_{type}.tsv.gz", - type=["deletions", "duplications", "insertions"], - ), - bed="vardbs/grch37/strucvar/dbvar.bed.gz", - bed_md5="vardbs/grch37/strucvar/dbvar.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/dbvar.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/dbvar.bed.gz.tbi.md5", - shell: - r""" - set -x - export LC_ALL=C - - for dst in {output.download}; do - type=$(basename $dst | cut -d _ -f 2 | cut -d . -f 1) - wget --no-check-certificate \ - -O $dst \ - https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/$type/GRCh37.nr_$type.tsv.gz - done - - awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-dbvar.awk \ - <(zcat {output.download}) \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_dgv: - output: - download_txt="vardbs/grch37/strucvar/download/GRCh37_hg19_variants_2020-02-25.txt", - bed="vardbs/grch37/strucvar/dgv.bed.gz", - bed_md5="vardbs/grch37/strucvar/dgv.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/dgv.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/dgv.bed.gz.tbi.md5", - shell: - r""" - set -x - export LC_ALL=C - - wget --no-check-certificate \ - -O {output.download_txt} \ - http://dgv.tcag.ca/dgv/docs/GRCh37_hg19_variants_2020-02-25.txt - - awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-dgv.awk \ - {output.download_txt} \ - | grep -v _gl \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_dgv_gs: - output: - download_gff3="vardbs/grch37/strucvar/download/DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3", - bed="vardbs/grch37/strucvar/dgv_gs.bed.gz", - bed_md5="vardbs/grch37/strucvar/dgv_gs.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/dgv_gs.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/dgv_gs.bed.gz.tbi.md5", - shell: - r""" - set -x - export LC_ALL=C - - wget --no-check-certificate \ - -O {output.download_gff3} \ - http://dgv.tcag.ca/dgv/docs/DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3 - - awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-dgv_gs.awk \ - {output.download_gff3} \ - | grep -v _gl \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_exac: - output: - download_bed="vardbs/grch37/strucvar/download/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed", - bed="vardbs/grch37/strucvar/exac.bed.gz", - bed_md5="vardbs/grch37/strucvar/exac.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/exac.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/exac.bed.gz.tbi.md5", - shell: - r""" - set -x - export LC_ALL=C - - wget --no-check-certificate \ - -O {output.download_bed} \ - ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/cnv/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed - - awk \ - -f scripts/vardbs-grch37-strucvar-exac.awk \ - {output.download_bed} \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_g1k: - output: - vcf="vardbs/grch37/strucvar/download/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz", - bed="vardbs/grch37/strucvar/g1k.bed.gz", - bed_md5="vardbs/grch37/strucvar/g1k.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/g1k.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/g1k.bed.gz.tbi.md5", - shell: - r""" - set -x - - wget --no-check-certificate \ - -O {output.vcf} \ - https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/integrated_sv_map/ALL.wgs.integrated_sv_map_v2.20130502.svs.genotypes.vcf.gz - - zcat {output.vcf} \ - | awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-g1k.awk \ - | awk -F $'\t' \ - -f scripts/sort-bed.awk \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule vardbs_grch37_struc_vars_gnomad_sv: - output: - vcf="vardbs/grch37/strucvar/download/gnomad_v2.1_sv.sites.vcf.gz", - bed="vardbs/grch37/strucvar/gnomad_sv.bed.gz", - bed_md5="vardbs/grch37/strucvar/gnomad_sv.bed.gz.md5", - bed_tbi="vardbs/grch37/strucvar/gnomad_sv.bed.gz.tbi", - bed_tbi_md5="vardbs/grch37/strucvar/gnomad_sv.bed.gz.tbi.md5", - shell: - r""" - set -x - - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" ERR EXIT - - wget --no-check-certificate \ - -O {output.vcf} \ - https://storage.googleapis.com/gcp-public-data--gnomad/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz - - echo -e "#chromosome\tbegin\tend\tsv_type\tn_homalt\tn_het" \ - > $TMPDIR/tmp.bed - - bcftools query \ - -e 'SVTYPE="MCNV"' \ - -f "%CHROM\t%POS0\t%INFO/END\t%INFO/SVTYPE\t%INFO/N_HOMALT\t%INFO/N_HET\n" \ - {output.vcf} \ - >> $TMPDIR/tmp.bed - bgzip -c $TMPDIR/tmp.bed >{output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/varfish_db_downloader/data_versions.py b/varfish_db_downloader/data_versions.py new file mode 100644 index 0000000..1a134b1 --- /dev/null +++ b/varfish_db_downloader/data_versions.py @@ -0,0 +1,15 @@ +"""Declaration of data versions.""" + +import attrs + + +@attrs.frozen() +class DataVersions: + #: Version of dbNSFP. + dbnsfp: str + + +#: The data versions to use. +DATA_VERSIONS = DataVersions( + dbnsfp="4.4", +)