From d731bf5f3b8f0bd69d95c78d5765ed176d2cc81b Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 7 Jun 2023 09:48:19 +0200 Subject: [PATCH] feat: binary conversion with annonars and worker (#35) --- Snakefile | 102 +++++++++---- rules/annos/features/cons.smk | 105 ------------- rules/annos/features/ensembl.smk | 63 -------- rules/annos/features/refseq.smk | 82 ---------- rules/annos/features/tads.smk | 54 ------- rules/annos/features/ucsc.smk | 113 -------------- rules/annos/seqvars/cadd.smk | 42 ----- rules/annos/seqvars/dbnsfp.smk | 85 ----------- rules/annos/seqvars/dbscsnv.smk | 37 ----- rules/annos/seqvars/dbsnp.smk | 30 ---- rules/annos/seqvars/gnomad_mtdna.smk | 45 ------ rules/annos/seqvars/gnomad_nuclear.smk | 143 ------------------ rules/annos/seqvars/helix.smk | 39 ----- rules/annos/strucvars/dbvar.smk | 74 --------- rules/annos/strucvars/dgv.smk | 112 -------------- rules/annos/strucvars/exac.smk | 36 ----- rules/annos/strucvars/g1k.smk | 37 ----- rules/annos/strucvars/gnomad.smk | 43 ------ rules/genes/dbnsfp.smk | 17 --- rules/genes/ensembl.smk | 112 -------------- rules/genes/gnomad.smk | 85 ----------- rules/genes/hgnc.smk | 68 --------- rules/genes/ncbi.smk | 94 ------------ rules/reference/human.smk | 51 ------- .../{data_versions.py => versions.py} | 30 ++++ varfish_db_downloader/wget.py | 29 +++- 26 files changed, 129 insertions(+), 1599 deletions(-) delete mode 100644 rules/annos/features/cons.smk delete mode 100644 rules/annos/features/ensembl.smk delete mode 100644 rules/annos/features/refseq.smk delete mode 100644 rules/annos/features/tads.smk delete mode 100644 rules/annos/features/ucsc.smk delete mode 100644 rules/annos/seqvars/cadd.smk delete mode 100644 rules/annos/seqvars/dbnsfp.smk delete mode 100644 rules/annos/seqvars/dbscsnv.smk delete mode 100644 rules/annos/seqvars/dbsnp.smk delete mode 100644 rules/annos/seqvars/gnomad_mtdna.smk delete mode 100644 rules/annos/seqvars/gnomad_nuclear.smk delete mode 100644 rules/annos/seqvars/helix.smk delete mode 100644 rules/annos/strucvars/dbvar.smk delete mode 100644 rules/annos/strucvars/dgv.smk delete mode 100644 rules/annos/strucvars/exac.smk delete mode 100644 rules/annos/strucvars/g1k.smk delete mode 100644 rules/annos/strucvars/gnomad.smk delete mode 100644 rules/genes/dbnsfp.smk delete mode 100644 rules/genes/ensembl.smk delete mode 100644 rules/genes/gnomad.smk delete mode 100644 rules/genes/hgnc.smk delete mode 100644 rules/genes/ncbi.smk delete mode 100644 rules/reference/human.smk rename varfish_db_downloader/{data_versions.py => versions.py} (79%) diff --git a/Snakefile b/Snakefile index 0f26db9..6fe1aad 100644 --- a/Snakefile +++ b/Snakefile @@ -6,13 +6,17 @@ # ``varfish-server-worker`` and is used in the backend for filtering and/or exposed to the # user via a REST API. -from varfish_db_downloader.data_versions import DATA_VERSIONS as DV +from varfish_db_downloader.versions import DATA_VERSIONS as DV, PACKAGE_VERSIONS as PV # The prefix to use for all shell commands. SHELL_PREFIX = "export LC_ALL=C; set -x -euo pipefail;" # Setup the shell prefix by default. shell.prefix(SHELL_PREFIX) +# Regular expression for genome release. +RE_GENOME = r"grch(37|38)" +# Regular expression for versions. +RE_VERSION = r"\d+(\.\d+)*" # =============================================================================================== # Test Mode @@ -58,6 +62,8 @@ rule help: ## all -- run all rules rule all: input: + # == work directory ===================================================================== + # # genes f"work/genes/dbnsfp/{DV.dbnsfp}/genes.tsv.gz", f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv", @@ -76,7 +82,7 @@ rule all: f"work/download/annos/grch37/seqvars/dbnsfp/{DV.dbnsfp}c/LICENSE.txt", f"work/download/annos/grch37/seqvars/dbscsnv/{DV.dbscsnv}/dbscSNV{DV.dbscsnv}.chr1", f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz", - "work/annos/grch37/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz", + f"work/annos/grch37/seqvars/helixmtdb/{DV.helixmtdb}/helixmtdb.vcf.gz", f"work/annos/grch37/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz", f"work/annos/grch37/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done", f"work/annos/grch37/seqvars/gnomad_genomes/{DV.gnomad_v2}/.done", @@ -86,10 +92,10 @@ rule all: # NB: dbNSFP is dual reference (for download) # NB: dbscSNV is dual reference (for download) f"work/download/annos/grch37/seqvars/dbsnp/{DV.dbsnp}/dbsnp.vcf.gz", - "work/annos/grch38/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz", - f"work/annos/grch38/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz", - f"work/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done", - f"work/annos/grch38/seqvars/gnomad_genomes/{DV.gnomad_v3}/.done", + f"work/download/annos/grch38/seqvars/helixmtdb/{DV.helixmtdb}/helixmtdb.vcf.gz", + f"work/download/annos/grch38/seqvars/gnomad_mtdna/{DV.gnomad_mtdna}/gnomad_mtdna.vcf.gz", + f"work/download/annos/grch38/seqvars/gnomad_exomes/{DV.gnomad_v2}/.done", + f"work/download/annos/grch38/seqvars/gnomad_genomes/{DV.gnomad_v3}/.done", # -- background/population structural variants and annoations thereof # ---- GRCh37 f"work/annos/grch37/strucvars/dbvar/{DV.dbvar}/dbvar.bed.gz", @@ -122,6 +128,41 @@ rule all: f"work/annos/grch38/features/ucsc/{DV.ucsc_rmsk_38}/rmsk.bed.gz", f"work/annos/grch38/features/ucsc/{DV.ucsc_alt_seq_liftover_38}/altSeqLiftOverPsl.bed.gz", f"work/annos/grch38/features/ucsc/{DV.ucsc_fix_seq_liftover_38}/fixSeqLiftOverPsl.bed.gz", + # + # == output directory =================================================================== + # + # -- mehari data + # ---- frequencies (via annonars) + f"output/mehari/freqs-grch37-{DV.gnomad_v2}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY", + f"output/mehari/freqs-grch38-{DV.gnomad_v3}+{DV.gnomad_v2}+{DV.gnomad_mtdna}+{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY", + # # -- varfish-server-worker data + # # ---- CADD + # f"output/worker/annos/seqvars/cadd-grch37-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/cadd-grch38-{DV.cadd}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- dbSNP + # f"output/worker/annos/seqvars/dbsnp-grch37-{DV.dbsnp}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/dbsnp-grch38-{DV.dbsnp}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- dbNSFP + # f"output/worker/annos/seqvars/dbnsfp-grch37-{DV.dbnsfp}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/dbnsfp-grch38-{DV.dbnsfp}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- dbscSNV + # f"output/worker/annos/seqvars/dbscsnv-grch37-{DV.dbscsnv}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/dbscsnv-grch38-{DV.dbscsnv}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- gnomAD mtDNA + # f"output/worker/annos/seqvars/gnomad-mtdna-grch37-{DV.gnomad_mtdna}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/gnomad-mtdna-grch38-{DV.gnomad_mtdna}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- gnomAD exomes + # f"output/worker/annos/seqvars/gnomad-exomes-grch37-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/gnomad-exomes-grch38-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- gnomAD genomes + # f"output/worker/annos/seqvars/gnomad-genomes-grch37-{DV.gnomad_v2}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/gnomad-genomes-grch38-{DV.gnomad_v3}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- HelixMtDb + # f"output/worker/annos/seqvars/helixmtdb-grch37-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/helixmtdb-grch38-{DV.helixmtdb}+{PV.annonars}/rocksdb/IDENTITY", + # # ---- UCSC conservation + # f"output/worker/annos/seqvars/cons-grch37-{DV.ucsc_cons_37}+{PV.annonars}/rocksdb/IDENTITY", + # f"output/worker/annos/seqvars/cons-grch38-{DV.ucsc_cons_38}+{PV.annonars}/rocksdb/IDENTITY", # =============================================================================================== @@ -129,31 +170,34 @@ rule all: # =============================================================================================== +# -- work directory ----------------------------------------------------------------------------- # Gene-related rules. -include: "rules/genes/dbnsfp.smk" -include: "rules/genes/ensembl.smk" -include: "rules/genes/gnomad.smk" -include: "rules/genes/hgnc.smk" -include: "rules/genes/ncbi.smk" +include: "rules/work/genes/dbnsfp.smk" +include: "rules/work/genes/ensembl.smk" +include: "rules/work/genes/gnomad.smk" +include: "rules/work/genes/hgnc.smk" +include: "rules/work/genes/ncbi.smk" # Reference sequence--related rules. -include: "rules/reference/human.smk" +include: "rules/work/reference/human.smk" # Features (position and not variant specific). -include: "rules/annos/features/cons.smk" -include: "rules/annos/features/ensembl.smk" -include: "rules/annos/features/refseq.smk" -include: "rules/annos/features/tads.smk" -include: "rules/annos/features/ucsc.smk" +include: "rules/work/annos/features/cons.smk" +include: "rules/work/annos/features/ensembl.smk" +include: "rules/work/annos/features/refseq.smk" +include: "rules/work/annos/features/tads.smk" +include: "rules/work/annos/features/ucsc.smk" # Sequence variants and annotations. -include: "rules/annos/seqvars/cadd.smk" -include: "rules/annos/seqvars/dbnsfp.smk" -include: "rules/annos/seqvars/dbscsnv.smk" -include: "rules/annos/seqvars/dbsnp.smk" -include: "rules/annos/seqvars/gnomad_mtdna.smk" -include: "rules/annos/seqvars/gnomad_nuclear.smk" -include: "rules/annos/seqvars/helix.smk" +include: "rules/work/annos/seqvars/cadd.smk" +include: "rules/work/annos/seqvars/dbnsfp.smk" +include: "rules/work/annos/seqvars/dbscsnv.smk" +include: "rules/work/annos/seqvars/dbsnp.smk" +include: "rules/work/annos/seqvars/gnomad_mtdna.smk" +include: "rules/work/annos/seqvars/gnomad_nuclear.smk" +include: "rules/work/annos/seqvars/helix.smk" # Structural variant related. -include: "rules/annos/strucvars/dbvar.smk" -include: "rules/annos/strucvars/dgv.smk" -include: "rules/annos/strucvars/exac.smk" -include: "rules/annos/strucvars/g1k.smk" -include: "rules/annos/strucvars/gnomad.smk" +include: "rules/work/annos/strucvars/dbvar.smk" +include: "rules/work/annos/strucvars/dgv.smk" +include: "rules/work/annos/strucvars/exac.smk" +include: "rules/work/annos/strucvars/g1k.smk" +include: "rules/work/annos/strucvars/gnomad.smk" +# -- output directory --------------------------------------------------------------------------- +include: "rules/output/mehari/freqs.smk" diff --git a/rules/annos/features/cons.smk b/rules/annos/features/cons.smk deleted file mode 100644 index 1220110..0000000 --- a/rules/annos/features/cons.smk +++ /dev/null @@ -1,105 +0,0 @@ -## Rules related to UCSC conservation track. - - -rule annos_features_cons_download: # -- download UCSC conservation track - output: - fa="work/download/annos/{genome_release}/features/cons/{version}/knownGene.exonAA.fa.gz", - shell: - r""" - if [[ {wildcards.genome_release} == grch37 ]]; then - ucsc_name=hg19 - else - ucsc_name=hg38 - fi - - # Check version. - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - wget -O $TMPDIR/alignments.html \ - https://hgdownload.cse.ucsc.edu/goldenpath/$ucsc_name/multiz100way/alignments - version=$(grep knownGene.exonAA.fa.gz $TMPDIR/alignments.html | awk '{{ print $3 }}') - if [[ "$version" != "{wildcards.version}" ]]; then - >&2 echo "Version mismatch for knownGene.exonAA.fa.gz: expected {wildcards.version}, got $version" - exit 1 - fi - - # Actually download the file. - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.fa} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - "https://hgdownload.cse.ucsc.edu/goldenpath/$ucsc_name/multiz100way/alignments/knownGene.exonAA.fa.gz" - """ - - -def input_annos_features_cons_to_vcf(wildcards): - """Input function for ``rule annos_features_cons_to_vcf``.""" - ensembl_version = { - "grch37": DV.ensembl_37, - "grch38": DV.ensembl_38, - }[wildcards.genome_release] - return { - "hgnc": f"work/genes/hgnc/{DV.today}/hgnc_info.jsonl", - "enst_ensg": ( - f"work/genes/enst_ensg/{wildcards.genome_release}/{ensembl_version}/enst_ensg.tsv" - ), - "reference": f"work/reference/{wildcards.genome_release}/reference.fa", - "fa": ( - f"work/download/annos/{wildcards.genome_release}/features/cons/" - f"{wildcards.version}/knownGene.exonAA.fa.gz" - ), - } - - -rule annos_features_cons_to_vcf: # -- build conservation VCF file - input: - unpack(input_annos_features_cons_to_vcf), - output: - vcf="work/download/annos/{genome_release}/features/cons/{version}/ucsc_conservation.vcf.gz", - tbi="work/download/annos/{genome_release}/features/cons/{version}/ucsc_conservation.vcf.gz.tbi", - shell: - r""" - python scripts/knowngeneaa.py \ - {input.hgnc} \ - {input.enst_ensg} \ - {input.reference} \ - {input.fa} \ - --output /dev/stdout \ - | bcftools sort \ - -O z \ - -o {output.vcf} - tabix -f {output.vcf} - """ - - -rule annos_features_cons_to_tsv: # -- convert conservation VCF to BED - input: - vcf="work/download/annos/{genome_release}/features/cons/{version}/ucsc_conservation.vcf.gz", - output: - tsv="work/annos/{genome_release}/features/cons/{version}/ucsc_conservation.tsv", - tsv_md5="work/annos/{genome_release}/features/cons/{version}/ucsc_conservation.tsv.md5", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - echo -e "chromosome\tstart\tstop\thgnc_id\tenst_id\texon_num\texon_count\talignment" >$TMPDIR/header.txt - - cat $TMPDIR/header.txt \ - | grep -v '^$' \ - | tr '\n' '\t' \ - | sed -e 's/\t*$/\n/g' \ - > {output.tsv} - - bcftools query \ - -f "%CHROM\t%POS\t%END\t%HGNC_ID\t%ENST_ID\t%EXON\t%EXON_COUNT\t%ALIGNMENT\n" \ - {input.vcf} \ - | uniq - >> {output.tsv} - - md5sum {output.tsv} > {output.tsv_md5} - """ diff --git a/rules/annos/features/ensembl.smk b/rules/annos/features/ensembl.smk deleted file mode 100644 index 418ab36..0000000 --- a/rules/annos/features/ensembl.smk +++ /dev/null @@ -1,63 +0,0 @@ -## Rules related to ENSEMBL features. - - -rule annos_features_ensembl_gene_regions_grch37_download: # -- download ENSEMBL gene regions files (GRCh37) - output: - gtf="work/download/annos/grch37/ensembl_genes/{version}/Homo_sapiens.GRCh37.{version}.gtf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.gtf} \ - 'https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.{wildcards.version}.gtf.gz' - """ - - -rule annos_features_ensembl_gene_regions_grch38_download: # -- download ENSEMBL gene regions files (GRCh38) - output: - gtf="work/download/annos/grch38/ensembl_genes/{version}/Homo_sapiens.GRCh38.{ensembl_version}.gtf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.gtf} \ - 'https://ftp.ensembl.org/pub/release-{wildcards.ensembl_version}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{wildcards.ensembl_version}.gtf.gz' - """ - - -def input_annos_features_ensembl_gene_regions_process(wildcards): - """Input function for ``rule annos_features_ensembl_gene_regions_process``.""" - genome_release_nolower = { - "grch37": "GRCh37", - "grch38": "GRCh38", - }[wildcards.genome_release] - return { - "gtf": ( - f"work/download/annos/{wildcards.genome_release}/ensembl_genes/{wildcards.version}/" - f"Homo_sapiens.{genome_release_nolower}.{wildcards.version}.gtf.gz" - ) - } - - -rule annos_features_ensembl_gene_regions_process: # -- process ENSEMBL gene regions files - input: - unpack(input_annos_features_ensembl_gene_regions_process), - output: - tsv="work/annos/{genome_release}/features/ensembl/{version}/ensembl_genes.bed.gz", - tsv_md5="work/annos/{genome_release}/features/ensembl/{version}/ensembl_genes.bed.gz.md5", - tsv_tbi="work/annos/{genome_release}/features/ensembl/{version}/ensembl_genes.bed.gz.tbi", - tsv_tbi_md5="work/annos/{genome_release}/features/ensembl/{version}/ensembl_genes.bed.gz.tbi.md5", - shell: - r""" - awk \ - -F $'\t' \ - -f scripts/features-ensembl-gene-regions.awk \ - <(zcat {input.gtf}) \ - | (set +e; egrep '^#|^X|^Y|^M|^[1-9]|^chrX|^chrY|^chrM|^chr[1-9]'; set -e) \ - | sort-bed - \ - | bgzip -c \ - > {output.tsv} - - tabix -f {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - md5sum {output.tsv_tbi} >{output.tsv_tbi_md5} - """ diff --git a/rules/annos/features/refseq.smk b/rules/annos/features/refseq.smk deleted file mode 100644 index 58b6824..0000000 --- a/rules/annos/features/refseq.smk +++ /dev/null @@ -1,82 +0,0 @@ -## Rules related to RefSeq features. - - -rule annos_features_refseq_gene_regions_download_grch37: # -- download ENSEMBL gene regions files (GRCh37) - output: - acc="work/download/annos/grch37/refseq/{version}/chr_accessions_GRCh37.p13", - gtf="work/download/annos/grch37/refseq/{version}/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.acc} \ - 'https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.{wildcards.version}/Assembled_chromosomes/chr_accessions_GRCh37.p13' - - wget --no-check-certificate \ - -O {output.gtf} \ - 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/{wildcards.version}.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz' - """ - - -rule annos_features_refseq_gene_regions_download_grch38: # -- download ENSEMBL gene regions files (GRCh38) - output: - report="work/download/annos/grch38/refseq/{version}/GCF_000001405.40_GRCh38.p14_assembly_report.txt", - acc="work/download/annos/grch38/refseq/{version}/chr_accessions_GRCh38.p14", - gtf="work/download/annos/grch38/refseq/{version}/GCF_000001405.40_GRCh38.p14_genomic.gtf.gz", - shell: - r""" - prefix=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.40_GRCh38.p14 - - wget --no-check-certificate \ - -O {output.report} \ - "$prefix/GCF_000001405.40_GRCh38.p14_assembly_report.txt" - - echo -e "#Chromosome\tRefSeq Accession.version\tRefSeq\tgi\tGenBank Accession.version\tGenBank gi" \ - > {output.acc} - awk -F $'\t' 'BEGIN {{ OFS=fS }} {{ print $10, $7, $9, $5, "." }}' {output.report} \ - >> {output.acc} - - wget --no-check-certificate \ - -O {output.gtf} \ - "$prefix/GCF_000001405.40_GRCh38.p14_genomic.gtf.gz" - """ - - -def input_annos_features_refseq_gene_regions_process_grch38(wildcards): - """Input function for ``rule annos_features_refseq_gene_regions_process_grch38``.""" - if wildcards.genome_build == "grch37": - return { - "acc": f"work/download/annos/grch37/refseq/{wildcards.version}/chr_accessions_GRCh37.p13", - "gtf": f"work/download/annos/grch37/refseq/{wildcards.version}/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - } - else: - return { - "acc": f"work/download/annos/grch38/refseq/{wildcards.version}/chr_accessions_GRCh38.p14", - "gtf": f"work/download/annos/grch38/refseq/{wildcards.version}/GCF_000001405.40_GRCh38.p14_genomic.gtf.gz", - } - - -rule annos_features_refseq_gene_regions_process_grch38: # -- process ENSEMBL gene regions files - input: - unpack(input_annos_features_refseq_gene_regions_process_grch38), - output: - tsv="work/annos/{genome_build}/features/refseq/{version}/refseq_genes.bed.gz", - tsv_md5="work/annos/{genome_build}/features/refseq/{version}/refseq_genes.bed.gz.md5", - tsv_tbi="work/annos/{genome_build}/features/refseq/{version}/refseq_genes.bed.gz.tbi", - tsv_tbi_md5="work/annos/{genome_build}/features/refseq/{version}/refseq_genes.bed.gz.tbi.md5", - shell: - r""" - awk \ - -F $'\t' \ - -f scripts/features-refseq-gene-regions.awk \ - {input.acc} \ - <(zcat {input.gtf}) \ - | egrep '^#|^X|^Y|^M|^[1-9]' \ - | sort-bed - \ - | bgzip -c \ - > {output.tsv} - - tabix -f {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - md5sum {output.tsv_tbi} >{output.tsv_tbi_md5} - """ diff --git a/rules/annos/features/tads.smk b/rules/annos/features/tads.smk deleted file mode 100644 index ab38450..0000000 --- a/rules/annos/features/tads.smk +++ /dev/null @@ -1,54 +0,0 @@ -## Rules related to TAD. - - -rule annos_features_tads_download: # -- download TAD ZIP files from 3dgenome.org - output: - zip="work/download/annos/{genome_release}/tads/{genome_release}.TADs.zip", - shell: - r""" - if [[ "{wildcards.genome_release}" == grch37 ]]; then - name=hg19 - else - name=hg38 - fi - - wget --no-check-certificate \ - -O {output.zip} \ - http://3dgenome.fsm.northwestern.edu/downloads/$name.TADs.zip - """ - - -rule annos_features_tads_unzip: # -- unzip TAD ZIP files - input: - zip="work/download/annos/{genome_release}/tads/{genome_release}.TADs.zip", - output: - hesc="work/download/annos/{genome_release}/tads/H1-ESC_Dixon2015-raw_TADs.txt", - shell: - r""" - unzip -o -d $(dirname {input.zip}) -j {input.zip} - - cd $(dirname {input.zip}) - - if $(set +e; ls | grep Dixon_2015 >/dev/null); then - for f in *Dixon_2015*; do - mv $f ${{f/Dixon_/Dixon}} - done - fi - """ - - -rule annos_features_tads_process: # -- process TAD files - input: - hesc="work/download/annos/{genome_release}/tads/H1-ESC_Dixon2015-raw_TADs.txt", - output: - bed_hesc="work/annos/{genome_release}/features/tads/dixon2015/hesc.bed", - bed_hesc_md5="work/annos/{genome_release}/features/tads/dixon2015/hesc.bed.md5", - shell: - r""" - echo -e "#chrom\tbegin\tend" >{output.bed_hesc} - cat {input.hesc} \ - | {{ if [[ "{wildcards.genome_release}" == "grch37" ]]; then sed -e 's/^chr//g'; else cat; fi }} \ - >>{output.bed_hesc} - - md5sum {output.bed_hesc} >{output.bed_hesc_md5} - """ diff --git a/rules/annos/features/ucsc.smk b/rules/annos/features/ucsc.smk deleted file mode 100644 index 905eb43..0000000 --- a/rules/annos/features/ucsc.smk +++ /dev/null @@ -1,113 +0,0 @@ -## Rules related to features from UCSC Genome Browser. - - -rule features_ucsc_download: # -- download of UCSC tracks - output: - txt="work/download/annos/{genome_release}/features/ucsc/{version}/{filename}", - shell: - r""" - # Check version. - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - if [[ "{wildcards.genome_release}" == "grch37" ]]; then - ucsc_name=hg19 - else - ucsc_name=hg38 - fi - - wget -O $TMPDIR/listing.html https://hgdownload.cse.ucsc.edu/goldenpath/$ucsc_name/database - version=$(grep {wildcards.filename} $TMPDIR/listing.html | awk '{{ print $3 }}') - if [[ "$version" != "{wildcards.version}" ]]; then - >&2 echo "Version mismatch for {wildcards.filename}: expected {version}, got $version" - exit 1 - fi - - # Actually perform the download. - wget -O {output.txt} https://hgdownload.cse.ucsc.edu/goldenpath/$ucsc_name/database/{wildcards.filename} - """ - - -rule features_ucsc_genomic_super_dups_process: # -- processing of UCSC genomicSuperDups - input: - txt="work/download/annos/{genome_release}/features/ucsc/{version}/genomicSuperDups.txt.gz", - output: - bed="work/annos/{genome_release}/features/ucsc/{version}/genomicSuperDups.bed.gz", - bed_md5="work/annos/{genome_release}/features/ucsc/{version}/genomicSuperDups.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/features/ucsc/{version}/genomicSuperDups.bed.gz.tbi", - bed_tbi_md5="work/annos/{genome_release}/features/ucsc/{version}/genomicSuperDups.bed.gz.tbi.md5", - shell: - r""" - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {input.txt} \ - | cut -f 2,3,4,5 \ - | {{ if [[ "{wildcards.genome_release}" == "grch37" ]]; then sed -e 's/^chr//g'; else cat; fi }} \ - | (set +e; egrep '^#|^X|^Y|^M|^[1-9]|^chrX|^chrY|^chrM|^chr[1-9]'; set -e) \ - | (set +e; egrep -v '^Un|^chrU|_random|_fix|_alt|_hap'; set -e) \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule features_ucsc_rmsk_process: # -- processing of UCSC rmsk - input: - txt="work/download/annos/{genome_release}/features/ucsc/{version}/rmsk.txt.gz", - output: - bed="work/annos/{genome_release}/features/ucsc/{version}/rmsk.bed.gz", - bed_md5="work/annos/{genome_release}/features/ucsc/{version}/rmsk.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/features/ucsc/{version}/rmsk.bed.gz.tbi", - bed_tbi_md5="work/annos/{genome_release}/features/ucsc/{version}/rmsk.bed.gz.tbi.md5", - shell: - r""" - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {input.txt} \ - | awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ if ($12 == $13) {{ label = $13 "/" $11 }} else {{ label = $12 "/" $13 "/" $11 }} print $6, $7, $8, label }}' \ - | {{ if [[ "{wildcards.genome_release}" == "grch37" ]]; then sed -e 's/^chr//g'; else cat; fi }} \ - | (set +e; egrep '^#|^X|^Y|^M|^[1-9]|^chrX|^chrY|^chrM|^chr[1-9]'; set -e) \ - | (set +e; egrep -v '^Un|^chrU|_random|_fix|_alt|_hap'; set -e) \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule features_ucsc_liftover_process: # -- process of UCSC *SeqLiftOverPsl - input: - txt="work/download/annos/{genome_release}/features/ucsc/{version}/{prefix}SeqLiftOverPsl.txt.gz", - output: - bed="work/annos/{genome_release}/features/ucsc/{version}/{prefix}SeqLiftOverPsl.bed.gz", - bed_md5="work/annos/{genome_release}/features/ucsc/{version}/{prefix}SeqLiftOverPsl.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/features/ucsc/{version}/{prefix}SeqLiftOverPsl.bed.gz.tbi", - bed_tbi_md5="work/annos/{genome_release}/features/ucsc/{version}/{prefix}SeqLiftOverPsl.bed.gz.tbi.md5", - shell: - r""" - ( - echo -e "#chrom\tbegin\tend\tlabel" - zcat {input.txt} \ - | awk -F $'\t' 'BEGIN {{ OFS=FS }} {{ if ($11 ~ /{wildcards.prefix}/ && $15 !~ /random/ && $15 !~ /hap/) {{ print $15, $17, $18, $11 }} }}' \ - | sort-bed - \ - | {{ if [[ "{wildcards.genome_release}" == "grch37" ]]; then sed -e 's/^chr//g'; else cat; fi }} \ - | (set +e; egrep '^#|^X|^Y|^M|^[1-9]|^chrX|^chrY|^chrM|^chr[1-9]'; set -e) \ - | (set +e; egrep -v '^Un|^chrU|_random|_fix|_alt|_hap'; set -e) \ - ) \ - | bgzip -c \ - > {output.bed} - - tabix -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/annos/seqvars/cadd.smk b/rules/annos/seqvars/cadd.smk deleted file mode 100644 index 668c091..0000000 --- a/rules/annos/seqvars/cadd.smk +++ /dev/null @@ -1,42 +0,0 @@ -## Rules related to the CADD score and its feature annotations. - -#: Prefix to the CADD download URL. -CADD_PREFIX = f"https://kircherlab.bihealth.org/download/CADD" - - -rule annos_seqvars_cadd_download: # -- download CADD data - output: - tsv="work/download/annos/{genome_release}/seqvars/cadd/{version}/{filename}.tsv.gz", - tsv_tbi="work/download/annos/{genome_release}/seqvars/cadd/{version}/{filename}.tsv.gz.tbi", - shell: - r""" - for path in {output}; - do - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out=$path \ - --split=16 \ - --max-concurrent-downloads=16 \ - --max-connection-per-server=16 \ - {CADD_PREFIX}/v{wildcards.version}/$(echo {wildcards.genome_release} | sed -e 's/grch/GRCh/')/$(basename $path) - done - """ - - -# rule annos_cadd_process_37: # -- process CADD data for GRCh37 -# input: -# "work/download/annos/grch37/seqvars/cadd/v{version}/whole_genome_SNVs_inclAnno.tsv.gz", -# "work/download/annos/grch37/seqvars/cadd/v{version}/whole_genome_SNVs_inclAnno.tsv.gz.tbi", -# "work/download/annos/grch37/seqvars/cadd/v{version}/InDels_inclAnno.tsv.gz", -# "work/download/annos/grch37/seqvars/cadd/v{version}/InDels_inclAnno.tsv.gz.tbi", -# output: -# touch("annos/grch37/cadd/.done"), -# rule annos_cadd_process_38: # -- process CADD data for GRCh38 -# input: -# "work/download/annos/grch38/seqvars/cadd/v{version}/whole_genome_SNVs_inclAnno.tsv.gz", -# "work/download/annos/grch38/seqvars/cadd/v{version}/whole_genome_SNVs_inclAnno.tsv.gz.tbi", -# "work/download/annos/grch38/seqvars/cadd/v{version}/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz", -# "work/download/annos/grch38/seqvars/cadd/v{version}/gnomad.genomes.r3.0.indel_inclAnno.tsv.gz.tbi", -# output: -# touch("annos/grch38/cadd/.done"), diff --git a/rules/annos/seqvars/dbnsfp.smk b/rules/annos/seqvars/dbnsfp.smk deleted file mode 100644 index 55e733f..0000000 --- a/rules/annos/seqvars/dbnsfp.smk +++ /dev/null @@ -1,85 +0,0 @@ -## Rules related to dbNSFP. - - -#: Download URL for dbNSFP 4.4a -DBNSFP_ACADEMIC_URL = "https://usf.box.com/shared/static/bvfzmkpgtphvbmmrvb2iyl2jl21o49kc" -#: Download URL for dbNSFP 4.4c -DBNSFP_COMMMERCIAL_URL = "https://usf.box.com/shared/static/a84zcdlkx2asq2nxh6xr2gdb4csmyvhk" - - -def files_dbnsfp(): - """Helper that returns the files within the dbNSFP archive.""" - lst = [ - "dbNSFP{version}{variant}.readme.txt", - "dbNSFP{version}{variant}_variant.chr1.gz", - "dbNSFP{version}{variant}_variant.chr10.gz", - "dbNSFP{version}{variant}_variant.chr11.gz", - "dbNSFP{version}{variant}_variant.chr12.gz", - "dbNSFP{version}{variant}_variant.chr13.gz", - "dbNSFP{version}{variant}_variant.chr14.gz", - "dbNSFP{version}{variant}_variant.chr15.gz", - "dbNSFP{version}{variant}_variant.chr16.gz", - "dbNSFP{version}{variant}_variant.chr17.gz", - "dbNSFP{version}{variant}_variant.chr18.gz", - "dbNSFP{version}{variant}_variant.chr19.gz", - "dbNSFP{version}{variant}_variant.chr2.gz", - "dbNSFP{version}{variant}_variant.chr20.gz", - "dbNSFP{version}{variant}_variant.chr21.gz", - "dbNSFP{version}{variant}_variant.chr22.gz", - "dbNSFP{version}{variant}_variant.chr3.gz", - "dbNSFP{version}{variant}_variant.chr4.gz", - "dbNSFP{version}{variant}_variant.chr5.gz", - "dbNSFP{version}{variant}_variant.chr6.gz", - "dbNSFP{version}{variant}_variant.chr7.gz", - "dbNSFP{version}{variant}_variant.chr8.gz", - "dbNSFP{version}{variant}_variant.chr9.gz", - "dbNSFP{version}{variant}_variant.chrM.gz", - "dbNSFP{version}{variant}_variant.chrX.gz", - "dbNSFP{version}{variant}_variant.chrY.gz", - "dbNSFP{version}_gene.complete.gz", - "dbNSFP{version}_gene.gz", - "LICENSE.txt", - "try.vcf", - "tryhg18.in", - "tryhg19.in", - "tryhg38.in", - ] - return ["work/download/annos/grch37/seqvars/dbnsfp/{version}{variant}/%s" % e for e in lst] - - -rule annos_seqvars_dbnsfp_download: # -- download dbNSFP ZIP file - output: - zip="work/download/annos/grch37/seqvars/dbnsfp/{version}{variant}/dbNSFP{version}{variant}.zip", - wildcard_constraints: - version=r"\d\.\d", - threads: 8 - shell: - r""" - if [[ "{wildcards.variant}" == a ]]; then - url={DBNSFP_ACADEMIC_URL} - else - url={DBNSFP_COMMMERCIAL_URL} - fi - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.zip} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - $url - """ - - -rule annos_seqvars_dbnsfp_extract: # -- extract dbNSFP ZIP file - input: - zip="work/download/annos/grch37/seqvars/dbnsfp/{version}{variant}/dbNSFP{version}{variant}.zip", - output: - files_dbnsfp(), - wildcard_constraints: - version=r"\d\.\d", - shell: - r""" - unzip -d $(dirname {input.zip}) {input.zip} - """ diff --git a/rules/annos/seqvars/dbscsnv.smk b/rules/annos/seqvars/dbscsnv.smk deleted file mode 100644 index 9ec53e9..0000000 --- a/rules/annos/seqvars/dbscsnv.smk +++ /dev/null @@ -1,37 +0,0 @@ -## Rules related to dbscSNV. - - -def files_dbscsnv(version: str = "1.1"): - """Files contained in the dbscSNV ZIP file.""" - chroms = [str(i) for i in range(1, 23)] + ["X", "Y"] - return [ - f"work/download/annos/grch37/seqvars/dbscsnv/{version}/dbscSNV{version}.chr{chrom}" - for chrom in chroms - ] - - -rule annos_seqvars_dbscsnv_download: # -- download dbscSNV ZIP file - output: - zip="work/download/annos/grch37/seqvars/dbscsnv/1.1/dbscSNV1.1.zip", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.zip} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip - """ - - -rule annos_seqvars_dbscsnv_extract: # -- extract dbscSNV ZIP file - input: - zip="work/download/annos/grch37/seqvars/dbscsnv/1.1/dbscSNV1.1.zip", - output: - files_dbscsnv(), - shell: - r""" - unzip -d $(dirname {input.zip}) {input.zip} - """ diff --git a/rules/annos/seqvars/dbsnp.smk b/rules/annos/seqvars/dbsnp.smk deleted file mode 100644 index 94ecea1..0000000 --- a/rules/annos/seqvars/dbsnp.smk +++ /dev/null @@ -1,30 +0,0 @@ -## Rules related to dbSNP. - - -rule annos_dbsnp_download: # -- download dbSNP data - output: - vcf="work/download/annos/{genome_release}/seqvars/dbsnp/{version}/dbsnp.vcf.gz", - vcf_tbi="work/download/annos/{genome_release}/seqvars/dbsnp/{version}/dbsnp.vcf.gz.tbi", - shell: - r""" - # Check the version. - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - if [[ "{wildcards.genome_release}" == grch37 ]]; then - reference=human_9606_b151_GRCh37p13 - else - reference=human_9606_b151_GRCh38p7 - fi - - # Perform the actual download. - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.vcf} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - https://ftp.ncbi.nih.gov/snp/organisms/$reference/VCF/00-All.vcf.gz - tabix -f {output.vcf} - """ diff --git a/rules/annos/seqvars/gnomad_mtdna.smk b/rules/annos/seqvars/gnomad_mtdna.smk deleted file mode 100644 index 07e2ac0..0000000 --- a/rules/annos/seqvars/gnomad_mtdna.smk +++ /dev/null @@ -1,45 +0,0 @@ -## Rules related to gnomAD mtDNA. - - -rule annos_gnomad_mtdna_download: # -- download gnomAD mtDNA - output: - dl="work/download/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad.genomes.v{version}.sites.chrM.vcf.bgz", - shell: - r""" - wget --no-check-certificate \ - -O {output.dl} \ - https://datasetgnomad.blob.core.windows.net/dataset/release/{wildcards.version}/vcf/genomes/gnomad.genomes.v{wildcards.version}.sites.chrM.vcf.bgz - """ - - -rule annos_gnomad_mtdna_process: # -- process gnomAD mtDNA - input: - dl="work/download/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad.genomes.v{version}.sites.chrM.vcf.bgz", - output: - vcf="work/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad_mtdna.vcf.gz", - vcf_md5="work/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad_mtdna.vcf.gz.md5", - vcf_tbi="work/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad_mtdna.vcf.gz.tbi", - vcf_tbi_md5="work/annos/{genome_build}/seqvars/gnomad_mtdna/{version}/gnomad_mtdna.vcf.gz.tbi.md5", - shell: - r""" - if [[ {wildcards.genome_build} == grch37 ]]; then - zcat {input.dl} \ - | sed \ - -e 's/chrM/MT/g' \ - -e 's/GRCh38_MT/GRCh37/g' \ - -e 's/,GERP_DIST/\&GERP_DIST/g' \ - -e 's/,BP_DIST/\&BP_DIST/g' \ - -e 's/,DIST_FROM_LAST_EXON/\&DIST_FROM_LAST_EXON/g' \ - -e 's/,50_BP_RULE/\&50_BP_RULE/g' \ - -e 's/,PHYLOCSF_TOO_SHORT/\&PHYLOCSF_TOO_SHORT/g' \ - | bgzip -c \ - > {output.vcf} - else - cp {input.dl} {output.vcf} - fi - - tabix -f {output.vcf} - - md5sum {output.vcf} > {output.vcf_md5} - md5sum {output.vcf_tbi} > {output.vcf_tbi_md5} - """ diff --git a/rules/annos/seqvars/gnomad_nuclear.smk b/rules/annos/seqvars/gnomad_nuclear.smk deleted file mode 100644 index 01826ac..0000000 --- a/rules/annos/seqvars/gnomad_nuclear.smk +++ /dev/null @@ -1,143 +0,0 @@ -## Rules related to gnomAD exomes/genomes sequence variants. - - -# URL prefix of gnomAD downloads. -# GNOMAD_PREFIX = "https://gnomad-public-us-east-1.s3.amazonaws.com/release" -# GNOMAD_PREFIX = "https://datasetgnomad.blob.core.windows.net/dataset/release" -GNOMAD_PREFIX = "https://storage.googleapis.com/gcp-public-data--gnomad/release" - - -rule annos_gnomad_nuclear_download_grch37: # -- download gnomAD v2 exomes/genomes for GRCh37 - output: - vcf="work/download/annos/grch37/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz", - vcf_tbi="work/download/annos/grch37/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz.tbi", - wildcard_constraints: - kind=r"[a-z]+", - threads: 8 - resources: - runtime="48h", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf} \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf_tbi} \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.vcf.bgz.tbi - """ - - -rule annos_gnomad_nuclear_download_grch38_liftover_v2: # -- download gnomAD v2 exomes lift-over for GRCh38 - output: - vcf="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz", - vcf_tbi="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz.tbi", - wildcard_constraints: - kind=r"[a-z]+", - threads: 8 - resources: - runtime="48h", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf} \ - {GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf_tbi} \ - {GNOMAD_PREFIX}/{wildcards.version}/liftover_grch38/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.r{wildcards.version}.sites.{wildcards.chrom}.liftover_grch38.vcf.bgz.tbi - """ - - -rule annos_gnomad_nuclear_download_grch38_v3: # -- download gnomAD genomes v3 - output: - vcf="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz", - vcf_tbi="work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz.tbi", - wildcard_constraints: - kind=r"[a-z]+", - threads: 8 - resources: - runtime="48h", - shell: - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf} \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz - - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - --out={output.vcf_tbi} \ - {GNOMAD_PREFIX}/{wildcards.version}/vcf/{wildcards.kind}/gnomad.{wildcards.kind}.v{wildcards.version}.sites.chr{wildcards.chrom}.vcf.bgz.tbi - """ - - -def input_annos_gnomad_nuclear_grch37(wildcards): - """Input files for gnomAD exomes/genomes GRCh37.""" - chroms = list(range(1, 23)) + ["X"] - # chrY is only available for GRCh37 genomes - if wildcards.kind == "exomes": - chroms.append("Y") - tpl = "work/download/annos/grch37/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.vcf.bgz" - return [tpl.format(kind=wildcards.kind, version=DV.gnomad_v2, chrom=chrom) for chrom in chroms] - - -def input_annos_gnomad_nuclear_grch38(wildcards): - """Input files for gnomAD exomes/genomes GRCh38.""" - chroms = list(range(1, 23)) + ["X", "Y"] - if wildcards.kind == "exomes": - tpl = "work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.r{version}.sites.{chrom}.liftover_grch38.vcf.bgz" - return [ - tpl.format(kind=wildcards.kind, version=DV.gnomad_v2, chrom=chrom) for chrom in chroms - ] - else: - tpl = "work/download/annos/grch38/seqvars/gnomad_{kind}/{version}/gnomad.{kind}.v{version}.sites.chr{chrom}.vcf.bgz" - return [ - tpl.format(kind=wildcards.kind, version=DV.gnomad_v3, chrom=chrom) for chrom in chroms - ] - - -rule annos_seqvars_gnomad_nuclear_grch37: # -- collect gnomAD exomes/genomes for GRCh37 - input: - input_annos_gnomad_nuclear_grch37, - output: - touch("work/annos/grch37/seqvars/gnomad_{kind}/{version}/.done"), - wildcard_constraints: - kind=r"[a-z]+", - - -rule annos_seqvars_gnomad_nuclear_grch38: # -- collect gnomAD exomes/genomes for GRCh38 - input: - input_annos_gnomad_nuclear_grch38, - output: - touch("work/annos/grch38/seqvars/gnomad_{kind}/{version}/.done"), - wildcard_constraints: - kind=r"[a-z]+", diff --git a/rules/annos/seqvars/helix.smk b/rules/annos/seqvars/helix.smk deleted file mode 100644 index 965ed19..0000000 --- a/rules/annos/seqvars/helix.smk +++ /dev/null @@ -1,39 +0,0 @@ -## Rules related to HelixMtDb. - - -rule annos_seqvars_helixmtdb_download: # -- download HelixMtDb data - output: - tsv="work/download/annos/{genome_build}/seqvars/helixmtdb/20200327/helixmtdb.tsv", - shell: - r""" - wget \ - --no-check-certificate \ - -O {output} \ - https://helix-research-public.s3.amazonaws.com/mito/HelixMTdb_20200327.tsv - """ - - -rule annos_seqvars_helixmtdb_convert: # -- process HelixMtDb data - input: - tsv="work/download/annos/{genome_build}/seqvars/helixmtdb/20200327/helixmtdb.tsv", - output: - vcf="work/annos/{genome_build}/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz", - vcf_tbi="work/annos/{genome_build}/seqvars/helixmtdb/20200327/helixmtdb.vcf.gz.tbi", - shell: - r""" - cat {input.tsv} \ - | python3 scripts/helix-to-vcf.py \ - > {output.vcf}.tmp - - if [[ {wildcards.genome_build} == GRCh37 ]]; then - sed -e 's/chrM/MT/g' {output.vcf}.tmp \ - | bgzip -c \ - > {output.vcf} - else - bgzip -c {output.vcf}.tmp >{output.vcf} - fi - - tabix -f {output.vcf} - - rm -f {output.vcf}.tmp - """ diff --git a/rules/annos/strucvars/dbvar.smk b/rules/annos/strucvars/dbvar.smk deleted file mode 100644 index bf427cd..0000000 --- a/rules/annos/strucvars/dbvar.smk +++ /dev/null @@ -1,74 +0,0 @@ -## Rules related to structural variants in dbVar. - - -rule annos_strucvars_dbvar_download: # -- download dbVar files - output: - "work/download/annos/{genome_release}/strucvars/dbvar/{version}/{genome_release_nolower}.nr_{sv_type}.tsv.gz", - shell: - r""" - # Ensure that the version is the latest in the release notes. - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - wget --no-check-certificate \ - -O $TMPDIR/release_notes_listing \ - https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/release_notes - - grep NR_stats.2 $TMPDIR/release_notes_listing \ - | tail -n 1 \ - | grep "$(echo {wildcards.version} | sed 's/-//g')" \ - || {{ >&2 echo "Version {wildcards.version} is not latest in dbVar release notes"; exit 1; }} - - # Actual download. - wget --no-check-certificate \ - -O {output} \ - https://ftp.ncbi.nlm.nih.gov/pub/dbVar/sandbox/sv_datasets/nonredundant/{wildcards.sv_type}/{wildcards.genome_release_nolower}.nr_{wildcards.sv_type}.tsv.gz - """ - - -def input_annos_strucvars_dbvar_process(wildcards): - """Input functions for rule ``rule annos_strucvars_dbvar_process``.""" - mapping = { - "grch37": "GRCh37", - "grch38": "GRCh38", - } - tpl = ( - "work/download/annos/{genome_release}/strucvars/dbvar/{version}/" - "{genome_release_nolower}.nr_{typ}.tsv.gz" - ) - return { - "download": [ - tpl.format( - genome_release=wildcards.genome_release, - version=wildcards.version, - genome_release_nolower=mapping[wildcards.genome_release], - typ=typ, - ) - for typ in ("deletions", "duplications", "insertions") - ] - } - - -rule annos_strucvars_dbvar_process: # -- process dbVar files - input: - unpack(input_annos_strucvars_dbvar_process), - output: - bed="work/annos/{genome_release}/strucvars/dbvar/{version}/dbvar.bed.gz", - bed_md5="work/annos/{genome_release}/strucvars/dbvar/{version}/dbvar.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/strucvars/dbvar/{version}/dbvar.bed.gz.tbi", - bed_tbi_md5="vardbs/{genome_release}/strucvars/dbvar/{version}/dbvar.bed.gz.tbi.md5", - shell: - r""" - awk \ - -F $'\t' \ - -f scripts/vardbs-strucvar-dbvar.awk \ - <(zcat {input.download}) \ - | sort-bed - \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/annos/strucvars/dgv.smk b/rules/annos/strucvars/dgv.smk deleted file mode 100644 index 98cd1c0..0000000 --- a/rules/annos/strucvars/dgv.smk +++ /dev/null @@ -1,112 +0,0 @@ -## Rules related to structural variants in DGV and DGV-GS. - - -rule annos_strucvars_dgv_download: # -- download DGV files - output: - txt="work/download/annos/{genome_release}/strucvars/dgv/{version}/{filename}", - shell: - r""" - wget --no-check-certificate \ - -O {output.txt} \ - http://dgv.tcag.ca/dgv/docs/{wildcards.filename} - """ - - -def input_annos_strucvars_dgv_process(wildcards): - """Input function for ``rule annos_strucvars_dgv_process``.""" - mapping = { - "grch37": { - "genome_release_nolower": "GRCh37", - "genome_release_ucsc": "hg19", - }, - "grch38": { - "genome_release_nolower": "GRCh38", - "genome_release_ucsc": "hg38", - }, - } - tpl = ( - "work/download/annos/{genome_release}/strucvars/dgv/{version}/" - "{genome_release_nolower}_{genome_release_ucsc}_variants_{version}.txt" - ) - return { - "txt": tpl.format( - **mapping[wildcards.genome_release], - **wildcards, - ) - } - - -rule annos_strucvars_dgv_process: # -- download DGV files - input: - unpack(input_annos_strucvars_dgv_process), - output: - bed="work/annos/{genome_release}/strucvars/dgv/{version}/dgv.bed.gz", - bed_md5="work/annos/{genome_release}/strucvars/dgv/{version}/dgv.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/strucvars/dgv/{version}/dgv.bed.gz.tbi", - bed_tbi_md5="work/annos/{genome_release}/strucvars/dgv/{version}/dgv.bed.gz.tbi.md5", - shell: - r""" - awk \ - -F $'\t' \ - -f scripts/vardbs-strucvar-dgv.awk \ - {input.txt} \ - | grep -v _gl \ - | sort-bed - \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ - - -rule annos_strucvars_dgv_gs_download: # -- download DGV GS files - output: - gff3="work/download/annos/{genome_release}/strucvars/dgv_gs/{version}/{filename}", - shell: - r""" - wget --no-check-certificate \ - -O {output.gff3} \ - http://dgv.tcag.ca/dgv/docs/{wildcards.filename} - """ - - -def input_annos_strucvars_dgv_gs_process(wildcards): - """Input function for ``rule annos_strucvars_dgv_gs_process``.""" - mapping = { - "grch37": "DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3", - "grch38": "DGV.GS.hg38.gff3", - } - return { - "gff3": ( - "work/download/annos/{genome_release}/strucvars/dgv_gs/{version}/{filename}" - ).format(filename=mapping[wildcards.genome_release], **wildcards) - } - - -rule annos_strucvars_dgv_gs_process: # -- download DGV GS files - input: - unpack(input_annos_strucvars_dgv_gs_process), - output: - bed="work/annos/{genome_release}/strucvars/dgv_gs/{version}/dgv_gs.bed.gz", - bed_md5="work/annos/{genome_release}/strucvars/dgv_gs/{version}/dgv_gs.bed.gz.md5", - bed_tbi="work/annos/{genome_release}/strucvars/dgv_gs/{version}/dgv_gs.bed.gz.tbi", - bed_tbi_md5="work/annos/{genome_release}/strucvars/dgv_gs/{version}/dgv_gs.bed.gz.tbi.md5", - shell: - r""" - awk \ - -F $'\t' \ - -f scripts/vardbs-strucvar-dgv_gs.awk \ - {input.gff3} \ - | grep -v _gl \ - | sort-bed - \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/annos/strucvars/exac.smk b/rules/annos/strucvars/exac.smk deleted file mode 100644 index bff543d..0000000 --- a/rules/annos/strucvars/exac.smk +++ /dev/null @@ -1,36 +0,0 @@ -## Rules related to ExAC-CNV. - - -rule annos_strucvars_exac_grch37_download: # -- process ExAC-CNV files - output: - bed="work/download/annos/grch37/strucvars/exac/0.3.1/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed", - shell: - r""" - wget --no-check-certificate \ - -O {output.bed} \ - ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/cnv/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed - """ - - -rule annos_strucvars_exac_grch37_process: # -- process ExAC-CNV files - input: - bed="work/download/annos/grch37/strucvars/exac/0.3.1/exac-final.autosome-1pct-sq60-qc-prot-coding.cnv.bed", - output: - bed="work/annos/grch37/strucvars/exac/0.3.1/exac.bed.gz", - bed_md5="work/annos/grch37/strucvars/exac/0.3.1/exac.bed.gz.md5", - bed_tbi="work/annos/grch37/strucvars/exac/0.3.1/exac.bed.gz.tbi", - bed_tbi_md5="work/annos/grch37/strucvars/exac/0.3.1/exac.bed.gz.tbi.md5", - shell: - r""" - awk \ - -f scripts/vardbs-grch37-strucvar-exac.awk \ - {input.bed} \ - | sort-bed - \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/annos/strucvars/g1k.smk b/rules/annos/strucvars/g1k.smk deleted file mode 100644 index 791d61e..0000000 --- a/rules/annos/strucvars/g1k.smk +++ /dev/null @@ -1,37 +0,0 @@ -## Rules related to Thousand Genomes SVs. - - -rule annos_strucvars_g1k_grch37_download: # -- download Thousand Genomes SVs - output: - vcf="work/download/annos/grch37/strucvars/g1k/phase3-v2/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.vcf} \ - https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/phase3/integrated_sv_map/ALL.wgs.integrated_sv_map_v2.20130502.svs.genotypes.vcf.gz - """ - - -rule annos_strucvars_g1k_grch37_process: # -- process Thousand Genomes SVs - input: - vcf="work/download/annos/grch37/strucvars/g1k/phase3-v2/ALL.wgs.mergedSV.v8.20130502.svs.genotypes.vcf.gz", - output: - bed="work/annos/grch37/strucvars/g1k/phase3-v2/g1k.bed.gz", - bed_md5="work/annos/grch37/strucvars/g1k/phase3-v2/g1k.bed.gz.md5", - bed_tbi="work/annos/grch37/strucvars/g1k/phase3-v2/g1k.bed.gz.tbi", - bed_tbi_md5="work/annos/grch37/strucvars/g1k/phase3-v2/g1k.bed.gz.tbi.md5", - shell: - r""" - zcat {input.vcf} \ - | awk \ - -F $'\t' \ - -f scripts/vardbs-grch37-strucvar-g1k.awk \ - | sort-bed - \ - | bgzip -c \ - > {output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/annos/strucvars/gnomad.smk b/rules/annos/strucvars/gnomad.smk deleted file mode 100644 index 45418a9..0000000 --- a/rules/annos/strucvars/gnomad.smk +++ /dev/null @@ -1,43 +0,0 @@ -## Rules related to gnomAD-SV. - - -rule annos_strucvars_gnomad_grch37_download: # -- download gnomAD-SV files - output: - vcf="work/download/annos/grch37/strucvars/gnomad/2.1.1/gnomad_v2.1_sv.sites.vcf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.vcf} \ - https://storage.googleapis.com/gcp-public-data--gnomad/papers/2019-sv/gnomad_v2.1_sv.sites.vcf.gz - """ - - -rule annos_strucvars_gnomad_grch37_process: # -- process gnomAD-SV files - input: - vcf="work/download/annos/grch37/strucvars/gnomad/2.1.1/gnomad_v2.1_sv.sites.vcf.gz", - output: - bed="work/annos/grch37/strucvars/gnomad/2.1.1/gnomad_sv.bed.gz", - bed_md5="work/annos/grch37/strucvars/gnomad/2.1.1/gnomad_sv.bed.gz.md5", - bed_tbi="work/annos/grch37/strucvars/gnomad/2.1.1/gnomad_sv.bed.gz.tbi", - bed_tbi_md5="work/annos/grch37/strucvars/gnomad/2.1.1/gnomad_sv.bed.gz.tbi.md5", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" ERR EXIT - - echo -e "#chromosome\tbegin\tend\tsv_type\tn_homalt\tn_het" \ - > $TMPDIR/tmp.bed - - bcftools query \ - -e 'SVTYPE="MCNV"' \ - -f "%CHROM\t%POS0\t%INFO/END\t%INFO/SVTYPE\t%INFO/N_HOMALT\t%INFO/N_HET\n" \ - {input.vcf} \ - >> $TMPDIR/tmp.bed - - bgzip -c $TMPDIR/tmp.bed >{output.bed} - - tabix -p bed -S 1 -f {output.bed} - - md5sum {output.bed} >{output.bed_md5} - md5sum {output.bed_tbi} >{output.bed_tbi_md5} - """ diff --git a/rules/genes/dbnsfp.smk b/rules/genes/dbnsfp.smk deleted file mode 100644 index ee5fe0b..0000000 --- a/rules/genes/dbnsfp.smk +++ /dev/null @@ -1,17 +0,0 @@ -## Rules related to dbNSFP gene information. - - -rule genes_dbnsfp_genes_copy: # -- copy over dbNSFP genes file - input: - tsv="work/download/annos/grch37/seqvars/dbnsfp/{version}a/dbNSFP{version}_gene.complete.gz", - output: - tsv="work/genes/dbnsfp/{version}/genes.tsv.gz", - tsv_md5="work/genes/dbnsfp/{version}/genes.tsv.gz.md5", - shell: - r""" - zcat {input.tsv} \ - | pigz -c \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ diff --git a/rules/genes/ensembl.smk b/rules/genes/ensembl.smk deleted file mode 100644 index e62e8f8..0000000 --- a/rules/genes/ensembl.smk +++ /dev/null @@ -1,112 +0,0 @@ -## Rules related to ENSEMBL gene information. - - -rule genes_ensembl_create_xlink: # -- create ENSEMBL gene information xlink table - output: - tsv=f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv", - tsv_md5=f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv.md5", - shell: - r""" - # Check wehther ensembl version is correct. - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - wget --no-check-certificate \ - -O $TMPDIR/current_README \ - https://ftp.ensembl.org/pub/current_README - grep "Ensembl Release {DV.ensembl} Databases" $TMPDIR/current_README \ - || (echo "Ensembl version is not {DV.ensembl}." && exit 1) - - echo -e "ensembl_gene_id\tensembl_transcript_id\tentrez_id\tgene_symbol" \ - >{output.tsv} - - wget --no-check-certificate \ - -O $TMPDIR/tmp \ - 'https://ensembl.org/biomart/martservice?query=' \ - - sort -u $TMPDIR/tmp \ - >> {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -rule genes_ensembl_download_maps_grch37: # -- download files for ENST-ENSG mapping (GRCh37) - output: - download_txt="work/genes/ensembl/grch37/87/download/knowntoEnsembl.txt.gz", - download_gtf="work/genes/ensembl/grch37/87/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.download_gtf} \ - 'https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz' - wget --no-check-certificate \ - -O {output.download_txt} \ - 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownToEnsembl.txt.gz' - """ - - -rule genes_ensembl_process_maps_grch37: # -- process ENST-ENSG mapping (GRCh37) - input: - download_txt="work/genes/ensembl/grch37/87/download/knowntoEnsembl.txt.gz", - download_gtf="work/genes/ensembl/grch37/87/download/GCF_000001405.25_GRCh37.p13_genomic.gtf.gz", - output: - tsv="work/genes/enst_ensg/grch37/87/enst_ensg.tsv", - tsv_md5="work/genes/enst_ensg/grch37/87/enst_ensg.tsv.md5", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - awk \ - -F $'\t' \ - -f scripts/genes-enst-ensg.awk \ - <(zcat {input.download_gtf}) \ - | sort \ - > $TMPDIR/tmp1.txt - - zcat {input.download_txt} \ - | sed -e 's/\..//g' \ - | sort -k2,2 \ - >> $TMPDIR/tmp2.txt - - echo -e "real_enst\tenst\tensg" > {output.tsv} - join -t $'\t' -1 2 -2 1 $TMPDIR/tmp2.txt $TMPDIR/tmp1.txt \ - >> {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -rule genes_ensembl_download_maps_grch38: # -- download files for ENST-ENSG mapping (GRCh38) - output: - download_gtf="work/genes/ensembl/grch38/{ensembl_version}/download/Homo_sapiens.GRCh38.{ensembl_version}.gtf.gz", - shell: - r""" - wget --no-check-certificate \ - -O {output.download_gtf} \ - https://ftp.ensembl.org/pub/release-{wildcards.ensembl_version}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{wildcards.ensembl_version}.gtf.gz - """ - - -rule genes_ensembl_process_maps_grch38: # -- process ENST-ENSG mapping (GRCh38) - input: - download_gtf="work/genes/ensembl/grch38/{ensembl_version}/download/Homo_sapiens.GRCh38.{ensembl_version}.gtf.gz", - output: - tsv="work/genes/enst_ensg/grch38/{ensembl_version}/enst_ensg.tsv", - tsv_md5="work/genes/enst_ensg/grch38/{ensembl_version}/enst_ensg.tsv.md5", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - echo -e "enst\tensg" > {output.tsv} - awk \ - -F $'\t' \ - -f scripts/genes-enst-ensg.awk \ - <(zcat {input.download_gtf}) \ - | sort \ - | sed -e 's/^/chr/' \ - >> {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ diff --git a/rules/genes/gnomad.smk b/rules/genes/gnomad.smk deleted file mode 100644 index 63545f1..0000000 --- a/rules/genes/gnomad.smk +++ /dev/null @@ -1,85 +0,0 @@ -## Rules related to gnomAD gene constraints. - - -rule genes_gnomad_download: # -- download gnomAD gene constraints - output: - bgz="work/download/genes/gnomad/2.1.1/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", - bgz_md5="work/download/genes/gnomad/2.1.1/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz.md5", - shell: - r""" - wget --no-check-certificate \ - -O {output.bgz} \ - https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz - - md5sum {output.bgz} >{output.bgz_md5} - """ - - -def run_genes_gnomad_constraints_v2_1_1_to_tsv(input, output, wildcards): - """Extra function because of snakefmt issues.""" - columns_src = [ - "transcript", - "exp_lof", - "exp_mis", - "exp_syn", - "mis_z", - "obs_lof", - "obs_mis", - "obs_syn", - "oe_lof", - "oe_lof_lower", - "oe_lof_upper", - "oe_mis", - "oe_mis_lower", - "oe_mis_upper", - "oe_syn", - "oe_syn_lower", - "oe_syn_upper", - "pLI", - "syn_z", - "exac_pLI", - "exac_obs_lof", - "exac_exp_lof", - "exac_oe_lof", - ] - columns_src_str = ",".join(columns_src) - columns_tmp = ["ensembl_transcript_id"] + columns_src[1:] - columns_tmp_str = ",".join(columns_tmp) - columns_dst = ["ensembl_gene_id", "entrez_id", "gene_symbol"] + columns_src[1:] - columns_dst_str = ",".join(columns_dst) - shell( - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - zcat {input.bgz} \ - | tr '\t' ',' \ - > $TMPDIR/tmp.txt - - qsv select {columns_src_str} $TMPDIR/tmp.txt \ - | qsv rename {columns_tmp_str} \ - | qsv sort -u \ - | tr ',' '\t' \ - > $TMPDIR/tmp.tsv - - qsv join -d '\t' \ - ensembl_transcript_id $TMPDIR/tmp.tsv \ - ensembl_transcript_id {input.xlink_ensembl} \ - | qsv select {columns_dst_str} \ - | tr ',' '\t' \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - ) - - -rule genes_gnomad_convert: # -- create gnomAD gene constraints TSV - input: - bgz="work/download/genes/gnomad/2.1.1/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz", - xlink_ensembl=f"work/genes/ensembl/{DV.ensembl}/ensembl_xlink.tsv", - output: - tsv="work/genes/gnomad/2.1.1/gnomad_constraints.tsv", - tsv_md5="work/genes/gnomad/2.1.1/gnomad_constraints.tsv.md5", - run: - run_genes_gnomad_constraints_v2_1_1_to_tsv(input, output, wildcards) diff --git a/rules/genes/hgnc.smk b/rules/genes/hgnc.smk deleted file mode 100644 index d56fde1..0000000 --- a/rules/genes/hgnc.smk +++ /dev/null @@ -1,68 +0,0 @@ -## Rules related to the HGNC data. - - -rule genes_hgnc_download: # -- Download the HGNC data - output: - json="work/download/hgnc/{date}/hgnc_complete_set.json", - json_md5="work/download/hgnc/{date}/hgnc_complete_set.json.md5", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - wget --no-check-certificate \ - -O {output.json} \ - https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json - - md5sum {output.json} > {output.json_md5} - """ - - -rule genes_hgnc_xlink: # -- Build HGNC xlink table. - input: - json="work/download/hgnc/{date}/hgnc_complete_set.json", - output: - tsv="work/genes/hgnc/{date}/hgnc_xlink.tsv", - tsv_md5="work/genes/hgnc/{date}/hgnc_xlink.tsv.md5", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - jq \ - --raw-output \ - --from-file scripts/genes-xlink-hgnc.jq \ - {input.json} \ - > {output.tsv} - - - md5sum {output.tsv} > {output.tsv_md5} - """ - - -rule genes_hgnc_gene_info: # -- Build HGNC gene_info JSONL file. - input: - json="work/download/hgnc/{date}/hgnc_complete_set.json", - output: - jsonl="work/genes/hgnc/{date}/hgnc_info.jsonl", - jsonl_md5="work/genes/hgnc/{date}/hgnc_info.jsonl.md5", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - jq \ - --compact-output \ - --raw-output \ - --from-file scripts/genes-hgnc-info.jq \ - {input.json} \ - > {output.jsonl} - - md5sum {output.jsonl} > {output.jsonl_md5} - """ diff --git a/rules/genes/ncbi.smk b/rules/genes/ncbi.smk deleted file mode 100644 index d5c2b95..0000000 --- a/rules/genes/ncbi.smk +++ /dev/null @@ -1,94 +0,0 @@ -## Rules related to gene data from NCBI. - - -rule genes_ncbi_download_mim2gene: # -- download NCBI MedGen mim2gene - output: - download="work/download/genes/ncbi/{date}/mim2gene_medgen", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - wget --no-check-certificate \ - -O {output.download} \ - https://ftp.ncbi.nih.gov/gene/DATA/mim2gene_medgen - """ - - -rule genes_ncbi_process_mim2gene: # -- process NCBI MedGen mim2gene - input: - download="work/download/genes/ncbi/{date}/mim2gene_medgen", - output: - tsv="work/genes/mim2gene/{date}/mim2gene.tsv", - tsv_md5="work/genes/mim2gene/{date}/mim2gene.tsv.md5", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - awk -f scripts/genes-mim2gene.awk \ - -F $'\t' \ - {input.download} \ - > {output.tsv} - - md5sum {output.tsv} >{output.tsv_md5} - """ - - -rule genes_ncbi_entrez_download: # -- download NCBI Entrez files - output: - ags="work/download/genes/ncbi/{date}/Homo_sapiens.ags.gz", - ags_md5="work/download/genes/ncbi/{date}/Homo_sapiens.ags.gz.md5", - gene2xml="work/download/genes/ncbi/{date}/linux64.gene2xml", - gene2xml_md5="work/download/genes/ncbi/{date}/linux64.gene2xml.md5", - shell: - r""" - export TMPDIR=$(mktemp -d) - trap "rm -rf $TMPDIR" EXIT - - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - wget --no-check-certificate \ - -O $(dirname {output.ags})/Homo_sapiens.ags.gz \ - https://ftp.ncbi.nih.gov/gene/DATA/ASN_BINARY/Mammalia/Homo_sapiens.ags.gz - - wget --no-check-certificate \ - -O $TMPDIR/linux64.gene2xml.gz \ - https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/gene2xml/linux64.gene2xml.gz - - gzip -d -c $TMPDIR/linux64.gene2xml.gz \ - > {output.gene2xml} - chmod u+x {output.gene2xml} - - md5sum {output.ags} >{output.ags_md5} - md5sum {output.gene2xml} >{output.gene2xml_md5} - """ - - -rule genes_ncbi_entrez_process: # -- process NCBI Entrez files - input: - ags="work/download/genes/ncbi/{date}/Homo_sapiens.ags.gz", - gene2xml="work/download/genes/ncbi/{date}/linux64.gene2xml", - output: - jsonl="work/genes/entrez/{date}/gene_info.jsonl", - jsonl_md5="work/genes/entrez/{date}/gene_info.jsonl.md5", - shell: - r""" - if [[ "$(date +%Y-%m-%d)" != "{wildcards.date}" ]]; then - >&2 echo "{wildcards.date} is not today" - exit 1 - fi - - ./{input.gene2xml} -b T -c T -i {input.ags} \ - | python3 scripts/refseq_xml_to_json.py \ - > {output.jsonl} - - md5sum {output.jsonl} >{output.jsonl_md5} - """ diff --git a/rules/reference/human.smk b/rules/reference/human.smk deleted file mode 100644 index ce5e53e..0000000 --- a/rules/reference/human.smk +++ /dev/null @@ -1,51 +0,0 @@ -## Rules related to human reference genome sequence. - -#: Download URLs -REFERENCE_URLS = { - "grch37": ( - "https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/" - "phase2_reference_assembly_sequence/hs37d5.fa.gz" - ), - "grch38": ( - "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/" - "seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz" - ), -} - - -rule reference_download: # -- download reference genome sequence - output: - download="work/download/reference/{genome_build}/reference.fa.gz", - run: - ref_url = REFERENCE_URLS[wildcards.genome_build] - shell( - r""" - aria2c \ - --check-certificate=false \ - --file-allocation=trunc \ - --out={output.download} \ - --split=8 \ - --max-concurrent-downloads=8 \ - --max-connection-per-server=8 \ - {ref_url} - """ - ) - - -rule reference_process: # -- post-process reference sequence after download - input: - download="work/download/reference/{genome_build}/reference.fa.gz", - output: - fasta="work/reference/{genome_build}/reference.fa", - fasta_md5="work/reference/{genome_build}/reference.fa.md5", - fasta_fai="work/reference/{genome_build}/reference.fa.fai", - fasta_fai_md5="work/reference/{genome_build}/reference.fa.fai.md5", - shell: - r""" - pigz -d -c {input.download} >{output.fasta} - - samtools faidx {output.fasta} - - md5sum {output.fasta} >{output.fasta_md5} - md5sum {output.fasta_fai} >{output.fasta_fai_md5} - """ diff --git a/varfish_db_downloader/data_versions.py b/varfish_db_downloader/versions.py similarity index 79% rename from varfish_db_downloader/data_versions.py rename to varfish_db_downloader/versions.py index ae620eb..07cf4be 100644 --- a/varfish_db_downloader/data_versions.py +++ b/varfish_db_downloader/versions.py @@ -1,6 +1,7 @@ """Declaration of data versions.""" import os +import subprocess from datetime import datetime import attrs @@ -11,6 +12,8 @@ @attrs.frozen() class DataVersions: + """Container with data versions.""" + #: String to use for GRCh37 ENSEMBL version. ensembl_37: str #: String to use for GRCh38 ENSEMBL version. @@ -45,6 +48,8 @@ class DataVersions: exac_cnv: str #: Thousand Genomes SVs. g1k_svs: str + #: HelixMtDb + helixmtdb: str #: UCSC conservation (GRCh37). ucsc_cons_37: str #: UCSC conservation (GRCh38). @@ -92,6 +97,7 @@ class DataVersions: dgv_gs="2016-05-15", exac_cnv="0.3.1", g1k_svs="phase3-v2", + helixmtdb="20200327", ucsc_cons_37="2016-10-07", ucsc_cons_38="2019-09-06", ucsc_rmsk_37="2020-03-22", @@ -106,3 +112,27 @@ class DataVersions: refseq_38="GCF_000001405.40-RS_2023_03", dbsnp="b151", ) + + +@attrs.frozen() +class PackageVersions: + """Container with package versions.""" + + #: Version of ``annona-rs`` executable. + annonars: str + #: Version of ``varfish-server-worker`` executable. + worker: str + + +def get_version(executable: str) -> str: + """Return version of ``executable``.""" + tmp: str = subprocess.check_output([executable, "--version"], text=True) + _, version = tmp.strip().split(" ", 1) + return version + + +#: The package versions from environment. +PACKAGE_VERSIONS = PackageVersions( + annonars=get_version("annonars"), + worker=get_version("varfish-server-worker"), +) diff --git a/varfish_db_downloader/wget.py b/varfish_db_downloader/wget.py index af6096b..ac8f5e3 100644 --- a/varfish_db_downloader/wget.py +++ b/varfish_db_downloader/wget.py @@ -89,6 +89,21 @@ def excerpt_head(url: str, path_out: str, count: int): f_out.write(b"\n") +def excerpt_copy_tbi(url: str, path_out: str, count: int): + """Copy ``.tbi`` file from the (previously downloaded) ``.vcf.gz`` file.)""" + _ = count + vcf_url = url[:-4] + vcf_file = vcf_url.split("/")[-1] + vcf_url_hash = hash_url(vcf_url) + base_path = os.path.dirname(os.path.dirname(path_out)) + vcf_tbi_path = f"{base_path}/{vcf_url_hash}/{vcf_file}.tbi" + logger.info(" (strategy COPY_TBI from {} to {})", vcf_tbi_path, path_out) + if not os.path.exists(vcf_tbi_path): + raise RuntimeError(f"File {vcf_tbi_path} does not exist (strategy COPY-TBI)") + shutil.copy(vcf_tbi_path, path_out) + + + def excerpt_vcf_head(url: str, path_out: str, count: int): """Excerpt a VCF file by applying the following steps: @@ -132,6 +147,7 @@ def excerpt_vcf_head(url: str, path_out: str, count: int): "head": excerpt_head, "gz-head": excerpt_head, "vcf-head": excerpt_vcf_head, + "copy-tbi": excerpt_copy_tbi, "manual": excerpt_manual, "no-excerpt": no_excerpt, } @@ -153,12 +169,19 @@ def default_for(url: str) -> "ExcerptStrategy": return ExcerptStrategy(strategy="vcf-head", count=100) elif url.endswith(".gz") or url.endswith(".bgz"): return ExcerptStrategy(strategy="gz-head", count=100) - elif url.endswith(".tbi") or "." not in url.split("/")[-1]: - return ExcerptStrategy(strategy="no-excerpt", count=100) + elif url.endswith(".tbi"): + return ExcerptStrategy(strategy="copy-tbi", count=None) + elif "." not in url.split("/")[-1]: + return ExcerptStrategy(strategy="no-excerpt", count=None) else: return ExcerptStrategy(strategy="head", count=100) +def hash_url(url: str) -> str: + """Return hashed URL string.""" + return hashlib.sha256(url.encode("utf-8")).hexdigest()[:HASH_LENGTH] + + @attrs.frozen() class UrlEntry: """Entry in the ``downloads_urls.yml`` file.""" @@ -174,7 +197,7 @@ class UrlEntry: def __attrs_post_init__(self): object.__setattr__( - self, "hash", hashlib.sha256(self.url.encode("utf-8")).hexdigest()[:HASH_LENGTH] + self, "hash", hash_url(self.url) )