From 6a76d0b2f42eacb907b6c8344cf4fc93b4031164 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 14 Sep 2023 11:03:20 -0700 Subject: [PATCH 1/3] Remove NCBI Virus API related files Due to https://github.com/nextstrain/ingest/issues/18, the NCBI Virus API is more of a hassle to use. The data from NCBI Datasets CLI covers the standard fields that we use in pathogen ingests, so we can drop the use of the undocumented NCBI Virus API. If any pathogen needs additional custom fields that are not available through NCBI Datasets, the pipeline can use fetch-from-ncbi-entrez and parse the GenBank file. --- README.md | 7 +- csv-to-ndjson | 15 - docs/ncbi-virus-all-fields-example.json | 292 ------------------ fetch-from-ncbi-virus | 33 -- ncbi-virus-url | 103 ------ .../fetch-from-ncbi-virus/filter-and-fields.t | 18 -- .../fetch-from-ncbi-virus/invalid-taxon-id.t | 4 - 7 files changed, 3 insertions(+), 469 deletions(-) delete mode 100755 csv-to-ndjson delete mode 100644 docs/ncbi-virus-all-fields-example.json delete mode 100755 fetch-from-ncbi-virus delete mode 100755 ncbi-virus-url delete mode 100644 tests/fetch-from-ncbi-virus/filter-and-fields.t delete mode 100644 tests/fetch-from-ncbi-virus/invalid-taxon-id.t diff --git a/README.md b/README.md index 533b39a..995fff4 100644 --- a/README.md +++ b/README.md @@ -72,10 +72,9 @@ Scripts for supporting ingest workflow automation that don’t really belong in NCBI interaction scripts that are useful for fetching public metadata and sequences. - [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. - Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/) or [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. -- [fetch-from-ncbi-virus](fetch-from-ncbi-virus) - Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) and output NDJSON records to stdout. -- [ncbi-virus-url](ncbi-virus-url) - Generates the URL to download metadata and sequences from NCBI Virus as a single CSV file. -- [csv-to-ndjson](csv-to-ndjson) - Converts CSV file to NDJSON file with a hard-coded 200MiB field size limit to accommodate sequences in the NCBI Virus download. + Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. + +Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18. Potential Nextstrain CLI scripts diff --git a/csv-to-ndjson b/csv-to-ndjson deleted file mode 100755 index 84befe0..0000000 --- a/csv-to-ndjson +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert CSV on stdin to NDJSON on stdout. -usage: `cat dummy.csv | ./csv-to-ndjson > dummy.ndjson` -""" -import csv -import json -from sys import stdin, stdout - -# 200 MiB; default is 128 KiB -csv.field_size_limit(200 * 1024 * 1024) - -for row in csv.DictReader(stdin): - json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:') - print() diff --git a/docs/ncbi-virus-all-fields-example.json b/docs/ncbi-virus-all-fields-example.json deleted file mode 100644 index bbf9a7f..0000000 --- a/docs/ncbi-virus-all-fields-example.json +++ /dev/null @@ -1,292 +0,0 @@ -{ - "ExportDate_dt": "2023-08-08T21:02:01.475Z", - "QualNum_i": 0, - "QualPct_d": 0.0, - "IncompleteCdsCnt_i": 0, - "gi_l": 1798174254, - "Host_s": "Homo sapiens", - "HostSpecies_s": "Homo sapiens (human), taxid:9606|", - "HostLineage_ss": [ - "cellular organisms, taxid:131567| biota", - "Eukaryota (eucaryotes), taxid:2759| eukaryotes Eucarya Eucaryotae Eukarya Eukaryotae", - "Opisthokonta, taxid:33154| Fungi/Metazoa group opisthokonts", - "Metazoa (metazoans), taxid:33208| multicellular animals Animalia animals", - "Eumetazoa, taxid:6072|", - "Bilateria, taxid:33213|", - "Deuterostomia (deuterostomes), taxid:33511|", - "Chordata (chordates), taxid:7711|", - "Craniata, taxid:89593|", - "Vertebrata (vertebrates), taxid:7742|", - "Gnathostomata (jawed vertebrates), taxid:7776|", - "Teleostomi, taxid:117570|", - "Euteleostomi (bony vertebrates), taxid:117571|", - "Sarcopterygii, taxid:8287|", - "Dipnotetrapodomorpha, taxid:1338369|", - "Tetrapoda (tetrapods), taxid:32523|", - "Amniota (amniotes), taxid:32524|", - "Mammalia (mammals), taxid:40674|", - "Theria, taxid:32525|", - "Eutheria (placentals), taxid:9347| eutherian mammals placental mammals Placentalia", - "Boreoeutheria, taxid:1437010| Boreotheria", - "Euarchontoglires, taxid:314146|", - "Primates, taxid:9443| Primata primates", - "Haplorrhini, taxid:376913|", - "Simiiformes, taxid:314293| Anthropoidea", - "Catarrhini, taxid:9526|", - "Hominoidea (apes), taxid:314295| ape", - "Hominidae (great apes), taxid:9604| Pongidae", - "Homininae, taxid:207598| Homo/Pan/Gorilla group", - "Homo (humans), taxid:9605|", - "Homo sapiens (human), taxid:9606|" - ], - "HostLineageId_ss": [ - "131567", - "2759", - "33154", - "33208", - "6072", - "33213", - "33511", - "7711", - "89593", - "7742", - "7776", - "117570", - "117571", - "8287", - "1338369", - "32523", - "32524", - "40674", - "32525", - "9347", - "1437010", - "314146", - "9443", - "376913", - "314293", - "9526", - "314295", - "9604", - "207598", - "9605", - "9606" - ], - "Locus_s": "NC_045512", - "OrgId_i": 2697049, - "VirusFamily_s": "Coronaviridae", - "VirusGenus_s": "Betacoronavirus", - "VirusSpecies_s": "Severe acute respiratory syndrome-related coronavirus", - "VirusSpeciesId_i": 694009, - "VirusLineage_ss": [ - "Viruses, taxid:10239| Vira Viridae viruses", - "Riboviria (RNA viruses), taxid:2559587| RNA viruses and viroids", - "Orthornavirae, taxid:2732396|", - "Pisuviricota, taxid:2732408|", - "Pisoniviricetes, taxid:2732506|", - "Nidovirales, taxid:76804|", - "Cornidovirineae, taxid:2499399|", - "Coronaviridae, taxid:11118|", - "Orthocoronavirinae, taxid:2501931|", - "Betacoronavirus, taxid:694002| Coronavirus", - "Sarbecovirus, taxid:2509511|", - "Severe acute respiratory syndrome-related coronavirus, taxid:694009| HCoV-SARS SARS SARSr-CoV SARSrCoV", - "Severe acute respiratory syndrome coronavirus 2, taxid:2697049| SARS-CoV-2", - "RNA viruses" - ], - "VirusLineageId_ss": [ - "10239", - "2559587", - "2732396", - "2732408", - "2732506", - "76804", - "2499399", - "11118", - "2501931", - "694002", - "2509511", - "694009", - "2697049" - ], - "VirusL0_s": "RNA viruses", - "VirusL1_s": "Orthornavirae, taxid:2732396", - "VirusL2_s": "Pisuviricota, taxid:2732408", - "VirusL3_s": "Pisoniviricetes, taxid:2732506", - "VirusL4_s": "Nidovirales, taxid:76804", - "VirusL5_s": "Cornidovirineae, taxid:2499399", - "VirusL6_s": "Coronaviridae, taxid:11118", - "VirusL7_s": "Orthocoronavirinae, taxid:2501931", - "VirusL8_s": "Betacoronavirus, taxid:694002", - "VirusL9_s": "Sarbecovirus, taxid:2509511", - "VirusL10_s": "Severe acute respiratory syndrome-related coronavirus, taxid:694009", - "ViralHost_ss": [ - "human", - "vertebrates" - ], - "GenomicMoltype_s": "ssRNA(+)", - "SLen_i": 29903, - "Flags_ss": [ - "refseq", - "complete" - ], - "Flags_csv": "refseq, complete", - "FlagsCount_i": 2, - "SetAcc_s": "GCF_009858895.2", - "Authors_ss": [ - "Wu,F.", - "Zhao,S.", - "Yu,B.", - "Chen,Y.M.", - "Wang,W.", - "Song,Z.G.", - "Hu,Y.", - "Tao,Z.W.", - "Tian,J.H.", - "Pei,Y.Y.", - "Yuan,M.L.", - "Zhang,Y.L.", - "Dai,F.H.", - "Liu,Y.", - "Wang,Q.M.", - "Zheng,J.J.", - "Xu,L.", - "Holmes,E.C.", - "Zhang,Y.Z.", - "Baranov,P.V.", - "Henderson,C.M.", - "Anderson,C.B.", - "Gesteland,R.F.", - "Atkins,J.F.", - "Howard,M.T.", - "Robertson,M.P.", - "Igel,H.", - "Baertsch,R.", - "Haussler,D.", - "Ares,M. Jr.", - "Scott,W.G.", - "Williams,G.D.", - "Chang,R.Y.", - "Brian,D.A.", - "Chen,Y.-M.", - "Song,Z.-G.", - "Tao,Z.-W.", - "Tian,J.-H.", - "Pei,Y.-Y.", - "Zhang,Y.-L.", - "Dai,F.-H.", - "Wang,Q.-M.", - "Zheng,J.-J.", - "Zhang,Y.-Z." - ], - "Authors_csv": "Wu,F., Zhao,S., Yu,B., Chen,Y.M., Wang,W., Song,Z.G., Hu,Y., Tao,Z.W., Tian,J.H., Pei,Y.Y., Yuan,M.L., Zhang,Y.L., Dai,F.H., Liu,Y., Wang,Q.M., Zheng,J.J., Xu,L., Holmes,E.C., Zhang,Y.Z., Baranov,P.V., Henderson,C.M., Anderson,C.B., Gesteland,R.F., Atkins,J.F., Howard,M.T., Robertson,M.P., Igel,H., Baertsch,R., Haussler,D., Ares,M. Jr., Scott,W.G., Williams,G.D., Chang,R.Y., Brian,D.A., Chen,Y.-M., Song,Z.-G., Tao,Z.-W., Tian,J.-H., Pei,Y.-Y., Zhang,Y.-L., Dai,F.-H., Wang,Q.-M., Zheng,J.-J., Zhang,Y.-Z.", - "AuthorsCount_i": 44, - "Country_s": "China", - "Isolate_s": "Wuhan-Hu-1", - "Lineage_s": "B", - "Division_s": "VRL", - "Keywords_ss": [ - "RefSeq" - ], - "KeywordsCount_i": 1, - "TaxName_s": "Severe acute respiratory syndrome coronavirus 2", - "Region_s": "Asia", - "ParentAcc_s": "set:NC_045512", - "SetPosition_i": 0, - "SourceDB_s": "RefSeq", - "Definition_s": "Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome", - "HostId_i": 9606, - "CreateDate_dt": "2020-01-13T00:00:00Z", - "CreateYear_i": 2020, - "Genome_js": "[{\"id\": \"NC_045512.2\", \"segment\": null, \"proteins\": [{\"id\": \"YP_009724389.1\", \"name\": \"ORF1ab polyprotein\", \"location\": \"join(266..13468,13468..21555)\"}, {\"id\": \"YP_009725295.1\", \"name\": \"ORF1a polyprotein\", \"location\": \"266..13483\"}, {\"id\": \"YP_009724390.1\", \"name\": \"surface glycoprotein\", \"location\": \"21563..25384\"}, {\"id\": \"YP_009724391.1\", \"name\": \"ORF3a protein\", \"location\": \"25393..26220\"}, {\"id\": \"YP_009724392.1\", \"name\": \"envelope protein\", \"location\": \"26245..26472\"}, {\"id\": \"YP_009724393.1\", \"name\": \"membrane glycoprotein\", \"location\": \"26523..27191\"}, {\"id\": \"YP_009724394.1\", \"name\": \"ORF6 protein\", \"location\": \"27202..27387\"}, {\"id\": \"YP_009724395.1\", \"name\": \"ORF7a protein\", \"location\": \"27394..27759\"}, {\"id\": \"YP_009725318.1\", \"name\": \"ORF7b\", \"location\": \"27756..27887\"}, {\"id\": \"YP_009724396.1\", \"name\": \"ORF8 protein\", \"location\": \"27894..28259\"}, {\"id\": \"YP_009724397.2\", \"name\": \"nucleocapsid phosphoprotein\", \"location\": \"28274..29533\"}, {\"id\": \"YP_009725255.1\", \"name\": \"ORF10 protein\", \"location\": \"29558..29674\"}]}]", - "MolType_s": "RNA", - "ProtAcc_ss": [ - "YP_009724389", - "YP_009725295", - "YP_009724390", - "YP_009724391", - "YP_009724392", - "YP_009724393", - "YP_009724394", - "YP_009724395", - "YP_009725318", - "YP_009724396", - "YP_009724397", - "YP_009725255" - ], - "ProtAccCount_i": 12, - "UpdateDate_dt": "2020-07-18T00:00:00Z", - "UpdateYear_i": 2020, - "PubMed_ss": [ - "32015508", - "15680415", - "15630477", - "10482585" - ], - "PubMed_csv": "32015508, 15680415, 15630477, 10482585", - "PubMedCount_i": 4, - "Completeness_s": "complete", - "CountryFull_s": "China", - "ProtNames_ss": [ - "ORF1ab polyprotein", - "ORF1a polyprotein", - "surface glycoprotein", - "ORF3a protein", - "envelope protein", - "membrane glycoprotein", - "ORF6 protein", - "ORF7a protein", - "ORF7b protein", - "ORF8 protein", - "nucleocapsid phosphoprotein", - "ORF10 protein" - ], - "ProtNamesCount_i": 12, - "IsolateParsed_s": "Wuhan-Hu-1", - "NuclAcc_ss": [ - "NC_045512" - ], - "NuclAccCount_i": 1, - "CollectionDate_dr": "2019-12", - "CollectionYear_i": 2019, - "SubmitterAffil_s": "National Center for Biotechnology Information, NIH", - "BioProject_ss": [ - "PRJNA485481" - ], - "BioProject_csv": "PRJNA485481", - "BioProjectCount_i": 1, - "AccVer_s": "NC_045512.2", - "CollectionDate_s": "2019-12", - "SubmitterCountry_s": "USA", - "CollectionDate_dt": "2019-12-01T00:00:00Z", - "GenomeCompleteness_s": "complete", - "SubmitterAffilFull_s": "National Center for Biotechnology Information, NIH", - "BioProject_s": "PRJNA485481", - "AccNV_s": "NC_045512", - "id": "NC_045512", - "SeqType_s": "Nucleotide", - "FastaMD5_s": "4928f859a1822d291e0225206a0068c8", - "live_i": 1, - "ids_ss": [ - "GCF_009858895", - "GCF_009858895.2", - "NC_045512", - "NC_045512.2", - "PRJNA485481", - "YP_009724389", - "YP_009724390", - "YP_009724391", - "YP_009724392", - "YP_009724393", - "YP_009724394", - "YP_009724395", - "YP_009724396", - "YP_009724397", - "YP_009725255", - "YP_009725295", - "YP_009725318", - "set:NC_045512" - ], - "gi_i": 1798174254, - "_version_": 1773711315042304000 -} diff --git a/fetch-from-ncbi-virus b/fetch-from-ncbi-virus deleted file mode 100755 index 39733e6..0000000 --- a/fetch-from-ncbi-virus +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -# usage: fetch-from-ncbi-virus [options] -# -# Fetch metadata and nucleotide sequences from [NCBI Virus](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/) -# and output NDJSON records to stdout. -# -# [options] are passed directly to ncbi-virus-url. See that script for usage details. -# -# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest: -# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank -# -set -euo pipefail - -bin="$(dirname "$0")" - - -main() { - local ncbi_taxon_id="${1:?NCBI taxon id is required.}" - local github_repo="${2:?A GitHub repository with owner and repository name is required as the second argument}" - - # "${@:3}" represents all other options, if any. - ncbi_virus_url="$("$bin"/ncbi-virus-url --ncbi-taxon-id "$ncbi_taxon_id" "${@:3}")" - - fetch "$ncbi_virus_url" "$github_repo" | "$bin"/csv-to-ndjson -} - -fetch() { - curl "$1" \ - --fail --silent --show-error --http1.1 \ - --header "User-Agent: https://github.com/$2 (hello@nextstrain.org)" -} - -main "$@" diff --git a/ncbi-virus-url b/ncbi-virus-url deleted file mode 100755 index 0dd116b..0000000 --- a/ncbi-virus-url +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate URL to download all virus sequences and their curated metadata for a -specified NCBI Taxon ID from GenBank via NCBI Virus. - -The URL this program builds is based on the URL for SARS-CoV-2 constructed with - - https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url - -and observing the network activity at - - https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide -""" -from urllib.parse import urlencode -from typing import List, Optional -import argparse - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--ncbi-taxon-id", required=True, - help="NCBI Taxon ID. Visit NCBI virus at " + - "https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/find-data/virus " + - "to search for supported taxon IDs.") - parser.add_argument("--filters", required=False, nargs="*", - help="Filter criteria to add as `fq` param values. " + - "Apply filters via the NCBI Virus UI and observe the network " + - "activity to find the desired filter string.") - parser.add_argument("--fields", required=False, nargs="*", - help="Metadata fields to add as `fl` param values. " + - "Expected to be formatted as :. " + - "See docs/ncbi-virus-all-fields-example.json for the available NCBI Virus fields.") - return parser.parse_args() - -def build_query_url(ncbi_taxon_id: str, - filters: Optional[List[str]]=None, - fields: Optional[List[str]]=None): - """ - Generate URL to download all viral sequences and their curated metadata - from GenBank via NCBI Virus. - """ - endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" - params = { - # Search criteria - 'fq': [ - '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) - f'VirusLineageId_ss:({ncbi_taxon_id})', - *(filters or []), - ], - - # Unclear, but seems necessary. - 'q': '*:*', - - # Response format - 'cmd': 'download', - 'dlfmt': 'csv', - 'fl': ','.join( - [':'.join(names) for names in [ - # Pairs of (output column name, source data field). - ('genbank_accession', 'id'), - ('genbank_accession_rev', 'AccVer_s'), - ('database', 'SourceDB_s'), - ('strain', 'Isolate_s'), - ('region', 'Region_s'), - ('location', 'CountryFull_s'), - ('collected', 'CollectionDate_s'), - ('submitted', 'CreateDate_dt'), - ('updated', 'UpdateDate_dt'), - ('length', 'SLen_i'), - ('host', 'Host_s'), - ('isolation_source', 'Isolation_csv'), - ('bioproject_accession', 'BioProject_s'), - ('biosample_accession', 'BioSample_s'), - ('sra_accession', 'SRALink_csv'), - ('title', 'Definition_s'), - ('authors', 'Authors_csv'), - ('submitting_organization', 'SubmitterAffilFull_s'), - ('publications', 'PubMed_csv'), - ('sequence', 'Nucleotide_seq'), - ]] + (fields or []) - ), - - # Stable sort with GenBank accessions. - # Columns are source data fields, not our output columns. - 'sort': 'id asc', - - # This isn't Entrez, but include the same email parameter it requires just - # to be nice. - 'email': 'hello@nextstrain.org', - } - query = urlencode(params, doseq = True, encoding = "utf-8") - - print(f"{endpoint}?{query}") - -def main(): - args = parse_args() - build_query_url( - ncbi_taxon_id=args.ncbi_taxon_id, - filters=args.filters, - fields=args.fields - ) - -if __name__ == '__main__': - main() diff --git a/tests/fetch-from-ncbi-virus/filter-and-fields.t b/tests/fetch-from-ncbi-virus/filter-and-fields.t deleted file mode 100644 index 2fd7020..0000000 --- a/tests/fetch-from-ncbi-virus/filter-and-fields.t +++ /dev/null @@ -1,18 +0,0 @@ -Get the virus lineage IDs for 4 early Dengue sequences, testing the options --filter and --field. - - $ $TESTDIR/../../fetch-from-ncbi-virus 12637 nextstrain/ingest \ - > --filters 'CreateDate_dt:([1987-11-29T00:00:00Z TO 1987-11-29T00:00:01Z])' \ - > --fields 'viruslineage_ids:VirusLineageId_ss' - {"genbank_accession":"X05375","genbank_accession_rev":"X05375.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for envelope protein E N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GTAACTTATGGGACGTGTACCACCACAGGAGAACACAGAAGAGAAAAAAGATCAGTGGCACTCGTTCCACATGTGGGAATGGGACTGGAGACACGAACTGAAACATGGATGTCATCAGAAGGGGCCTGGAAACATGCCCAGAGAATTGAAACTTGGATCTTGAGACATCCAGGCTTTACCATAATGGCAGCAATCCTGGCATACACCATAGGAACGACACATTTCCAAAGAGCCCTGATTTTCATCTTACTGACAGCTGTCGCTCCTTCAATGACAATGCGTTGCATAGGAATATCAAATAGAGACTTTGTAGAAGGGGTTTCAGGAGGAAGCTGGGTTGACATAGTCTTAGAACATGGA","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05376","genbank_accession_rev":"X05376.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS1 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"ACAACAATGAGGGGAGCGAAGAGAATGGCCATTTTAGGTGACACAGCTTGGGATTTTGGATCCCTGGGAGGAGTGTTTACATCTATAGGAAAGGCTCTCCACCAAGTTTTCGGAGCAATCTATGGGGCTGCCTTCAGTGGGGTCTCATGGACTATGAAAATCCTCATAGGAGTCATTATCACATGGATAGGAATGAATTCACGCAGCACCTCACTTTCTGTGTCACTAGTATTGGTGGGAGTCGTGACGCTGTATTTGGGAGTTATGGTGCAGGCCGATAGTGGTTGCGTTGTGAGCTGGAAAAACAAAGAACTGAAGTGTGGCAGTGGGATTTTCATCACAGACAACGTGCACACATGG","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05377","genbank_accession_rev":"X05377.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS3 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"CTCACTGTGTGCTACGTGCTCACTGGACGATCGGCCGATTTGGAACTGGAGAGAGCCGCCGATGTCAAATGGGAAGATCAGGCAGAGATATCAGGAAGCAGTCCAATCCTGTCAATAACAATATCAGAAGATGGTAGCATGTCGATAAAAAACGAAGAGGAAGAACAAACACTGACCATACTCATTAGAACAGGATTGCTGGTGATCTCAGGACTTTTTCCTGTATCAATACCAATCACGGCAGCAGCATGGTACCTGTGGGAAGTGAAGAAACAACGGGCTGGAGTATTGTGGGATGTCCCTTCACCCCCACCCGTGGGAAAGGCTGAACTGGAAGATGGAGCCTATAGAATCAAGCAA","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - {"genbank_accession":"X05378","genbank_accession_rev":"X05378.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS5 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GATCCAATACCCTATGATCCAAAGTTTGAAAAGCAGTTGGGACAAGTAATGCTCCTAGTCCTCTGCGGGACTCAAGTGTTGATGATGAGGACTACATGGGCTCTGTGTGAGGCTTTAACCTTAGCGACCGGGCCTATCTCCACATTGTGGGAAGGAAATCCAGGGAGGTTTTGGAACACTACCATTGCAGTGTCAATGGCTAACATTTTTAGAGGGAGTTACTTGGCCGGAGCTGGACTTCTCTTTTCCATCATGAAGAACACAACCAACACGAGAAGGGGAACTGGCAACATAGGAGAGACGCTTGGAGAGAAATGGAAAAGCCGATTGAACGCATTGGGGAAAAGTGAATTCCAGATC","viruslineage_ids":"10239,2559587,2732396,2732406,2732462,2732545,11050,11051,12637,11060"} - -Do the same but without --field. - - $ $TESTDIR/../../fetch-from-ncbi-virus 12637 nextstrain/ingest \ - > --filters 'CreateDate_dt:([1987-11-29T00:00:00Z TO 1987-11-29T00:00:01Z])' - {"genbank_accession":"X05375","genbank_accession_rev":"X05375.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for envelope protein E N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GTAACTTATGGGACGTGTACCACCACAGGAGAACACAGAAGAGAAAAAAGATCAGTGGCACTCGTTCCACATGTGGGAATGGGACTGGAGACACGAACTGAAACATGGATGTCATCAGAAGGGGCCTGGAAACATGCCCAGAGAATTGAAACTTGGATCTTGAGACATCCAGGCTTTACCATAATGGCAGCAATCCTGGCATACACCATAGGAACGACACATTTCCAAAGAGCCCTGATTTTCATCTTACTGACAGCTGTCGCTCCTTCAATGACAATGCGTTGCATAGGAATATCAAATAGAGACTTTGTAGAAGGGGTTTCAGGAGGAAGCTGGGTTGACATAGTCTTAGAACATGGA"} - {"genbank_accession":"X05376","genbank_accession_rev":"X05376.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS1 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"ACAACAATGAGGGGAGCGAAGAGAATGGCCATTTTAGGTGACACAGCTTGGGATTTTGGATCCCTGGGAGGAGTGTTTACATCTATAGGAAAGGCTCTCCACCAAGTTTTCGGAGCAATCTATGGGGCTGCCTTCAGTGGGGTCTCATGGACTATGAAAATCCTCATAGGAGTCATTATCACATGGATAGGAATGAATTCACGCAGCACCTCACTTTCTGTGTCACTAGTATTGGTGGGAGTCGTGACGCTGTATTTGGGAGTTATGGTGCAGGCCGATAGTGGTTGCGTTGTGAGCTGGAAAAACAAAGAACTGAAGTGTGGCAGTGGGATTTTCATCACAGACAACGTGCACACATGG"} - {"genbank_accession":"X05377","genbank_accession_rev":"X05377.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS3 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"CTCACTGTGTGCTACGTGCTCACTGGACGATCGGCCGATTTGGAACTGGAGAGAGCCGCCGATGTCAAATGGGAAGATCAGGCAGAGATATCAGGAAGCAGTCCAATCCTGTCAATAACAATATCAGAAGATGGTAGCATGTCGATAAAAAACGAAGAGGAAGAACAAACACTGACCATACTCATTAGAACAGGATTGCTGGTGATCTCAGGACTTTTTCCTGTATCAATACCAATCACGGCAGCAGCATGGTACCTGTGGGAAGTGAAGAAACAACGGGCTGGAGTATTGTGGGATGTCCCTTCACCCCCACCCGTGGGAAAGGCTGAACTGGAAGATGGAGCCTATAGAATCAAGCAA"} - {"genbank_accession":"X05378","genbank_accession_rev":"X05378.1","database":"GenBank","strain":"","region":"","location":"","collected":"","submitted":"1987-11-29T00:00:00Z","updated":"2016-07-26T00:00:00Z","length":"360","host":"","isolation_source":"","bioproject_accession":"","biosample_accession":"","sra_accession":"","title":"Dengue virus type 2 genomic RNA for NS5 protein N-term","authors":"Biedrzycka,A., Cauchi,M.R., Bartholomeusz,A., Gorman,J.J., Wright,P.J.","submitting_organization":"","publications":"2952760","sequence":"GATCCAATACCCTATGATCCAAAGTTTGAAAAGCAGTTGGGACAAGTAATGCTCCTAGTCCTCTGCGGGACTCAAGTGTTGATGATGAGGACTACATGGGCTCTGTGTGAGGCTTTAACCTTAGCGACCGGGCCTATCTCCACATTGTGGGAAGGAAATCCAGGGAGGTTTTGGAACACTACCATTGCAGTGTCAATGGCTAACATTTTTAGAGGGAGTTACTTGGCCGGAGCTGGACTTCTCTTTTCCATCATGAAGAACACAACCAACACGAGAAGGGGAACTGGCAACATAGGAGAGACGCTTGGAGAGAAATGGAAAAGCCGATTGAACGCATTGGGGAAAAGTGAATTCCAGATC"} diff --git a/tests/fetch-from-ncbi-virus/invalid-taxon-id.t b/tests/fetch-from-ncbi-virus/invalid-taxon-id.t deleted file mode 100644 index 7a0d522..0000000 --- a/tests/fetch-from-ncbi-virus/invalid-taxon-id.t +++ /dev/null @@ -1,4 +0,0 @@ -Fetch from an invalid Taxon ID without any additional options. -This should not error nor return any output. - - $ $TESTDIR/../../fetch-from-ncbi-virus INVALID_TAXID nextstrain/ingest From 9355a5d67c79d015884e027bafedf09635b62e2c Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 14 Sep 2023 11:14:36 -0700 Subject: [PATCH 2/3] CI: Update workflow triggers Trim back CI jobs for reasons stated in https://github.com/nextstrain/cli/commit/fab709a2f45fc74afe2a15038e877e4dd58ae222 --- .github/workflows/ci.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bbf40f7..9766db3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,9 +1,11 @@ name: CI on: - - push - - pull_request - - workflow_dispatch + push: + branches: + - main + pull_request: + workflow_dispatch: jobs: shellcheck: From 6e955d7f6028f3621fe063ce7f349b84eecd2ca7 Mon Sep 17 00:00:00 2001 From: Jover Date: Thu, 14 Sep 2023 11:14:55 -0700 Subject: [PATCH 3/3] CI: remove Cram tests for now The only tests we had were for fetch-from-ncbi-virus, which were removed in 6a76d0b2f42eacb907b6c8344cf4fc93b4031164 --- .github/workflows/ci.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9766db3..610bbe0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,11 +13,3 @@ jobs: steps: - uses: actions/checkout@v3 - uses: nextstrain/.github/actions/shellcheck@master - - cram: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - - run: pip install cram - - run: cram tests/