diff --git a/ingest/bin/csv-to-ndjson.py b/ingest/bin/csv-to-ndjson.py deleted file mode 100755 index 8c0bb27..0000000 --- a/ingest/bin/csv-to-ndjson.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -""" -Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest: -https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson - -Convert CSV on stdin to NDJSON on stdout. -""" -import csv -import json -import argparse - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="find where sequences are glycosylated", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--input", required=True, help="csv file") - parser.add_argument("--output", required=True, help="ndjson file") - - args = parser.parse_args() - -# 200 MiB; default is 128 KiB -csv.field_size_limit(200 * 1024 * 1024) - -with open(args.input) as file: - with open(args.output, 'w') as output_file: - for row in csv.DictReader(file): - json.dump(row, output_file, allow_nan = False, indent = None, separators = ',:') - output_file.write("\n") diff --git a/ingest/bin/genbank-url b/ingest/bin/genbank-url deleted file mode 100755 index 6ca11d2..0000000 --- a/ingest/bin/genbank-url +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 - -from urllib.parse import urlencode -import yaml -from yaml import Loader - -with open('config/config.yaml', 'r') as f: - config = yaml.load(f, Loader=Loader) - - if config['rsv'] == 'B': - taxonid = [208895] - elif config['rsv'] == 'A': - taxonid = [208893] - -for i in taxonid: - endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/" - params = { - # Search criteria - 'fq': [ - '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein) - f'VirusLineageId_ss:({i})', # NCBI Taxon id for RSV - ], - - # Unclear, but seems necessary. - 'q': '*:*', - - # Response format - 'cmd': 'download', - 'dlfmt': 'csv', - 'fl': ','.join( - ':'.join(names) for names in [ - # Pairs of (output column name, source data field). - ('genbank_accession', 'id'), - ('genbank_accession_rev', 'AccVer_s'), - ('database', 'SourceDB_s'), - ('strain', 'Isolate_s'), - ('region', 'Region_s'), - ('location', 'CountryFull_s'), - ('collected', 'CollectionDate_s'), - ('submitted', 'CreateDate_dt'), - ('length', 'SLen_i'), - ('host', 'Host_s'), - ('isolation_source', 'Isolation_csv'), - ('bioproject_accession', 'BioProject_s'), - ('biosample_accession', 'BioSample_s'), - ('sra_accession', 'SRALink_csv'), - ('title', 'Definition_s'), - ('authors', 'Authors_csv'), - ('publications', 'PubMed_csv'), - ('sequence', 'Nucleotide_seq'), - ] - ), - - # Stable sort with newest last so diffs work nicely. Columns are source - # data fields, not our output columns. - 'sort': 'SourceDB_s desc, CollectionDate_s asc, id asc', - - # This isn't Entrez, but include the same email parameter it requires just - # to be nice. - 'email': 'hello@nextstrain.org', - } - query = urlencode(params, doseq = True, encoding = "utf-8") - - print(f"{endpoint}?{query}") diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc new file mode 100644 index 0000000..153d20f --- /dev/null +++ b/ingest/vendored/.cramrc @@ -0,0 +1,3 @@ +[cram] +shell = /bin/bash +indent = 2 diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml index dcb3b89..c6a218a 100644 --- a/ingest/vendored/.github/workflows/ci.yaml +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -1,9 +1,11 @@ name: CI on: - - push - - pull_request - - workflow_dispatch + push: + branches: + - main + pull_request: + workflow_dispatch: jobs: shellcheck: @@ -11,3 +13,11 @@ jobs: steps: - uses: actions/checkout@v3 - uses: nextstrain/.github/actions/shellcheck@master + + cram: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - run: pip install cram + - run: cram tests/ \ No newline at end of file diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo index b530e64..9702ba8 100644 --- a/ingest/vendored/.gitrepo +++ b/ingest/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/ingest branch = main - commit = 1eb8b30428d5f66adac201f0a246a7ab4bdc9792 - parent = 9f6b59f1ce418d9e5bdd1c4e0bbf5a070d15072e + commit = c02fa8120edc3a831d5c9ab16a119f1866c300e3 + parent = 405a8ec814cddcbf0246977559c7690e077d4fbf method = merge cmdver = 0.4.6 diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md index 0311a55..008ec43 100644 --- a/ingest/vendored/README.md +++ b/ingest/vendored/README.md @@ -25,6 +25,24 @@ Any future updates of ingest scripts can be pulled in with: git subrepo pull ingest/vendored ``` +> **Warning** +> Beware of rebasing/dropping the parent commit of a `git subrepo` update + +`git subrepo` relies on metadata in the `ingest/vendored/.gitrepo` file, +which includes the hash for the parent commit in the pathogen repos. +If this hash no longer exists in the commit history, there will be errors when +running future `git subrepo pull` commands. + +If you run into an error similar to the following: +``` +$ git subrepo pull ingest/vendored +git-subrepo: Command failed: 'git branch subrepo/ingest/vendored '. +fatal: not a valid object name: '' +``` +Check the parent commit hash in the `ingest/vendored/.gitrepo` file and make +sure the commit exists in the commit history. Update to the appropriate parent +commit hash if needed. + ## History Much of this tooling originated in @@ -69,6 +87,13 @@ Scripts for supporting ingest workflow automation that don’t really belong in - [trigger-on-new-data](trigger-on-new-data) - Triggers downstream GitHub Actions if the provided `upload-to-s3` outputs do not contain the `identical_file_message` A hacky way to ensure that we only trigger downstream phylogenetic builds if the S3 objects have been updated. +NCBI interaction scripts that are useful for fetching public metadata and sequences. + +- [fetch-from-ncbi-entrez](fetch-from-ncbi-entrez) - Fetch metadata and nucleotide sequences from [NCBI Entrez](https://www.ncbi.nlm.nih.gov/books/NBK25501/) and output to a GenBank file. + Useful for pathogens with metadata and annotations in custom fields that are not part of the standard [NCBI Datasets](https://www.ncbi.nlm.nih.gov/datasets/) outputs. + +Historically, some pathogen repos used the undocumented NCBI Virus API through [fetch-from-ncbi-virus](https://github.com/nextstrain/ingest/blob/c97df238518171c2b1574bec0349a55855d1e7a7/fetch-from-ncbi-virus) to fetch data. However we've opted to drop the NCBI Virus scripts due to https://github.com/nextstrain/ingest/issues/18. + Potential Nextstrain CLI scripts - [sha256sum](sha256sum) - Used to check if files are identical in upload-to-s3 and download-from-s3 scripts. @@ -89,3 +114,17 @@ Potential augur curate scripts - [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' - [transform-field-names](transform-field-names) - Rename fields of NDJSON records - [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) +- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. + +## Software requirements + +Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date. + +## Testing + +Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack. + +For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally, + +1. Download Cram: `pip install cram` +2. Run the tests: `cram tests/` diff --git a/ingest/vendored/cloudfront-invalidate b/ingest/vendored/cloudfront-invalidate index dec4852..dbea398 100755 --- a/ingest/vendored/cloudfront-invalidate +++ b/ingest/vendored/cloudfront-invalidate @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Originally from @tsibley's gist: https://gist.github.com/tsibley/a66262d341dedbea39b02f27e2837ea8 set -euo pipefail diff --git a/ingest/vendored/download-from-s3 b/ingest/vendored/download-from-s3 index 44f7ff3..4981186 100755 --- a/ingest/vendored/download-from-s3 +++ b/ingest/vendored/download-from-s3 @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail bin="$(dirname "$0")" diff --git a/ingest/vendored/fetch-from-ncbi-entrez b/ingest/vendored/fetch-from-ncbi-entrez new file mode 100755 index 0000000..194a0c8 --- /dev/null +++ b/ingest/vendored/fetch-from-ncbi-entrez @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Fetch metadata and nucleotide sequences from NCBI Entrez and output to a GenBank file. +""" +import json +import argparse +from Bio import SeqIO, Entrez + +# To use the efetch API, the docs indicate only around 10,000 records should be fetched per request +# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch +# However, in my testing with HepB, the max records returned was 9,999 +# - Jover, 16 August 2023 +BATCH_SIZE = 9999 + +Entrez.email = "hello@nextstrain.org" + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--term', required=True, type=str, + help='Genbank search term. Replace spaces with "+", e.g. "Hepatitis+B+virus[All+Fields]complete+genome[All+Fields]"') + parser.add_argument('--output', required=True, type=str, help='Output file (Genbank)') + return parser.parse_args() + + +def get_esearch_history(term): + """ + Search for the provided *term* via ESearch and store the results using the + Entrez history server.¹ + + Returns the total count of returned records, query key, and web env needed + to access the records from the server. + + ¹ https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Using_the_Entrez_History_Server + """ + handle = Entrez.esearch(db="nucleotide", term=term, retmode="json", usehistory="y", retmax=0) + esearch_result = json.loads(handle.read())['esearchresult'] + print(f"Search term {term!r} returned {esearch_result['count']} IDs.") + return { + "count": int(esearch_result["count"]), + "query_key": esearch_result["querykey"], + "web_env": esearch_result["webenv"] + } + + +def fetch_from_esearch_history(count, query_key, web_env): + """ + Fetch records in batches from Entrez history server using the provided + *query_key* and *web_env* and yields them as a BioPython SeqRecord iterator. + """ + print(f"Fetching GenBank records in batches of n={BATCH_SIZE}") + + for start in range(0, count, BATCH_SIZE): + handle = Entrez.efetch( + db="nucleotide", + query_key=query_key, + webenv=web_env, + retstart=start, + retmax=BATCH_SIZE, + rettype="gb", + retmode="text") + + yield SeqIO.parse(handle, "genbank") + + +if __name__=="__main__": + args = parse_args() + + with open(args.output, "w") as output_handle: + for batch_results in fetch_from_esearch_history(**get_esearch_history(args.term)): + SeqIO.write(batch_results, output_handle, "genbank") diff --git a/ingest/vendored/notify-on-diff b/ingest/vendored/notify-on-diff index c304d6b..ddbe7da 100755 --- a/ingest/vendored/notify-on-diff +++ b/ingest/vendored/notify-on-diff @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail diff --git a/ingest/vendored/notify-on-job-fail b/ingest/vendored/notify-on-job-fail index 02cb6ba..7dd2409 100755 --- a/ingest/vendored/notify-on-job-fail +++ b/ingest/vendored/notify-on-job-fail @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" diff --git a/ingest/vendored/notify-on-job-start b/ingest/vendored/notify-on-job-start index 3e44bb0..1c8ce7d 100755 --- a/ingest/vendored/notify-on-job-start +++ b/ingest/vendored/notify-on-job-start @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" diff --git a/ingest/vendored/notify-on-record-change b/ingest/vendored/notify-on-record-change index c0bf8f7..f424252 100755 --- a/ingest/vendored/notify-on-record-change +++ b/ingest/vendored/notify-on-record-change @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" diff --git a/ingest/vendored/notify-slack b/ingest/vendored/notify-slack index db98bfb..a343435 100755 --- a/ingest/vendored/notify-slack +++ b/ingest/vendored/notify-slack @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${SLACK_TOKEN:?The SLACK_TOKEN environment variable is required.}" diff --git a/ingest/vendored/s3-object-exists b/ingest/vendored/s3-object-exists index faac421..679c20a 100755 --- a/ingest/vendored/s3-object-exists +++ b/ingest/vendored/s3-object-exists @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail url="${1#s3://}" diff --git a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t new file mode 100644 index 0000000..1c05df7 --- /dev/null +++ b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t @@ -0,0 +1,17 @@ +Look for strain name in "strain" or a list of backup fields. + +If strain entry exists, do not do anything. + + $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields strain_s accession + {"strain":"i/am/a/strain","strain_s":"other"} + +If strain entry does not exists, search the backup fields + + $ echo '{"strain_s": "other"}' \ + > | $TESTDIR/../../transform-strain-names \ + > --strain-regex '^.+$' \ + > --backup-fields accession strain_s + {"strain_s":"other","strain":"other"} \ No newline at end of file diff --git a/ingest/bin/transform-strain-names b/ingest/vendored/transform-strain-names similarity index 98% rename from ingest/bin/transform-strain-names rename to ingest/vendored/transform-strain-names index 027f18a..d86c0e4 100755 --- a/ingest/bin/transform-strain-names +++ b/ingest/vendored/transform-strain-names @@ -40,6 +40,7 @@ if __name__ == '__main__': for field in args.backup_fields: if record.get(field): record['strain'] = str(record[field]) + break if record['strain'] == '': print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) diff --git a/ingest/vendored/trigger b/ingest/vendored/trigger index 11d1b63..586f9cc 100755 --- a/ingest/vendored/trigger +++ b/ingest/vendored/trigger @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${PAT_GITHUB_DISPATCH:=}" diff --git a/ingest/vendored/trigger-on-new-data b/ingest/vendored/trigger-on-new-data index ef71d88..470d2f4 100755 --- a/ingest/vendored/trigger-on-new-data +++ b/ingest/vendored/trigger-on-new-data @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail : "${PAT_GITHUB_DISPATCH:?The PAT_GITHUB_DISPATCH environment variable is required.}" diff --git a/ingest/vendored/upload-to-s3 b/ingest/vendored/upload-to-s3 index 31cd49b..36d171c 100755 --- a/ingest/vendored/upload-to-s3 +++ b/ingest/vendored/upload-to-s3 @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail bin="$(dirname "$0")" diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index d91a0ba..1dd82fc 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -68,7 +68,7 @@ rule transform: | ./vendored/transform-field-names \ --field-map {params.field_map} \ | augur curate normalize-strings \ - | ./bin/transform-strain-names \ + | ./vendored/transform-strain-names \ --strain-regex {params.strain_regex} \ --backup-fields {params.strain_backup_fields} \ | augur curate format-dates \