fixup! Fix: Multiple header lines in the genbank tsv file

nextstrain · Aug 22, 2023 · 93082c7 · 93082c7
1 parent e2d8104
commit 93082c7
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 37 deletions.
diff --git a/ingest/bin/csv-to-ndjson b/ingest/bin/csv-to-ndjson
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+"""
+Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest:
+https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson
+
+Convert CSV on stdin to NDJSON on stdout.
+"""
+import csv
+import json
+from sys import stdin, stdout
+
+# 200 MiB; default is 128 KiB
+csv.field_size_limit(200 * 1024 * 1024)
+
+for row in csv.DictReader(stdin):
+    json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
+    print()
diff --git a/ingest/bin/fetch-from-genbank b/ingest/bin/fetch-from-genbank
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest:
+#   https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank
+#
+set -euo pipefail
+
+bin="$(dirname "$0")"
+
+
+main() {
+    local ncbi_taxon_id="${1:?NCBI taxon id is required.}"
+    fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson
+}
+
+fetch() {
+    curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \
+        --fail --silent --show-error --http1.1 \
+        --header 'User-Agent: https://github.com/nextstrain/monkeypox ([email protected])'
+}
+
+main "$@"
diff --git a/ingest/bin/genbank-url b/ingest/bin/genbank-url
@@ -1,24 +1,43 @@
 #!/usr/bin/env python3
+"""
+Generate URL to download all pathogen sequences and their curated metadata
+from GenBank via NCBI Virus.
 
-from urllib.parse import urlencode
-import yaml
-from yaml import Loader
+The URL this program builds is based on the URL for pathogen constructed with
+
+    https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url
+
+and observing the network activity at
 
-with open('config/config.yaml', 'r') as f:
-    config = yaml.load(f, Loader=Loader)
+    https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Monkeypox%20virus,%20taxid:10244
+"""
+from urllib.parse import urlencode
+import argparse
 
-    if config['rsv'] == 'B':
-        taxonid = [208895]
-    elif config['rsv'] == 'A':
-        taxonid = [208893]
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Given an NCBI taxon ID, generate URL to download "
+        "all viral sequences and their curated metadata from GenBank via NCBI Virus."
+    )
+    parser.add_argument(
+        "--ncbi-taxon-id",
+        help="NCBI Taxon ID.",
+        default="10244",
+        required=True
+    )
+    return parser.parse_args()
 
-for i in taxonid:
+def build_query_url(ncbi_taxon_id: str):
+    """
+    Generate URL to download all viral sequences and their curated metadata
+    from GenBank via NCBI Virus.
+    """
     endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
     params = {
         # Search criteria
         'fq': [
             '{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
-            f'VirusLineageId_ss:({i})',                # NCBI Taxon id for RSV
+            f'VirusLineageId_ss:({ncbi_taxon_id})',     # NCBI Taxon id for Pathogen
         ],
 
         # Unclear, but seems necessary.
@@ -62,3 +81,10 @@ for i in taxonid:
     query = urlencode(params, doseq = True, encoding = "utf-8")
 
     print(f"{endpoint}?{query}")
+
+def main():
+    args = parse_args()
+    build_query_url(args.ncbi_taxon_id)
+
+if __name__ == '__main__':
+    main()
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -13,42 +13,28 @@ Produces final output as
 
 """
 
+def download_subtype(wildcards):
+    subtype = {
+        'a': '208893',
+        'b': '208895',
+        'general': '11250',
+    }
+    return subtype[wildcards.subtype]
+
 rule fetch_from_genbank:
     output:
-        csv = "data/genbank.csv"
+        genbank_ndjson="data/genbank_{subtype}.ndjson",
     params:
-        URL_a = config['fetch']['genbank_url']['a'],
-        URL_b = config['fetch']['genbank_url']['b'],
-        URL_general = config['fetch']['genbank_url']['general']
-    shell:
-        """
-        curl "{params.URL_a}" --fail --silent --show-error --http1.1 \
-             --header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_a
-        curl "{params.URL_b}" --fail --silent --show-error --http1.1 \
-             --header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_b
-        curl "{params.URL_general}" --fail --silent --show-error --http1.1 \
-             --header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_general
-
-        tsv-append -H {output}_a {output}_b {output}_general > {output}
-        rm {output}_a {output}_b {output}_general
-        """
-
-rule csv_to_ndjson:
-    input:
-        csv = rules.fetch_from_genbank.output.csv
-    output:
-        ndjson = "data/genbank.ndjson"
+        ncbi_taxon_id=download_subtype,
     shell:
         """
-        python bin/csv-to-ndjson.py \
-            --input {input.csv} \
-            --output {output.ndjson}
+        ./bin/fetch-from-genbank {params.ncbi_taxon_id} > {output.genbank_ndjson}
         """
 
 
 rule fetch_all_sequences:
     input:
-        all_sources = "data/genbank.ndjson"
+        all_sources = expand("data/genbank_{subtype}.ndjson", subtype=["a", "b", "general"]),
     output:
         sequences_ndjson = "data/sequences.ndjson"
     shell: