Skip to content

Commit

Permalink
fixup! Fix: Multiple header lines in the genbank tsv file
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Aug 22, 2023
1 parent e2d8104 commit 93082c7
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 37 deletions.
17 changes: 17 additions & 0 deletions ingest/bin/csv-to-ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env python3
"""
Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest:
https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson
Convert CSV on stdin to NDJSON on stdout.
"""
import csv
import json
from sys import stdin, stdout

# 200 MiB; default is 128 KiB
csv.field_size_limit(200 * 1024 * 1024)

for row in csv.DictReader(stdin):
json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
print()
22 changes: 22 additions & 0 deletions ingest/bin/fetch-from-genbank
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
#
# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest:
# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank
#
set -euo pipefail

bin="$(dirname "$0")"


main() {
local ncbi_taxon_id="${1:?NCBI taxon id is required.}"
fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson
}

fetch() {
curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \
--fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/monkeypox ([email protected])'
}

main "$@"
48 changes: 37 additions & 11 deletions ingest/bin/genbank-url
Original file line number Diff line number Diff line change
@@ -1,24 +1,43 @@
#!/usr/bin/env python3
"""
Generate URL to download all pathogen sequences and their curated metadata
from GenBank via NCBI Virus.
from urllib.parse import urlencode
import yaml
from yaml import Loader
The URL this program builds is based on the URL for pathogen constructed with
https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/genbank-url
and observing the network activity at
with open('config/config.yaml', 'r') as f:
config = yaml.load(f, Loader=Loader)
https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Monkeypox%20virus,%20taxid:10244
"""
from urllib.parse import urlencode
import argparse

if config['rsv'] == 'B':
taxonid = [208895]
elif config['rsv'] == 'A':
taxonid = [208893]
def parse_args():
parser = argparse.ArgumentParser(
description="Given an NCBI taxon ID, generate URL to download "
"all viral sequences and their curated metadata from GenBank via NCBI Virus."
)
parser.add_argument(
"--ncbi-taxon-id",
help="NCBI Taxon ID.",
default="10244",
required=True
)
return parser.parse_args()

for i in taxonid:
def build_query_url(ncbi_taxon_id: str):
"""
Generate URL to download all viral sequences and their curated metadata
from GenBank via NCBI Virus.
"""
endpoint = "https://www.ncbi.nlm.nih.gov/genomes/VirusVariation/vvsearch2/"
params = {
# Search criteria
'fq': [
'{!tag=SeqType_s}SeqType_s:("Nucleotide")', # Nucleotide sequences (as opposed to protein)
f'VirusLineageId_ss:({i})', # NCBI Taxon id for RSV
f'VirusLineageId_ss:({ncbi_taxon_id})', # NCBI Taxon id for Pathogen
],

# Unclear, but seems necessary.
Expand Down Expand Up @@ -62,3 +81,10 @@ for i in taxonid:
query = urlencode(params, doseq = True, encoding = "utf-8")

print(f"{endpoint}?{query}")

def main():
args = parse_args()
build_query_url(args.ncbi_taxon_id)

if __name__ == '__main__':
main()
38 changes: 12 additions & 26 deletions ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,28 @@ Produces final output as
"""

def download_subtype(wildcards):
subtype = {
'a': '208893',
'b': '208895',
'general': '11250',
}
return subtype[wildcards.subtype]

rule fetch_from_genbank:
output:
csv = "data/genbank.csv"
genbank_ndjson="data/genbank_{subtype}.ndjson",
params:
URL_a = config['fetch']['genbank_url']['a'],
URL_b = config['fetch']['genbank_url']['b'],
URL_general = config['fetch']['genbank_url']['general']
shell:
"""
curl "{params.URL_a}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_a
curl "{params.URL_b}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_b
curl "{params.URL_general}" --fail --silent --show-error --http1.1 \
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_general
tsv-append -H {output}_a {output}_b {output}_general > {output}
rm {output}_a {output}_b {output}_general
"""

rule csv_to_ndjson:
input:
csv = rules.fetch_from_genbank.output.csv
output:
ndjson = "data/genbank.ndjson"
ncbi_taxon_id=download_subtype,
shell:
"""
python bin/csv-to-ndjson.py \
--input {input.csv} \
--output {output.ndjson}
./bin/fetch-from-genbank {params.ncbi_taxon_id} > {output.genbank_ndjson}
"""


rule fetch_all_sequences:
input:
all_sources = "data/genbank.ndjson"
all_sources = expand("data/genbank_{subtype}.ndjson", subtype=["a", "b", "general"]),
output:
sequences_ndjson = "data/sequences.ndjson"
shell:
Expand Down

0 comments on commit 93082c7

Please sign in to comment.