-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fixup! Fix: Multiple header lines in the genbank tsv file
- Loading branch information
Showing
4 changed files
with
88 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest: | ||
https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson | ||
Convert CSV on stdin to NDJSON on stdout. | ||
""" | ||
import csv | ||
import json | ||
from sys import stdin, stdout | ||
|
||
# 200 MiB; default is 128 KiB | ||
csv.field_size_limit(200 * 1024 * 1024) | ||
|
||
for row in csv.DictReader(stdin): | ||
json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:') | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
# | ||
# Originally copied from "bin/fetch-from-genbank" in nextstrain/ncov-ingest: | ||
# https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/fetch-from-genbank | ||
# | ||
set -euo pipefail | ||
|
||
bin="$(dirname "$0")" | ||
|
||
|
||
main() { | ||
local ncbi_taxon_id="${1:?NCBI taxon id is required.}" | ||
fetch "$ncbi_taxon_id" | "$bin"/csv-to-ndjson | ||
} | ||
|
||
fetch() { | ||
curl "$("$bin"/genbank-url --ncbi-taxon-id "$1")" \ | ||
--fail --silent --show-error --http1.1 \ | ||
--header 'User-Agent: https://github.com/nextstrain/monkeypox ([email protected])' | ||
} | ||
|
||
main "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,42 +13,28 @@ Produces final output as | |
""" | ||
|
||
def download_subtype(wildcards): | ||
subtype = { | ||
'a': '208893', | ||
'b': '208895', | ||
'general': '11250', | ||
} | ||
return subtype[wildcards.subtype] | ||
|
||
rule fetch_from_genbank: | ||
output: | ||
csv = "data/genbank.csv" | ||
genbank_ndjson="data/genbank_{subtype}.ndjson", | ||
params: | ||
URL_a = config['fetch']['genbank_url']['a'], | ||
URL_b = config['fetch']['genbank_url']['b'], | ||
URL_general = config['fetch']['genbank_url']['general'] | ||
shell: | ||
""" | ||
curl "{params.URL_a}" --fail --silent --show-error --http1.1 \ | ||
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_a | ||
curl "{params.URL_b}" --fail --silent --show-error --http1.1 \ | ||
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_b | ||
curl "{params.URL_general}" --fail --silent --show-error --http1.1 \ | ||
--header 'User-Agent: https://github.com/nextstrain/rsv ([email protected])' >> {output}_general | ||
tsv-append -H {output}_a {output}_b {output}_general > {output} | ||
rm {output}_a {output}_b {output}_general | ||
""" | ||
|
||
rule csv_to_ndjson: | ||
input: | ||
csv = rules.fetch_from_genbank.output.csv | ||
output: | ||
ndjson = "data/genbank.ndjson" | ||
ncbi_taxon_id=download_subtype, | ||
shell: | ||
""" | ||
python bin/csv-to-ndjson.py \ | ||
--input {input.csv} \ | ||
--output {output.ndjson} | ||
./bin/fetch-from-genbank {params.ncbi_taxon_id} > {output.genbank_ndjson} | ||
""" | ||
|
||
|
||
rule fetch_all_sequences: | ||
input: | ||
all_sources = "data/genbank.ndjson" | ||
all_sources = expand("data/genbank_{subtype}.ndjson", subtype=["a", "b", "general"]), | ||
output: | ||
sequences_ndjson = "data/sequences.ndjson" | ||
shell: | ||
|