From 238f9560de0ed41d13f391f31e1985164239b22d Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 09:33:57 +0200 Subject: [PATCH 01/26] fix: segment wild card constraints --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 4ab9f54a..2c1c69f1 100644 --- a/Snakefile +++ b/Snakefile @@ -4,7 +4,7 @@ from treetime.utils import numeric_date wildcard_constraints: - segment = r'pb2|pb1|pa|ha|np|na|ma', + segment = r'pb2|pb1|pa|ha|np|na|mp|ns', center = r'who|cdc|crick|niid|crick|vidrl', passage = r'cell|egg', assay = r'fra|hi', From dcc74524a43cd1e8511e725723e859711badc916 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 12:55:29 +0200 Subject: [PATCH 02/26] stub out flu nextclade dataset workflows --- nextclade/Snakefile | 347 ++++++++++++++++++ nextclade/config/auspice_config.json | 50 +++ nextclade/config/config_dict.yaml | 81 ++++ nextclade/config/pathogen.json | 41 +++ .../h1n1pdm/ha/CY121680/genemap.gff | 5 + .../h1n1pdm/ha/CY121680/reference.fasta | 2 + .../h1n1pdm/ha/CY121680/virus_properties.json | 26 ++ .../h1n1pdm/ha/MW626062/genemap.gff | 5 + .../h1n1pdm/ha/MW626062/reference.fasta | 27 ++ .../h1n1pdm/ha/MW626062/virus_properties.json | 26 ++ nextclade/dataset_config/h1n1pdm/includes.txt | 0 .../h1n1pdm/na/MW626056/genemap.gff | 4 + .../h1n1pdm/na/MW626056/reference.fasta | 25 ++ .../h1n1pdm/na/MW626056/virus_properties.json | 22 ++ .../h3n2/ha/CY163680/genemap.gff | 5 + .../h3n2/ha/CY163680/reference.fasta | 26 ++ .../h3n2/ha/CY163680/virus_properties.json | 26 ++ .../h3n2/ha/EPI1857216/genemap.gff | 5 + .../h3n2/ha/EPI1857216/reference.fasta | 23 ++ .../h3n2/ha/EPI1857216/virus_properties.json | 56 +++ nextclade/dataset_config/h3n2/includes.txt | 7 + .../h3n2/na/EPI1857215/genemap.gff | 4 + .../h3n2/na/EPI1857215/reference.fasta | 2 + .../h3n2/na/EPI1857215/virus_properties.json | 22 ++ .../vic/ha/EPI1926632/genemap.gff | 5 + .../vic/ha/EPI1926632/reference.fasta | 25 ++ .../vic/ha/EPI1926632/virus_properties.json | 26 ++ .../vic/ha/KX058884/genemap.gff | 5 + .../vic/ha/KX058884/reference.fasta | 28 ++ .../vic/ha/KX058884/virus_properties.json | 26 ++ nextclade/dataset_config/vic/includes.txt | 0 .../vic/na/CY073894/genemap.gff | 4 + .../vic/na/CY073894/reference.fasta | 25 ++ .../vic/na/CY073894/virus_properties.json | 22 ++ .../yam/ha/JN993010/genemap.gff | 5 + .../yam/ha/JN993010/reference.fasta | 2 + 36 files changed, 1010 insertions(+) create mode 100644 nextclade/Snakefile create mode 100644 nextclade/config/auspice_config.json create mode 100644 nextclade/config/config_dict.yaml create mode 100644 nextclade/config/pathogen.json create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json create mode 100644 nextclade/dataset_config/h1n1pdm/includes.txt create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json create mode 100644 nextclade/dataset_config/h3n2/includes.txt create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json create mode 100644 nextclade/dataset_config/vic/ha/KX058884/genemap.gff create mode 100644 nextclade/dataset_config/vic/ha/KX058884/reference.fasta create mode 100644 nextclade/dataset_config/vic/ha/KX058884/virus_properties.json create mode 100644 nextclade/dataset_config/vic/includes.txt create mode 100644 nextclade/dataset_config/vic/na/CY073894/genemap.gff create mode 100644 nextclade/dataset_config/vic/na/CY073894/reference.fasta create mode 100644 nextclade/dataset_config/vic/na/CY073894/virus_properties.json create mode 100644 nextclade/dataset_config/yam/ha/JN993010/genemap.gff create mode 100644 nextclade/dataset_config/yam/ha/JN993010/reference.fasta diff --git a/nextclade/Snakefile b/nextclade/Snakefile new file mode 100644 index 00000000..ab5819d8 --- /dev/null +++ b/nextclade/Snakefile @@ -0,0 +1,347 @@ +import datetime + + +wildcard_constraints: + flu_type="[A-Za-z0-9]+", + year="\d\d\d\d", + lineage=r"h3n2|h1n1pdm|vic|yam", + segment = r'pb2|pb1|pa|ha|np|na|mp|ns', + reference="[^_/]+", + + + + +rule download_sequences: + output: + sequences="data/{lineage}/raw_{segment}.fasta" + params: + s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/raw_sequences.fasta.xz" + conda: "../../workflow/envs/nextstrain.yaml" + shell: + """ + aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences} + """ + +rule download_clades: + message: + "Downloading clade definitions for {wildcards.lineage} from {params.source_tsv} -> {output}" + output: + clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv" + params: + source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade], + shell: + """ + curl {params.source_tsv} > {output.clade_tsv} + """ + +rule offset_clades: + input: + rules.download_clades.output, + output: + "data/{clade}_{lineage}_{segment}_{reference}.tsv", + params: + offset=lambda w: config["builds"][w.lineage][w.segment]['refs'][w.reference][ + "clade_offset" + ], + shell: + """ + perl -F'\\t' -ne \ + '$F[2]+={params.offset} if $F[1] =~ "nuc"; \ + print join "\\t", @F' \ + {input} \ + >{output} + """ + +rule parse: + input: + sequences="data/{lineage}/raw_{segment}.fasta", + output: + metadata="data/{lineage}_{segment}_metadata.tsv", + sequences="data/{lineage}_{segment}_sequences.fasta", + params: + fields="strainName virus segment EPI_ISL date submission_date region country division location passage collecting_lab submitting_lab age sex", + shell: + """ + augur parse \ + --sequences {input.sequences} \ + --fields {params.fields} \ + --output-metadata {output.metadata} \ + --output-sequences {output.sequences} + """ + + +def genes(w): + if w.segment=='ha': return ["SigPep", "HA1", "HA2"] + if w.segment=='na': return ["NA"] + +rule subsample: + input: + aligned_sequences=rules.parse.output.sequences, + enriched_metadata=rules.parse.output.metadata, + include_strains="../config/{lineage}/reference_strains.txt", + exclude="../config/{lineage}/outliers.txt", + output: + sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", + sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt", + params: + filter_arguments=lambda w: config["builds"][w.lineage][w.segment][ + w.reference + ]["filter"], + reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment][ + w.reference + ]["reference_EPI_ISL"], + other_include = lambda w:config["builds"][w.lineage][w.segment][w.reference].get("include_file","") + shell: + """ + augur filter \ + --sequences {input.aligned_sequences} \ + --metadata {input.enriched_metadata} \ + --include {input.include_strains} {params.other_include} \ + --include-where EPI_ISL={params.reference_EPI_ISL} \ + {params.filter_arguments} \ + --output {output.sampled_sequences} \ + --output-strains {output.sampled_strains} + """ + +rule align: + input: + sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", + annotation="references/{lineage}/{segment}/{reference}/annotation.gff", + reference="references/{lineage}/{segment}/{reference}/reference.fasta", + output: + alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta", + insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv", + params: + outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta", + threads: 3 + shell: + """ + nextclade run \ + --jobs={threads} \ + --input-ref {input.reference} \ + --input-annotation {input.annotation} \ + --output-translations {params.outdir} \ + --output-fasta {output.alignment} \ + --output-insertions {output.insertions} \ + {input.sequences} \ + 2>&1 + """ + + +rule tree: + input: + alignment=rules.align.output.alignment, + output: + tree="build/{lineage}/{segment}/{reference}/tree_raw.nwk", + params: + args=lambda w: config["tree"].get("tree-builder-args", "") + if "tree" in config + else "", + threads: 3 + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --tree-builder-args {params.args} \ + --output {output.tree} \ + --nthreads {threads} \ + > /dev/null + """ + +# root using dates in treetime, use 1500 as sequence length (good enough, doesn't matter) +rule root: + input: + tree=rules.tree.output.tree, + metadata = rules.parse.output.metadata, + output: + tree="build/{lineage}/{segment}/{reference}/tree_rooted.nwk", + params: + outdir = "build/{lineage}/{segment}/{reference}/tt_out" + shell: + """ + treetime clock \ + --tree {input.tree} \ + --sequence-length 1500 \ + --dates {input.metadata} \ + --clock-filter 4 \ + --clock-filter-method local \ + --outdir {params.outdir} + cp {params.outdir}/rerooted.newick {output.tree} + """ + +# refine while keeping the root +rule refine: + input: + tree=rules.root.output.tree, + alignment=rules.align.output.alignment, + enriched_metadata=rules.parse.output.metadata, + output: + tree="build/{lineage}/{segment}/{reference}/tree.nwk", + node_data="build/{lineage}/{segment}/{reference}/branch_lengths.json", + threads: 1 + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.enriched_metadata} \ + --output-tree {output.tree} \ + --output-node-data {output.node_data} \ + --keep-root \ + --divergence-unit mutations-per-site + """ + + +rule ancestral: + message: + """ + Reconstructing ancestral sequences and mutations + - inferring ambiguous mutations + """ + input: + tree=rules.refine.output.tree, + alignment=rules.align.output.alignment, + annotation="references/{lineage}/{segment}/{reference}/annotation.gff", + reference="references/{lineage}/{segment}/{reference}/reference.fasta", + output: + node_data="build/{lineage}/{segment}/{reference}/muts.json", + params: + inference="joint", + genes=genes, + translations=lambda w: expand( + "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta", + strain=w.lineage, + segment=w.segment, + genes=genes(w), + reference=w.reference, + ), + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --inference {params.inference} \ + --infer-ambiguous \ + --genes {params.genes} \ + --annotation {input.annotation} \ + --translations {params.translations} \ + --root-sequence {input.reference} \ + --output-node-data {output.node_data} + """ + +rule clades: + message: + "Adding internal clade labels" + input: + tree=rules.refine.output.tree, + nucs=rules.ancestral.output.node_data, + clades=rules.offset_clades.output, + output: + node_data="build/{lineage}/{segment}/{reference}/{clade}.json", + shell: + """ + augur clades --tree {input.tree} \ + --mutations {input.nuc_muts} {input.aa_muts} \ + --clades {input.clades} \ + --output-node-data {output.node_data} \ + > /dev/null + """ + +def get_node_data(w): + node_data = [ + rules.refine.output.node_data, + "build/{lineage}/{segment}/{reference}/aa_muts_adapted.json".format(**w), + "build/{lineage}/{segment}/{reference}/nuc_muts_adapted.json".format(**w), + ] + + for clade in config["builds"][w.lineage][w.segment][w.reference]["clades"]: + node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'/{clade}.json') + + if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference]: + node_data.append("build/{lineage}/{segment}/{reference}/clades-short.json".format(**w)) + + return node_data + + +rule export: + message: + "Exporting data files for auspice" + input: + tree=rules.refine.output.tree, + metadata=rules.parse.output.metadata, + node_data = get_node_data, + auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference] else config["files"]["auspice_config"], + output: + auspice_json="auspice/{lineage}/{segment}/{reference}/auspice_raw.json", + params: + fields="region strainName country date EPI_ISL", + date=datetime.datetime.utcnow().strftime("%Y-%m-%d"), + shell: + """ + AUGUR_RECURSION_LIMIT=10000 \ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --node-data {input.node_data}\ + --auspice-config {input.auspice_config} \ + --color-by-metadata {params.fields} \ + --title "Nextclade reference tree for Influenza type:{wildcards.lineage} segment:{wildcards.segment} with root {wildcards.reference} built on {params.date}" \ + --output {output.auspice_json} 2>&1; + """ + + +rule generate_sample_sequences: + input: + sequences="data/{lineage}_{segment}_sequences.fasta", + metadata=rules.parse.output.metadata, + output: + sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta", + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --min-date 2020 --group-by year --subsample-max-sequences 50 \ + --exclude-ambiguous-dates-by year \ + --exclude-where 'country!=USA' 'submitting_lab!=centers_for_disease_control_and_prevention' \ + --probabilistic-sampling \ + --output {output.sequences} + """ + +rule make_dataset: + input: + sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta", + auspice_json="auspice/{lineage}/{segment}/{reference}/auspice.json", + annotation="references/{lineage}/{segment}/{reference}/annotation.gff", + reference="references/{lineage}/{segment}/{reference}/reference.fasta", + pathogen_json="references/{lineage}/{segment}/{reference}/pathogen.json", + additional_config="references/{lineage}/{segment}/{reference}/virus_properties.json", + output: + sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta", + tree="datasets/{lineage}/{segment}/{reference}/tree.json", + annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff", + reference="datasets/{lineage}/{segment}/{reference}/reference.fasta", + pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json", + shell: + """ + cp {input.sequences} {output.sequences} \ + cp {input.auspice_json} {output.tree} \ + cp {input.reference} {output.reference} \ + cp {input.annotation} {output.annotation} \ + jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json} + """ + + + +rule clean: + shell: + """ + rm -rf output test data/clades* data/include* auspice/* + """ + + +rule clean_all: + shell: + """ + rm -rf output test auspice build data + """ diff --git a/nextclade/config/auspice_config.json b/nextclade/config/auspice_config.json new file mode 100644 index 00000000..418a8348 --- /dev/null +++ b/nextclade/config/auspice_config.json @@ -0,0 +1,50 @@ +{ + "title": "Genomic epidemiology of Influenza", + "build_url": "https://github.com/neherlab/nextclade_data_workflows", + "maintainers": [ + { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, + { "name": "Richard Neher", "url": "https://neherlab.org" } + ], + "extensions": { + "nextclade": { + } + }, + "data_provenance": [ + { + "name": "GISAID" + } + ], + "colorings": [ + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "date", + "title": "Sample Date", + "type": "ordinal" + }, + { + "key": "EPI_ISL", + "title": "EPI_ISL", + "type": "categorical" + } + ], + "filters": [ + "region", + "country", + "clade_membership" + ], + "display_defaults": { + "color_by": "clade_membership", + "distance_measure": "div", + "branch_label": "clade" + }, + "panels": ["tree","entropy"] +} diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml new file mode 100644 index 00000000..239f1f9e --- /dev/null +++ b/nextclade/config/config_dict.yaml @@ -0,0 +1,81 @@ +clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/" + +builds: + h1n1pdm: + ha: + clade_systems: + "clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv" + "subclade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv" + "short-clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv" + refs: + CY121680: + filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + clade_offset: 0 + reference_EPI_ISL: EPI1583287 + reference_strain: A/California/7/2009-egg #TODO: exclude + MW626062: + filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + clade_offset: 0 + clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv" + reference_EPI_ISL: EPI1812046 + reference_strain: A/Wisconsin/588/2019 + na: + clade_systems: + "clade": "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv" + refs: + MW626056: + filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" + clade_offset: 0 + reference_EPI_ISL: EPI1812046 + reference_strain: A/Wisconsin/588/2019 + h3n2: + ha: + clade_systems: + "clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv" + "subclade": "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv" + "short-clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv" + refs: + EPI1857216: + filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + clade_offset: -17 + clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" + include_file: references/h3n2/includes.txt + reference_EPI_ISL: EPI1857216 + reference_strain: A/Darwin/6/2021 + CY163680: + filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + clade_offset: 0 + clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" + include_file: references/h3n2/includes.txt + reference_EPI_ISL: EPI545340 + reference_strain: A/Wisconsin/67/2005-egg + na: + clade_systems: + "clade": "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv" + refs: + EPI1857215: + filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" + clade_offset: 4 + reference_EPI_ISL: EPI1857215 + reference_strain: A/Darwin/6/2021 + vic: + ha: + clade_systems: + "clade": "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv" + "subclade": "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv" + refs: + KX058884: + filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + clade_offset: 0 + clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv" + reference_EPI_ISL: EPI696970 + reference_strain: B/Brisbane/60/2008-egg + na: + clade_systems: + "clade": "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" + refs: + CY073894: + filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" + clade_offset: -30 + reference_EPI_ISL: CY073894 + reference_strain: B/Brisbane/60/2008 diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json new file mode 100644 index 00000000..d536f97f --- /dev/null +++ b/nextclade/config/pathogen.json @@ -0,0 +1,41 @@ +{ + "alignmentParams": { + "excessBandwidth": 9, + "terminalBandwidth": 100, + "allowedMismatches": 4, + "gapAlignmentSide": "right", + "minSeedCover": 0.1 + }, + "qc": { + "privateMutations": { + "enabled": true, + "typical": 5, + "cutoff": 15, + "weightLabeledSubstitutions": 2, + "weightReversionSubstitutions": 1, + "weightUnlabeledSubstitutions": 1 + }, + "missingData": { + "enabled": false, + "missingDataThreshold": 100, + "scoreBias": 10 + }, + "snpClusters": { + "enabled": false, + "windowSize": 100, + "clusterCutOff": 5, + "scoreWeight": 50 + }, + "mixedSites": { + "enabled": true, + "mixedSitesThreshold": 4 + }, + "frameShifts": { + "enabled": true + }, + "stopCodons": { + "enabled": true, + "ignoredStopCodons": [] + } + } +} diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff new file mode 100644 index 00000000..a3952a1f --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region CY121680.1 1 1752 +CY121680.1 feature gene 21 71 . + . gene_name="SigPep" +CY121680.1 feature gene 72 1052 . + . gene_name="HA1" +CY121680.1 feature gene 1053 1718 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta b/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta new file mode 100644 index 00000000..a3b664be --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta @@ -0,0 +1,2 @@ +>CY121680.1 Influenza A virus (A/California/07/2009(H1N1)) hemagglutinin (HA) gene, complete cds +GGAAAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATATACATTTGCAACCGCAAATGCAGACACATTATGTATAGGTTATCATGCGAACAATTCAACAGACACTGTAGACACAGTACTAGAAAAGAATGTAACAGTAACACACTCTGTTAACCTTCTAGAAGACAAGCATAACGGGAAACTATGCAAACTAAGAGGGGTAGCCCCATTGCATTTGGGTAAATGTAACATTGCTGGCTGGATCCTGGGAAATCCAGAGTGTGAATCACTCTCCACAGCAAGCTCATGGTCCTACATTGTGGAAACACCTAGTTCAGACAATGGAACGTGTTACCCAGGAGATTTCATCGATTATGAGGAGCTAAGAGAGCAATTGAGCTCAGTGTCATCATTTGAAAGGTTTGAGATATTCCCCAAGACAAGTTCATGGCCCAATCATGACTCGAACAAAGGTGTAACGGCAGCATGTCCTCATGCTGGAGCAAAAAGCTTCTACAAAAATTTAATATGGCTAGTTAAAAAAGGAAATTCATACCCAAAGCTCAGCAAATCCTACATTAATGATAAAGGGAAAGAAGTCCTCGTGCTATGGGGCATTCACCATCCATCTACTAGTGCTGACCAACAAAGTCTCTATCAGAATGCAGATGCATATGTTTTTGTGGGGTCATCAAGATACAGCAAGAAGTTCAAGCCGGAAATAGCAATAAGACCCAAAGTGAGGGATCGAGAAGGGAGAATGAACTATTACTGGACACTAGTAGAGCCGGGAGACAAAATAACATTCGAAGCAACTGGAAATCTAGTGGTACCGAGATATGCATTCGCAATGGAAAGAAATGCTGGATCTGGTATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCAAACACCCAAGGGTGCTATAAACACCAGCCTCCCATTTCAGAATATACATCCGATCACAATTGGAAAATGTCCAAAATATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATATCCCGTCTATTCAATCTAGAGGCCTATTTGGGGCCATTGCCGGTTTCATTGAAGGGGGGTGGACAGGGATGGTAGATGGATGGTACGGTTATCACCATCAAAATGAGCAGGGGTCAGGATATGCAGCCGACCTGAAGAGCACACAGAATGCCATTGACGAGATTACTAACAAAGTAAATTCTGTTATTGAAAAGATGAATACACAGTTCACAGCAGTAGGTAAAGAGTTCAACCACCTGGAAAAAAGAATAGAGAATTTAAATAAAAAAGTTGATGATGGTTTCCTGGACATTTGGACTTACAATGCCGAACTGTTGGTTCTATTGGAAAATGAAAGAACTTTGGACTACCACGATTCAAATGTGAAGAACTTATATGAAAAGGTAAGAAGCCAGCTAAAAAACAATGCCAAGGAAATTGGAAACGGCTGCTTTGAATTTTACCACAAATGCGATAACACGTGCATGGAAAGTGTCAAAAATGGGACTTATGACTACCCAAAATACTCAGAGGAAGCAAAATTAAACAGAGAAGAAATAGATGGGGTAAAGCTGGAATCAACAAGGATTTACCAGATTTTGGCGATCTATTCAACTGTCGCCAGTTCATTGGTACTGGTAGTCTCCCTGGGGGCAATCAGTTTCTGGATGTGCTCTAATGGGTCTCTACAGTGTAGAATATGTATTTAACATTAGGATTTCAGAAGCATGAGAAAAACAC diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json new file mode 100644 index 00000000..a9af3dfd --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json @@ -0,0 +1,26 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff new file mode 100644 index 00000000..825fe437 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region MW626062.1 1 1752 +MW626062.1 feature gene 21 71 . + . gene_name="SigPep" +MW626062.1 feature gene 72 1052 . + . gene_name="HA1" +MW626062.1 feature gene 1053 1718 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta b/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta new file mode 100644 index 00000000..546126e1 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta @@ -0,0 +1,27 @@ +>MW626062.1 Influenza A virus (A/Wisconsin/588/2019(H1N1)) segment 4 hemagglutinin (HA) gene, complete cds +GGAAAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTATGCTGTATACATTTACAACCGCAAATGC +AGACACATTATGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAGTACTAGAAAAGAAT +GTAACAGTAACACACTCTGTCAATCTTCTGGAAGACAAGCATAACGGAAAACTATGCAAACTAAGAGGGG +TAGCCCCATTGCATTTGGGTAAATGTAACATTGCTGGCTGGATCCTGGGAAATCCAGAGTGTGAATCACT +CTCCACAGCAAGATCATGGTCCTACATTGTGGAAACATCTAATTCAGACAATGGAACGTGTTACCCAGGA +GATTTCATCAATTATGAGGAGCTAAGAGAGCAATTGAGCTCAGTGTCATCATTTGAAAGGTTTGAAATAT +TCCCCAAGACAAGTTCATGGCCTAATCATGACTCGGACAATGGTGTAACGGCAGCATGTCCTCACGCTGG +AGCAAAAAGCTTCTACAAAAACTTGATATGGCTGGTTAAAAAAGGAAAATCATACCCAAAGATCAACCAA +ACCTACATTAATGATAAAGGGAAAGAAGTCCTCGTGCTGTGGGGCATTCACCATCCACCTACTATTGCTG +ACCAACAAAGTCTCTATCAGAATGCAGATGCATATGTTTTTGTGGGGACATCAAGATACAGCAAGAAGTT +CAAGCCGGAAATAGCAACAAGACCCAAAGTGAGGGATCAAGAAGGGAGAATGAACTATTACTGGACACTA +GTAGAACCGGGAGACAAAATAACATTCGAAGCAACTGGTAATCTAGTGGCACCGAGATATGCATTCACAA +TGGAAAGAGATGCTGGATCTGGTATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCA +GACACCCGAGGGTGCTATAAACACCAGCCTCCCATTTCAGAATGTACATCCGATCACAATTGGGAAATGT +CCAAAGTATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATGTCCCGTCTATTCAATCTA +GAGGCCTATTCGGGGCCATTGCTGGCTTCATCGAAGGGGGGTGGACAGGGATGGTAGATGGATGGTACGG +TTATCACCATCAAAATGAGCAGGGGTCAGGATATGCAGCCGATCTGAAGAGCACACAAAATGCCATTGAT +AAGATTACTAACAAAGTAAATTCTGTTATTGAAAAGATGAATACACAGTTCACAGCAGTTGGTAAAGAGT +TCAACCACCTTGAAAAAAGAATAGAGAATCTAAATAAAAAGGTTGATGATGGTTTCCTGGACATTTGGAC +TTACAATGCCGAACTGTTGGTTCTACTGGAAAACGAAAGAACTTTGGACTATCACGATTCAAATGTGAAG +AACTTGTATGAAAAAGTAAGAAACCAGTTAAAAAACAATGCCAAGGAAATTGGAAACGGCTGCTTTGAAT +TTTACCACAAATGCGACAACACATGCATGGAAAGTGTCAAGAATGGGACTTATGACTACCCAAAATACTC +AGAGGAAGCAAAATTAAACAGAGAAAAAATAGATGGAGTAAAGCTGGACTCAACAAGGATCTACCAGATT +TTGGCGATCTATTCAACTGTTGCCAGTTCATTGGTACTGGTAGTCTCCCTGGGGGCAATCAGCTTCTGGA +TGTGCTCTAATGGGTCTCTACAGTGTAGAATATGTATTTAACATTAGGATTTCAGAATCATGAGAAAAAC +AC diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json new file mode 100644 index 00000000..a9af3dfd --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json @@ -0,0 +1,26 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/h1n1pdm/includes.txt b/nextclade/dataset_config/h1n1pdm/includes.txt new file mode 100644 index 00000000..e69de29b diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff b/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff new file mode 100644 index 00000000..f026596c --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region MW626056.1 1 1433 +MW626056.1 annotation remark 1 1433 . . . accessions=MW626056;data_file_division=VRL;date=23-FEB-2021;keywords=;molecule_type=cRNA;organism=Influenza A virus;references=location: %5B0:1433%5D%0Aauthors: Jernigan%2CD.%2C Wentworth%2CD.%2C Barnes%2CJ.%2C Garten%2CR. and Xu%2CX.%0Atitle: Influenza Sequencing Activity group%0Ajournal: Unpublished%0Amedline id: %0Apubmed id: %0Acomment:,location: %5B0:1433%5D%0Aauthors: Jernigan%2CD.%2C Wentworth%2CD.%2C Barnes%2CJ.%2C Garten%2CR. and Xu%2CX.%0Atitle: Direct Submission%0Ajournal: Submitted %2817-FEB-2021%29 WHO Collaborating Center for Surveillance%2C Epidemiology and Control of Influenza%2C Influenza Division%2C Centers for Disease Control and Prevention%2C 1600 Clifton Road%2C N.E.%2C Atlanta%2C GA 30333%2C USA%0Amedline id: %0Apubmed id: %0Acomment:;sequence_version=1;source=Influenza A virus;structured_comment=OrderedDict%28%5B%28%27FluData%27%2C OrderedDict%28%5B%28%27EPI_ISOLATE_ID%27%2C %27EPI_ISL_628888%27%29%2C %28%27NAME%27%2C %27A/Wisconsin/588/2019%27%29%2C %28%27TYPE%27%2C %27H1N1%27%29%2C %28%27Segment_name%27%2C %27NA%27%29%2C %28%27HOST_AGE%27%2C %2766%27%29%2C %28%27HOST_GENDER%27%2C %27M%27%29%2C %28%27PASSAGE%27%2C %27C2S1 %282020-05-15%29%27%29%2C %28%27LOCATION%27%2C %27United States / Wisconsin%27%29%2C %28%27COLLECT_DATE%27%2C %2719-Dec-2019%27%29%2C %28%27Lineage%27%2C %27A%28H1N1%29pdm09%27%29%2C %28%27SPECIMEN_ID%27%2C %2719VR015562 ORIGINAL%27%29%2C %28%27SENDER_LAB%27%2C %27Wisconsin State Laboratory of Hygiene +MW626056.1 feature gene 9 1418 . + . codon_start=1;gene=NA;gene_name=NA;product=neuraminidase;protein_id=QRV63257.1; diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta b/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta new file mode 100644 index 00000000..02ca3b74 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta @@ -0,0 +1,25 @@ +>MW626056.1 Influenza A virus (A/Wisconsin/588/2019(H1N1)) segment 6 neuraminidase (NA) gene, complete cds +AGTTTAAAATGAATCCAAACCAAAAGATAATAACCATTGGTTCTATCTGTATGACAATTG +GAACGGCTAACTTAATATTACAAATTGGAAACATAATCTCAATATGGGTTAGCCACTCAA +TTCAAATTGGAAATCAAAGCCAGATTGAAACATGCAATAAAAGCGTCATTACTTATGAAA +ACAACACTTGGGTAAATCAGACATTTGTTAACATCAGCAACACTAACTCTGCTGCTAGAC +AGTCAGTGGCTTCCGTGAAATTAGCGGGCAATTCCTCTCTCTGCCCTGTTAGTGGATGGG +CTATATACAGTAAAGACAACAGTGTAAGAATCGGTTCCAAGGGGGATGTGTTTGTCATAA +GGGAACCATTCATATCATGCTCTCCCTTGGAATGCAGAACCTTCTTCTTGACTCAAGGGG +CTTTGCTAAATGACAAACATTCCAATGGAACCATTAAAGACAGAAGCCCATATCGAACCC +TAATGAGCTGTCCTATTGGTGAAGTTCCCTCTCCATACAACTCAAGATTTGAGTCAGTCG +CTTGGTCAGCAAGTGCTTGTCATGATGGCACCAATTGGCTAACAATTGGAATTTCTGGCC +CAGACAGTGGGGCAGTGGCTGTGTTAAAATACAATGGCATAATAACAGACACTATCAAGA +GTTGGAGGAACAAGATATTGAGAACACAAGAGTCTGAATGTGCATGTGTAAATGGTTCTT +GCTTTACCATAATGACCGATGGACCAAGTGATGGACAGGCCTCATACAAAATCTTCAGAA +TAGAAAAGGGAAAGATAATCAAATCAGTCGAAATGAAAGCCCCTAATTATCACTATGAAG +AATGCTCCTGTTACCCTGATTCTAGTGAAATCACATGTGTGTGCAGGGATAACTGGCATG +GCTCGAATCGACCGTGGGTGTCTTTCAACCAGAATCTGGAATATCAGATGGGATACATAT +GCAGTGGGGTTTTCGGAGACAATCCACGCCCTAATGATAAGACAGGCAGTTGTGGTCCAG +TATCGTCTAATGGAGCAAATGGGGTAAAAGGATTTTCATTCAAATACGGCAATGGTGTTT +GGATAGGGAGAACTAAGAGCATTAGTTCAAGAAAAGGTTTTGAGATGATTTGGGATCCGA +ATGGATGGACTGGGACTGACAATAAATTCTCAAAAAAGCAAGATATCGTAGGAATAAATG +AGTGGTCAGGGTATAGCGGGAGTTTTGTTCAGCATCCAGAACTAACAGGGCTGAATTGTA +TAAGACCTTGCTTCTGGGTTGAACTAATAAGAGGACGACCCGAAGAGAACACAATCTGGA +CTAGCGGGAGCAGCATATCCTTTTGTGGTGTAGACAGTGACATTGTGGGTTGGTCTTGGC +CAGACGGTGCTGAGTTGCCATTTACCATTGACAAGTAATTTGTTCAAAAAACT diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json new file mode 100644 index 00000000..27ec895a --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json @@ -0,0 +1,22 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"NA", + "ranges":[{"begin":33, "end":470}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff b/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff new file mode 100644 index 00000000..cbb8d4e5 --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region CY163680.1 1 1737 +CY163680.1 feature CDS 18 65 . + . name="SigPep" +CY163680.1 feature CDS 66 1052 . + . name="HA1" +CY163680.1 feature CDS 1053 1715 . + . name="HA2" diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta b/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta new file mode 100644 index 00000000..a4df05fb --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta @@ -0,0 +1,26 @@ +>CY163680.1 Influenza A virus (A/Wisconsin/67/2005(H3N2)) hemagglutinin (HA) gene, complete cds +GGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAA +ACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATA +GTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAG +GTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGG +AGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGC +AACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGG +AGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAG +ATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAAC +GTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACA +ATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAAC +TGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACA +ATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCA +AAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCAT +CACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGT +CCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTA +GAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGG +TTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAAT +CAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAAT +TCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTC +ATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAAC +AAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAA +TATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAG +AGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATC +CTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCT +GCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGAGTGCATTAATTAAAAACAC diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json b/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json new file mode 100644 index 00000000..a9af3dfd --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json @@ -0,0 +1,26 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff b/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff new file mode 100644 index 00000000..f33ff5bc --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region EPI1857216 1 1718 +EPI1857216 feature gene 1 48 . + . gene_name="SigPep" +EPI1857216 feature gene 49 1035 . + . gene_name="HA1" +EPI1857216 feature gene 1036 1698 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta new file mode 100644 index 00000000..ee3b943f --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta @@ -0,0 +1,23 @@ +>EPI_ISL_1563628 | A/Darwin/6/2021 | A / H3N2 | | 2021-03-16 +atgaagactatcattgctttgagcaacattctatgtcttgttttcgctcaaaaaatacctggaaatgacaatagcacggc +aacgctgtgccttgggcaccatgcagtaccaaacggaacgatagtgaaaacaatcacaaatgaccgaattgaagttacta +atgctactgagttggttcagaattcatcaataggtgaaatatgcggcagtcctcatcagatccttgatggagggaactgc +acactaatagatgctctattgggggaccctcagtgtgacggctttcaaaataaggaatgggacctttttgttgaaagaag +cagagccaacagcaactgttacccttatgatgtgccggattatgcctcccttaggtcactagttgcctcatccggcacac +tggagtttaaaaatgaaagcttcaattggactggagtcaaacaaaacggaacaagttctgcgtgcataaggggatctagt +agtagtttttttagtagattaaattggttgaccagcttaaacaacatatatccagcacagaacgtgactatgccaaacaa +ggaacaatttgacaaattgtacatttggggggttcaccacccggatacggacaagaaccaaatctccctgtttgctcaat +catcaggaagaatcacagtatctaccaaaagaagccaacaagctgtaatcccaaatatcggatctagacccagaataagg +gatatccctagcagaataagcatctattggacaatagtaaaaccgggagacatacttttgattaacagcacagggaatct +aattgctcctaggggttacttcaaaatacgaagtgggaaaagctcaataatgagatcagatgcacccattggcaaatgta +agtctgaatgcatcactccaaatggaagcattcccaatgacaaaccgttccaaaatgtaaacaggatcacatacggggcc +tgtcccagatatgttaagcaaagcaccctgaaattggcaacaggaatgcgaaatgtaccagagaaacaaaccagaggcat +atttggcgcaatagcgggtttcatagaaaatggatgggagggaatggtggatggttggtacggtttcaggcatcaaaatt +ctgagggaagaggacaagcagcagatctcaaaagcactcaagcagcaatcgatcaaatcaatgggaagctgaatcgattg +atcggaaaaaccaacgagaaattccatcagattgaaaaagaattctcagaagtagaaggaagagttcaagaccttgagaa +atatgttgaggacactaaaatagatctctggtcatacaacgcggagcttcttgttgccctggagaaccaacatacgattg +acctaactgactcagaaatgaacaaactgtttgaaaaaacaaagaagcaactgagggaaaatgctgaggatatgggaaat +ggttgtttcaaaatataccacaaatgtgacaatgcctgcataggatcaataagaaatgaaacttatgaccacaatgtgta +cagggatgaagcattaaacaaccggttccagatcaagggagttgagctgaagtcagggtacaaagattggatcctatgga +tttcctttgccatgtcatgttttttgctttgtattgctttgttggggttcatcatgtgggcctgccaaaagggcaacatt +agatgcaacatttgcatttgagtgcattaattaaaaac diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json new file mode 100644 index 00000000..6b5cd7dd --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json @@ -0,0 +1,56 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "phenotypeData":[ + { + "name": "RBD", + "nameFriendly": "RBD mutations", + "description": "This column displays the number of differences between the sequence and the reference at positions identified by Koel et al. (145, 155, 156, 158, 159, 189, and 193 in HA1)", + "gene": "HA1", + "aaRange": { + "begin": 100, + "end": 200 + }, + "ignore": { + "clades": ["outgroup"] + }, + "data": [ + { + "name": "differences", + "weight": 1, + "locations": { + "145": {"default":1}, + "155": {"default":1}, + "156": {"default":1}, + "158": {"default":1}, + "159": {"default":1}, + "189": {"default":1}, + "193": {"default":1} + } + } + ] + } + ], + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} diff --git a/nextclade/dataset_config/h3n2/includes.txt b/nextclade/dataset_config/h3n2/includes.txt new file mode 100644 index 00000000..07ada14b --- /dev/null +++ b/nextclade/dataset_config/h3n2/includes.txt @@ -0,0 +1,7 @@ +A/India/Pun-NIV293349/2021 +A/AbuDhabi/2375/2021 +A/Kenya/101/2021 +A/Victoria/361/2011 + +A/Singapore/INFTT0001/2021 +A/Nepal/21FL2632/2021 \ No newline at end of file diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff b/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff new file mode 100644 index 00000000..cdeefd62 --- /dev/null +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region EPI1857215 1 1433 +EPI1857215 annotation remark 1 1439 . . . accessions=EPI1857215; +EPI1857215 feature gene 8 1417 . + . codon_start=1;gene=NA;gene_name=NA;product=neuraminidase; diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta b/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta new file mode 100644 index 00000000..a656dd57 --- /dev/null +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta @@ -0,0 +1,2 @@ +>EPI1857215 +AGTAAAGATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACAATTTCCACAATATGCTTCTTCATGCAAATTGCCATCCTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAATAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATTTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCTTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAATGTGCATTCAAATAACACAGTACGTGATAGAACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTCCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAACGATATTCTCAGAACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAATGCTACAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCAAATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGATCCAACCGGCCCATCATAGATATAAACATAAAGGATCATAGCATTGTTTCCAGGTATGTGTGTTCTGGACTTGTTGGAGACACACCCAGAAAAAGCGACAGCTCCAGCAGTAGCCATTGTTTGAACCCTAACAATGAAAAAGGTGATCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGGAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCGTTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGCGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACTTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGAACCTCAGTCTCATGCATATATAAGCTTTCGCAATTTTAGAAAAAA diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json b/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json new file mode 100644 index 00000000..a56465d3 --- /dev/null +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json @@ -0,0 +1,22 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"NA", + "ranges":[{"begin":33, "end":470}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff b/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff new file mode 100644 index 00000000..9c9f0b75 --- /dev/null +++ b/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region EPI1926632 1 1847 +EPI1926632 feature gene 20 64 . + . gene_name="SigPep" +EPI1926632 feature gene 65 1096 . + . gene_name="HA1" +EPI1926632 feature gene 1097 1765 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta new file mode 100644 index 00000000..6bd01349 --- /dev/null +++ b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta @@ -0,0 +1,25 @@ +>B/Austria/1359417/2021 | EPI_ISL_6307006 | B / H0N0 | Victoria | 2021-01-09 +attttctaatatccacaaaatgaaggcaataattgtactactcatggtagtaacatccaatgcagatcgaatctgcactg +ggataacatcgtcaaactcaccacatgtcgtcaaaactgctactcaaggggaggtcaatgtgactggtgtaataccactg +acaacaacacccaccaaatctcattttgcaaatctcaaaggaacagaaaccagggggaaactatgcccaaaatgcctaaa +ctgcacagatctggatgtagccttgggcagaccaaaatgcacagggaaaataccctctgcaagggtttcaatactccatg +aagtcagacctgttacatctgggtgctttcctataatgcatgatagaacaaaaattagacagctgcctaaccttctccga +ggatacgaacatgtcaggttatcaactcacaacgttatcaatacagaagatgcaccaggaggaccctacgaaattggaac +ctcagggtcttgcctcaacattaccaatggaaaaggattcttcgcaacaatggcttgggccgtcccaaaaaacaaaacag +caacaaatccattaacaatagaagtaccatacatttgtacagaagaagaagaccaaattaccgtttgggggttccactct +gacgacgagacccaaatggcaaggctctatggggattcaaagccccagaagttcacctcatctgccaacggagtgaccac +acactacgtctcacagattggtggctttccaaatcaaacagaagacggaggactaccacaaagtggcagaattgttgttg +attacatggtgcaaaaatctggaaaaacaggaacaattacctatcaaagaggtattttattgcctcaaaaggtgtggtgc +gcaagtggcaagagcaaggtaataaaaggatccttgcccttaattggagaagcagattgcctccatgaaaaatacggtgg +attaaacaaaagcaagccttactacacaggggaacatgcaaaggccataggaaattgcccaatatgggtgaaaacaccct +tgaagctggccaatggaaccaaatatagacctcctgcaaaactattaaaggaaagaggtttcttcggagccattgctggt +ttcttagagggaggatgggaaggaatgattgcaggttggcacggatacacatcccatggggcacatggagtagcggtggc +agctgaccttaagagcactcaggaggccataaacaagataacaaaaaatctcaactctttgagtgagctggaagtaaaga +atcttcaaagactaagcggtgccatggatgaactccacaacgaaatactagaactagatgagaaagtggatgatctcaga +gctgatacaataagctcacagatagaactcgcagtcctgctttccaatgaaggaataataaacagtgaagatgaacatct +cttggcgcttgaaagaaagctgaagaaaatgctgggcccctctgctgtagagataggaaatggatgctttgaaaccaaac +acaagtgcaaccagacctgtctcgacagaatagctgctggtacctttgatgcaggagaattttctctccccacctttgat +tcactgaatattactgctgcatctttaaatgacgatggattggacaatcatactatactgctttactactcaactgctgc +ctccagtttggctgtaacactgatgatagctatctttgttgtttatatggtctccagagacaatgtttcttgctccattt +gtctataagggaagttaagccctgtattttcctttattgtagtgcttgtttgcttgttgtcattacaaagaaacgttatt +gaaaaat diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json b/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json new file mode 100644 index 00000000..d80db6a5 --- /dev/null +++ b/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json @@ -0,0 +1,26 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":187}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/vic/ha/KX058884/genemap.gff b/nextclade/dataset_config/vic/ha/KX058884/genemap.gff new file mode 100644 index 00000000..6b0fe595 --- /dev/null +++ b/nextclade/dataset_config/vic/ha/KX058884/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region KX058884.1 1 1885 +KX058884.1 feature gene 34 78 . + . gene_name="SigPep" +KX058884.1 feature gene 79 1119 . + . gene_name="HA1" +KX058884.1 feature gene 1120 1791 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/vic/ha/KX058884/reference.fasta b/nextclade/dataset_config/vic/ha/KX058884/reference.fasta new file mode 100644 index 00000000..6d22e3ea --- /dev/null +++ b/nextclade/dataset_config/vic/ha/KX058884/reference.fasta @@ -0,0 +1,28 @@ +>KX058884.1 Influenza B virus (B/Brisbane/60/2008) segment 4 hemagglutinin (HA) gene, complete cds +AGCAGAAGCAGAGCATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACAT +CCAATGCAGATCGAATCTGCACTGGGATAACATCGTCAAACTCACCACATGTCGTCAAAACTGCTACTCA +AGGGGAGGTCAATGTGACTGGTGTAATACCACTGACAACAACACCCACCAAATCTCATTTTGCAAATCTC +AAAGGAACAGAAACCAGGGGGAAACTATGCCCAAAATGCCTCAACTGCACAGATCTGGACGTAGCCTTGG +GCAGACCAAAATGCACGGGGAAAATACCCTCGGCAAGAGTTTCAATACTCCATGAAGTCAGACCTGTTAC +ATCTGGGTGCTTTCCTATAATGCACGACAGAACAAAAATTAGACAGCTGCCTAACCTTCTCCGAGGATAC +GAACATATCAGGTTATCAACCCATAACGTTATCAATGCAGAAAATGCACCAGGAGGACCCTACAAAATTG +GAACCTCAGGGTCTTGCCCTAACATTACCAATGGAAACGGATTTTTCGCAACAATGGCTTGGGCCGTCCC +AAAAAACGACAAAAACAAAACAGCAACAAATCCATTAACAATAGAAGTACCATACATTTGTACAGAAGGA +GAAGACCAAATTACCGTTTGGGGGTTCCACTCTGACAACGAGGCCCAAATGGCAAAGCTCTATGGGGACT +CAAAGCCCCAGAAGTTCACCTCATCTGCCAACGGAGTGACCACACATTACGTTTCACAGATTGGTGGCTT +CCCAAATCAAACAGAAGACGGAGGACTACCACAAAGTGGTAGAATTGTTGTTGATTACATGGTGCAAAAA +TCTGGGAAAACAGGAACAATTACCTATCAAAGGGGTATTTTATTGCCTCAAAAGGTGTGGTGCGCAAGTG +GCAGGAGCAAGGTAATAAAAGGATCCTTGCCTTTAATTGGAGAAGCAGATTGCCTCCACGAAAAATACGG +TGGATTAAACAAAAGCAAGCCTTACTACACAGGGGAACATGCAAAGGCCATAGGAAATTGCCCAATATGG +GTGAAAACACCCTTGAAGCTGGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTAAAGGAAAGGG +GTTTCTTCGGAGCTATTGCTGGTTTCTTAGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATA +CACATCCCATGGGGCACATGGAGTAGCGGTGGCAGCAGACCTTAAGAGCACTCAAGAGGCCATAAACAAG +ATAACAAAAAATCTCAACTCTTTGAGTGAGCTGGAAGTAAAGAATCTTCAAAGACTAAGCGGTGCCATGG +ATGAACTCCACAACGAAATACTAGAACTAGATGAGAAAGTGGATGATCTCAGAGCTGATACAATAAGCTC +ACAAATAGAACTCGCAGTCCTGCTTTCCAATGAAGGAATAATAAACAGTGAAGATGAACATCTCTTGGCG +CTTGAAAGAAAGCTGAAGAAAATGCTGGGCCCCTCTGCTGTAGAGATAGGGAATGGATGCTTTGAAACCA +AACACAAGTGCAACCAGACCTGTCTCGACAGAATAGCTGCTGGTACCTTTGATGCAGGAGAATTTTCTCT +CCCCACCTTTGATTCACTGAATATTACTGCTGCATCTTTAAATGACGATGGATTGGATAATCATACTATA +CTGCTTTACTACTCAACTGCTGCCTCCAGTTTGGCTGTAACACTGATGATAGCTATCTTTGTTGTTTATA +TGGTCTCCAGAGACAATGTTTCTTGCTCCATCTGTCTATAAGGGAAGTTAAGCCCTGTATTTTCCTTTAT +TGTAGTGCTTGTTTACTTGTTGTCATTACAAAGAAACGTTATTGAAAAATGCTCTTGTTACTACT diff --git a/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json b/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json new file mode 100644 index 00000000..a9af3dfd --- /dev/null +++ b/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json @@ -0,0 +1,26 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/vic/includes.txt b/nextclade/dataset_config/vic/includes.txt new file mode 100644 index 00000000..e69de29b diff --git a/nextclade/dataset_config/vic/na/CY073894/genemap.gff b/nextclade/dataset_config/vic/na/CY073894/genemap.gff new file mode 100644 index 00000000..a1ce3b32 --- /dev/null +++ b/nextclade/dataset_config/vic/na/CY073894/genemap.gff @@ -0,0 +1,4 @@ +##gff-version 3 +##sequence-region CY073894.1 1 1401 +CY073894.1 annotation remark 1 1401 . . . accessions=CY073894;data_file_division=VRL;date=25-JUL-2016;keywords=;molecule_type=cRNA;organism=Influenza B virus %28B/Brisbane/60/2008%29;references=location: %5B0:1401%5D%0Aauthors: Sabaiduc%2CS.%2C Skowronski%2CD.%2C Petric%2CM. and Chan%2CT.%0Atitle: %0Ajournal: Unpublished%0Amedline id: %0Apubmed id: %0Acomment:,location: %5B0:1401%5D%0Aauthors: Sabaiduc%2CS.%2C Skowronski%2CD.%2C Gardy%2CJ.%2C Petric%2CM. and Chan%2CT.%0Atitle: Direct Submission%0Ajournal: Submitted %2816-SEP-2010%29 Genome Research Laboratory%2C British Columbia Centre for Disease Control%2C 655 West 12th Ave.%2C Vancouver%2C BC V5Z4R4%2C Canada%0Amedline id: %0Apubmed id: %0Acomment:;sequence_version=1;source=Influenza B virus %28B/Brisbane/60/2008%29;taxonomy=Viruses,Riboviria,Orthornavirae,Negarnaviricota,Polyploviricotina,Insthoviricetes,Articulavirales,Orthomyxoviridae,Betainfluenzavirus;topology=linear +CY073894.1 feature gene 1 1401 . + . codon_start=1;gene=NA;gene_name=NA;product=neuraminidase;protein_id=ADN32819.1;translation=MLPSTIQTLTLFLTSGGVLLSLYVSASLSYLLYSDILLKFSPTEITAPTMPLDCANASNVQAVNRSATKGVTLLLPEPEWTYPRLSCPGSTFQKALLISPHRFGETKGNSAPLIIREPFIACGPNECKHFALTHYAAQPGGYYNGTRGDRNKLRHLISVKLGKIPTVENSIFHMAAWSGSACHDGKEWTYIGVDGPDNNALLKVKYGEAYTDTYHSYANKILRTQESACNCIGGNCYLMITDGSASGVSECRFLKIREGRIIKEIFPTGRVKHTEECTCGFASNKTIECACRDNSYTAKRPFVKLNVETDTAEIRLMCTDTYLDTPRPNDGSITGPCESNGDKGSGGIKGGFVHQRMESKIGRWYSRTMSKTERMGMGLYVKYDGDPWADSDALAFSGVMVSMKEPGWYSFGFEIKDKKCDVPCIGIEMVHDGGKETWHSAATAIYCLMGSGQLLWDTVTGVDMAL diff --git a/nextclade/dataset_config/vic/na/CY073894/reference.fasta b/nextclade/dataset_config/vic/na/CY073894/reference.fasta new file mode 100644 index 00000000..1bdc8402 --- /dev/null +++ b/nextclade/dataset_config/vic/na/CY073894/reference.fasta @@ -0,0 +1,25 @@ +>CY073894.1 Influenza B virus (B/Brisbane/60/2008) segment 6 sequence +ATGCTACCTTCAACTATACAAACGTTAACCCTATTTCTCACATCAGGGGGAGTATTATTA +TCACTATATGTGTCAGCTTCATTATCATACTTACTATATTCGGATATATTGCTAAAATTC +TCACCAACAGAAATAACTGCACCAACAATGCCATTGGATTGTGCAAACGCATCAAATGTT +CAGGCTGTGAACCGTTCTGCAACAAAAGGGGTGACACTTCTTCTCCCAGAACCGGAGTGG +ACATACCCGCGTTTATCTTGCCCGGGCTCAACCTTTCAGAAAGCACTCCTAATTAGCCCT +CATAGATTCGGAGAAACCAAAGGAAACTCAGCTCCCTTGATAATAAGGGAACCTTTTATT +GCTTGTGGACCAAATGAATGCAAACACTTTGCTCTAACCCATTATGCAGCCCAACCAGGG +GGATACTACAATGGAACAAGAGGAGACAGAAACAAGCTGAGGCATCTAATTTCAGTCAAA +TTGGGCAAAATCCCAACAGTAGAAAACTCCATTTTCCACATGGCAGCATGGAGCGGGTCC +GCGTGCCATGATGGTAAGGAATGGACATATATCGGAGTTGATGGCCCTGACAATAATGCA +TTGCTCAAAGTAAAATATGGAGAAGCATATACTGACACATACCATTCCTATGCAAACAAA +ATCCTAAGAACACAAGAAAGTGCCTGCAATTGCATCGGGGGAAATTGTTATCTTATGATA +ACTGATGGCTCAGCTTCAGGTGTTAGTGAATGCAGATTTCTTAAGATTCGAGAGGGCCGA +ATAATAAAAGAAATATTTCCAACAGGAAGAGTAAAACACACTGAGGAATGCACATGCGGA +TTTGCCAGCAATAAAACCATAGAATGTGCCTGTAGAGATAACAGTTACACAGCAAAAAGA +CCTTTTGTCAAATTAAACGTGGAGACTGATACAGCAGAAATAAGATTGATGTGCACAGAT +ACTTATTTGGACACCCCCAGACCAAACGATGGAAGCATAACAGGCCCTTGTGAATCTAAT +GGGGACAAAGGGAGTGGAGGCATCAAGGGAGGATTTGTTCATCAAAGAATGGAATCCAAG +ATTGGAAGGTGGTACTCTCGAACGATGTCTAAAACTGAAAGGATGGGGATGGGACTGTAT +GTCAAGTATGATGGAGACCCATGGGCTGACAGTGATGCCCTAGCTTTTAGTGGAGTAATG +GTTTCAATGAAAGAACCTGGTTGGTACTCCTTTGGCTTCGAAATAAAAGATAAGAAATGC +GATGTCCCCTGTATTGGGATAGAGATGGTACATGATGGTGGAAAAGAGACTTGGCACTCA +GCAGCAACAGCCATTTACTGTTTAATGGGCTCAGGACAGCTGCTGTGGGACACTGTCACA +GGTGTTGACATGGCTCTGTAA diff --git a/nextclade/dataset_config/vic/na/CY073894/virus_properties.json b/nextclade/dataset_config/vic/na/CY073894/virus_properties.json new file mode 100644 index 00000000..3bfeb859 --- /dev/null +++ b/nextclade/dataset_config/vic/na/CY073894/virus_properties.json @@ -0,0 +1,22 @@ +{ + "schemaVersion": "1.10.0", + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"NA", + "ranges":[{"begin":33, "end":466}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/yam/ha/JN993010/genemap.gff b/nextclade/dataset_config/yam/ha/JN993010/genemap.gff new file mode 100644 index 00000000..1d67c700 --- /dev/null +++ b/nextclade/dataset_config/yam/ha/JN993010/genemap.gff @@ -0,0 +1,5 @@ +##gff-version 3 +##sequence-region JN993010.1 1 1755 +JN993010.1 feature gene 1 45 . + . gene_name="SigPep" +JN993010.1 feature gene 46 1083 . + . gene_name="HA1" +JN993010.1 feature gene 1084 1755 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/yam/ha/JN993010/reference.fasta b/nextclade/dataset_config/yam/ha/JN993010/reference.fasta new file mode 100644 index 00000000..ace128e2 --- /dev/null +++ b/nextclade/dataset_config/yam/ha/JN993010/reference.fasta @@ -0,0 +1,2 @@ +>JN993010.1 Influenza B virus (B/Wisconsin/01/2010) segment 4 hemagglutinin (HA) gene, complete cds +ATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGATCGAATCTGCACTGGGATAACATCTTCAAACTCACCTCATGTGGTCAAAACAGCTACTCAAGGGGAGGTCAATGTGACTGGCGTGATACCACTGACAACAACACCAACAAAATCTTATTTTGCAAATCTCAAAGGAACAAGGACCAGAGGGAAACTATGCCCGGACTGTCTCAACTGTACAGATCTGGATGTGGCCTTGGGCAGGCCAATGTGTGTGGGGACCACACCTTCTGCTAAAGCTTCAATACTCCACGAGGTCAGACCTGTTACATCCGGGTGCTTTCCTATAATGCACGACAGAACAAAAATCAGGCAACTACCCAATCTTCTCAGAGGATATGAAAATATCAGGTTATCAACCCAAAACGTTATCGATGCAGAAAAAGCACCAGGAGGACCCTACAGACTTGGAACCTCAGGATCTTGCCCTAACGCTACCAGTAAAATCGGATTTTTTGCAACAATGGCTTGGGCTGTCCCAAAGGACAACTACAAAAATGCAACGAACCCACTAACAGTAGAAGTACCATACATTTGTACAGAAGGGGAAGACCAAATTACTGTTTGGGGGTTCCATTCAGATAACAAAACCCAAATGAAGAGCCTCTATGGAGACTCAAATCCTCAAAAGTTCACCTCATCTGCTAATGGAGTAACCACACATTATGTTTCTCAGATTGGCGACTTCCCAGATCAAACAGAAGACGGAGGACTACCACAAAGCGGCAGAATTGTTGTTGATTACATGATGCAAAAACCTGGGAAAACAGGAACAATTGTCTATCAAAGAGGTGTTTTGTTGCCTCAAAAGGTGTGGTGCGCGAGTGGCAGGAGCAAAGTAATAAAAGGGTCATTGCCTTTAATTGGTGAAGCAGATTGCCTTCATGAAAAATACGGTGGATTAAACAAAAGCAAGCCTTACTACACAGGAGAACATGCAAAAGCCATAGGAAATTGCCCAATATGGGTAAAAACACCTTTGAAGCTTGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTGAAGGAAAGGGGTTTCTTCGGAGCTATTGCTGGTTTCCTAGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCTCACGGAGCACATGGAGTGGCAGTGGCGGCAGACCTTAAGAGTACACAAGAAGCTATAAATAAGATAACAAAAAATCTCAATTCTTTGAGTGAGCTAGAAGTAAAGAACCTTCAAAGACTAAGTGGTGCCATGGATGAACTCCACAACGAAATACTCGAGCTGGATGAGAAAGTGGATGATCTCAGAGCTGACACTATAAGCTCACAAATAGAACTTGCAGTCTTGCTTTCCAACGAAGGAATAATAAACAGTGAAGACGAGCATCTATTGGCACTTGAGAGAAAACTAAAGAAAATGCTGGGTCCCTCTGCTGTAGACATAGGAAACGGATGCTTCGAAACCAAACACAAATGCAACCAGACCTGCTTAGACAGGATAGCTGCTGGCACCTTTAATGCAGGAGAATTTTCTCTCCCCACTTTTGATTCATTGAACATTACTGCTGCATCTTTAAATGATGATGGATTGGATAACCATACTATACTGCTCTATTACTCAACTGCTGCTTCTAGTTTGGCTGTAACATTAATGCTAGCTATTTTTATTGTTTATATGGTCTCCAGAGACAACGTTTCATGCTCCATCTGTCTATAA From e9afb6d87093a8d26dbdf96de00b7d05b331df47 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 14:30:27 +0200 Subject: [PATCH 03/26] rename annotation and virus properties files --- nextclade/Snakefile | 59 +++++++++++-------- .../CY121680/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../MW626062/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../MW626056/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../CY163680/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../KX058884/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../CY073894/{genemap.gff => annotation.gff} | 0 .../{virus_properties.json => pathogen.json} | 0 .../JN993010/{genemap.gff => annotation.gff} | 0 20 files changed, 33 insertions(+), 26 deletions(-) rename nextclade/dataset_config/h1n1pdm/ha/CY121680/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h1n1pdm/ha/CY121680/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/h1n1pdm/ha/MW626062/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h1n1pdm/ha/MW626062/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/h1n1pdm/na/MW626056/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h1n1pdm/na/MW626056/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/h3n2/ha/CY163680/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h3n2/ha/CY163680/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/h3n2/ha/EPI1857216/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h3n2/ha/EPI1857216/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/h3n2/na/EPI1857215/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/h3n2/na/EPI1857215/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/vic/ha/EPI1926632/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/vic/ha/EPI1926632/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/vic/ha/KX058884/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/vic/ha/KX058884/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/vic/na/CY073894/{genemap.gff => annotation.gff} (100%) rename nextclade/dataset_config/vic/na/CY073894/{virus_properties.json => pathogen.json} (100%) rename nextclade/dataset_config/yam/ha/JN993010/{genemap.gff => annotation.gff} (100%) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index ab5819d8..5e5ee0e9 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -8,7 +8,17 @@ wildcard_constraints: segment = r'pb2|pb1|pa|ha|np|na|mp|ns', reference="[^_/]+", - +def all_builds(w): + builds = [] + for lineage in config["builds"]: + for segment in config["builds"][lineage]: + for ref in config["builds"][lineage][segment]["refs"]: + builds.append(f"datasets/{lineage}/{segment}/{ref}/tree.json") + return builds + +rule all: + input: + all_builds rule download_sequences: @@ -84,13 +94,13 @@ rule subsample: sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt", params: - filter_arguments=lambda w: config["builds"][w.lineage][w.segment][ + filter_arguments=lambda w: config["builds"][w.lineage][w.segment]["refs"][ w.reference ]["filter"], - reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment][ + reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment]["refs"][ w.reference ]["reference_EPI_ISL"], - other_include = lambda w:config["builds"][w.lineage][w.segment][w.reference].get("include_file","") + other_include = lambda w:config["builds"][w.lineage][w.segment]["refs"][w.reference].get("include_file","") shell: """ augur filter \ @@ -106,8 +116,8 @@ rule subsample: rule align: input: sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", - annotation="references/{lineage}/{segment}/{reference}/annotation.gff", - reference="references/{lineage}/{segment}/{reference}/reference.fasta", + annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", + reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", output: alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta", insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv", @@ -201,8 +211,8 @@ rule ancestral: input: tree=rules.refine.output.tree, alignment=rules.align.output.alignment, - annotation="references/{lineage}/{segment}/{reference}/annotation.gff", - reference="references/{lineage}/{segment}/{reference}/reference.fasta", + annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", + reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", output: node_data="build/{lineage}/{segment}/{reference}/muts.json", params: @@ -210,7 +220,7 @@ rule ancestral: genes=genes, translations=lambda w: expand( "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta", - strain=w.lineage, + lineage=w.lineage, segment=w.segment, genes=genes(w), reference=w.reference, @@ -234,14 +244,14 @@ rule clades: "Adding internal clade labels" input: tree=rules.refine.output.tree, - nucs=rules.ancestral.output.node_data, + muts=rules.ancestral.output.node_data, clades=rules.offset_clades.output, output: - node_data="build/{lineage}/{segment}/{reference}/{clade}.json", + node_data="build/{lineage}/{segment}/{reference}/clade_{clade}.json", shell: """ augur clades --tree {input.tree} \ - --mutations {input.nuc_muts} {input.aa_muts} \ + --mutations {input.muts} \ --clades {input.clades} \ --output-node-data {output.node_data} \ > /dev/null @@ -250,15 +260,12 @@ rule clades: def get_node_data(w): node_data = [ rules.refine.output.node_data, - "build/{lineage}/{segment}/{reference}/aa_muts_adapted.json".format(**w), - "build/{lineage}/{segment}/{reference}/nuc_muts_adapted.json".format(**w), + "build/{lineage}/{segment}/{reference}/muts.json".format(**w), ] - for clade in config["builds"][w.lineage][w.segment][w.reference]["clades"]: - node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'/{clade}.json') - - if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference]: - node_data.append("build/{lineage}/{segment}/{reference}/clades-short.json".format(**w)) + for clade in config["builds"][w.lineage][w.segment]["clade_systems"]: + print(clade) + node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'clade_{clade}.json') return node_data @@ -270,9 +277,9 @@ rule export: tree=rules.refine.output.tree, metadata=rules.parse.output.metadata, node_data = get_node_data, - auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference] else config["files"]["auspice_config"], + auspice_config= "config/auspice_config.json", output: - auspice_json="auspice/{lineage}/{segment}/{reference}/auspice_raw.json", + auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", params: fields="region strainName country date EPI_ISL", date=datetime.datetime.utcnow().strftime("%Y-%m-%d"), @@ -311,11 +318,11 @@ rule generate_sample_sequences: rule make_dataset: input: sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta", - auspice_json="auspice/{lineage}/{segment}/{reference}/auspice.json", - annotation="references/{lineage}/{segment}/{reference}/annotation.gff", - reference="references/{lineage}/{segment}/{reference}/reference.fasta", - pathogen_json="references/{lineage}/{segment}/{reference}/pathogen.json", - additional_config="references/{lineage}/{segment}/{reference}/virus_properties.json", + auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", + annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", + reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", + pathogen_json="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", + additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", output: sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta", tree="datasets/{lineage}/{segment}/{reference}/tree.json", diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/CY121680/annotation.gff similarity index 100% rename from nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff rename to nextclade/dataset_config/h1n1pdm/ha/CY121680/annotation.gff diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json similarity index 100% rename from nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json rename to nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/MW626062/annotation.gff similarity index 100% rename from nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff rename to nextclade/dataset_config/h1n1pdm/ha/MW626062/annotation.gff diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json similarity index 100% rename from nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json rename to nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff b/nextclade/dataset_config/h1n1pdm/na/MW626056/annotation.gff similarity index 100% rename from nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff rename to nextclade/dataset_config/h1n1pdm/na/MW626056/annotation.gff diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json similarity index 100% rename from nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json rename to nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff similarity index 100% rename from nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff rename to nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json similarity index 100% rename from nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json rename to nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff b/nextclade/dataset_config/h3n2/ha/EPI1857216/annotation.gff similarity index 100% rename from nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff rename to nextclade/dataset_config/h3n2/ha/EPI1857216/annotation.gff diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json similarity index 100% rename from nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json rename to nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff b/nextclade/dataset_config/h3n2/na/EPI1857215/annotation.gff similarity index 100% rename from nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff rename to nextclade/dataset_config/h3n2/na/EPI1857215/annotation.gff diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json similarity index 100% rename from nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json rename to nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff b/nextclade/dataset_config/vic/ha/EPI1926632/annotation.gff similarity index 100% rename from nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff rename to nextclade/dataset_config/vic/ha/EPI1926632/annotation.gff diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json similarity index 100% rename from nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json rename to nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json diff --git a/nextclade/dataset_config/vic/ha/KX058884/genemap.gff b/nextclade/dataset_config/vic/ha/KX058884/annotation.gff similarity index 100% rename from nextclade/dataset_config/vic/ha/KX058884/genemap.gff rename to nextclade/dataset_config/vic/ha/KX058884/annotation.gff diff --git a/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json similarity index 100% rename from nextclade/dataset_config/vic/ha/KX058884/virus_properties.json rename to nextclade/dataset_config/vic/ha/KX058884/pathogen.json diff --git a/nextclade/dataset_config/vic/na/CY073894/genemap.gff b/nextclade/dataset_config/vic/na/CY073894/annotation.gff similarity index 100% rename from nextclade/dataset_config/vic/na/CY073894/genemap.gff rename to nextclade/dataset_config/vic/na/CY073894/annotation.gff diff --git a/nextclade/dataset_config/vic/na/CY073894/virus_properties.json b/nextclade/dataset_config/vic/na/CY073894/pathogen.json similarity index 100% rename from nextclade/dataset_config/vic/na/CY073894/virus_properties.json rename to nextclade/dataset_config/vic/na/CY073894/pathogen.json diff --git a/nextclade/dataset_config/yam/ha/JN993010/genemap.gff b/nextclade/dataset_config/yam/ha/JN993010/annotation.gff similarity index 100% rename from nextclade/dataset_config/yam/ha/JN993010/genemap.gff rename to nextclade/dataset_config/yam/ha/JN993010/annotation.gff From 73bbd87085c459a7e192e28cd271b6a5b5b0af59 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 15:27:37 +0200 Subject: [PATCH 04/26] update pathogen json --- nextclade/Snakefile | 41 +++++++++++++------------------ nextclade/config/config_dict.yaml | 35 +++++++++++++++++--------- nextclade/config/pathogen.json | 14 ++++++++++- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 5e5ee0e9..1ec4d6c0 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -26,7 +26,6 @@ rule download_sequences: sequences="data/{lineage}/raw_{segment}.fasta" params: s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/raw_sequences.fasta.xz" - conda: "../../workflow/envs/nextstrain.yaml" shell: """ aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences} @@ -38,7 +37,7 @@ rule download_clades: output: clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv" params: - source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade], + source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'], shell: """ curl {params.source_tsv} > {output.clade_tsv} @@ -119,20 +118,19 @@ rule align: annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", output: - alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta", - insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv", + alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta" params: outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta", + nextclade_bin = "./nextclade_v3" threads: 3 shell: """ - nextclade run \ + {params.nextclade_bin} run \ --jobs={threads} \ --input-ref {input.reference} \ --input-annotation {input.annotation} \ --output-translations {params.outdir} \ --output-fasta {output.alignment} \ - --output-insertions {output.insertions} \ {input.sequences} \ 2>&1 """ @@ -143,16 +141,11 @@ rule tree: alignment=rules.align.output.alignment, output: tree="build/{lineage}/{segment}/{reference}/tree_raw.nwk", - params: - args=lambda w: config["tree"].get("tree-builder-args", "") - if "tree" in config - else "", threads: 3 shell: """ augur tree \ --alignment {input.alignment} \ - --tree-builder-args {params.args} \ --output {output.tree} \ --nthreads {threads} \ > /dev/null @@ -218,13 +211,7 @@ rule ancestral: params: inference="joint", genes=genes, - translations=lambda w: expand( - "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta", - lineage=w.lineage, - segment=w.segment, - genes=genes(w), - reference=w.reference, - ), + translations= "build/{lineage}/{segment}/{reference}/aligned.gene.%GENE.fasta", shell: """ augur ancestral \ @@ -234,7 +221,7 @@ rule ancestral: --infer-ambiguous \ --genes {params.genes} \ --annotation {input.annotation} \ - --translations {params.translations} \ + --translations {params.translations:q} \ --root-sequence {input.reference} \ --output-node-data {output.node_data} """ @@ -248,11 +235,16 @@ rule clades: clades=rules.offset_clades.output, output: node_data="build/{lineage}/{segment}/{reference}/clade_{clade}.json", + params: + membership_key= lambda w: config["builds"][w.lineage][w.segment]["clade_systems"][w.clade].get('key', 'clade_membership'), + label_key= lambda w: config["builds"][w.lineage][w.segment]["clade_systems"][w.clade].get('key', 'clade') shell: """ augur clades --tree {input.tree} \ --mutations {input.muts} \ --clades {input.clades} \ + --membership-name {params.membership_key} \ + --label-name {params.label_key} \ --output-node-data {output.node_data} \ > /dev/null """ @@ -292,6 +284,7 @@ rule export: --node-data {input.node_data}\ --auspice-config {input.auspice_config} \ --color-by-metadata {params.fields} \ + --minify-json \ --title "Nextclade reference tree for Influenza type:{wildcards.lineage} segment:{wildcards.segment} with root {wildcards.reference} built on {params.date}" \ --output {output.auspice_json} 2>&1; """ @@ -321,7 +314,7 @@ rule make_dataset: auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", - pathogen_json="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", + pathogen_json="config/pathogen.json", additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", output: sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta", @@ -331,10 +324,10 @@ rule make_dataset: pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json", shell: """ - cp {input.sequences} {output.sequences} \ - cp {input.auspice_json} {output.tree} \ - cp {input.reference} {output.reference} \ - cp {input.annotation} {output.annotation} \ + cp {input.sequences} {output.sequences} + cp {input.auspice_json} {output.tree} + cp {input.reference} {output.reference} + cp {input.annotation} {output.annotation} jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json} """ diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index 239f1f9e..f94e283a 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -4,9 +4,11 @@ builds: h1n1pdm: ha: clade_systems: - "clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv" - "subclade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv" - "short-clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv" + clade: + url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv" + subclade: + url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv" + key: "subclade" refs: CY121680: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" @@ -21,7 +23,8 @@ builds: reference_strain: A/Wisconsin/588/2019 na: clade_systems: - "clade": "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv" + clade: + url: "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv" refs: MW626056: filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" @@ -31,9 +34,14 @@ builds: h3n2: ha: clade_systems: - "clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv" - "subclade": "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv" - "short-clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv" + clade: + url: "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv" + subclade: + url: "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv" + key: "subclade" + short-clade: + url: "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv" + key: "short-clade" refs: EPI1857216: filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" @@ -51,7 +59,8 @@ builds: reference_strain: A/Wisconsin/67/2005-egg na: clade_systems: - "clade": "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv" + clade: + url: "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv" refs: EPI1857215: filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" @@ -61,8 +70,11 @@ builds: vic: ha: clade_systems: - "clade": "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv" - "subclade": "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv" + clade: + url: "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv" + subclade: + url: "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv" + key: "subclade" refs: KX058884: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" @@ -72,7 +84,8 @@ builds: reference_strain: B/Brisbane/60/2008-egg na: clade_systems: - "clade": "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" + clade: + url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" refs: CY073894: filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json index d536f97f..ab5c112d 100644 --- a/nextclade/config/pathogen.json +++ b/nextclade/config/pathogen.json @@ -1,4 +1,5 @@ { + "schemaVersion": "3.0.0", "alignmentParams": { "excessBandwidth": 9, "terminalBandwidth": 100, @@ -6,6 +7,13 @@ "gapAlignmentSide": "right", "minSeedCover": 0.1 }, + "files":{ + "examples": "example_sequences.fasta", + "genomeAnnotation": "annotation.gff", + "pathogenJson": "pathogen.json", + "reference": "reference.fasta", + "treeJson": "tree.json" + }, "qc": { "privateMutations": { "enabled": true, @@ -37,5 +45,9 @@ "enabled": true, "ignoredStopCodons": [] } - } + }, + "geneOrderPreference": [ + "HA1", + "HA2" + ] } From f99b3f318ee90115bd6b506a26f4ad96c0a54b7e Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 16:37:22 +0200 Subject: [PATCH 05/26] add script to merge jsons --- nextclade/Snakefile | 30 +++++++++-- .../h1n1pdm/ha/CY121680/pathogen.json | 1 - nextclade/scripts/merge_jsons.py | 52 +++++++++++++++++++ 3 files changed, 78 insertions(+), 5 deletions(-) create mode 100644 nextclade/scripts/merge_jsons.py diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 1ec4d6c0..513d5c85 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -249,6 +249,29 @@ rule clades: > /dev/null """ +rule virus_specific_jsons: + input: + auspice_config= "config/auspice_config.json", + pathogen = "config/pathogen.json", + additional_pathogen="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", + output: + pathogen = "build/{lineage}/{segment}/{reference}/pathogen.json", + auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json", + params: + clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', '') + for clade in config["builds"][w.lineage][w.segment]["clade_systems"]] + shell: + """ + python3 scripts/merge_jsons.py --lineage {wildcards.lineage} --reference {wildcards.reference} \ + --segment {wildcards.segment} --clades {params.clades} \ + --pathogen-jsons {input.pathogen} {input.additional_pathogen} \ + --auspice-config {input.auspice_config} \ + --output-pathogen {output.pathogen} \ + --output-auspice {output.auspice} + """ + + + def get_node_data(w): node_data = [ rules.refine.output.node_data, @@ -269,7 +292,7 @@ rule export: tree=rules.refine.output.tree, metadata=rules.parse.output.metadata, node_data = get_node_data, - auspice_config= "config/auspice_config.json", + auspice_config= "build/{lineage}/{segment}/{reference}/auspice_config.json" output: auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", params: @@ -314,8 +337,7 @@ rule make_dataset: auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", - pathogen_json="config/pathogen.json", - additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json", + pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json", output: sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta", tree="datasets/{lineage}/{segment}/{reference}/tree.json", @@ -328,7 +350,7 @@ rule make_dataset: cp {input.auspice_json} {output.tree} cp {input.reference} {output.reference} cp {input.annotation} {output.annotation} - jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json} + cp {input.pathogen_json} {output.pathogen_json} """ diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json index a9af3dfd..0fec5c9f 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py new file mode 100644 index 00000000..02657e5f --- /dev/null +++ b/nextclade/scripts/merge_jsons.py @@ -0,0 +1,52 @@ +import json, argparse + +def get_clade_configs(name): + return { + "short_clades": { + "name": "short_clade", + "displayName": "Abbreviated clade name", + "description": "For recent subclades with long names, the prefix describing their history is omitted." + }, + "subclade": { + "name": "subclade", + "displayName": "Subclade", + "description": "Experimental fine-grained subclade annotation." + }}.get(name, {'name':name, "displayName":name}) + + +if __name__=="__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--lineage", required=True, help="attribute info") + parser.add_argument("--segment", required=True, help="attribute info") + parser.add_argument("--reference", required=True, help="attribute info") + parser.add_argument("--auspice-config", required=True, help="Auspice config JSON with coloring entry to have scale added to") + parser.add_argument("--pathogen-jsons", nargs='+', required=True, help="name of the coloring field in the Auspice config JSON") + parser.add_argument("--clades", nargs="+", required=True, help="list of values to assign colors to") + parser.add_argument("--output-auspice", required=True, help="Auspice config JSON with scale added to the requested coloring") + parser.add_argument("--output-pathogen", required=True, help="Auspice config JSON with scale added to the requested coloring") + args = parser.parse_args() + + pathogen_json = {} + for p in args.pathogen_jsons: + with open(p) as fh: + pathogen_json.update(json.load(fh)) + + with open(args.auspice_config) as fh: + auspice_json = json.load(fh) + + pathogen_json['attributes'] = {"name":{"value":args.lineage}, + "segment":{"value":args.segment}, + "reference":{"value":args.reference}} + + + if len(args.clades): + auspice_json['extensions']['nextclade']["clade_node_attrs"] = [ + get_clade_configs(c) for c in args.clades + ] + + with open(args.output_pathogen, 'w') as fh: + json.dump(pathogen_json, fh, indent=2) + + with open(args.output_auspice, 'w') as fh: + json.dump(auspice_json, fh, indent=2) + From 04744fb8d486e720c7b824f4546df74d1fd57683 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 25 Sep 2023 17:28:17 +0200 Subject: [PATCH 06/26] fixes and make reference sequences uppercase --- nextclade/Snakefile | 4 +- nextclade/config/config_dict.yaml | 4 +- .../h3n2/ha/CY163680/annotation.gff | 6 +-- .../h3n2/ha/EPI1857216/reference.fasta | 44 ++++++++--------- .../vic/ha/EPI1926632/reference.fasta | 48 +++++++++---------- nextclade/scripts/merge_jsons.py | 2 +- 6 files changed, 55 insertions(+), 53 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 513d5c85..943aeaec 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -258,7 +258,7 @@ rule virus_specific_jsons: pathogen = "build/{lineage}/{segment}/{reference}/pathogen.json", auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json", params: - clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', '') + clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', 'default') for clade in config["builds"][w.lineage][w.segment]["clade_systems"]] shell: """ @@ -344,6 +344,7 @@ rule make_dataset: annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff", reference="datasets/{lineage}/{segment}/{reference}/reference.fasta", pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json", + auspice="auspice/{lineage}_{segment}_{reference}.json", shell: """ cp {input.sequences} {output.sequences} @@ -351,6 +352,7 @@ rule make_dataset: cp {input.reference} {output.reference} cp {input.annotation} {output.annotation} cp {input.pathogen_json} {output.pathogen_json} + cp {input.auspice_json} {output.auspice} """ diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index f94e283a..99044324 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -47,14 +47,14 @@ builds: filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: -17 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" - include_file: references/h3n2/includes.txt + include_file: dataset_config/h3n2/includes.txt reference_EPI_ISL: EPI1857216 reference_strain: A/Darwin/6/2021 CY163680: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" - include_file: references/h3n2/includes.txt + include_file: dataset_config/h3n2/includes.txt reference_EPI_ISL: EPI545340 reference_strain: A/Wisconsin/67/2005-egg na: diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff index cbb8d4e5..579ff7ff 100644 --- a/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff +++ b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff @@ -1,5 +1,5 @@ ##gff-version 3 ##sequence-region CY163680.1 1 1737 -CY163680.1 feature CDS 18 65 . + . name="SigPep" -CY163680.1 feature CDS 66 1052 . + . name="HA1" -CY163680.1 feature CDS 1053 1715 . + . name="HA2" +CY163680.1 feature gene 18 65 . + . gene_name="SigPep" +CY163680.1 feature gene 66 1052 . + . gene_name="HA1" +CY163680.1 feature gene 1053 1715 . + . gene_name="HA2" diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta index ee3b943f..427694bc 100644 --- a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta @@ -1,23 +1,23 @@ >EPI_ISL_1563628 | A/Darwin/6/2021 | A / H3N2 | | 2021-03-16 -atgaagactatcattgctttgagcaacattctatgtcttgttttcgctcaaaaaatacctggaaatgacaatagcacggc -aacgctgtgccttgggcaccatgcagtaccaaacggaacgatagtgaaaacaatcacaaatgaccgaattgaagttacta -atgctactgagttggttcagaattcatcaataggtgaaatatgcggcagtcctcatcagatccttgatggagggaactgc -acactaatagatgctctattgggggaccctcagtgtgacggctttcaaaataaggaatgggacctttttgttgaaagaag -cagagccaacagcaactgttacccttatgatgtgccggattatgcctcccttaggtcactagttgcctcatccggcacac -tggagtttaaaaatgaaagcttcaattggactggagtcaaacaaaacggaacaagttctgcgtgcataaggggatctagt -agtagtttttttagtagattaaattggttgaccagcttaaacaacatatatccagcacagaacgtgactatgccaaacaa -ggaacaatttgacaaattgtacatttggggggttcaccacccggatacggacaagaaccaaatctccctgtttgctcaat -catcaggaagaatcacagtatctaccaaaagaagccaacaagctgtaatcccaaatatcggatctagacccagaataagg -gatatccctagcagaataagcatctattggacaatagtaaaaccgggagacatacttttgattaacagcacagggaatct -aattgctcctaggggttacttcaaaatacgaagtgggaaaagctcaataatgagatcagatgcacccattggcaaatgta -agtctgaatgcatcactccaaatggaagcattcccaatgacaaaccgttccaaaatgtaaacaggatcacatacggggcc -tgtcccagatatgttaagcaaagcaccctgaaattggcaacaggaatgcgaaatgtaccagagaaacaaaccagaggcat -atttggcgcaatagcgggtttcatagaaaatggatgggagggaatggtggatggttggtacggtttcaggcatcaaaatt -ctgagggaagaggacaagcagcagatctcaaaagcactcaagcagcaatcgatcaaatcaatgggaagctgaatcgattg -atcggaaaaaccaacgagaaattccatcagattgaaaaagaattctcagaagtagaaggaagagttcaagaccttgagaa -atatgttgaggacactaaaatagatctctggtcatacaacgcggagcttcttgttgccctggagaaccaacatacgattg -acctaactgactcagaaatgaacaaactgtttgaaaaaacaaagaagcaactgagggaaaatgctgaggatatgggaaat -ggttgtttcaaaatataccacaaatgtgacaatgcctgcataggatcaataagaaatgaaacttatgaccacaatgtgta -cagggatgaagcattaaacaaccggttccagatcaagggagttgagctgaagtcagggtacaaagattggatcctatgga -tttcctttgccatgtcatgttttttgctttgtattgctttgttggggttcatcatgtgggcctgccaaaagggcaacatt -agatgcaacatttgcatttgagtgcattaattaaaaac +ATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATGACAATAGCACGGC +AACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTA +ATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTGAAATATGCGGCAGTCCTCATCAGATCCTTGATGGAGGGAACTGC +ACACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCTTTCAAAATAAGGAATGGGACCTTTTTGTTGAAAGAAG +CAGAGCCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACAC +TGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCAAACAAAACGGAACAAGTTCTGCGTGCATAAGGGGATCTAGT +AGTAGTTTTTTTAGTAGATTAAATTGGTTGACCAGCTTAAACAACATATATCCAGCACAGAACGTGACTATGCCAAACAA +GGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGATACGGACAAGAACCAAATCTCCCTGTTTGCTCAAT +CATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGG +GATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCT +AATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGTA +AGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCGTTCCAAAATGTAAACAGGATCACATACGGGGCC +TGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACCAGAGGCAT +ATTTGGCGCAATAGCGGGTTTCATAGAAAATGGATGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATT +CTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTG +ATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAGTTCAAGACCTTGAGAA +ATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACGATTG +ACCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGAAAT +GGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTA +CAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGA +TTTCCTTTGCCATGTCATGTTTTTTGCTTTGTATTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATT +AGATGCAACATTTGCATTTGAGTGCATTAATTAAAAAC diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta index 6bd01349..a77d4fcf 100644 --- a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta +++ b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta @@ -1,25 +1,25 @@ >B/Austria/1359417/2021 | EPI_ISL_6307006 | B / H0N0 | Victoria | 2021-01-09 -attttctaatatccacaaaatgaaggcaataattgtactactcatggtagtaacatccaatgcagatcgaatctgcactg -ggataacatcgtcaaactcaccacatgtcgtcaaaactgctactcaaggggaggtcaatgtgactggtgtaataccactg -acaacaacacccaccaaatctcattttgcaaatctcaaaggaacagaaaccagggggaaactatgcccaaaatgcctaaa -ctgcacagatctggatgtagccttgggcagaccaaaatgcacagggaaaataccctctgcaagggtttcaatactccatg -aagtcagacctgttacatctgggtgctttcctataatgcatgatagaacaaaaattagacagctgcctaaccttctccga -ggatacgaacatgtcaggttatcaactcacaacgttatcaatacagaagatgcaccaggaggaccctacgaaattggaac -ctcagggtcttgcctcaacattaccaatggaaaaggattcttcgcaacaatggcttgggccgtcccaaaaaacaaaacag -caacaaatccattaacaatagaagtaccatacatttgtacagaagaagaagaccaaattaccgtttgggggttccactct -gacgacgagacccaaatggcaaggctctatggggattcaaagccccagaagttcacctcatctgccaacggagtgaccac -acactacgtctcacagattggtggctttccaaatcaaacagaagacggaggactaccacaaagtggcagaattgttgttg -attacatggtgcaaaaatctggaaaaacaggaacaattacctatcaaagaggtattttattgcctcaaaaggtgtggtgc -gcaagtggcaagagcaaggtaataaaaggatccttgcccttaattggagaagcagattgcctccatgaaaaatacggtgg -attaaacaaaagcaagccttactacacaggggaacatgcaaaggccataggaaattgcccaatatgggtgaaaacaccct -tgaagctggccaatggaaccaaatatagacctcctgcaaaactattaaaggaaagaggtttcttcggagccattgctggt -ttcttagagggaggatgggaaggaatgattgcaggttggcacggatacacatcccatggggcacatggagtagcggtggc -agctgaccttaagagcactcaggaggccataaacaagataacaaaaaatctcaactctttgagtgagctggaagtaaaga -atcttcaaagactaagcggtgccatggatgaactccacaacgaaatactagaactagatgagaaagtggatgatctcaga -gctgatacaataagctcacagatagaactcgcagtcctgctttccaatgaaggaataataaacagtgaagatgaacatct -cttggcgcttgaaagaaagctgaagaaaatgctgggcccctctgctgtagagataggaaatggatgctttgaaaccaaac -acaagtgcaaccagacctgtctcgacagaatagctgctggtacctttgatgcaggagaattttctctccccacctttgat -tcactgaatattactgctgcatctttaaatgacgatggattggacaatcatactatactgctttactactcaactgctgc -ctccagtttggctgtaacactgatgatagctatctttgttgtttatatggtctccagagacaatgtttcttgctccattt -gtctataagggaagttaagccctgtattttcctttattgtagtgcttgtttgcttgttgtcattacaaagaaacgttatt -gaaaaat +ATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGATCGAATCTGCACTG +GGATAACATCGTCAAACTCACCACATGTCGTCAAAACTGCTACTCAAGGGGAGGTCAATGTGACTGGTGTAATACCACTG +ACAACAACACCCACCAAATCTCATTTTGCAAATCTCAAAGGAACAGAAACCAGGGGGAAACTATGCCCAAAATGCCTAAA +CTGCACAGATCTGGATGTAGCCTTGGGCAGACCAAAATGCACAGGGAAAATACCCTCTGCAAGGGTTTCAATACTCCATG +AAGTCAGACCTGTTACATCTGGGTGCTTTCCTATAATGCATGATAGAACAAAAATTAGACAGCTGCCTAACCTTCTCCGA +GGATACGAACATGTCAGGTTATCAACTCACAACGTTATCAATACAGAAGATGCACCAGGAGGACCCTACGAAATTGGAAC +CTCAGGGTCTTGCCTCAACATTACCAATGGAAAAGGATTCTTCGCAACAATGGCTTGGGCCGTCCCAAAAAACAAAACAG +CAACAAATCCATTAACAATAGAAGTACCATACATTTGTACAGAAGAAGAAGACCAAATTACCGTTTGGGGGTTCCACTCT +GACGACGAGACCCAAATGGCAAGGCTCTATGGGGATTCAAAGCCCCAGAAGTTCACCTCATCTGCCAACGGAGTGACCAC +ACACTACGTCTCACAGATTGGTGGCTTTCCAAATCAAACAGAAGACGGAGGACTACCACAAAGTGGCAGAATTGTTGTTG +ATTACATGGTGCAAAAATCTGGAAAAACAGGAACAATTACCTATCAAAGAGGTATTTTATTGCCTCAAAAGGTGTGGTGC +GCAAGTGGCAAGAGCAAGGTAATAAAAGGATCCTTGCCCTTAATTGGAGAAGCAGATTGCCTCCATGAAAAATACGGTGG +ATTAAACAAAAGCAAGCCTTACTACACAGGGGAACATGCAAAGGCCATAGGAAATTGCCCAATATGGGTGAAAACACCCT +TGAAGCTGGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTAAAGGAAAGAGGTTTCTTCGGAGCCATTGCTGGT +TTCTTAGAGGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCCCATGGGGCACATGGAGTAGCGGTGGC +AGCTGACCTTAAGAGCACTCAGGAGGCCATAAACAAGATAACAAAAAATCTCAACTCTTTGAGTGAGCTGGAAGTAAAGA +ATCTTCAAAGACTAAGCGGTGCCATGGATGAACTCCACAACGAAATACTAGAACTAGATGAGAAAGTGGATGATCTCAGA +GCTGATACAATAAGCTCACAGATAGAACTCGCAGTCCTGCTTTCCAATGAAGGAATAATAAACAGTGAAGATGAACATCT +CTTGGCGCTTGAAAGAAAGCTGAAGAAAATGCTGGGCCCCTCTGCTGTAGAGATAGGAAATGGATGCTTTGAAACCAAAC +ACAAGTGCAACCAGACCTGTCTCGACAGAATAGCTGCTGGTACCTTTGATGCAGGAGAATTTTCTCTCCCCACCTTTGAT +TCACTGAATATTACTGCTGCATCTTTAAATGACGATGGATTGGACAATCATACTATACTGCTTTACTACTCAACTGCTGC +CTCCAGTTTGGCTGTAACACTGATGATAGCTATCTTTGTTGTTTATATGGTCTCCAGAGACAATGTTTCTTGCTCCATTT +GTCTATAAGGGAAGTTAAGCCCTGTATTTTCCTTTATTGTAGTGCTTGTTTGCTTGTTGTCATTACAAAGAAACGTTATT +GAAAAAT diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py index 02657e5f..10eb5df4 100644 --- a/nextclade/scripts/merge_jsons.py +++ b/nextclade/scripts/merge_jsons.py @@ -41,7 +41,7 @@ def get_clade_configs(name): if len(args.clades): auspice_json['extensions']['nextclade']["clade_node_attrs"] = [ - get_clade_configs(c) for c in args.clades + get_clade_configs(c) for c in args.clades if c!='default' ] with open(args.output_pathogen, 'w') as fh: From da3704a545e5f5f65c07cbb4bee316378ea391ed Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 26 Sep 2023 23:18:44 +0200 Subject: [PATCH 07/26] add outliers --- config/h1n1pdm/outliers.txt | 4 ++++ config/h1n1pdm/reference_strains.txt | 1 - config/h3n2/outliers.txt | 2 ++ nextclade/Snakefile | 2 ++ nextclade/config/config_dict.yaml | 12 ++++++------ 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/config/h1n1pdm/outliers.txt b/config/h1n1pdm/outliers.txt index de309446..310ebd47 100644 --- a/config/h1n1pdm/outliers.txt +++ b/config/h1n1pdm/outliers.txt @@ -10,6 +10,7 @@ A/Asturias/RR6898/2010 A/Austria/1048413/2018 A/Austria/183/2009-egg A/Bangladesh/2021/2012 +A/Bangkok/P1600/2023 A/Bari/166/2016 A/Bari/167/2016 A/Beijing/1/2009-egg @@ -51,6 +52,7 @@ A/India/4562/2021 A/India/8484/2021 A/India/9324/2021 A/India/9825/2021 +A/India/CT-AIIMSR-266/2022 A/India/Pun1418633/2014 A/India/TS-NIV2019nCoV177/2020 A/Iowa/1/2006 @@ -110,6 +112,7 @@ A/Rennes/F_006_111_BL/2020 A/Rennes/F_006_119_JD/2020 A/RheinlandPfalz/1/2020 A/RioGrandedoNorte/117490/2012 +A/RioGrandeDoNorte/2023-012263-IEC/2023 A/SaintLucia/7340/2020 A/Shandong/1/2009 A/Shandong/1/2009-egg @@ -123,6 +126,7 @@ A/SouthAfrica/16112/2021 A/SouthAfrica/16325/2021 A/SouthAfrica/1857/2013 A/SouthAfrica/NHLS-UCT-GS-0017/2021 +A/SouthAfrica/NHLS-UCT-GS-0034/2021 A/SouthAfrica/PET20279/2021 A/SouthDakota/3/2008 A/SriLanka/11/2012 diff --git a/config/h1n1pdm/reference_strains.txt b/config/h1n1pdm/reference_strains.txt index dbbf65a7..6db5111c 100644 --- a/config/h1n1pdm/reference_strains.txt +++ b/config/h1n1pdm/reference_strains.txt @@ -5,7 +5,6 @@ A/Arizona/33/2017 A/Arkansas/4/2020 A/Arkansas/8/2020 A/Arkansas/8/2020-egg -A/Austria/1048413/2018 A/Bangladesh/2021/2012 A/Bangladesh/3002/2015 A/Bolivia/559/2013 diff --git a/config/h3n2/outliers.txt b/config/h3n2/outliers.txt index 0ab95da5..b9c1473e 100644 --- a/config/h3n2/outliers.txt +++ b/config/h3n2/outliers.txt @@ -10,6 +10,7 @@ A/Austria/839906/2015 A/Bangladesh/3010/2020 A/BosniaandHerzegovia/288G/2019 A/Brazil/BA-LACEN-BA053-292045410/2022 +A/Brazil/BA-LACEN-BA071-292012660/2021 A/Busan/15453/2009 A/California/NHRC0001/2011 A/Cambodia/NHRCC00001/2009 @@ -463,6 +464,7 @@ A/Sedbury/2991/2023 A/Shanghai-Minxing/1482/2017 A/Shanghai/6/2014 A/Singapore/GP11121/2022 +A/SouthAfrica/K056301/2023 A/SouthAustralia/1/2021 A/SouthAustralia/22/2018 A/SouthAustralia/47/2016 diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 943aeaec..0d6d84f8 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -105,6 +105,7 @@ rule subsample: augur filter \ --sequences {input.aligned_sequences} \ --metadata {input.enriched_metadata} \ + --exclude {input.exclude} \ --include {input.include_strains} {params.other_include} \ --include-where EPI_ISL={params.reference_EPI_ISL} \ {params.filter_arguments} \ @@ -148,6 +149,7 @@ rule tree: --alignment {input.alignment} \ --output {output.tree} \ --nthreads {threads} \ + --tree-builder-args '-ninit 10 -n 4 -czb' \ > /dev/null """ diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index 99044324..75531d50 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -11,12 +11,12 @@ builds: key: "subclade" refs: CY121680: - filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + filter: "--min-date 2014 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 reference_EPI_ISL: EPI1583287 reference_strain: A/California/7/2009-egg #TODO: exclude MW626062: - filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + filter: "--min-date 2019 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv" reference_EPI_ISL: EPI1812046 @@ -44,14 +44,14 @@ builds: key: "short-clade" refs: EPI1857216: - filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: -17 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" include_file: dataset_config/h3n2/includes.txt reference_EPI_ISL: EPI1857216 reference_strain: A/Darwin/6/2021 CY163680: - filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" include_file: dataset_config/h3n2/includes.txt @@ -77,7 +77,7 @@ builds: key: "subclade" refs: KX058884: - filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500" + filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv" reference_EPI_ISL: EPI696970 @@ -88,7 +88,7 @@ builds: url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" refs: CY073894: - filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" + filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000" clade_offset: -30 reference_EPI_ISL: CY073894 reference_strain: B/Brisbane/60/2008 From dacde90ef030554a9c96dc4ef831418ccd3029d1 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 27 Sep 2023 18:32:38 +0200 Subject: [PATCH 08/26] add includes --- nextclade/Snakefile | 6 +++--- nextclade/config/config_dict.yaml | 2 -- nextclade/dataset_config/h1n1pdm/includes.txt | 8 ++++++++ nextclade/dataset_config/vic/includes.txt | 5 +++++ 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 0d6d84f8..4ed28051 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -88,6 +88,7 @@ rule subsample: aligned_sequences=rules.parse.output.sequences, enriched_metadata=rules.parse.output.metadata, include_strains="../config/{lineage}/reference_strains.txt", + nextclade_include="dataset_config/{lineage}/include.txt", exclude="../config/{lineage}/outliers.txt", output: sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", @@ -98,15 +99,14 @@ rule subsample: ]["filter"], reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment]["refs"][ w.reference - ]["reference_EPI_ISL"], - other_include = lambda w:config["builds"][w.lineage][w.segment]["refs"][w.reference].get("include_file","") + ]["reference_EPI_ISL"] shell: """ augur filter \ --sequences {input.aligned_sequences} \ --metadata {input.enriched_metadata} \ --exclude {input.exclude} \ - --include {input.include_strains} {params.other_include} \ + --include {input.include_strains} {input.nextclade_include} \ --include-where EPI_ISL={params.reference_EPI_ISL} \ {params.filter_arguments} \ --output {output.sampled_sequences} \ diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index 75531d50..32fb1c71 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -47,14 +47,12 @@ builds: filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: -17 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" - include_file: dataset_config/h3n2/includes.txt reference_EPI_ISL: EPI1857216 reference_strain: A/Darwin/6/2021 CY163680: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" - include_file: dataset_config/h3n2/includes.txt reference_EPI_ISL: EPI545340 reference_strain: A/Wisconsin/67/2005-egg na: diff --git a/nextclade/dataset_config/h1n1pdm/includes.txt b/nextclade/dataset_config/h1n1pdm/includes.txt index e69de29b..e168d0bf 100644 --- a/nextclade/dataset_config/h1n1pdm/includes.txt +++ b/nextclade/dataset_config/h1n1pdm/includes.txt @@ -0,0 +1,8 @@ +A/Lao/1632/2023 +A/NorthCarolina/6/2023 +A/Victoria/114/2023 +A/Victoria/27/2023 +A/Tuvalu/GX23096/2023 +A/India/Pune-Nivsari_22_884/2022 +A/Nepal/22FL2393/2022 +A/Singapore/GP6589/2022 diff --git a/nextclade/dataset_config/vic/includes.txt b/nextclade/dataset_config/vic/includes.txt index e69de29b..ffdd2b25 100644 --- a/nextclade/dataset_config/vic/includes.txt +++ b/nextclade/dataset_config/vic/includes.txt @@ -0,0 +1,5 @@ +B/Brazil/1417/2023 +B/Massachusetts/1/2022 +B/Amazonas/2022-014046-IEC/2022 +B/Iquitos/FPI20551/2022 +B/Pennsylvania/3/2022 From f21122a348a0a942cd193f8ab2e202b5d786cdbd Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 27 Sep 2023 18:48:37 +0200 Subject: [PATCH 09/26] fix include file name and clean rules --- nextclade/Snakefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 4ed28051..8d1fd3e5 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -88,7 +88,7 @@ rule subsample: aligned_sequences=rules.parse.output.sequences, enriched_metadata=rules.parse.output.metadata, include_strains="../config/{lineage}/reference_strains.txt", - nextclade_include="dataset_config/{lineage}/include.txt", + nextclade_include="dataset_config/{lineage}/includes.txt", exclude="../config/{lineage}/outliers.txt", output: sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", @@ -281,7 +281,6 @@ def get_node_data(w): ] for clade in config["builds"][w.lineage][w.segment]["clade_systems"]: - print(clade) node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'clade_{clade}.json') return node_data @@ -362,12 +361,15 @@ rule make_dataset: rule clean: shell: """ - rm -rf output test data/clades* data/include* auspice/* + rm -rf build datasets auspice """ rule clean_all: shell: """ - rm -rf output test auspice build data + rm -rf build + rm -rf auspice + rm -rf datasets + rm -rf data/ """ From 403e27c654c32d02671d3b6e2046c08babf83058 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 10 Nov 2023 15:29:58 +0100 Subject: [PATCH 10/26] sort out short clade names, add outlier pruning --- nextclade/Snakefile | 21 +++++- nextclade/config/config_dict.yaml | 3 + nextclade/config/pathogen.json | 121 ++++++++++++++++++------------ nextclade/scripts/merge_jsons.py | 3 + 4 files changed, 97 insertions(+), 51 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 8d1fd3e5..426ae70d 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -174,10 +174,29 @@ rule root: cp {params.outdir}/rerooted.newick {output.tree} """ +rule prune_outliers: + input: + tree = rules.root.output.tree + output: + tree="build/{lineage}/{segment}/{reference}/tree_rooted_pruned.nwk" + params: + outliers = "build/{lineage}/{segment}/{reference}/tt_out/outliers.tsv" + run: + import pandas as pd + from Bio import Phylo + outliers = pd.read_csv(params.outliers, sep='\t', index_col=0) + T = Phylo.read(input.tree, 'newick') + + for n in outliers.index: + if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n): + print("prune", n) + T.prune(n) + Phylo.write(T, output.tree, "newick") + # refine while keeping the root rule refine: input: - tree=rules.root.output.tree, + tree=rules.prune_outliers.output.tree, alignment=rules.align.output.alignment, enriched_metadata=rules.parse.output.metadata, output: diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index 32fb1c71..ede743b5 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -5,7 +5,10 @@ builds: ha: clade_systems: clade: + url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv" + short-clade: url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv" + key: "short-clade" subclade: url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv" key: "subclade" diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json index ab5c112d..a6471a56 100644 --- a/nextclade/config/pathogen.json +++ b/nextclade/config/pathogen.json @@ -1,53 +1,74 @@ { - "schemaVersion": "3.0.0", - "alignmentParams": { - "excessBandwidth": 9, - "terminalBandwidth": 100, - "allowedMismatches": 4, - "gapAlignmentSide": "right", - "minSeedCover": 0.1 + "schemaVersion": "3.0.0", + "alignmentParams": { + "excessBandwidth": 9, + "terminalBandwidth": 100, + "allowedMismatches": 4, + "gapAlignmentSide": "right", + "minSeedCover": 0.1 + }, + "files": { + "examples": "example_sequences.fasta", + "genomeAnnotation": "annotation.gff", + "pathogenJson": "pathogen.json", + "reference": "reference.fasta", + "treeJson": "tree.json" + }, + "qc": { + "privateMutations": { + "enabled": true, + "typical": 5, + "cutoff": 15, + "weightLabeledSubstitutions": 2, + "weightReversionSubstitutions": 1, + "weightUnlabeledSubstitutions": 1 }, - "files":{ - "examples": "example_sequences.fasta", - "genomeAnnotation": "annotation.gff", - "pathogenJson": "pathogen.json", - "reference": "reference.fasta", - "treeJson": "tree.json" - }, - "qc": { - "privateMutations": { - "enabled": true, - "typical": 5, - "cutoff": 15, - "weightLabeledSubstitutions": 2, - "weightReversionSubstitutions": 1, - "weightUnlabeledSubstitutions": 1 - }, - "missingData": { - "enabled": false, - "missingDataThreshold": 100, - "scoreBias": 10 - }, - "snpClusters": { - "enabled": false, - "windowSize": 100, - "clusterCutOff": 5, - "scoreWeight": 50 - }, - "mixedSites": { - "enabled": true, - "mixedSitesThreshold": 4 - }, - "frameShifts": { - "enabled": true - }, - "stopCodons": { - "enabled": true, - "ignoredStopCodons": [] - } + "missingData": { + "enabled": false, + "missingDataThreshold": 100, + "scoreBias": 10 }, - "geneOrderPreference": [ - "HA1", - "HA2" - ] -} + "snpClusters": { + "enabled": false, + "windowSize": 100, + "clusterCutOff": 5, + "scoreWeight": 50 + }, + "mixedSites": { + "enabled": true, + "mixedSitesThreshold": 4 + }, + "frameShifts": { + "enabled": true + }, + "stopCodons": { + "enabled": true, + "ignoredStopCodons": [] + } + }, + "geneOrderPreference": [ + "HA1", + "HA2" + ], + "maintenance": { + "website": [ + "https://nextstrain.org", + "https://clades.nextstrain.org" + ], + "documentation": [ + "https://github.com/nextstrain/seasonal-flu" + ], + "source code": [ + "https://github.com/nextstrain/seasonal_flu" + ], + "issues": [ + "https://github.com/nextstrain/seasonal_flu/issues" + ], + "organizations": [ + "Nextstrain" + ], + "authors": [ + "Nextstrain team " + ] + } +} \ No newline at end of file diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py index 10eb5df4..59fa0394 100644 --- a/nextclade/scripts/merge_jsons.py +++ b/nextclade/scripts/merge_jsons.py @@ -38,6 +38,9 @@ def get_clade_configs(name): "segment":{"value":args.segment}, "reference":{"value":args.reference}} + pathogen_json['geneOrderPreference'] = {"ha": ["HA1", "HA2"], "na":["NA"]}.get(args.segment, []) + if args.segment in ['ha', 'na']: + pathogen_json['defaultGene'] = {"ha": "HA1", "na":"NA"}.get(args.segment) if len(args.clades): auspice_json['extensions']['nextclade']["clade_node_attrs"] = [ From e64b4e88c18a1288b6ba6ba6f603e8380de2a0bf Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 10 Nov 2023 15:30:09 +0100 Subject: [PATCH 11/26] add outliers --- config/h3n2/outliers.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config/h3n2/outliers.txt b/config/h3n2/outliers.txt index b9c1473e..83553b5e 100644 --- a/config/h3n2/outliers.txt +++ b/config/h3n2/outliers.txt @@ -11,6 +11,8 @@ A/Bangladesh/3010/2020 A/BosniaandHerzegovia/288G/2019 A/Brazil/BA-LACEN-BA053-292045410/2022 A/Brazil/BA-LACEN-BA071-292012660/2021 +A/Brazil/BA-LACEN-BA123-292029844/2021 +A/Brazil/BA-LACEN-BA128-292008241/2021 A/Busan/15453/2009 A/California/NHRC0001/2011 A/Cambodia/NHRCC00001/2009 @@ -28,6 +30,8 @@ A/Catalonia/NSVH100570896/2017 A/Catalonia/NSVH100731127/2017 A/Catalonia/NSVH100748648/2018 A/Catalonia/NSVH100751838/2018 +A/ChiangRai/NIC-P3252/2022 +A/ChiangRai/D1249/2022 A/Chile/8266/2003 A/Corsica/10/2009 A/Corsica/11/2008 @@ -46,6 +50,7 @@ A/Corsica/39/2009 A/Corsica/42/2009 A/Corsica/7/2007 A/Cyprus/F83/2017 +A/Dakar/922/2022 A/Darwin/143/2020 A/Delaware/3/2011 A/Delaware/33/2017 From 26bf61a02fc4fb22a306aebcc58b283e149b0c0b Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 10 Nov 2023 17:26:07 +0100 Subject: [PATCH 12/26] nextclade: add readme's and fix attributes --- nextclade/Snakefile | 14 +++++++---- nextclade/config/pathogen.json | 11 +++++++-- .../h1n1pdm/ha/CY121680/README.md | 22 ++++++++++++++++++ .../h1n1pdm/ha/MW626062/README.md | 22 ++++++++++++++++++ .../h1n1pdm/ha/MW626062/pathogen.json | 1 - .../h1n1pdm/na/MW626056/README.md | 22 ++++++++++++++++++ .../h1n1pdm/na/MW626056/pathogen.json | 1 - .../dataset_config/h3n2/ha/CY163680/README.md | 22 ++++++++++++++++++ .../h3n2/ha/CY163680/pathogen.json | 1 - .../h3n2/ha/EPI1857216/README.md | 23 +++++++++++++++++++ .../h3n2/ha/EPI1857216/pathogen.json | 1 - .../h3n2/na/EPI1857215/README.md | 23 +++++++++++++++++++ .../h3n2/na/EPI1857215/pathogen.json | 1 - .../vic/ha/EPI1926632/README.md | 22 ++++++++++++++++++ .../vic/ha/EPI1926632/pathogen.json | 1 - .../dataset_config/vic/ha/KX058884/README.md | 22 ++++++++++++++++++ .../vic/ha/KX058884/pathogen.json | 1 - .../dataset_config/vic/na/CY073894/README.md | 22 ++++++++++++++++++ .../vic/na/CY073894/pathogen.json | 1 - nextclade/scripts/merge_jsons.py | 11 ++++++--- 20 files changed, 227 insertions(+), 17 deletions(-) create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/README.md create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/README.md create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/README.md create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/README.md create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/README.md create mode 100644 nextclade/dataset_config/vic/ha/KX058884/README.md create mode 100644 nextclade/dataset_config/vic/na/CY073894/README.md diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 426ae70d..85125b56 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -280,10 +280,13 @@ rule virus_specific_jsons: auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json", params: clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', 'default') - for clade in config["builds"][w.lineage][w.segment]["clade_systems"]] + for clade in config["builds"][w.lineage][w.segment]["clade_systems"]], + reference_name = lambda w: config["builds"][w.lineage][w.segment]['refs'][w.reference]['reference_strain'] shell: """ - python3 scripts/merge_jsons.py --lineage {wildcards.lineage} --reference {wildcards.reference} \ + python3 scripts/merge_jsons.py --lineage {wildcards.lineage} \ + --reference {wildcards.reference} \ + --reference-name {params.reference_name} \ --segment {wildcards.segment} --clades {params.clades} \ --pathogen-jsons {input.pathogen} {input.additional_pathogen} \ --auspice-config {input.auspice_config} \ @@ -357,12 +360,14 @@ rule make_dataset: auspice_json="build/{lineage}/{segment}/{reference}/auspice.json", annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", + readme="dataset_config/{lineage}/{segment}/{reference}/README.md", pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json", output: - sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta", + sequences="datasets/{lineage}/{segment}/{reference}/sequences.fasta", tree="datasets/{lineage}/{segment}/{reference}/tree.json", - annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff", + annotation="datasets/{lineage}/{segment}/{reference}/genome_annotation.gff3", reference="datasets/{lineage}/{segment}/{reference}/reference.fasta", + readme="datasets/{lineage}/{segment}/{reference}/README.md", pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json", auspice="auspice/{lineage}_{segment}_{reference}.json", shell: @@ -370,6 +375,7 @@ rule make_dataset: cp {input.sequences} {output.sequences} cp {input.auspice_json} {output.tree} cp {input.reference} {output.reference} + cp {input.readme} {output.readme} cp {input.annotation} {output.annotation} cp {input.pathogen_json} {output.pathogen_json} cp {input.auspice_json} {output.auspice} diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json index a6471a56..ee06a003 100644 --- a/nextclade/config/pathogen.json +++ b/nextclade/config/pathogen.json @@ -7,10 +7,17 @@ "gapAlignmentSide": "right", "minSeedCover": 0.1 }, + "compatibility": { + "cli": "3.0.0-alpha.0", + "web": "3.0.0-alpha.0" + }, + "defaultGene": "HA1", "files": { - "examples": "example_sequences.fasta", - "genomeAnnotation": "annotation.gff", + "changelog": "CHANGELOG.md", + "examples": "sequences.fasta", + "genomeAnnotation": "genome_annotation.gff3", "pathogenJson": "pathogen.json", + "readme": "README.md", "reference": "reference.fasta", "treeJson": "tree.json" }, diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md new file mode 100644 index 00000000..90099616 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h1n1pdm_ha | Influenza A H1N1pdm HA | +| reference | CY121680 | A/California/07/2009 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md new file mode 100644 index 00000000..fc893f29 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h1n1pdm_ha | Influenza A H1N1pdm HA | +| reference | MW626062 | A/Wisconsin/588/2019 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json index a9af3dfd..0fec5c9f 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md new file mode 100644 index 00000000..dbf50de6 --- /dev/null +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h1n1pdm_na | Influenza A H1N1pdm HA | +| reference | MW626056 | A/Wisconsin/588/2019 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json index 27ec895a..56945894 100644 --- a/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md new file mode 100644 index 00000000..4efdec2a --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h3n2_ha | Influenza A H3N2 HA | +| reference | CY163680 | A/Wisconsin/67/2005 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json index a9af3dfd..0fec5c9f 100644 --- a/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json +++ b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md new file mode 100644 index 00000000..07116d3c --- /dev/null +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md @@ -0,0 +1,23 @@ +# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h3n2_ha | Influenza A H3N2 HA | +| reference | EPI1857216 | A/Darwin/6/2021 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/) + * Identification of glycosilation motifs + * Counting of mutations in the RBD + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json index 6b5cd7dd..c50e5311 100644 --- a/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "phenotypeData":[ diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md new file mode 100644 index 00000000..99a76b4e --- /dev/null +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md @@ -0,0 +1,23 @@ +# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu_h3n2_na/EPI1857216) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_h3n2_na | Influenza A H3N2 NA | +| reference | EPI1857216 | A/Darwin/6/2021 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/) + * Identification of glycosilation motifs + * Counting of mutations in the RBD + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json index a56465d3..0b454508 100644 --- a/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/README.md b/nextclade/dataset_config/vic/ha/EPI1926632/README.md new file mode 100644 index 00000000..1d9e0509 --- /dev/null +++ b/nextclade/dataset_config/vic/ha/EPI1926632/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu_vic_ha/EPI1926632) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_vic_ha | Influenza B Vic HA | +| reference | EPI1926632 | B/Austria/1359417/2021 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json index d80db6a5..cd3daefa 100644 --- a/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json +++ b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md new file mode 100644 index 00000000..d28dc35c --- /dev/null +++ b/nextclade/dataset_config/vic/ha/KX058884/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_vic_ha | Influenza B Vic HA | +| reference | KX058884 | B/Brisbane/60/2008 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/vic/ha/KX058884/pathogen.json b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json index a9af3dfd..0fec5c9f 100644 --- a/nextclade/dataset_config/vic/ha/KX058884/pathogen.json +++ b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md new file mode 100644 index 00000000..c7cbf354 --- /dev/null +++ b/nextclade/dataset_config/vic/na/CY073894/README.md @@ -0,0 +1,22 @@ +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_na/CY073894) + + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu_vic_na | Influenza B Vic HA | +| reference | CY073894 | B/Brisbane/60/2008 | + + +## Features +This dataset supports + + * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/) + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/vic/na/CY073894/pathogen.json b/nextclade/dataset_config/vic/na/CY073894/pathogen.json index 3bfeb859..efc674fd 100644 --- a/nextclade/dataset_config/vic/na/CY073894/pathogen.json +++ b/nextclade/dataset_config/vic/na/CY073894/pathogen.json @@ -1,5 +1,4 @@ { - "schemaVersion": "1.10.0", "nucMutLabelMap": {}, "nucMutLabelMapReverse": {}, "aaMotifs": [ diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py index 59fa0394..1ffc48d2 100644 --- a/nextclade/scripts/merge_jsons.py +++ b/nextclade/scripts/merge_jsons.py @@ -19,6 +19,7 @@ def get_clade_configs(name): parser.add_argument("--lineage", required=True, help="attribute info") parser.add_argument("--segment", required=True, help="attribute info") parser.add_argument("--reference", required=True, help="attribute info") + parser.add_argument("--reference-name", required=True, help="attribute info") parser.add_argument("--auspice-config", required=True, help="Auspice config JSON with coloring entry to have scale added to") parser.add_argument("--pathogen-jsons", nargs='+', required=True, help="name of the coloring field in the Auspice config JSON") parser.add_argument("--clades", nargs="+", required=True, help="list of values to assign colors to") @@ -34,9 +35,13 @@ def get_clade_configs(name): with open(args.auspice_config) as fh: auspice_json = json.load(fh) - pathogen_json['attributes'] = {"name":{"value":args.lineage}, - "segment":{"value":args.segment}, - "reference":{"value":args.reference}} + flu_type = {'h3n2':'A', 'h1n1pdm':'A', 'vic':'B', 'yam':'B'}[args.lineage] + lineage_name = {'h3n2':'H3N2', 'h1n1pdm':'H1N1pdm', 'vic':'Victoria', 'yam':'Yamagata'}[args.lineage] + + pathogen_json['attributes'] = {"name": f"Influenza {flu_type} {lineage_name} {args.segment.upper()}", + "segment": args.segment, + "reference accession": args.reference, + "reference name": args.reference_name} pathogen_json['geneOrderPreference'] = {"ha": ["HA1", "HA2"], "na":["NA"]}.get(args.segment, []) if args.segment in ['ha', 'na']: From 34172fde8edea546feffad903e7c0623a711fb21 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 10 Nov 2023 21:29:12 +0100 Subject: [PATCH 13/26] nextclade: add changelog --- nextclade/Snakefile | 15 +++++++++++++++ nextclade/config/config_dict.yaml | 10 ++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 85125b56..2813e8b4 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -43,6 +43,18 @@ rule download_clades: curl {params.source_tsv} > {output.clade_tsv} """ +rule download_changelog: + message: + "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}" + output: + changelog = "data/{lineage}_{segment}_changelog.md" + params: + source=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["changelog"], + shell: + """ + curl {params.source} > {output.changelog} + """ + rule offset_clades: input: rules.download_clades.output, @@ -361,6 +373,7 @@ rule make_dataset: annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", readme="dataset_config/{lineage}/{segment}/{reference}/README.md", + changelog="data/{lineage}_{segment}_changelog.md", pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json", output: sequences="datasets/{lineage}/{segment}/{reference}/sequences.fasta", @@ -370,12 +383,14 @@ rule make_dataset: readme="datasets/{lineage}/{segment}/{reference}/README.md", pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json", auspice="auspice/{lineage}_{segment}_{reference}.json", + changelog="datasets/{lineage}/{segment}/{reference}/CHANGELOG.md", shell: """ cp {input.sequences} {output.sequences} cp {input.auspice_json} {output.tree} cp {input.reference} {output.reference} cp {input.readme} {output.readme} + cp {input.changelog} {output.changelog} cp {input.annotation} {output.annotation} cp {input.pathogen_json} {output.pathogen_json} cp {input.auspice_json} {output.auspice} diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index ede743b5..31d99c8b 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -3,6 +3,7 @@ clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/" builds: h1n1pdm: ha: + changelog: "seasonal_A-H1N1pdm_HA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv" @@ -21,10 +22,10 @@ builds: MW626062: filter: "--min-date 2019 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 - clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv" reference_EPI_ISL: EPI1812046 reference_strain: A/Wisconsin/588/2019 na: + changelog: "seasonal_A-H1N1pdm_NA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv" @@ -36,6 +37,7 @@ builds: reference_strain: A/Wisconsin/588/2019 h3n2: ha: + changelog: "seasonal_A-H3N2_HA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv" @@ -49,16 +51,15 @@ builds: EPI1857216: filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: -17 - clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" reference_EPI_ISL: EPI1857216 reference_strain: A/Darwin/6/2021 CY163680: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 - clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv" reference_EPI_ISL: EPI545340 reference_strain: A/Wisconsin/67/2005-egg na: + changelog: "seasonal_A-H3N2_NA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv" @@ -70,6 +71,7 @@ builds: reference_strain: A/Darwin/6/2021 vic: ha: + changelog: "seasonal_B-Vic_HA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv" @@ -80,10 +82,10 @@ builds: KX058884: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 - clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv" reference_EPI_ISL: EPI696970 reference_strain: B/Brisbane/60/2008-egg na: + changelog: "seasonal_B-Vic_NA/main/CHANGELOG.md" clade_systems: clade: url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" From 99497fd4d3236c4ce9d2549b1167c2cbbc8baacf Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 12 Nov 2023 15:45:37 +0100 Subject: [PATCH 14/26] nextclade: add to dataset readme --- .../dataset_config/h1n1pdm/ha/CY121680/README.md | 13 +++++++++++++ .../dataset_config/h1n1pdm/ha/MW626062/README.md | 11 +++++++++++ .../dataset_config/h1n1pdm/na/MW626056/README.md | 9 +++++++++ nextclade/dataset_config/h3n2/ha/CY163680/README.md | 12 ++++++++++++ .../dataset_config/h3n2/ha/EPI1857216/README.md | 11 +++++++++++ .../dataset_config/h3n2/na/EPI1857215/README.md | 8 ++++++++ nextclade/dataset_config/vic/ha/KX058884/README.md | 11 +++++++++++ nextclade/dataset_config/vic/na/CY073894/README.md | 9 +++++++++ 8 files changed, 84 insertions(+) diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md index 90099616..b8b0874a 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md @@ -1,5 +1,7 @@ # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680) +This dataset uses an older reference sequence (A/California/07/2009) and recent sequences will differ at a large number of positions from this reference. +For the analysis of currently circulating viruses, the dataset using A/Wisconsin/588/2019 as reference might be more appropriate. ## Dataset attributes @@ -17,6 +19,17 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. +Clade names are structured as _Number-Letter_ binomials separated by periods as in `6B.1A.5a.2a.1`. These sometimes get shortened by omission of leading binomials like `5a.2a.1`. + +In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. + + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md index fc893f29..03c8197a 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md @@ -1,5 +1,6 @@ # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062) +This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses. ## Dataset attributes @@ -17,6 +18,16 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. +Clade names are structured as _Number-Letter_ binomials separated by periods as in `6B.1A.5a.2a.1`. These sometimes get shortened by omission of leading binomials like `5a.2a.1`. + +In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md index dbf50de6..10c38f0d 100644 --- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md @@ -1,5 +1,6 @@ # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056) +This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses. ## Dataset attributes @@ -8,6 +9,14 @@ | name | flu_h1n1pdm_na | Influenza A H1N1pdm HA | | reference | MW626056 | A/Wisconsin/588/2019 | +## Clades of seasonal influenza viruses + +The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. + +This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. + ## Features This dataset supports diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md index 4efdec2a..c91fff85 100644 --- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md +++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md @@ -1,5 +1,7 @@ # Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680) +This dataset uses an older reference sequence (A/Wisconsin/67/2005) and recent sequences will differ at a large number of positions from this reference. +For the analysis of currently circulating viruses, the dataset using A/Darwin/6/2021 as reference might be more appropriate. ## Dataset attributes @@ -17,6 +19,16 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. +Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. + +In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`. +The leading letter is an alias of a previous name. + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md index 07116d3c..a2ced41f 100644 --- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md @@ -1,5 +1,6 @@ # Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216) +This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable for the analysis of circulating viruses. ## Dataset attributes @@ -18,6 +19,16 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. +Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. + +In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`. +The leading letter is an alias of a previous name. + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md index 99a76b4e..4855ee8f 100644 --- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md @@ -8,6 +8,14 @@ | name | flu_h3n2_na | Influenza A H3N2 NA | | reference | EPI1857216 | A/Darwin/6/2021 | +## Clades of seasonal influenza viruses + +The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. + +This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. + ## Features This dataset supports diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md index d28dc35c..fe88397a 100644 --- a/nextclade/dataset_config/vic/ha/KX058884/README.md +++ b/nextclade/dataset_config/vic/ha/KX058884/README.md @@ -1,5 +1,6 @@ # Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884) +The reference sequence for this datasets precedes the deletions at positions 162ff in the HA1 protein of the virus and thus follows the canonical numbering of aminoacids in the protein. ## Dataset attributes @@ -17,6 +18,16 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. +Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `V1A.3a.2`. + +In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.3.2`. +The leading letter is an alias of a previous name. + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md index c7cbf354..dae95036 100644 --- a/nextclade/dataset_config/vic/na/CY073894/README.md +++ b/nextclade/dataset_config/vic/na/CY073894/README.md @@ -17,6 +17,15 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. + +This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.2.3`. +The leading letter is an alias of a previous name. + + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html From 57cb639884dccaf30432eb32cb7fc80d3a465133 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 12 Nov 2023 15:52:58 +0100 Subject: [PATCH 15/26] nextclade: add links to README --- .../h1n1pdm/ha/CY121680/README.md | 2 ++ .../h1n1pdm/ha/MW626062/README.md | 1 + .../h1n1pdm/na/MW626056/README.md | 16 +++++++++------- .../dataset_config/h3n2/ha/CY163680/README.md | 3 ++- .../h3n2/na/EPI1857215/README.md | 18 ++++++++++-------- .../dataset_config/vic/ha/KX058884/README.md | 1 + .../dataset_config/vic/na/CY073894/README.md | 1 + 7 files changed, 26 insertions(+), 16 deletions(-) diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md index b8b0874a..b14250ef 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md @@ -28,6 +28,8 @@ Clade names are structured as _Number-Letter_ binomials separated by periods as In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/). + ## What is Nextclade dataset diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md index 03c8197a..91cc32d3 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md @@ -27,6 +27,7 @@ Clade names are structured as _Number-Letter_ binomials separated by periods as In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/). ## What is Nextclade dataset diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md index 10c38f0d..9ecb25f9 100644 --- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md @@ -9,13 +9,6 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit | name | flu_h1n1pdm_na | Influenza A H1N1pdm HA | | reference | MW626056 | A/Wisconsin/588/2019 | -## Clades of seasonal influenza viruses - -The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. - -This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. -These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. -The leading letter is an alias of a previous name. ## Features @@ -26,6 +19,15 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. + +This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/). + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md index c91fff85..fa391c39 100644 --- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md +++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md @@ -23,11 +23,12 @@ This dataset supports The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. -Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. +Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/). ## What is Nextclade dataset diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md index 4855ee8f..a23b9233 100644 --- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md @@ -8,14 +8,6 @@ | name | flu_h3n2_na | Influenza A H3N2 NA | | reference | EPI1857216 | A/Darwin/6/2021 | -## Clades of seasonal influenza viruses - -The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. - -This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. -These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. -The leading letter is an alias of a previous name. - ## Features This dataset supports @@ -26,6 +18,16 @@ This dataset supports * Sequence QC * Phylogenetic placement +## Clades of seasonal influenza viruses + +The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment. + +This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. +These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`. +The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/). + + ## What is Nextclade dataset Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md index fe88397a..0f117264 100644 --- a/nextclade/dataset_config/vic/ha/KX058884/README.md +++ b/nextclade/dataset_config/vic/ha/KX058884/README.md @@ -27,6 +27,7 @@ Clade names are structured as _Number-Letter_ binomials (with exceptions) separa In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.3.2`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/). ## What is Nextclade dataset diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md index dae95036..f2dba64e 100644 --- a/nextclade/dataset_config/vic/na/CY073894/README.md +++ b/nextclade/dataset_config/vic/na/CY073894/README.md @@ -24,6 +24,7 @@ The WHO Collaborating centers **do not** define "clades" for the neuraminidase s This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.2.3`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/). ## What is Nextclade dataset From 2893860e47a2cd00419439b6dc668b2c204ca6ea Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 12 Nov 2023 15:54:53 +0100 Subject: [PATCH 16/26] nextclade: cp command --- nextclade/Snakefile | 3 +++ nextclade/dataset_config/h3n2/ha/EPI1857216/README.md | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 2813e8b4..31b898fe 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -413,3 +413,6 @@ rule clean_all: rm -rf datasets rm -rf data/ """ + + +# cp datasets/h3n2/ha/EPI1857216/* ../../../nextstrain/nextclade_data/data/nextstrain/flu/h3n2/ha/EPI1857216 \ No newline at end of file diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md index a2ced41f..1bb9a2c8 100644 --- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md @@ -23,11 +23,12 @@ This dataset supports The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. -Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. +Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`. In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups. These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`. The leading letter is an alias of a previous name. +Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/). ## What is Nextclade dataset From 7d4c044a70a2658f87fa19d7c4d72e471044b3cc Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 12:55:04 +0100 Subject: [PATCH 17/26] add outliers --- config/h1n1pdm/outliers.txt | 1 + config/vic/outliers.txt | 1 + config/yam/outliers.txt | 1 + 3 files changed, 3 insertions(+) diff --git a/config/h1n1pdm/outliers.txt b/config/h1n1pdm/outliers.txt index 310ebd47..52ff0d4c 100644 --- a/config/h1n1pdm/outliers.txt +++ b/config/h1n1pdm/outliers.txt @@ -69,6 +69,7 @@ A/Louisiana/EVTL-100047/2022 A/Malaysia/2142295/2009 A/Malaysia/2142299/2009 A/Malaysia/2143696/2009 +A/Manitoba/2/2021 A/Minnesota/3/2008 A/Minnesota/33/2014 A/Minnesota/46/2015 diff --git a/config/vic/outliers.txt b/config/vic/outliers.txt index 3fda1fbf..1f46722e 100644 --- a/config/vic/outliers.txt +++ b/config/vic/outliers.txt @@ -3,6 +3,7 @@ B/Alagoas/4386/2023 B/Auckland/1/2008 B/Bangkok/SI17/2012 B/Bangkok/SI58/2012 +B/Bari/53/2023 B/Brisbane/14/2016 B/Brisbine/33/2008 B/California/87/2017-egg diff --git a/config/yam/outliers.txt b/config/yam/outliers.txt index b8a3642c..1dad5be1 100644 --- a/config/yam/outliers.txt +++ b/config/yam/outliers.txt @@ -9,6 +9,7 @@ B/Kolkata/N-2047/2009 B/Nairobi/351/2005 B/NewHampshire/1/2016 B/Norway/2155/2017 +B/Palermo/2/2011 B/Riyadh/3/2010 B/Riyadh/4/2010 B/Thailand/CU-B10303/2014 From 5ed45edc4c41dc15295d731e9d6dc026de1ca482 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 12:56:05 +0100 Subject: [PATCH 18/26] nextclade: refine worflow, readme's, add Yam dataset --- nextclade/Snakefile | 46 ++++++++++++++++--- nextclade/config/config_dict.yaml | 24 +++++++++- .../h1n1pdm/ha/CY121680/README.md | 4 +- .../h1n1pdm/ha/MW626062/README.md | 4 +- .../h1n1pdm/na/MW626056/README.md | 4 +- .../dataset_config/h3n2/ha/CY163680/README.md | 4 +- .../h3n2/ha/EPI1857216/README.md | 4 +- .../h3n2/na/EPI1857215/README.md | 4 +- .../vic/ha/EPI1926632/README.md | 4 +- .../dataset_config/vic/ha/KX058884/README.md | 4 +- .../dataset_config/vic/na/CY073894/README.md | 4 +- .../dataset_config/yam/ha/JN993010/README.md | 28 +++++++++++ .../yam/ha/JN993010/pathogen.json | 25 ++++++++++ nextclade/dataset_config/yam/includes.txt | 1 + nextclade/scripts/merge_jsons.py | 7 ++- 15 files changed, 141 insertions(+), 26 deletions(-) create mode 100644 nextclade/dataset_config/yam/ha/JN993010/README.md create mode 100644 nextclade/dataset_config/yam/ha/JN993010/pathogen.json create mode 100644 nextclade/dataset_config/yam/includes.txt diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 31b898fe..bd1acee0 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -37,13 +37,13 @@ rule download_clades: output: clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv" params: - source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'], + source_tsv=lambda w: (config['clade_repo'] if w.lineage!='yam' else '') + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'], shell: """ curl {params.source_tsv} > {output.clade_tsv} """ -rule download_changelog: +rule download_changelog_clades: message: "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}" output: @@ -55,6 +55,18 @@ rule download_changelog: curl {params.source} > {output.changelog} """ +rule download_changelog_dataset: + message: + "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}" + output: + changelog = "data/{lineage}_{segment}_{reference}_dataset-changelog.md" + params: + source=lambda w: f"{config['dataset_repo']}/{w.lineage}/{w.segment}/{w.reference}/CHANGELOG.md", + shell: + """ + curl {params.source} > {output.changelog} + """ + rule offset_clades: input: rules.download_clades.output, @@ -103,8 +115,8 @@ rule subsample: nextclade_include="dataset_config/{lineage}/includes.txt", exclude="../config/{lineage}/outliers.txt", output: - sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", - sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt", + sampled_sequences="build/{lineage}/{segment}/{reference}/subsample_tmp.fasta", + sampled_strains="build/{lineage}/{segment}/{reference}/subsample_tmp.txt", params: filter_arguments=lambda w: config["builds"][w.lineage][w.segment]["refs"][ w.reference @@ -125,6 +137,27 @@ rule subsample: --output-strains {output.sampled_strains} """ +rule subsample_harddate: + input: + sequences=rules.subsample.output.sampled_sequences, + enriched_metadata=rules.parse.output.metadata, + output: + sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", + sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt", + params: + hardmin=lambda w: config["builds"][w.lineage][w.segment]["refs"][ + w.reference + ]["hardmin_date"], + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.enriched_metadata} \ + --min-date {params.hardmin} \ + --output {output.sampled_sequences} \ + --output-strains {output.sampled_strains} + """ + rule align: input: sequences="build/{lineage}/{segment}/{reference}/subsample.fasta", @@ -180,7 +213,7 @@ rule root: --tree {input.tree} \ --sequence-length 1500 \ --dates {input.metadata} \ - --clock-filter 4 \ + --clock-filter 3 \ --clock-filter-method local \ --outdir {params.outdir} cp {params.outdir}/rerooted.newick {output.tree} @@ -373,6 +406,7 @@ rule make_dataset: annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff", reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta", readme="dataset_config/{lineage}/{segment}/{reference}/README.md", + dataset_changelog="data/{lineage}_{segment}_{reference}_dataset-changelog.md", changelog="data/{lineage}_{segment}_changelog.md", pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json", output: @@ -390,7 +424,7 @@ rule make_dataset: cp {input.auspice_json} {output.tree} cp {input.reference} {output.reference} cp {input.readme} {output.readme} - cp {input.changelog} {output.changelog} + cp {input.dataset_changelog} {output.changelog} cp {input.annotation} {output.annotation} cp {input.pathogen_json} {output.pathogen_json} cp {input.auspice_json} {output.auspice} diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index 31d99c8b..ff866cc6 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -1,4 +1,5 @@ clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/" +dataset_repo: "https://raw.githubusercontent.com/nextstrain/nextclade_data/master/data/nextstrain/flu" builds: h1n1pdm: @@ -17,11 +18,13 @@ builds: CY121680: filter: "--min-date 2014 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 + hardmin_date: 2009 reference_EPI_ISL: EPI1583287 reference_strain: A/California/7/2009-egg #TODO: exclude MW626062: filter: "--min-date 2019 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 + hardmin_date: 2015 reference_EPI_ISL: EPI1812046 reference_strain: A/Wisconsin/588/2019 na: @@ -33,6 +36,7 @@ builds: MW626056: filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" clade_offset: 0 + hardmin_date: 2015 reference_EPI_ISL: EPI1812046 reference_strain: A/Wisconsin/588/2019 h3n2: @@ -51,11 +55,13 @@ builds: EPI1857216: filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: -17 + hardmin_date: 2015 reference_EPI_ISL: EPI1857216 reference_strain: A/Darwin/6/2021 CY163680: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 + hardmin_date: 2004 reference_EPI_ISL: EPI545340 reference_strain: A/Wisconsin/67/2005-egg na: @@ -67,6 +73,7 @@ builds: EPI1857215: filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500" clade_offset: 4 + hardmin_date: 2015 reference_EPI_ISL: EPI1857215 reference_strain: A/Darwin/6/2021 vic: @@ -82,6 +89,7 @@ builds: KX058884: filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000" clade_offset: 0 + hardmin_date: 2007 reference_EPI_ISL: EPI696970 reference_strain: B/Brisbane/60/2008-egg na: @@ -91,7 +99,21 @@ builds: url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv" refs: CY073894: - filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000" + filter: "--min-date 2014 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000" clade_offset: -30 + hardmin_date: 2007 reference_EPI_ISL: CY073894 reference_strain: B/Brisbane/60/2008 + yam: + ha: + changelog: "seasonal_B-Yam_HA/main/CHANGELOG.md" + clade_systems: + clade: + url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/yam/ha/clades.tsv" + refs: + JN993010: + filter: "--min-date 2005 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1000" + clade_offset: 0 + hardmin_date: 2003 + reference_EPI_ISL: EPI271600 + reference_strain: B/Wisconsin/01/2010 diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md index b14250ef..648c0027 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680) +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu/h1n1pdm/ha/CY121680) This dataset uses an older reference sequence (A/California/07/2009) and recent sequences will differ at a large number of positions from this reference. For the analysis of currently circulating viruses, the dataset using A/Wisconsin/588/2019 as reference might be more appropriate. @@ -7,7 +7,7 @@ For the analysis of currently circulating viruses, the dataset using A/Wisconsin | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h1n1pdm_ha | Influenza A H1N1pdm HA | +| name | flu/h1n1pdm/ha | Influenza A H1N1pdm HA | | reference | CY121680 | A/California/07/2009 | diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md index 91cc32d3..b3830dec 100644 --- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md +++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062) +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu/h1n1pdm/ha/MW626062) This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses. @@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h1n1pdm_ha | Influenza A H1N1pdm HA | +| name | flu/h1n1pdm/ha | Influenza A H1N1pdm HA | | reference | MW626062 | A/Wisconsin/588/2019 | diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md index 9ecb25f9..d618a759 100644 --- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md +++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056) +# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu/h1n1pdm/na/MW626056) This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses. @@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h1n1pdm_na | Influenza A H1N1pdm HA | +| name | flu/h1n1pdm/na | Influenza A H1N1pdm HA | | reference | MW626056 | A/Wisconsin/588/2019 | diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md index fa391c39..decd5059 100644 --- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md +++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680) +# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu/h3n2/ha/CY163680) This dataset uses an older reference sequence (A/Wisconsin/67/2005) and recent sequences will differ at a large number of positions from this reference. For the analysis of currently circulating viruses, the dataset using A/Darwin/6/2021 as reference might be more appropriate. @@ -7,7 +7,7 @@ For the analysis of currently circulating viruses, the dataset using A/Darwin/6/ | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h3n2_ha | Influenza A H3N2 HA | +| name | flu/h3n2/ha | Influenza A H3N2 HA | | reference | CY163680 | A/Wisconsin/67/2005 | diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md index 1bb9a2c8..3972b34c 100644 --- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md +++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216) +# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu/h3n2/ha/EPI1857216) This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable for the analysis of circulating viruses. @@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h3n2_ha | Influenza A H3N2 HA | +| name | flu/h3n2/ha | Influenza A H3N2 HA | | reference | EPI1857216 | A/Darwin/6/2021 | diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md index a23b9233..f501ed8a 100644 --- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md +++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md @@ -1,11 +1,11 @@ -# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu_h3n2_na/EPI1857216) +# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu/h3n2/na/EPI1857216) ## Dataset attributes | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_h3n2_na | Influenza A H3N2 NA | +| name | flu/h3n2/na | Influenza A H3N2 NA | | reference | EPI1857216 | A/Darwin/6/2021 | diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/README.md b/nextclade/dataset_config/vic/ha/EPI1926632/README.md index 1d9e0509..11e6b3e7 100644 --- a/nextclade/dataset_config/vic/ha/EPI1926632/README.md +++ b/nextclade/dataset_config/vic/ha/EPI1926632/README.md @@ -1,11 +1,11 @@ -# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu_vic_ha/EPI1926632) +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu/vic/ha/EPI1926632) ## Dataset attributes | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_vic_ha | Influenza B Vic HA | +| name | flu/vic/ha | Influenza B Vic HA | | reference | EPI1926632 | B/Austria/1359417/2021 | diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md index 0f117264..e001f7d4 100644 --- a/nextclade/dataset_config/vic/ha/KX058884/README.md +++ b/nextclade/dataset_config/vic/ha/KX058884/README.md @@ -1,4 +1,4 @@ -# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884) +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu/vic/ha/KX058884) The reference sequence for this datasets precedes the deletions at positions 162ff in the HA1 protein of the virus and thus follows the canonical numbering of aminoacids in the protein. @@ -6,7 +6,7 @@ The reference sequence for this datasets precedes the deletions at positions 162 | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_vic_ha | Influenza B Vic HA | +| name | flu/vic/ha | Influenza B Vic HA | | reference | KX058884 | B/Brisbane/60/2008 | diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md index f2dba64e..f24e20aa 100644 --- a/nextclade/dataset_config/vic/na/CY073894/README.md +++ b/nextclade/dataset_config/vic/na/CY073894/README.md @@ -1,11 +1,11 @@ -# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_na/CY073894) +# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu/vic/na/CY073894) ## Dataset attributes | attribute | value | value friendly | | -------------------- | -------------------- | ---------------------------------------- | -| name | flu_vic_na | Influenza B Vic HA | +| name | flu/vic/na | Influenza B Vic HA | | reference | CY073894 | B/Brisbane/60/2008 | diff --git a/nextclade/dataset_config/yam/ha/JN993010/README.md b/nextclade/dataset_config/yam/ha/JN993010/README.md new file mode 100644 index 00000000..eecc77ef --- /dev/null +++ b/nextclade/dataset_config/yam/ha/JN993010/README.md @@ -0,0 +1,28 @@ +# Nextclade dataset for "Influenza B Yam HA" based on reference "B/Wisconsin/01/2010" (flu/yam/ha/JN993010) + +B/Yamagata viruses have not been observed since 2020. This dataset is provided for analysis of old sequences or suspected Yamagata sequences. + +## Dataset attributes + +| attribute | value | value friendly | +| -------------------- | -------------------- | ---------------------------------------- | +| name | flu/yam/ha | Influenza B Yam HA | +| reference | JN993010 | B/Wisconsin/01/2010 | + + +## Features +This dataset supports + + * Assignment to clades + * Identification of glycosilation motifs + * Sequence QC + * Phylogenetic placement + +## Clades of seasonal influenza viruses + +The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses. +Clade demarcation do not always coincide with significantly different antigenic properties of the viruses. + +## What is Nextclade dataset + +Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html diff --git a/nextclade/dataset_config/yam/ha/JN993010/pathogen.json b/nextclade/dataset_config/yam/ha/JN993010/pathogen.json new file mode 100644 index 00000000..0fec5c9f --- /dev/null +++ b/nextclade/dataset_config/yam/ha/JN993010/pathogen.json @@ -0,0 +1,25 @@ +{ + "nucMutLabelMap": {}, + "nucMutLabelMapReverse": {}, + "aaMotifs": [ + { + "name": "glycosylation", + "nameShort": "Glyc.", + "nameFriendly": "Glycosylation", + "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)", + "includeGenes": [ + { + "gene":"HA1", + "ranges":[] + }, + { + "gene":"HA2", + "ranges":[{"begin":0, "end":186}] + } + ], + "motifs": [ + "N[^P][ST]" + ] + } + ] +} \ No newline at end of file diff --git a/nextclade/dataset_config/yam/includes.txt b/nextclade/dataset_config/yam/includes.txt new file mode 100644 index 00000000..824bfcf5 --- /dev/null +++ b/nextclade/dataset_config/yam/includes.txt @@ -0,0 +1 @@ +B/Phuket/3073/2013 diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py index 1ffc48d2..6037a612 100644 --- a/nextclade/scripts/merge_jsons.py +++ b/nextclade/scripts/merge_jsons.py @@ -7,11 +7,16 @@ def get_clade_configs(name): "displayName": "Abbreviated clade name", "description": "For recent subclades with long names, the prefix describing their history is omitted." }, + "short-clades": { + "name": "short-clade", + "displayName": "Abbreviated clade name", + "description": "For recent subclades with long names, the prefix describing their history is omitted." + }, "subclade": { "name": "subclade", "displayName": "Subclade", "description": "Experimental fine-grained subclade annotation." - }}.get(name, {'name':name, "displayName":name}) + }}.get(name, {'name':name, "displayName":name, "description":""}) if __name__=="__main__": From 4ea5bb2d74929af6f8820c5bb04522cc3a88e4c1 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 13:00:47 +0100 Subject: [PATCH 19/26] nextclade: update auspice config --- nextclade/config/auspice_config.json | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nextclade/config/auspice_config.json b/nextclade/config/auspice_config.json index 418a8348..ed218b66 100644 --- a/nextclade/config/auspice_config.json +++ b/nextclade/config/auspice_config.json @@ -1,9 +1,8 @@ { - "title": "Genomic epidemiology of Influenza", - "build_url": "https://github.com/neherlab/nextclade_data_workflows", + "title": "Nextclade reference dataset for seasonal influenza viruses", + "build_url": "https://github.com/nextstrain/seasonal-flu", "maintainers": [ - { "name": "Cornelius Roemer", "url": "https://neherlab.org" }, - { "name": "Richard Neher", "url": "https://neherlab.org" } + { "name": "Nextstrain team", "url": "https://nextstrain.org" } ], "extensions": { "nextclade": { From 931fb345c6963e2212979390d5fd1d64a56857d3 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 13:05:01 +0100 Subject: [PATCH 20/26] nextclade: update comments and messages --- nextclade/Snakefile | 6 ++---- nextclade/config/config_dict.yaml | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index bd1acee0..7f830b15 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -45,7 +45,7 @@ rule download_clades: rule download_changelog_clades: message: - "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}" + "Downloading nomenclature changelog for {wildcards.lineage} from {params.source} -> {output}" output: changelog = "data/{lineage}_{segment}_changelog.md" params: @@ -57,7 +57,7 @@ rule download_changelog_clades: rule download_changelog_dataset: message: - "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}" + "Downloading previous dataset changelog for {wildcards.lineage} from {params.source} -> {output}" output: changelog = "data/{lineage}_{segment}_{reference}_dataset-changelog.md" params: @@ -448,5 +448,3 @@ rule clean_all: rm -rf data/ """ - -# cp datasets/h3n2/ha/EPI1857216/* ../../../nextstrain/nextclade_data/data/nextstrain/flu/h3n2/ha/EPI1857216 \ No newline at end of file diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml index ff866cc6..2ee812bb 100644 --- a/nextclade/config/config_dict.yaml +++ b/nextclade/config/config_dict.yaml @@ -15,7 +15,7 @@ builds: url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv" key: "subclade" refs: - CY121680: + CY121680: # exclude South Korean genomes because of sequencing artifacts close to the start of HA filter: "--min-date 2014 --probabilistic-sampling --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500" clade_offset: 0 hardmin_date: 2009 From dc141617e9bc29576128dafba824bc3c754bd7d2 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 13:08:09 +0100 Subject: [PATCH 21/26] nextclade: generalize 'genes' function --- nextclade/Snakefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 7f830b15..2158fa58 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -8,6 +8,13 @@ wildcard_constraints: segment = r'pb2|pb1|pa|ha|np|na|mp|ns', reference="[^_/]+", + +def genes(w): + return { + 'ha': ["SigPep", "HA1", "HA2"], + 'na': ["N"] + }.get(w.segment, []) + def all_builds(w): builds = [] for lineage in config["builds"]: @@ -102,11 +109,6 @@ rule parse: --output-sequences {output.sequences} """ - -def genes(w): - if w.segment=='ha': return ["SigPep", "HA1", "HA2"] - if w.segment=='na': return ["NA"] - rule subsample: input: aligned_sequences=rules.parse.output.sequences, From c33b654961eb8acbf3aef7cfe37339261f1a20b1 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 19 Nov 2023 13:10:07 +0100 Subject: [PATCH 22/26] nextclade: remove redundant env variable def --- nextclade/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 2158fa58..fe0a1de0 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -370,7 +370,6 @@ rule export: date=datetime.datetime.utcnow().strftime("%Y-%m-%d"), shell: """ - AUGUR_RECURSION_LIMIT=10000 \ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ From a72cc56781a5aad8ad2d64c9da3c22c92a316fae Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 20 Nov 2023 16:45:48 -0800 Subject: [PATCH 23/26] Fix NA gene name --- nextclade/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index fe0a1de0..7daaeee2 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -12,7 +12,7 @@ wildcard_constraints: def genes(w): return { 'ha': ["SigPep", "HA1", "HA2"], - 'na': ["N"] + 'na': ["NA"] }.get(w.segment, []) def all_builds(w): From 17eaf0b3e2da2278246b011bb8f1a06ac871b3e1 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 20 Nov 2023 16:46:03 -0800 Subject: [PATCH 24/26] Check for outliers file before trying to open it Outliers don't always exist from the treetime clock command, so check before trying to open and avoid a file not found error. --- nextclade/Snakefile | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 7daaeee2..ef54d280 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -229,15 +229,20 @@ rule prune_outliers: params: outliers = "build/{lineage}/{segment}/{reference}/tt_out/outliers.tsv" run: + from pathlib import Path import pandas as pd from Bio import Phylo - outliers = pd.read_csv(params.outliers, sep='\t', index_col=0) + T = Phylo.read(input.tree, 'newick') - for n in outliers.index: - if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n): - print("prune", n) - T.prune(n) + if Path(params.outliers).exists(): + outliers = pd.read_csv(params.outliers, sep='\t', index_col=0) + + for n in outliers.index: + if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n): + print("prune", n) + T.prune(n) + Phylo.write(T, output.tree, "newick") # refine while keeping the root From c8a068433def78b06b7e5d9abd76cd9dc20b34d4 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 20 Nov 2023 17:13:34 -0800 Subject: [PATCH 25/26] Change variable name for genes expected by Nextclade --- nextclade/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index ef54d280..73d54fa6 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -168,7 +168,7 @@ rule align: output: alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta" params: - outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta", + outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{cds}}.fasta", nextclade_bin = "./nextclade_v3" threads: 3 shell: From 43438be5fd04f81c35051dfbcf2e33da967f0c4c Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 21 Nov 2023 10:14:00 -0800 Subject: [PATCH 26/26] Set a default config for Nextclade workflow Use the standard approach of defining a default config for the Nextclade workflow and save everyone a couple of characters of typing. --- nextclade/Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 73d54fa6..910e3512 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,3 +1,5 @@ +configfile: "config/config_dict.yaml" + import datetime