From 238f9560de0ed41d13f391f31e1985164239b22d Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 09:33:57 +0200
Subject: [PATCH 01/26] fix: segment wild card constraints

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 4ab9f54a..2c1c69f1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -4,7 +4,7 @@ from treetime.utils import numeric_date
 
 
 wildcard_constraints:
-    segment = r'pb2|pb1|pa|ha|np|na|ma',
+    segment = r'pb2|pb1|pa|ha|np|na|mp|ns',
     center = r'who|cdc|crick|niid|crick|vidrl',
     passage = r'cell|egg',
     assay = r'fra|hi',

From dcc74524a43cd1e8511e725723e859711badc916 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 12:55:29 +0200
Subject: [PATCH 02/26] stub out flu nextclade dataset workflows

---
 nextclade/Snakefile                           | 347 ++++++++++++++++++
 nextclade/config/auspice_config.json          |  50 +++
 nextclade/config/config_dict.yaml             |  81 ++++
 nextclade/config/pathogen.json                |  41 +++
 .../h1n1pdm/ha/CY121680/genemap.gff           |   5 +
 .../h1n1pdm/ha/CY121680/reference.fasta       |   2 +
 .../h1n1pdm/ha/CY121680/virus_properties.json |  26 ++
 .../h1n1pdm/ha/MW626062/genemap.gff           |   5 +
 .../h1n1pdm/ha/MW626062/reference.fasta       |  27 ++
 .../h1n1pdm/ha/MW626062/virus_properties.json |  26 ++
 nextclade/dataset_config/h1n1pdm/includes.txt |   0
 .../h1n1pdm/na/MW626056/genemap.gff           |   4 +
 .../h1n1pdm/na/MW626056/reference.fasta       |  25 ++
 .../h1n1pdm/na/MW626056/virus_properties.json |  22 ++
 .../h3n2/ha/CY163680/genemap.gff              |   5 +
 .../h3n2/ha/CY163680/reference.fasta          |  26 ++
 .../h3n2/ha/CY163680/virus_properties.json    |  26 ++
 .../h3n2/ha/EPI1857216/genemap.gff            |   5 +
 .../h3n2/ha/EPI1857216/reference.fasta        |  23 ++
 .../h3n2/ha/EPI1857216/virus_properties.json  |  56 +++
 nextclade/dataset_config/h3n2/includes.txt    |   7 +
 .../h3n2/na/EPI1857215/genemap.gff            |   4 +
 .../h3n2/na/EPI1857215/reference.fasta        |   2 +
 .../h3n2/na/EPI1857215/virus_properties.json  |  22 ++
 .../vic/ha/EPI1926632/genemap.gff             |   5 +
 .../vic/ha/EPI1926632/reference.fasta         |  25 ++
 .../vic/ha/EPI1926632/virus_properties.json   |  26 ++
 .../vic/ha/KX058884/genemap.gff               |   5 +
 .../vic/ha/KX058884/reference.fasta           |  28 ++
 .../vic/ha/KX058884/virus_properties.json     |  26 ++
 nextclade/dataset_config/vic/includes.txt     |   0
 .../vic/na/CY073894/genemap.gff               |   4 +
 .../vic/na/CY073894/reference.fasta           |  25 ++
 .../vic/na/CY073894/virus_properties.json     |  22 ++
 .../yam/ha/JN993010/genemap.gff               |   5 +
 .../yam/ha/JN993010/reference.fasta           |   2 +
 36 files changed, 1010 insertions(+)
 create mode 100644 nextclade/Snakefile
 create mode 100644 nextclade/config/auspice_config.json
 create mode 100644 nextclade/config/config_dict.yaml
 create mode 100644 nextclade/config/pathogen.json
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json
 create mode 100644 nextclade/dataset_config/h1n1pdm/includes.txt
 create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff
 create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta
 create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json
 create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff
 create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta
 create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json
 create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff
 create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
 create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json
 create mode 100644 nextclade/dataset_config/h3n2/includes.txt
 create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff
 create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta
 create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json
 create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff
 create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
 create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json
 create mode 100644 nextclade/dataset_config/vic/ha/KX058884/genemap.gff
 create mode 100644 nextclade/dataset_config/vic/ha/KX058884/reference.fasta
 create mode 100644 nextclade/dataset_config/vic/ha/KX058884/virus_properties.json
 create mode 100644 nextclade/dataset_config/vic/includes.txt
 create mode 100644 nextclade/dataset_config/vic/na/CY073894/genemap.gff
 create mode 100644 nextclade/dataset_config/vic/na/CY073894/reference.fasta
 create mode 100644 nextclade/dataset_config/vic/na/CY073894/virus_properties.json
 create mode 100644 nextclade/dataset_config/yam/ha/JN993010/genemap.gff
 create mode 100644 nextclade/dataset_config/yam/ha/JN993010/reference.fasta

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
new file mode 100644
index 00000000..ab5819d8
--- /dev/null
+++ b/nextclade/Snakefile
@@ -0,0 +1,347 @@
+import datetime
+
+
+wildcard_constraints:
+    flu_type="[A-Za-z0-9]+",
+    year="\d\d\d\d",
+    lineage=r"h3n2|h1n1pdm|vic|yam",
+    segment = r'pb2|pb1|pa|ha|np|na|mp|ns',
+    reference="[^_/]+",
+
+
+
+
+rule download_sequences:
+    output:
+        sequences="data/{lineage}/raw_{segment}.fasta"
+    params:
+        s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/raw_sequences.fasta.xz"
+    conda: "../../workflow/envs/nextstrain.yaml"
+    shell:
+        """
+        aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences}
+        """
+
+rule download_clades:
+    message:
+        "Downloading clade definitions for {wildcards.lineage} from {params.source_tsv} -> {output}"
+    output:
+        clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv"
+    params:
+        source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade],
+    shell:
+        """
+        curl {params.source_tsv} > {output.clade_tsv}
+        """
+
+rule offset_clades:
+    input:
+        rules.download_clades.output,
+    output:
+        "data/{clade}_{lineage}_{segment}_{reference}.tsv",
+    params:
+        offset=lambda w: config["builds"][w.lineage][w.segment]['refs'][w.reference][
+            "clade_offset"
+        ],
+    shell:
+        """
+        perl -F'\\t' -ne \
+            '$F[2]+={params.offset} if $F[1] =~ "nuc"; \
+            print join "\\t", @F' \
+            {input} \
+            >{output}
+        """
+
+rule parse:
+    input:
+        sequences="data/{lineage}/raw_{segment}.fasta",
+    output:
+        metadata="data/{lineage}_{segment}_metadata.tsv",
+        sequences="data/{lineage}_{segment}_sequences.fasta",
+    params:
+        fields="strainName virus segment EPI_ISL date submission_date region country division location passage collecting_lab submitting_lab age sex",
+    shell:
+        """
+        augur parse \
+            --sequences {input.sequences} \
+            --fields {params.fields} \
+            --output-metadata {output.metadata} \
+            --output-sequences {output.sequences}
+        """
+
+
+def genes(w):
+    if w.segment=='ha': return ["SigPep", "HA1", "HA2"]
+    if w.segment=='na': return ["NA"]
+
+rule subsample:
+    input:
+        aligned_sequences=rules.parse.output.sequences,
+        enriched_metadata=rules.parse.output.metadata,
+        include_strains="../config/{lineage}/reference_strains.txt",
+        exclude="../config/{lineage}/outliers.txt",
+    output:
+        sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
+        sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt",
+    params:
+        filter_arguments=lambda w: config["builds"][w.lineage][w.segment][
+            w.reference
+        ]["filter"],
+        reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment][
+            w.reference
+        ]["reference_EPI_ISL"],
+        other_include = lambda w:config["builds"][w.lineage][w.segment][w.reference].get("include_file","")
+    shell:
+        """
+        augur filter \
+            --sequences {input.aligned_sequences} \
+            --metadata {input.enriched_metadata} \
+            --include {input.include_strains} {params.other_include} \
+            --include-where EPI_ISL={params.reference_EPI_ISL} \
+            {params.filter_arguments} \
+            --output {output.sampled_sequences} \
+            --output-strains {output.sampled_strains}
+        """
+
+rule align:
+    input:
+        sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
+        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
+    output:
+        alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta",
+        insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv",
+    params:
+        outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta",
+    threads: 3
+    shell:
+        """
+        nextclade run \
+            --jobs={threads} \
+            --input-ref {input.reference} \
+            --input-annotation {input.annotation} \
+            --output-translations {params.outdir} \
+            --output-fasta {output.alignment} \
+            --output-insertions {output.insertions} \
+            {input.sequences} \
+            2>&1
+        """
+
+
+rule tree:
+    input:
+        alignment=rules.align.output.alignment,
+    output:
+        tree="build/{lineage}/{segment}/{reference}/tree_raw.nwk",
+    params:
+        args=lambda w: config["tree"].get("tree-builder-args", "")
+        if "tree" in config
+        else "",
+    threads: 3
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --tree-builder-args {params.args} \
+            --output {output.tree} \
+            --nthreads {threads} \
+            > /dev/null
+        """
+
+# root using dates in treetime, use 1500 as sequence length (good enough, doesn't matter)
+rule root:
+    input:
+        tree=rules.tree.output.tree,
+        metadata = rules.parse.output.metadata,
+    output:
+        tree="build/{lineage}/{segment}/{reference}/tree_rooted.nwk",
+    params:
+        outdir = "build/{lineage}/{segment}/{reference}/tt_out"
+    shell:
+        """
+        treetime clock \
+            --tree {input.tree} \
+            --sequence-length 1500 \
+            --dates {input.metadata} \
+            --clock-filter 4 \
+            --clock-filter-method local \
+            --outdir {params.outdir}
+        cp {params.outdir}/rerooted.newick {output.tree}
+        """
+
+# refine while keeping the root
+rule refine:
+    input:
+        tree=rules.root.output.tree,
+        alignment=rules.align.output.alignment,
+        enriched_metadata=rules.parse.output.metadata,
+    output:
+        tree="build/{lineage}/{segment}/{reference}/tree.nwk",
+        node_data="build/{lineage}/{segment}/{reference}/branch_lengths.json",
+    threads: 1
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.enriched_metadata} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --keep-root \
+            --divergence-unit mutations-per-site
+        """
+
+
+rule ancestral:
+    message:
+        """
+        Reconstructing ancestral sequences and mutations
+          - inferring ambiguous mutations
+        """
+    input:
+        tree=rules.refine.output.tree,
+        alignment=rules.align.output.alignment,
+        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
+    output:
+        node_data="build/{lineage}/{segment}/{reference}/muts.json",
+    params:
+        inference="joint",
+        genes=genes,
+        translations=lambda w: expand(
+            "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta",
+            strain=w.lineage,
+            segment=w.segment,
+            genes=genes(w),
+            reference=w.reference,
+        ),
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --inference {params.inference} \
+            --infer-ambiguous \
+            --genes {params.genes} \
+            --annotation {input.annotation} \
+            --translations {params.translations} \
+            --root-sequence {input.reference} \
+            --output-node-data {output.node_data}
+        """
+
+rule clades:
+    message:
+        "Adding internal clade labels"
+    input:
+        tree=rules.refine.output.tree,
+        nucs=rules.ancestral.output.node_data,
+        clades=rules.offset_clades.output,
+    output:
+        node_data="build/{lineage}/{segment}/{reference}/{clade}.json",
+    shell:
+        """
+        augur clades --tree {input.tree} \
+            --mutations {input.nuc_muts} {input.aa_muts} \
+            --clades {input.clades} \
+            --output-node-data {output.node_data} \
+            > /dev/null
+        """
+
+def get_node_data(w):
+    node_data = [
+        rules.refine.output.node_data,
+        "build/{lineage}/{segment}/{reference}/aa_muts_adapted.json".format(**w),
+        "build/{lineage}/{segment}/{reference}/nuc_muts_adapted.json".format(**w),
+    ]
+
+    for clade in config["builds"][w.lineage][w.segment][w.reference]["clades"]:
+        node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'/{clade}.json')
+
+    if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference]:
+        node_data.append("build/{lineage}/{segment}/{reference}/clades-short.json".format(**w))
+
+    return node_data
+
+
+rule export:
+    message:
+        "Exporting data files for auspice"
+    input:
+        tree=rules.refine.output.tree,
+        metadata=rules.parse.output.metadata,
+        node_data = get_node_data,
+        auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference] else config["files"]["auspice_config"],
+    output:
+        auspice_json="auspice/{lineage}/{segment}/{reference}/auspice_raw.json",
+    params:
+        fields="region strainName country date EPI_ISL",
+        date=datetime.datetime.utcnow().strftime("%Y-%m-%d"),
+    shell:
+        """
+        AUGUR_RECURSION_LIMIT=10000 \
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --node-data {input.node_data}\
+            --auspice-config {input.auspice_config} \
+            --color-by-metadata {params.fields} \
+            --title "Nextclade reference tree for Influenza type:{wildcards.lineage} segment:{wildcards.segment} with root {wildcards.reference} built on {params.date}" \
+            --output {output.auspice_json} 2>&1;
+        """
+
+
+rule generate_sample_sequences:
+    input:
+        sequences="data/{lineage}_{segment}_sequences.fasta",
+        metadata=rules.parse.output.metadata,
+    output:
+        sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta",
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --min-date 2020 --group-by year --subsample-max-sequences 50  \
+            --exclude-ambiguous-dates-by year \
+            --exclude-where 'country!=USA' 'submitting_lab!=centers_for_disease_control_and_prevention' \
+            --probabilistic-sampling \
+            --output {output.sequences}
+        """
+
+rule make_dataset:
+    input:
+        sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta",
+        auspice_json="auspice/{lineage}/{segment}/{reference}/auspice.json",
+        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
+        pathogen_json="references/{lineage}/{segment}/{reference}/pathogen.json",
+        additional_config="references/{lineage}/{segment}/{reference}/virus_properties.json",
+    output:
+        sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta",
+        tree="datasets/{lineage}/{segment}/{reference}/tree.json",
+        annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="datasets/{lineage}/{segment}/{reference}/reference.fasta",
+        pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json",
+    shell:
+        """
+        cp {input.sequences} {output.sequences} \
+        cp {input.auspice_json} {output.tree} \
+        cp {input.reference} {output.reference} \
+        cp {input.annotation} {output.annotation} \
+        jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json}
+        """
+
+
+
+rule clean:
+    shell:
+        """
+        rm -rf output test data/clades* data/include* auspice/*
+        """
+
+
+rule clean_all:
+    shell:
+        """
+        rm -rf output test auspice build data
+        """
diff --git a/nextclade/config/auspice_config.json b/nextclade/config/auspice_config.json
new file mode 100644
index 00000000..418a8348
--- /dev/null
+++ b/nextclade/config/auspice_config.json
@@ -0,0 +1,50 @@
+{
+  "title": "Genomic epidemiology of Influenza",
+  "build_url": "https://github.com/neherlab/nextclade_data_workflows",
+  "maintainers": [
+    { "name": "Cornelius Roemer", "url": "https://neherlab.org" },
+    { "name": "Richard Neher", "url": "https://neherlab.org" }
+  ],
+  "extensions": {
+    "nextclade": {
+    }
+  },
+  "data_provenance": [
+    {
+      "name": "GISAID"
+    }
+  ],
+  "colorings": [
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "date",
+      "title": "Sample Date",
+      "type": "ordinal"
+    },
+    {
+      "key": "EPI_ISL",
+      "title": "EPI_ISL",
+      "type": "categorical"
+    }
+  ],
+  "filters": [
+    "region",
+    "country",
+    "clade_membership"
+  ],
+  "display_defaults": {
+    "color_by": "clade_membership",
+    "distance_measure": "div",
+    "branch_label": "clade"
+  },
+  "panels": ["tree","entropy"]
+}
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
new file mode 100644
index 00000000..239f1f9e
--- /dev/null
+++ b/nextclade/config/config_dict.yaml
@@ -0,0 +1,81 @@
+clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/"
+
+builds:
+  h1n1pdm:
+    ha:
+      clade_systems:
+        "clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv"
+        "subclade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv"
+        "short-clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv"
+      refs:
+        CY121680:
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500  --subsample-max-sequences 1500"
+          clade_offset: 0
+          reference_EPI_ISL: EPI1583287
+          reference_strain: A/California/7/2009-egg #TODO: exclude
+        MW626062:
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          clade_offset: 0
+          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv"
+          reference_EPI_ISL: EPI1812046
+          reference_strain: A/Wisconsin/588/2019
+    na:
+      clade_systems:
+        "clade": "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv"
+      refs:
+        MW626056:
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
+          clade_offset: 0
+          reference_EPI_ISL: EPI1812046
+          reference_strain: A/Wisconsin/588/2019
+  h3n2:
+    ha:
+      clade_systems:
+        "clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv"
+        "subclade": "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv"
+        "short-clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv"
+      refs:
+        EPI1857216:
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          clade_offset: -17
+          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
+          include_file: references/h3n2/includes.txt
+          reference_EPI_ISL: EPI1857216
+          reference_strain: A/Darwin/6/2021
+        CY163680:
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          clade_offset: 0
+          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
+          include_file: references/h3n2/includes.txt
+          reference_EPI_ISL: EPI545340
+          reference_strain: A/Wisconsin/67/2005-egg
+    na:
+      clade_systems:
+        "clade": "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv"
+      refs:
+        EPI1857215:
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
+          clade_offset: 4
+          reference_EPI_ISL: EPI1857215
+          reference_strain: A/Darwin/6/2021
+  vic:
+    ha:
+      clade_systems:
+        "clade": "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv"
+        "subclade": "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv"
+      refs:
+        KX058884:
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          clade_offset: 0
+          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv"
+          reference_EPI_ISL: EPI696970
+          reference_strain: B/Brisbane/60/2008-egg
+    na:
+      clade_systems:
+        "clade": "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"
+      refs:
+        CY073894:
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
+          clade_offset: -30
+          reference_EPI_ISL: CY073894
+          reference_strain: B/Brisbane/60/2008
diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json
new file mode 100644
index 00000000..d536f97f
--- /dev/null
+++ b/nextclade/config/pathogen.json
@@ -0,0 +1,41 @@
+{
+    "alignmentParams": {
+        "excessBandwidth": 9,
+        "terminalBandwidth": 100,
+        "allowedMismatches": 4,
+        "gapAlignmentSide": "right",
+        "minSeedCover": 0.1
+    },
+    "qc": {
+        "privateMutations": {
+          "enabled": true,
+          "typical": 5,
+          "cutoff": 15,
+          "weightLabeledSubstitutions": 2,
+          "weightReversionSubstitutions": 1,
+          "weightUnlabeledSubstitutions": 1
+        },
+        "missingData": {
+          "enabled": false,
+          "missingDataThreshold": 100,
+          "scoreBias": 10
+        },
+        "snpClusters": {
+          "enabled": false,
+          "windowSize": 100,
+          "clusterCutOff": 5,
+          "scoreWeight": 50
+        },
+        "mixedSites": {
+          "enabled": true,
+          "mixedSitesThreshold": 4
+        },
+        "frameShifts": {
+          "enabled": true
+        },
+        "stopCodons": {
+          "enabled": true,
+          "ignoredStopCodons": []
+        }
+    }
+}
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff
new file mode 100644
index 00000000..a3952a1f
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region CY121680.1 1 1752
+CY121680.1	feature	gene	21	71	.	+	.	gene_name="SigPep"
+CY121680.1	feature	gene	72	1052	.	+	.	gene_name="HA1"
+CY121680.1	feature	gene	1053	1718	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta b/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta
new file mode 100644
index 00000000..a3b664be
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/reference.fasta
@@ -0,0 +1,2 @@
+>CY121680.1 Influenza A virus (A/California/07/2009(H1N1)) hemagglutinin (HA) gene, complete cds
+GGAAAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTCTGCTATATACATTTGCAACCGCAAATGCAGACACATTATGTATAGGTTATCATGCGAACAATTCAACAGACACTGTAGACACAGTACTAGAAAAGAATGTAACAGTAACACACTCTGTTAACCTTCTAGAAGACAAGCATAACGGGAAACTATGCAAACTAAGAGGGGTAGCCCCATTGCATTTGGGTAAATGTAACATTGCTGGCTGGATCCTGGGAAATCCAGAGTGTGAATCACTCTCCACAGCAAGCTCATGGTCCTACATTGTGGAAACACCTAGTTCAGACAATGGAACGTGTTACCCAGGAGATTTCATCGATTATGAGGAGCTAAGAGAGCAATTGAGCTCAGTGTCATCATTTGAAAGGTTTGAGATATTCCCCAAGACAAGTTCATGGCCCAATCATGACTCGAACAAAGGTGTAACGGCAGCATGTCCTCATGCTGGAGCAAAAAGCTTCTACAAAAATTTAATATGGCTAGTTAAAAAAGGAAATTCATACCCAAAGCTCAGCAAATCCTACATTAATGATAAAGGGAAAGAAGTCCTCGTGCTATGGGGCATTCACCATCCATCTACTAGTGCTGACCAACAAAGTCTCTATCAGAATGCAGATGCATATGTTTTTGTGGGGTCATCAAGATACAGCAAGAAGTTCAAGCCGGAAATAGCAATAAGACCCAAAGTGAGGGATCGAGAAGGGAGAATGAACTATTACTGGACACTAGTAGAGCCGGGAGACAAAATAACATTCGAAGCAACTGGAAATCTAGTGGTACCGAGATATGCATTCGCAATGGAAAGAAATGCTGGATCTGGTATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCAAACACCCAAGGGTGCTATAAACACCAGCCTCCCATTTCAGAATATACATCCGATCACAATTGGAAAATGTCCAAAATATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATATCCCGTCTATTCAATCTAGAGGCCTATTTGGGGCCATTGCCGGTTTCATTGAAGGGGGGTGGACAGGGATGGTAGATGGATGGTACGGTTATCACCATCAAAATGAGCAGGGGTCAGGATATGCAGCCGACCTGAAGAGCACACAGAATGCCATTGACGAGATTACTAACAAAGTAAATTCTGTTATTGAAAAGATGAATACACAGTTCACAGCAGTAGGTAAAGAGTTCAACCACCTGGAAAAAAGAATAGAGAATTTAAATAAAAAAGTTGATGATGGTTTCCTGGACATTTGGACTTACAATGCCGAACTGTTGGTTCTATTGGAAAATGAAAGAACTTTGGACTACCACGATTCAAATGTGAAGAACTTATATGAAAAGGTAAGAAGCCAGCTAAAAAACAATGCCAAGGAAATTGGAAACGGCTGCTTTGAATTTTACCACAAATGCGATAACACGTGCATGGAAAGTGTCAAAAATGGGACTTATGACTACCCAAAATACTCAGAGGAAGCAAAATTAAACAGAGAAGAAATAGATGGGGTAAAGCTGGAATCAACAAGGATTTACCAGATTTTGGCGATCTATTCAACTGTCGCCAGTTCATTGGTACTGGTAGTCTCCCTGGGGGCAATCAGTTTCTGGATGTGCTCTAATGGGTCTCTACAGTGTAGAATATGTATTTAACATTAGGATTTCAGAAGCATGAGAAAAACAC
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json
new file mode 100644
index 00000000..a9af3dfd
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json
@@ -0,0 +1,26 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"HA1",
+            "ranges":[]
+          },
+          {
+            "gene":"HA2",
+            "ranges":[{"begin":0, "end":186}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff
new file mode 100644
index 00000000..825fe437
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region MW626062.1 1 1752
+MW626062.1	feature	gene	21	71	.	+	.	gene_name="SigPep"
+MW626062.1	feature	gene	72	1052	.	+	.	gene_name="HA1"
+MW626062.1	feature	gene	1053	1718	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta b/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta
new file mode 100644
index 00000000..546126e1
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/reference.fasta
@@ -0,0 +1,27 @@
+>MW626062.1 Influenza A virus (A/Wisconsin/588/2019(H1N1)) segment 4 hemagglutinin (HA) gene, complete cds
+GGAAAACAAAAGCAACAAAAATGAAGGCAATACTAGTAGTTATGCTGTATACATTTACAACCGCAAATGC
+AGACACATTATGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAGTACTAGAAAAGAAT
+GTAACAGTAACACACTCTGTCAATCTTCTGGAAGACAAGCATAACGGAAAACTATGCAAACTAAGAGGGG
+TAGCCCCATTGCATTTGGGTAAATGTAACATTGCTGGCTGGATCCTGGGAAATCCAGAGTGTGAATCACT
+CTCCACAGCAAGATCATGGTCCTACATTGTGGAAACATCTAATTCAGACAATGGAACGTGTTACCCAGGA
+GATTTCATCAATTATGAGGAGCTAAGAGAGCAATTGAGCTCAGTGTCATCATTTGAAAGGTTTGAAATAT
+TCCCCAAGACAAGTTCATGGCCTAATCATGACTCGGACAATGGTGTAACGGCAGCATGTCCTCACGCTGG
+AGCAAAAAGCTTCTACAAAAACTTGATATGGCTGGTTAAAAAAGGAAAATCATACCCAAAGATCAACCAA
+ACCTACATTAATGATAAAGGGAAAGAAGTCCTCGTGCTGTGGGGCATTCACCATCCACCTACTATTGCTG
+ACCAACAAAGTCTCTATCAGAATGCAGATGCATATGTTTTTGTGGGGACATCAAGATACAGCAAGAAGTT
+CAAGCCGGAAATAGCAACAAGACCCAAAGTGAGGGATCAAGAAGGGAGAATGAACTATTACTGGACACTA
+GTAGAACCGGGAGACAAAATAACATTCGAAGCAACTGGTAATCTAGTGGCACCGAGATATGCATTCACAA
+TGGAAAGAGATGCTGGATCTGGTATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCA
+GACACCCGAGGGTGCTATAAACACCAGCCTCCCATTTCAGAATGTACATCCGATCACAATTGGGAAATGT
+CCAAAGTATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATGTCCCGTCTATTCAATCTA
+GAGGCCTATTCGGGGCCATTGCTGGCTTCATCGAAGGGGGGTGGACAGGGATGGTAGATGGATGGTACGG
+TTATCACCATCAAAATGAGCAGGGGTCAGGATATGCAGCCGATCTGAAGAGCACACAAAATGCCATTGAT
+AAGATTACTAACAAAGTAAATTCTGTTATTGAAAAGATGAATACACAGTTCACAGCAGTTGGTAAAGAGT
+TCAACCACCTTGAAAAAAGAATAGAGAATCTAAATAAAAAGGTTGATGATGGTTTCCTGGACATTTGGAC
+TTACAATGCCGAACTGTTGGTTCTACTGGAAAACGAAAGAACTTTGGACTATCACGATTCAAATGTGAAG
+AACTTGTATGAAAAAGTAAGAAACCAGTTAAAAAACAATGCCAAGGAAATTGGAAACGGCTGCTTTGAAT
+TTTACCACAAATGCGACAACACATGCATGGAAAGTGTCAAGAATGGGACTTATGACTACCCAAAATACTC
+AGAGGAAGCAAAATTAAACAGAGAAAAAATAGATGGAGTAAAGCTGGACTCAACAAGGATCTACCAGATT
+TTGGCGATCTATTCAACTGTTGCCAGTTCATTGGTACTGGTAGTCTCCCTGGGGGCAATCAGCTTCTGGA
+TGTGCTCTAATGGGTCTCTACAGTGTAGAATATGTATTTAACATTAGGATTTCAGAATCATGAGAAAAAC
+AC
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json
new file mode 100644
index 00000000..a9af3dfd
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json
@@ -0,0 +1,26 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"HA1",
+            "ranges":[]
+          },
+          {
+            "gene":"HA2",
+            "ranges":[{"begin":0, "end":186}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/h1n1pdm/includes.txt b/nextclade/dataset_config/h1n1pdm/includes.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff b/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff
new file mode 100644
index 00000000..f026596c
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff
@@ -0,0 +1,4 @@
+##gff-version 3
+##sequence-region MW626056.1 1 1433
+MW626056.1	annotation	remark	1	1433	.	.	.	accessions=MW626056;data_file_division=VRL;date=23-FEB-2021;keywords=;molecule_type=cRNA;organism=Influenza A virus;references=location: %5B0:1433%5D%0Aauthors: Jernigan%2CD.%2C Wentworth%2CD.%2C Barnes%2CJ.%2C Garten%2CR. and Xu%2CX.%0Atitle: Influenza Sequencing Activity group%0Ajournal: Unpublished%0Amedline id: %0Apubmed id: %0Acomment:,location: %5B0:1433%5D%0Aauthors: Jernigan%2CD.%2C Wentworth%2CD.%2C Barnes%2CJ.%2C Garten%2CR. and Xu%2CX.%0Atitle: Direct Submission%0Ajournal: Submitted %2817-FEB-2021%29 WHO Collaborating Center for Surveillance%2C Epidemiology and Control of Influenza%2C Influenza Division%2C Centers for Disease Control and Prevention%2C 1600 Clifton Road%2C N.E.%2C Atlanta%2C GA 30333%2C USA%0Amedline id: %0Apubmed id: %0Acomment:;sequence_version=1;source=Influenza A virus;structured_comment=OrderedDict%28%5B%28%27FluData%27%2C OrderedDict%28%5B%28%27EPI_ISOLATE_ID%27%2C %27EPI_ISL_628888%27%29%2C %28%27NAME%27%2C %27A/Wisconsin/588/2019%27%29%2C %28%27TYPE%27%2C %27H1N1%27%29%2C %28%27Segment_name%27%2C %27NA%27%29%2C %28%27HOST_AGE%27%2C %2766%27%29%2C %28%27HOST_GENDER%27%2C %27M%27%29%2C %28%27PASSAGE%27%2C %27C2S1 %282020-05-15%29%27%29%2C %28%27LOCATION%27%2C %27United States / Wisconsin%27%29%2C %28%27COLLECT_DATE%27%2C %2719-Dec-2019%27%29%2C %28%27Lineage%27%2C %27A%28H1N1%29pdm09%27%29%2C %28%27SPECIMEN_ID%27%2C %2719VR015562 ORIGINAL%27%29%2C %28%27SENDER_LAB%27%2C %27Wisconsin State Laboratory of Hygiene
+MW626056.1	feature	gene	9	1418	.	+	.	codon_start=1;gene=NA;gene_name=NA;product=neuraminidase;protein_id=QRV63257.1;
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta b/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta
new file mode 100644
index 00000000..02ca3b74
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/reference.fasta
@@ -0,0 +1,25 @@
+>MW626056.1 Influenza A virus (A/Wisconsin/588/2019(H1N1)) segment 6 neuraminidase (NA) gene, complete cds
+AGTTTAAAATGAATCCAAACCAAAAGATAATAACCATTGGTTCTATCTGTATGACAATTG
+GAACGGCTAACTTAATATTACAAATTGGAAACATAATCTCAATATGGGTTAGCCACTCAA
+TTCAAATTGGAAATCAAAGCCAGATTGAAACATGCAATAAAAGCGTCATTACTTATGAAA
+ACAACACTTGGGTAAATCAGACATTTGTTAACATCAGCAACACTAACTCTGCTGCTAGAC
+AGTCAGTGGCTTCCGTGAAATTAGCGGGCAATTCCTCTCTCTGCCCTGTTAGTGGATGGG
+CTATATACAGTAAAGACAACAGTGTAAGAATCGGTTCCAAGGGGGATGTGTTTGTCATAA
+GGGAACCATTCATATCATGCTCTCCCTTGGAATGCAGAACCTTCTTCTTGACTCAAGGGG
+CTTTGCTAAATGACAAACATTCCAATGGAACCATTAAAGACAGAAGCCCATATCGAACCC
+TAATGAGCTGTCCTATTGGTGAAGTTCCCTCTCCATACAACTCAAGATTTGAGTCAGTCG
+CTTGGTCAGCAAGTGCTTGTCATGATGGCACCAATTGGCTAACAATTGGAATTTCTGGCC
+CAGACAGTGGGGCAGTGGCTGTGTTAAAATACAATGGCATAATAACAGACACTATCAAGA
+GTTGGAGGAACAAGATATTGAGAACACAAGAGTCTGAATGTGCATGTGTAAATGGTTCTT
+GCTTTACCATAATGACCGATGGACCAAGTGATGGACAGGCCTCATACAAAATCTTCAGAA
+TAGAAAAGGGAAAGATAATCAAATCAGTCGAAATGAAAGCCCCTAATTATCACTATGAAG
+AATGCTCCTGTTACCCTGATTCTAGTGAAATCACATGTGTGTGCAGGGATAACTGGCATG
+GCTCGAATCGACCGTGGGTGTCTTTCAACCAGAATCTGGAATATCAGATGGGATACATAT
+GCAGTGGGGTTTTCGGAGACAATCCACGCCCTAATGATAAGACAGGCAGTTGTGGTCCAG
+TATCGTCTAATGGAGCAAATGGGGTAAAAGGATTTTCATTCAAATACGGCAATGGTGTTT
+GGATAGGGAGAACTAAGAGCATTAGTTCAAGAAAAGGTTTTGAGATGATTTGGGATCCGA
+ATGGATGGACTGGGACTGACAATAAATTCTCAAAAAAGCAAGATATCGTAGGAATAAATG
+AGTGGTCAGGGTATAGCGGGAGTTTTGTTCAGCATCCAGAACTAACAGGGCTGAATTGTA
+TAAGACCTTGCTTCTGGGTTGAACTAATAAGAGGACGACCCGAAGAGAACACAATCTGGA
+CTAGCGGGAGCAGCATATCCTTTTGTGGTGTAGACAGTGACATTGTGGGTTGGTCTTGGC
+CAGACGGTGCTGAGTTGCCATTTACCATTGACAAGTAATTTGTTCAAAAAACT
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json
new file mode 100644
index 00000000..27ec895a
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json
@@ -0,0 +1,22 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"NA",
+            "ranges":[{"begin":33, "end":470}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff b/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff
new file mode 100644
index 00000000..cbb8d4e5
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region CY163680.1 1 1737
+CY163680.1	feature	CDS	18	65	.	+	.	name="SigPep"
+CY163680.1	feature	CDS	66	1052	.	+	.	name="HA1"
+CY163680.1	feature	CDS	1053	1715	.	+	.	name="HA2"
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta b/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta
new file mode 100644
index 00000000..a4df05fb
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/reference.fasta
@@ -0,0 +1,26 @@
+>CY163680.1 Influenza A virus (A/Wisconsin/67/2005(H3N2)) hemagglutinin (HA) gene, complete cds
+GGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAA
+ACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATA
+GTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAG
+GTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGG
+AGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGC
+AACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGG
+AGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAG
+ATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAAC
+GTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACA
+ATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAAC
+TGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACA
+ATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCA
+AAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCAT
+CACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGT
+CCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTA
+GAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGG
+TTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAAT
+CAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAAT
+TCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTC
+ATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAAC
+AAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAA
+TATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAG
+AGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATC
+CTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCT
+GCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGAGTGCATTAATTAAAAACAC
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json b/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json
new file mode 100644
index 00000000..a9af3dfd
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json
@@ -0,0 +1,26 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"HA1",
+            "ranges":[]
+          },
+          {
+            "gene":"HA2",
+            "ranges":[{"begin":0, "end":186}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff b/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff
new file mode 100644
index 00000000..f33ff5bc
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region EPI1857216 1 1718
+EPI1857216	feature	gene	1	48	.	+	.	gene_name="SigPep"
+EPI1857216	feature	gene	49	1035	.	+	.	gene_name="HA1"
+EPI1857216	feature	gene	1036	1698	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
new file mode 100644
index 00000000..ee3b943f
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
@@ -0,0 +1,23 @@
+>EPI_ISL_1563628 | A/Darwin/6/2021 | A / H3N2 |  | 2021-03-16
+atgaagactatcattgctttgagcaacattctatgtcttgttttcgctcaaaaaatacctggaaatgacaatagcacggc
+aacgctgtgccttgggcaccatgcagtaccaaacggaacgatagtgaaaacaatcacaaatgaccgaattgaagttacta
+atgctactgagttggttcagaattcatcaataggtgaaatatgcggcagtcctcatcagatccttgatggagggaactgc
+acactaatagatgctctattgggggaccctcagtgtgacggctttcaaaataaggaatgggacctttttgttgaaagaag
+cagagccaacagcaactgttacccttatgatgtgccggattatgcctcccttaggtcactagttgcctcatccggcacac
+tggagtttaaaaatgaaagcttcaattggactggagtcaaacaaaacggaacaagttctgcgtgcataaggggatctagt
+agtagtttttttagtagattaaattggttgaccagcttaaacaacatatatccagcacagaacgtgactatgccaaacaa
+ggaacaatttgacaaattgtacatttggggggttcaccacccggatacggacaagaaccaaatctccctgtttgctcaat
+catcaggaagaatcacagtatctaccaaaagaagccaacaagctgtaatcccaaatatcggatctagacccagaataagg
+gatatccctagcagaataagcatctattggacaatagtaaaaccgggagacatacttttgattaacagcacagggaatct
+aattgctcctaggggttacttcaaaatacgaagtgggaaaagctcaataatgagatcagatgcacccattggcaaatgta
+agtctgaatgcatcactccaaatggaagcattcccaatgacaaaccgttccaaaatgtaaacaggatcacatacggggcc
+tgtcccagatatgttaagcaaagcaccctgaaattggcaacaggaatgcgaaatgtaccagagaaacaaaccagaggcat
+atttggcgcaatagcgggtttcatagaaaatggatgggagggaatggtggatggttggtacggtttcaggcatcaaaatt
+ctgagggaagaggacaagcagcagatctcaaaagcactcaagcagcaatcgatcaaatcaatgggaagctgaatcgattg
+atcggaaaaaccaacgagaaattccatcagattgaaaaagaattctcagaagtagaaggaagagttcaagaccttgagaa
+atatgttgaggacactaaaatagatctctggtcatacaacgcggagcttcttgttgccctggagaaccaacatacgattg
+acctaactgactcagaaatgaacaaactgtttgaaaaaacaaagaagcaactgagggaaaatgctgaggatatgggaaat
+ggttgtttcaaaatataccacaaatgtgacaatgcctgcataggatcaataagaaatgaaacttatgaccacaatgtgta
+cagggatgaagcattaaacaaccggttccagatcaagggagttgagctgaagtcagggtacaaagattggatcctatgga
+tttcctttgccatgtcatgttttttgctttgtattgctttgttggggttcatcatgtgggcctgccaaaagggcaacatt
+agatgcaacatttgcatttgagtgcattaattaaaaac
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json
new file mode 100644
index 00000000..6b5cd7dd
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json
@@ -0,0 +1,56 @@
+{
+  "schemaVersion": "1.10.0",
+  "nucMutLabelMap": {},
+  "nucMutLabelMapReverse": {},
+  "phenotypeData":[
+    {
+      "name": "RBD",
+      "nameFriendly": "RBD mutations",
+      "description": "This column displays the number of differences between the sequence and the reference at positions identified by Koel et al. (145, 155, 156, 158, 159, 189, and 193 in HA1)",
+      "gene": "HA1",
+      "aaRange": {
+        "begin": 100,
+        "end": 200
+      },
+      "ignore": {
+        "clades": ["outgroup"]
+      },
+      "data": [
+        {
+          "name": "differences",
+          "weight": 1,
+          "locations": {
+            "145": {"default":1},
+            "155": {"default":1},
+            "156": {"default":1},
+            "158": {"default":1},
+            "159": {"default":1},
+            "189": {"default":1},
+            "193": {"default":1}
+          }
+        }
+      ]
+    }
+  ],
+  "aaMotifs": [
+    {
+      "name": "glycosylation",
+      "nameShort": "Glyc.",
+      "nameFriendly": "Glycosylation",
+      "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+      "includeGenes": [
+        {
+          "gene":"HA1",
+          "ranges":[]
+        },
+        {
+          "gene":"HA2",
+          "ranges":[{"begin":0, "end":186}]
+        }
+      ],
+      "motifs": [
+        "N[^P][ST]"
+      ]
+    }
+  ]
+}
diff --git a/nextclade/dataset_config/h3n2/includes.txt b/nextclade/dataset_config/h3n2/includes.txt
new file mode 100644
index 00000000..07ada14b
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/includes.txt
@@ -0,0 +1,7 @@
+A/India/Pun-NIV293349/2021
+A/AbuDhabi/2375/2021
+A/Kenya/101/2021
+A/Victoria/361/2011
+
+A/Singapore/INFTT0001/2021
+A/Nepal/21FL2632/2021
\ No newline at end of file
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff b/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff
new file mode 100644
index 00000000..cdeefd62
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff
@@ -0,0 +1,4 @@
+##gff-version 3
+##sequence-region EPI1857215 1 1433
+EPI1857215	annotation	remark	1	1439	.	.	.	accessions=EPI1857215;
+EPI1857215	feature	gene	8	1417	.	+	.	codon_start=1;gene=NA;gene_name=NA;product=neuraminidase;
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta b/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta
new file mode 100644
index 00000000..a656dd57
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/reference.fasta
@@ -0,0 +1,2 @@
+>EPI1857215
+AGTAAAGATGAATCCAAATCAAAAGATAATAACGATTGGCTCTGTTTCTCTCACAATTTCCACAATATGCTTCTTCATGCAAATTGCCATCCTGATAACTACTGTAACATTGCATTTCAAGCAATATGAATTCAACTCCCCCCCAAATAACCAAGTGATGCTGTGTGAACCAACAATAATAGAAAGAAACATAACAGAGATAGTGTATTTGACCAACACCACCATAGAGAAGGAAATATGCCCCAAACCAGCAGAATACAGAAATTGGTCAAAACCGCAATGTGGCATTACAGGATTTGCACCTTTCTCTAAGGACAATTCGATTAGGCTTTCCGCTGGTGGGGACATCTGGGTGACAAGAGAACCTTATGTGTCATGCGATCTTGACAAGTGTTATCAATTTGCCCTTGGACAGGGAACAACACTAAACAATGTGCATTCAAATAACACAGTACGTGATAGAACCCCTTATCGGACTCTATTGATGAATGAGTTGGGTGTTCCTTTCCATCTGGGGACCAAGCAAGTGTGCATAGCATGGTCCAGCTCAAGTTGTCACGATGGAAAAGCATGGCTGCATGTTTGTATAACGGGGGATGATAAAAATGCAACTGCTAGCTTCATTTACAATGGGAGGCTTGTAGATAGTGTTGTTTCATGGTCCAACGATATTCTCAGAACCCAGGAGTCAGAATGCGTTTGTATCAATGGAACTTGTACAGTAGTAATGACTGATGGAAATGCTACAGGAAAAGCTGATACTAAAATACTATTCATTGAGGAGGGGAAAATCGTTCATACTAGCAAATTGTCAGGAAGTGCTCAGCATGTCGAAGAGTGCTCTTGCTATCCTCGATATCCTGGTGTCAGATGTGTCTGCAGAGACAACTGGAAAGGATCCAACCGGCCCATCATAGATATAAACATAAAGGATCATAGCATTGTTTCCAGGTATGTGTGTTCTGGACTTGTTGGAGACACACCCAGAAAAAGCGACAGCTCCAGCAGTAGCCATTGTTTGAACCCTAACAATGAAAAAGGTGATCATGGAGTGAAAGGCTGGGCCTTTGATGATGGAAATGACGTGTGGATGGGGAGAACAATCAACGAGACGTCACGCTTAGGGTATGAAACCTTCAAAGTCGTTGAAGGCTGGTCCAACCCTAAGTCCAAATTGCAGATAAATAGGCAAGTCATAGTTGACAGAGGCGATAGGTCCGGTTATTCTGGTATTTTCTCTGTTGAAGGCAAAAGCTGCATCAATCGGTGCTTTTATGTGGAGTTGATTAGGGGAAGAAAAGAGGAAACTGAAGTCTTGTGGACTTCAAACAGTATTGTTGTGTTTTGTGGCACCTCAGGTACATATGGAACAGGCTCATGGCCTGATGGGGCGAACCTCAGTCTCATGCATATATAAGCTTTCGCAATTTTAGAAAAAA
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json b/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json
new file mode 100644
index 00000000..a56465d3
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json
@@ -0,0 +1,22 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"NA",
+            "ranges":[{"begin":33, "end":470}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff b/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff
new file mode 100644
index 00000000..9c9f0b75
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region EPI1926632 1 1847
+EPI1926632	feature	gene	20	64	.	+	.	gene_name="SigPep"
+EPI1926632	feature	gene	65	1096	.	+	.	gene_name="HA1"
+EPI1926632	feature	gene	1097	1765	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
new file mode 100644
index 00000000..6bd01349
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
@@ -0,0 +1,25 @@
+>B/Austria/1359417/2021 | EPI_ISL_6307006 | B / H0N0 | Victoria | 2021-01-09
+attttctaatatccacaaaatgaaggcaataattgtactactcatggtagtaacatccaatgcagatcgaatctgcactg
+ggataacatcgtcaaactcaccacatgtcgtcaaaactgctactcaaggggaggtcaatgtgactggtgtaataccactg
+acaacaacacccaccaaatctcattttgcaaatctcaaaggaacagaaaccagggggaaactatgcccaaaatgcctaaa
+ctgcacagatctggatgtagccttgggcagaccaaaatgcacagggaaaataccctctgcaagggtttcaatactccatg
+aagtcagacctgttacatctgggtgctttcctataatgcatgatagaacaaaaattagacagctgcctaaccttctccga
+ggatacgaacatgtcaggttatcaactcacaacgttatcaatacagaagatgcaccaggaggaccctacgaaattggaac
+ctcagggtcttgcctcaacattaccaatggaaaaggattcttcgcaacaatggcttgggccgtcccaaaaaacaaaacag
+caacaaatccattaacaatagaagtaccatacatttgtacagaagaagaagaccaaattaccgtttgggggttccactct
+gacgacgagacccaaatggcaaggctctatggggattcaaagccccagaagttcacctcatctgccaacggagtgaccac
+acactacgtctcacagattggtggctttccaaatcaaacagaagacggaggactaccacaaagtggcagaattgttgttg
+attacatggtgcaaaaatctggaaaaacaggaacaattacctatcaaagaggtattttattgcctcaaaaggtgtggtgc
+gcaagtggcaagagcaaggtaataaaaggatccttgcccttaattggagaagcagattgcctccatgaaaaatacggtgg
+attaaacaaaagcaagccttactacacaggggaacatgcaaaggccataggaaattgcccaatatgggtgaaaacaccct
+tgaagctggccaatggaaccaaatatagacctcctgcaaaactattaaaggaaagaggtttcttcggagccattgctggt
+ttcttagagggaggatgggaaggaatgattgcaggttggcacggatacacatcccatggggcacatggagtagcggtggc
+agctgaccttaagagcactcaggaggccataaacaagataacaaaaaatctcaactctttgagtgagctggaagtaaaga
+atcttcaaagactaagcggtgccatggatgaactccacaacgaaatactagaactagatgagaaagtggatgatctcaga
+gctgatacaataagctcacagatagaactcgcagtcctgctttccaatgaaggaataataaacagtgaagatgaacatct
+cttggcgcttgaaagaaagctgaagaaaatgctgggcccctctgctgtagagataggaaatggatgctttgaaaccaaac
+acaagtgcaaccagacctgtctcgacagaatagctgctggtacctttgatgcaggagaattttctctccccacctttgat
+tcactgaatattactgctgcatctttaaatgacgatggattggacaatcatactatactgctttactactcaactgctgc
+ctccagtttggctgtaacactgatgatagctatctttgttgtttatatggtctccagagacaatgtttcttgctccattt
+gtctataagggaagttaagccctgtattttcctttattgtagtgcttgtttgcttgttgtcattacaaagaaacgttatt
+gaaaaat
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json b/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json
new file mode 100644
index 00000000..d80db6a5
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json
@@ -0,0 +1,26 @@
+{
+  "schemaVersion": "1.10.0",
+  "nucMutLabelMap": {},
+  "nucMutLabelMapReverse": {},
+  "aaMotifs": [
+    {
+      "name": "glycosylation",
+      "nameShort": "Glyc.",
+      "nameFriendly": "Glycosylation",
+      "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+      "includeGenes": [
+        {
+          "gene":"HA1",
+          "ranges":[]
+        },
+        {
+          "gene":"HA2",
+          "ranges":[{"begin":0, "end":187}]
+        }
+      ],
+      "motifs": [
+        "N[^P][ST]"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/vic/ha/KX058884/genemap.gff b/nextclade/dataset_config/vic/ha/KX058884/genemap.gff
new file mode 100644
index 00000000..6b0fe595
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/KX058884/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region KX058884.1 1 1885
+KX058884.1	feature	gene	34	78	.	+	.	gene_name="SigPep"
+KX058884.1	feature	gene	79	1119	.	+	.	gene_name="HA1"
+KX058884.1	feature	gene	1120	1791	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/vic/ha/KX058884/reference.fasta b/nextclade/dataset_config/vic/ha/KX058884/reference.fasta
new file mode 100644
index 00000000..6d22e3ea
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/KX058884/reference.fasta
@@ -0,0 +1,28 @@
+>KX058884.1 Influenza B virus (B/Brisbane/60/2008) segment 4 hemagglutinin (HA) gene, complete cds
+AGCAGAAGCAGAGCATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACAT
+CCAATGCAGATCGAATCTGCACTGGGATAACATCGTCAAACTCACCACATGTCGTCAAAACTGCTACTCA
+AGGGGAGGTCAATGTGACTGGTGTAATACCACTGACAACAACACCCACCAAATCTCATTTTGCAAATCTC
+AAAGGAACAGAAACCAGGGGGAAACTATGCCCAAAATGCCTCAACTGCACAGATCTGGACGTAGCCTTGG
+GCAGACCAAAATGCACGGGGAAAATACCCTCGGCAAGAGTTTCAATACTCCATGAAGTCAGACCTGTTAC
+ATCTGGGTGCTTTCCTATAATGCACGACAGAACAAAAATTAGACAGCTGCCTAACCTTCTCCGAGGATAC
+GAACATATCAGGTTATCAACCCATAACGTTATCAATGCAGAAAATGCACCAGGAGGACCCTACAAAATTG
+GAACCTCAGGGTCTTGCCCTAACATTACCAATGGAAACGGATTTTTCGCAACAATGGCTTGGGCCGTCCC
+AAAAAACGACAAAAACAAAACAGCAACAAATCCATTAACAATAGAAGTACCATACATTTGTACAGAAGGA
+GAAGACCAAATTACCGTTTGGGGGTTCCACTCTGACAACGAGGCCCAAATGGCAAAGCTCTATGGGGACT
+CAAAGCCCCAGAAGTTCACCTCATCTGCCAACGGAGTGACCACACATTACGTTTCACAGATTGGTGGCTT
+CCCAAATCAAACAGAAGACGGAGGACTACCACAAAGTGGTAGAATTGTTGTTGATTACATGGTGCAAAAA
+TCTGGGAAAACAGGAACAATTACCTATCAAAGGGGTATTTTATTGCCTCAAAAGGTGTGGTGCGCAAGTG
+GCAGGAGCAAGGTAATAAAAGGATCCTTGCCTTTAATTGGAGAAGCAGATTGCCTCCACGAAAAATACGG
+TGGATTAAACAAAAGCAAGCCTTACTACACAGGGGAACATGCAAAGGCCATAGGAAATTGCCCAATATGG
+GTGAAAACACCCTTGAAGCTGGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTAAAGGAAAGGG
+GTTTCTTCGGAGCTATTGCTGGTTTCTTAGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATA
+CACATCCCATGGGGCACATGGAGTAGCGGTGGCAGCAGACCTTAAGAGCACTCAAGAGGCCATAAACAAG
+ATAACAAAAAATCTCAACTCTTTGAGTGAGCTGGAAGTAAAGAATCTTCAAAGACTAAGCGGTGCCATGG
+ATGAACTCCACAACGAAATACTAGAACTAGATGAGAAAGTGGATGATCTCAGAGCTGATACAATAAGCTC
+ACAAATAGAACTCGCAGTCCTGCTTTCCAATGAAGGAATAATAAACAGTGAAGATGAACATCTCTTGGCG
+CTTGAAAGAAAGCTGAAGAAAATGCTGGGCCCCTCTGCTGTAGAGATAGGGAATGGATGCTTTGAAACCA
+AACACAAGTGCAACCAGACCTGTCTCGACAGAATAGCTGCTGGTACCTTTGATGCAGGAGAATTTTCTCT
+CCCCACCTTTGATTCACTGAATATTACTGCTGCATCTTTAAATGACGATGGATTGGATAATCATACTATA
+CTGCTTTACTACTCAACTGCTGCCTCCAGTTTGGCTGTAACACTGATGATAGCTATCTTTGTTGTTTATA
+TGGTCTCCAGAGACAATGTTTCTTGCTCCATCTGTCTATAAGGGAAGTTAAGCCCTGTATTTTCCTTTAT
+TGTAGTGCTTGTTTACTTGTTGTCATTACAAAGAAACGTTATTGAAAAATGCTCTTGTTACTACT
diff --git a/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json b/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json
new file mode 100644
index 00000000..a9af3dfd
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json
@@ -0,0 +1,26 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"HA1",
+            "ranges":[]
+          },
+          {
+            "gene":"HA2",
+            "ranges":[{"begin":0, "end":186}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/vic/includes.txt b/nextclade/dataset_config/vic/includes.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/nextclade/dataset_config/vic/na/CY073894/genemap.gff b/nextclade/dataset_config/vic/na/CY073894/genemap.gff
new file mode 100644
index 00000000..a1ce3b32
--- /dev/null
+++ b/nextclade/dataset_config/vic/na/CY073894/genemap.gff
@@ -0,0 +1,4 @@
+##gff-version 3
+##sequence-region CY073894.1 1 1401
+CY073894.1	annotation	remark	1	1401	.	.	.	accessions=CY073894;data_file_division=VRL;date=25-JUL-2016;keywords=;molecule_type=cRNA;organism=Influenza B virus %28B/Brisbane/60/2008%29;references=location: %5B0:1401%5D%0Aauthors: Sabaiduc%2CS.%2C Skowronski%2CD.%2C Petric%2CM. and Chan%2CT.%0Atitle: %0Ajournal: Unpublished%0Amedline id: %0Apubmed id: %0Acomment:,location: %5B0:1401%5D%0Aauthors: Sabaiduc%2CS.%2C Skowronski%2CD.%2C Gardy%2CJ.%2C Petric%2CM. and Chan%2CT.%0Atitle: Direct Submission%0Ajournal: Submitted %2816-SEP-2010%29 Genome Research Laboratory%2C British Columbia Centre for Disease Control%2C 655 West 12th Ave.%2C Vancouver%2C BC V5Z4R4%2C Canada%0Amedline id: %0Apubmed id: %0Acomment:;sequence_version=1;source=Influenza B virus %28B/Brisbane/60/2008%29;taxonomy=Viruses,Riboviria,Orthornavirae,Negarnaviricota,Polyploviricotina,Insthoviricetes,Articulavirales,Orthomyxoviridae,Betainfluenzavirus;topology=linear
+CY073894.1	feature	gene	1	1401	.	+	.	codon_start=1;gene=NA;gene_name=NA;product=neuraminidase;protein_id=ADN32819.1;translation=MLPSTIQTLTLFLTSGGVLLSLYVSASLSYLLYSDILLKFSPTEITAPTMPLDCANASNVQAVNRSATKGVTLLLPEPEWTYPRLSCPGSTFQKALLISPHRFGETKGNSAPLIIREPFIACGPNECKHFALTHYAAQPGGYYNGTRGDRNKLRHLISVKLGKIPTVENSIFHMAAWSGSACHDGKEWTYIGVDGPDNNALLKVKYGEAYTDTYHSYANKILRTQESACNCIGGNCYLMITDGSASGVSECRFLKIREGRIIKEIFPTGRVKHTEECTCGFASNKTIECACRDNSYTAKRPFVKLNVETDTAEIRLMCTDTYLDTPRPNDGSITGPCESNGDKGSGGIKGGFVHQRMESKIGRWYSRTMSKTERMGMGLYVKYDGDPWADSDALAFSGVMVSMKEPGWYSFGFEIKDKKCDVPCIGIEMVHDGGKETWHSAATAIYCLMGSGQLLWDTVTGVDMAL
diff --git a/nextclade/dataset_config/vic/na/CY073894/reference.fasta b/nextclade/dataset_config/vic/na/CY073894/reference.fasta
new file mode 100644
index 00000000..1bdc8402
--- /dev/null
+++ b/nextclade/dataset_config/vic/na/CY073894/reference.fasta
@@ -0,0 +1,25 @@
+>CY073894.1 Influenza B virus (B/Brisbane/60/2008) segment 6 sequence
+ATGCTACCTTCAACTATACAAACGTTAACCCTATTTCTCACATCAGGGGGAGTATTATTA
+TCACTATATGTGTCAGCTTCATTATCATACTTACTATATTCGGATATATTGCTAAAATTC
+TCACCAACAGAAATAACTGCACCAACAATGCCATTGGATTGTGCAAACGCATCAAATGTT
+CAGGCTGTGAACCGTTCTGCAACAAAAGGGGTGACACTTCTTCTCCCAGAACCGGAGTGG
+ACATACCCGCGTTTATCTTGCCCGGGCTCAACCTTTCAGAAAGCACTCCTAATTAGCCCT
+CATAGATTCGGAGAAACCAAAGGAAACTCAGCTCCCTTGATAATAAGGGAACCTTTTATT
+GCTTGTGGACCAAATGAATGCAAACACTTTGCTCTAACCCATTATGCAGCCCAACCAGGG
+GGATACTACAATGGAACAAGAGGAGACAGAAACAAGCTGAGGCATCTAATTTCAGTCAAA
+TTGGGCAAAATCCCAACAGTAGAAAACTCCATTTTCCACATGGCAGCATGGAGCGGGTCC
+GCGTGCCATGATGGTAAGGAATGGACATATATCGGAGTTGATGGCCCTGACAATAATGCA
+TTGCTCAAAGTAAAATATGGAGAAGCATATACTGACACATACCATTCCTATGCAAACAAA
+ATCCTAAGAACACAAGAAAGTGCCTGCAATTGCATCGGGGGAAATTGTTATCTTATGATA
+ACTGATGGCTCAGCTTCAGGTGTTAGTGAATGCAGATTTCTTAAGATTCGAGAGGGCCGA
+ATAATAAAAGAAATATTTCCAACAGGAAGAGTAAAACACACTGAGGAATGCACATGCGGA
+TTTGCCAGCAATAAAACCATAGAATGTGCCTGTAGAGATAACAGTTACACAGCAAAAAGA
+CCTTTTGTCAAATTAAACGTGGAGACTGATACAGCAGAAATAAGATTGATGTGCACAGAT
+ACTTATTTGGACACCCCCAGACCAAACGATGGAAGCATAACAGGCCCTTGTGAATCTAAT
+GGGGACAAAGGGAGTGGAGGCATCAAGGGAGGATTTGTTCATCAAAGAATGGAATCCAAG
+ATTGGAAGGTGGTACTCTCGAACGATGTCTAAAACTGAAAGGATGGGGATGGGACTGTAT
+GTCAAGTATGATGGAGACCCATGGGCTGACAGTGATGCCCTAGCTTTTAGTGGAGTAATG
+GTTTCAATGAAAGAACCTGGTTGGTACTCCTTTGGCTTCGAAATAAAAGATAAGAAATGC
+GATGTCCCCTGTATTGGGATAGAGATGGTACATGATGGTGGAAAAGAGACTTGGCACTCA
+GCAGCAACAGCCATTTACTGTTTAATGGGCTCAGGACAGCTGCTGTGGGACACTGTCACA
+GGTGTTGACATGGCTCTGTAA
diff --git a/nextclade/dataset_config/vic/na/CY073894/virus_properties.json b/nextclade/dataset_config/vic/na/CY073894/virus_properties.json
new file mode 100644
index 00000000..3bfeb859
--- /dev/null
+++ b/nextclade/dataset_config/vic/na/CY073894/virus_properties.json
@@ -0,0 +1,22 @@
+{
+    "schemaVersion": "1.10.0",
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"NA",
+            "ranges":[{"begin":33, "end":466}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/yam/ha/JN993010/genemap.gff b/nextclade/dataset_config/yam/ha/JN993010/genemap.gff
new file mode 100644
index 00000000..1d67c700
--- /dev/null
+++ b/nextclade/dataset_config/yam/ha/JN993010/genemap.gff
@@ -0,0 +1,5 @@
+##gff-version 3
+##sequence-region JN993010.1 1 1755
+JN993010.1	feature	gene	1	45	.	+	.	gene_name="SigPep"
+JN993010.1	feature	gene	46	1083	.	+	.	gene_name="HA1"
+JN993010.1	feature	gene	1084	1755	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/yam/ha/JN993010/reference.fasta b/nextclade/dataset_config/yam/ha/JN993010/reference.fasta
new file mode 100644
index 00000000..ace128e2
--- /dev/null
+++ b/nextclade/dataset_config/yam/ha/JN993010/reference.fasta
@@ -0,0 +1,2 @@
+>JN993010.1 Influenza B virus (B/Wisconsin/01/2010) segment 4 hemagglutinin (HA) gene, complete cds
+ATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGATCGAATCTGCACTGGGATAACATCTTCAAACTCACCTCATGTGGTCAAAACAGCTACTCAAGGGGAGGTCAATGTGACTGGCGTGATACCACTGACAACAACACCAACAAAATCTTATTTTGCAAATCTCAAAGGAACAAGGACCAGAGGGAAACTATGCCCGGACTGTCTCAACTGTACAGATCTGGATGTGGCCTTGGGCAGGCCAATGTGTGTGGGGACCACACCTTCTGCTAAAGCTTCAATACTCCACGAGGTCAGACCTGTTACATCCGGGTGCTTTCCTATAATGCACGACAGAACAAAAATCAGGCAACTACCCAATCTTCTCAGAGGATATGAAAATATCAGGTTATCAACCCAAAACGTTATCGATGCAGAAAAAGCACCAGGAGGACCCTACAGACTTGGAACCTCAGGATCTTGCCCTAACGCTACCAGTAAAATCGGATTTTTTGCAACAATGGCTTGGGCTGTCCCAAAGGACAACTACAAAAATGCAACGAACCCACTAACAGTAGAAGTACCATACATTTGTACAGAAGGGGAAGACCAAATTACTGTTTGGGGGTTCCATTCAGATAACAAAACCCAAATGAAGAGCCTCTATGGAGACTCAAATCCTCAAAAGTTCACCTCATCTGCTAATGGAGTAACCACACATTATGTTTCTCAGATTGGCGACTTCCCAGATCAAACAGAAGACGGAGGACTACCACAAAGCGGCAGAATTGTTGTTGATTACATGATGCAAAAACCTGGGAAAACAGGAACAATTGTCTATCAAAGAGGTGTTTTGTTGCCTCAAAAGGTGTGGTGCGCGAGTGGCAGGAGCAAAGTAATAAAAGGGTCATTGCCTTTAATTGGTGAAGCAGATTGCCTTCATGAAAAATACGGTGGATTAAACAAAAGCAAGCCTTACTACACAGGAGAACATGCAAAAGCCATAGGAAATTGCCCAATATGGGTAAAAACACCTTTGAAGCTTGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTGAAGGAAAGGGGTTTCTTCGGAGCTATTGCTGGTTTCCTAGAAGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCTCACGGAGCACATGGAGTGGCAGTGGCGGCAGACCTTAAGAGTACACAAGAAGCTATAAATAAGATAACAAAAAATCTCAATTCTTTGAGTGAGCTAGAAGTAAAGAACCTTCAAAGACTAAGTGGTGCCATGGATGAACTCCACAACGAAATACTCGAGCTGGATGAGAAAGTGGATGATCTCAGAGCTGACACTATAAGCTCACAAATAGAACTTGCAGTCTTGCTTTCCAACGAAGGAATAATAAACAGTGAAGACGAGCATCTATTGGCACTTGAGAGAAAACTAAAGAAAATGCTGGGTCCCTCTGCTGTAGACATAGGAAACGGATGCTTCGAAACCAAACACAAATGCAACCAGACCTGCTTAGACAGGATAGCTGCTGGCACCTTTAATGCAGGAGAATTTTCTCTCCCCACTTTTGATTCATTGAACATTACTGCTGCATCTTTAAATGATGATGGATTGGATAACCATACTATACTGCTCTATTACTCAACTGCTGCTTCTAGTTTGGCTGTAACATTAATGCTAGCTATTTTTATTGTTTATATGGTCTCCAGAGACAACGTTTCATGCTCCATCTGTCTATAA

From e9afb6d87093a8d26dbdf96de00b7d05b331df47 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 14:30:27 +0200
Subject: [PATCH 03/26] rename annotation and virus properties files

---
 nextclade/Snakefile                           | 59 +++++++++++--------
 .../CY121680/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../MW626062/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../MW626056/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../CY163680/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../{genemap.gff => annotation.gff}           |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../{genemap.gff => annotation.gff}           |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../{genemap.gff => annotation.gff}           |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../KX058884/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../CY073894/{genemap.gff => annotation.gff}  |  0
 .../{virus_properties.json => pathogen.json}  |  0
 .../JN993010/{genemap.gff => annotation.gff}  |  0
 20 files changed, 33 insertions(+), 26 deletions(-)
 rename nextclade/dataset_config/h1n1pdm/ha/CY121680/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h1n1pdm/ha/CY121680/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/h1n1pdm/ha/MW626062/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h1n1pdm/ha/MW626062/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/h1n1pdm/na/MW626056/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h1n1pdm/na/MW626056/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/h3n2/ha/CY163680/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h3n2/ha/CY163680/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/h3n2/ha/EPI1857216/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h3n2/ha/EPI1857216/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/h3n2/na/EPI1857215/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/h3n2/na/EPI1857215/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/vic/ha/EPI1926632/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/vic/ha/EPI1926632/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/vic/ha/KX058884/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/vic/ha/KX058884/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/vic/na/CY073894/{genemap.gff => annotation.gff} (100%)
 rename nextclade/dataset_config/vic/na/CY073894/{virus_properties.json => pathogen.json} (100%)
 rename nextclade/dataset_config/yam/ha/JN993010/{genemap.gff => annotation.gff} (100%)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index ab5819d8..5e5ee0e9 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -8,7 +8,17 @@ wildcard_constraints:
     segment = r'pb2|pb1|pa|ha|np|na|mp|ns',
     reference="[^_/]+",
 
-
+def all_builds(w):
+    builds = []
+    for lineage in config["builds"]:
+        for segment in config["builds"][lineage]:
+            for ref in config["builds"][lineage][segment]["refs"]:
+                builds.append(f"datasets/{lineage}/{segment}/{ref}/tree.json")
+    return builds
+
+rule all:
+    input:
+        all_builds
 
 
 rule download_sequences:
@@ -84,13 +94,13 @@ rule subsample:
         sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
         sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt",
     params:
-        filter_arguments=lambda w: config["builds"][w.lineage][w.segment][
+        filter_arguments=lambda w: config["builds"][w.lineage][w.segment]["refs"][
             w.reference
         ]["filter"],
-        reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment][
+        reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment]["refs"][
             w.reference
         ]["reference_EPI_ISL"],
-        other_include = lambda w:config["builds"][w.lineage][w.segment][w.reference].get("include_file","")
+        other_include = lambda w:config["builds"][w.lineage][w.segment]["refs"][w.reference].get("include_file","")
     shell:
         """
         augur filter \
@@ -106,8 +116,8 @@ rule subsample:
 rule align:
     input:
         sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
-        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
-        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
+        annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
     output:
         alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta",
         insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv",
@@ -201,8 +211,8 @@ rule ancestral:
     input:
         tree=rules.refine.output.tree,
         alignment=rules.align.output.alignment,
-        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
-        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
+        annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
     output:
         node_data="build/{lineage}/{segment}/{reference}/muts.json",
     params:
@@ -210,7 +220,7 @@ rule ancestral:
         genes=genes,
         translations=lambda w: expand(
             "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta",
-            strain=w.lineage,
+            lineage=w.lineage,
             segment=w.segment,
             genes=genes(w),
             reference=w.reference,
@@ -234,14 +244,14 @@ rule clades:
         "Adding internal clade labels"
     input:
         tree=rules.refine.output.tree,
-        nucs=rules.ancestral.output.node_data,
+        muts=rules.ancestral.output.node_data,
         clades=rules.offset_clades.output,
     output:
-        node_data="build/{lineage}/{segment}/{reference}/{clade}.json",
+        node_data="build/{lineage}/{segment}/{reference}/clade_{clade}.json",
     shell:
         """
         augur clades --tree {input.tree} \
-            --mutations {input.nuc_muts} {input.aa_muts} \
+            --mutations {input.muts} \
             --clades {input.clades} \
             --output-node-data {output.node_data} \
             > /dev/null
@@ -250,15 +260,12 @@ rule clades:
 def get_node_data(w):
     node_data = [
         rules.refine.output.node_data,
-        "build/{lineage}/{segment}/{reference}/aa_muts_adapted.json".format(**w),
-        "build/{lineage}/{segment}/{reference}/nuc_muts_adapted.json".format(**w),
+        "build/{lineage}/{segment}/{reference}/muts.json".format(**w),
     ]
 
-    for clade in config["builds"][w.lineage][w.segment][w.reference]["clades"]:
-        node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'/{clade}.json')
-
-    if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference]:
-        node_data.append("build/{lineage}/{segment}/{reference}/clades-short.json".format(**w))
+    for clade in config["builds"][w.lineage][w.segment]["clade_systems"]:
+        print(clade)
+        node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'clade_{clade}.json')
 
     return node_data
 
@@ -270,9 +277,9 @@ rule export:
         tree=rules.refine.output.tree,
         metadata=rules.parse.output.metadata,
         node_data = get_node_data,
-        auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.lineage][w.segment][w.reference] else config["files"]["auspice_config"],
+        auspice_config= "config/auspice_config.json",
     output:
-        auspice_json="auspice/{lineage}/{segment}/{reference}/auspice_raw.json",
+        auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
     params:
         fields="region strainName country date EPI_ISL",
         date=datetime.datetime.utcnow().strftime("%Y-%m-%d"),
@@ -311,11 +318,11 @@ rule generate_sample_sequences:
 rule make_dataset:
     input:
         sequences="build/{lineage}/{segment}/{reference}/sample_sequences.fasta",
-        auspice_json="auspice/{lineage}/{segment}/{reference}/auspice.json",
-        annotation="references/{lineage}/{segment}/{reference}/annotation.gff",
-        reference="references/{lineage}/{segment}/{reference}/reference.fasta",
-        pathogen_json="references/{lineage}/{segment}/{reference}/pathogen.json",
-        additional_config="references/{lineage}/{segment}/{reference}/virus_properties.json",
+        auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
+        annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
+        reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
+        pathogen_json="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
+        additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
     output:
         sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta",
         tree="datasets/{lineage}/{segment}/{reference}/tree.json",
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/CY121680/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/ha/CY121680/genemap.gff
rename to nextclade/dataset_config/h1n1pdm/ha/CY121680/annotation.gff
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/ha/CY121680/virus_properties.json
rename to nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff b/nextclade/dataset_config/h1n1pdm/ha/MW626062/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/ha/MW626062/genemap.gff
rename to nextclade/dataset_config/h1n1pdm/ha/MW626062/annotation.gff
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/ha/MW626062/virus_properties.json
rename to nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff b/nextclade/dataset_config/h1n1pdm/na/MW626056/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/na/MW626056/genemap.gff
rename to nextclade/dataset_config/h1n1pdm/na/MW626056/annotation.gff
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h1n1pdm/na/MW626056/virus_properties.json
rename to nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h3n2/ha/CY163680/genemap.gff
rename to nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h3n2/ha/CY163680/virus_properties.json
rename to nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff b/nextclade/dataset_config/h3n2/ha/EPI1857216/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h3n2/ha/EPI1857216/genemap.gff
rename to nextclade/dataset_config/h3n2/ha/EPI1857216/annotation.gff
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h3n2/ha/EPI1857216/virus_properties.json
rename to nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff b/nextclade/dataset_config/h3n2/na/EPI1857215/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/h3n2/na/EPI1857215/genemap.gff
rename to nextclade/dataset_config/h3n2/na/EPI1857215/annotation.gff
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/h3n2/na/EPI1857215/virus_properties.json
rename to nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff b/nextclade/dataset_config/vic/ha/EPI1926632/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/vic/ha/EPI1926632/genemap.gff
rename to nextclade/dataset_config/vic/ha/EPI1926632/annotation.gff
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/vic/ha/EPI1926632/virus_properties.json
rename to nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json
diff --git a/nextclade/dataset_config/vic/ha/KX058884/genemap.gff b/nextclade/dataset_config/vic/ha/KX058884/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/vic/ha/KX058884/genemap.gff
rename to nextclade/dataset_config/vic/ha/KX058884/annotation.gff
diff --git a/nextclade/dataset_config/vic/ha/KX058884/virus_properties.json b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/vic/ha/KX058884/virus_properties.json
rename to nextclade/dataset_config/vic/ha/KX058884/pathogen.json
diff --git a/nextclade/dataset_config/vic/na/CY073894/genemap.gff b/nextclade/dataset_config/vic/na/CY073894/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/vic/na/CY073894/genemap.gff
rename to nextclade/dataset_config/vic/na/CY073894/annotation.gff
diff --git a/nextclade/dataset_config/vic/na/CY073894/virus_properties.json b/nextclade/dataset_config/vic/na/CY073894/pathogen.json
similarity index 100%
rename from nextclade/dataset_config/vic/na/CY073894/virus_properties.json
rename to nextclade/dataset_config/vic/na/CY073894/pathogen.json
diff --git a/nextclade/dataset_config/yam/ha/JN993010/genemap.gff b/nextclade/dataset_config/yam/ha/JN993010/annotation.gff
similarity index 100%
rename from nextclade/dataset_config/yam/ha/JN993010/genemap.gff
rename to nextclade/dataset_config/yam/ha/JN993010/annotation.gff

From 73bbd87085c459a7e192e28cd271b6a5b5b0af59 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 15:27:37 +0200
Subject: [PATCH 04/26] update pathogen json

---
 nextclade/Snakefile               | 41 +++++++++++++------------------
 nextclade/config/config_dict.yaml | 35 +++++++++++++++++---------
 nextclade/config/pathogen.json    | 14 ++++++++++-
 3 files changed, 54 insertions(+), 36 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 5e5ee0e9..1ec4d6c0 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -26,7 +26,6 @@ rule download_sequences:
         sequences="data/{lineage}/raw_{segment}.fasta"
     params:
         s3_path="s3://nextstrain-data-private/files/workflows/seasonal-flu/{lineage}/{segment}/raw_sequences.fasta.xz"
-    conda: "../../workflow/envs/nextstrain.yaml"
     shell:
         """
         aws s3 cp {params.s3_path} - | xz -c -d > {output.sequences}
@@ -38,7 +37,7 @@ rule download_clades:
     output:
         clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv"
     params:
-        source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade],
+        source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'],
     shell:
         """
         curl {params.source_tsv} > {output.clade_tsv}
@@ -119,20 +118,19 @@ rule align:
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
     output:
-        alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta",
-        insertions="build/{lineage}/{segment}/{reference}/align.insertions.csv",
+        alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta"
     params:
         outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta",
+        nextclade_bin = "./nextclade_v3"
     threads: 3
     shell:
         """
-        nextclade run \
+        {params.nextclade_bin} run \
             --jobs={threads} \
             --input-ref {input.reference} \
             --input-annotation {input.annotation} \
             --output-translations {params.outdir} \
             --output-fasta {output.alignment} \
-            --output-insertions {output.insertions} \
             {input.sequences} \
             2>&1
         """
@@ -143,16 +141,11 @@ rule tree:
         alignment=rules.align.output.alignment,
     output:
         tree="build/{lineage}/{segment}/{reference}/tree_raw.nwk",
-    params:
-        args=lambda w: config["tree"].get("tree-builder-args", "")
-        if "tree" in config
-        else "",
     threads: 3
     shell:
         """
         augur tree \
             --alignment {input.alignment} \
-            --tree-builder-args {params.args} \
             --output {output.tree} \
             --nthreads {threads} \
             > /dev/null
@@ -218,13 +211,7 @@ rule ancestral:
     params:
         inference="joint",
         genes=genes,
-        translations=lambda w: expand(
-            "build/{lineage}/{segment}/{reference}/aligned.gene.{genes}.fasta",
-            lineage=w.lineage,
-            segment=w.segment,
-            genes=genes(w),
-            reference=w.reference,
-        ),
+        translations= "build/{lineage}/{segment}/{reference}/aligned.gene.%GENE.fasta",
     shell:
         """
         augur ancestral \
@@ -234,7 +221,7 @@ rule ancestral:
             --infer-ambiguous \
             --genes {params.genes} \
             --annotation {input.annotation} \
-            --translations {params.translations} \
+            --translations {params.translations:q} \
             --root-sequence {input.reference} \
             --output-node-data {output.node_data}
         """
@@ -248,11 +235,16 @@ rule clades:
         clades=rules.offset_clades.output,
     output:
         node_data="build/{lineage}/{segment}/{reference}/clade_{clade}.json",
+    params:
+        membership_key= lambda w: config["builds"][w.lineage][w.segment]["clade_systems"][w.clade].get('key', 'clade_membership'),
+        label_key= lambda w: config["builds"][w.lineage][w.segment]["clade_systems"][w.clade].get('key', 'clade')
     shell:
         """
         augur clades --tree {input.tree} \
             --mutations {input.muts} \
             --clades {input.clades} \
+            --membership-name {params.membership_key} \
+            --label-name {params.label_key} \
             --output-node-data {output.node_data} \
             > /dev/null
         """
@@ -292,6 +284,7 @@ rule export:
             --node-data {input.node_data}\
             --auspice-config {input.auspice_config} \
             --color-by-metadata {params.fields} \
+            --minify-json \
             --title "Nextclade reference tree for Influenza type:{wildcards.lineage} segment:{wildcards.segment} with root {wildcards.reference} built on {params.date}" \
             --output {output.auspice_json} 2>&1;
         """
@@ -321,7 +314,7 @@ rule make_dataset:
         auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
-        pathogen_json="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
+        pathogen_json="config/pathogen.json",
         additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
     output:
         sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta",
@@ -331,10 +324,10 @@ rule make_dataset:
         pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json",
     shell:
         """
-        cp {input.sequences} {output.sequences} \
-        cp {input.auspice_json} {output.tree} \
-        cp {input.reference} {output.reference} \
-        cp {input.annotation} {output.annotation} \
+        cp {input.sequences} {output.sequences}
+        cp {input.auspice_json} {output.tree}
+        cp {input.reference} {output.reference}
+        cp {input.annotation} {output.annotation}
         jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json}
         """
 
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index 239f1f9e..f94e283a 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -4,9 +4,11 @@ builds:
   h1n1pdm:
     ha:
       clade_systems:
-        "clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv"
-        "subclade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv"
-        "short-clade": "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv"
+        clade:
+          url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv"
+        subclade:
+          url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv"
+          key: "subclade"
       refs:
         CY121680:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500  --subsample-max-sequences 1500"
@@ -21,7 +23,8 @@ builds:
           reference_strain: A/Wisconsin/588/2019
     na:
       clade_systems:
-        "clade": "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv"
+        clade:
+          url: "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv"
       refs:
         MW626056:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
@@ -31,9 +34,14 @@ builds:
   h3n2:
     ha:
       clade_systems:
-        "clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv"
-        "subclade": "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv"
-        "short-clade": "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv"
+        clade:
+          url: "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv"
+        subclade:
+          url: "seasonal_A-H3N2_HA/main/.auto-generated/subclades.tsv"
+          key: "subclade"
+        short-clade:
+          url: "seasonal_A-H3N2_HA/main/.auto-generated/clades.tsv"
+          key: "short-clade"
       refs:
         EPI1857216:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
@@ -51,7 +59,8 @@ builds:
           reference_strain: A/Wisconsin/67/2005-egg
     na:
       clade_systems:
-        "clade": "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv"
+        clade:
+          url: "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv"
       refs:
         EPI1857215:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
@@ -61,8 +70,11 @@ builds:
   vic:
     ha:
       clade_systems:
-        "clade": "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv"
-        "subclade": "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv"
+        clade:
+          url: "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv"
+        subclade:
+          url: "seasonal_B-Vic_HA/main/.auto-generated/subclades.tsv"
+          key: "subclade"
       refs:
         KX058884:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
@@ -72,7 +84,8 @@ builds:
           reference_strain: B/Brisbane/60/2008-egg
     na:
       clade_systems:
-        "clade": "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"
+        clade:
+          url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"
       refs:
         CY073894:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json
index d536f97f..ab5c112d 100644
--- a/nextclade/config/pathogen.json
+++ b/nextclade/config/pathogen.json
@@ -1,4 +1,5 @@
 {
+    "schemaVersion": "3.0.0",
     "alignmentParams": {
         "excessBandwidth": 9,
         "terminalBandwidth": 100,
@@ -6,6 +7,13 @@
         "gapAlignmentSide": "right",
         "minSeedCover": 0.1
     },
+    "files":{
+        "examples": "example_sequences.fasta",
+        "genomeAnnotation": "annotation.gff",
+        "pathogenJson": "pathogen.json",
+        "reference": "reference.fasta",
+        "treeJson": "tree.json"
+        },
     "qc": {
         "privateMutations": {
           "enabled": true,
@@ -37,5 +45,9 @@
           "enabled": true,
           "ignoredStopCodons": []
         }
-    }
+    },
+    "geneOrderPreference": [
+        "HA1",
+        "HA2"
+      ]
 }

From f99b3f318ee90115bd6b506a26f4ad96c0a54b7e Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 16:37:22 +0200
Subject: [PATCH 05/26] add script to merge jsons

---
 nextclade/Snakefile                           | 30 +++++++++--
 .../h1n1pdm/ha/CY121680/pathogen.json         |  1 -
 nextclade/scripts/merge_jsons.py              | 52 +++++++++++++++++++
 3 files changed, 78 insertions(+), 5 deletions(-)
 create mode 100644 nextclade/scripts/merge_jsons.py

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 1ec4d6c0..513d5c85 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -249,6 +249,29 @@ rule clades:
             > /dev/null
         """
 
+rule virus_specific_jsons:
+    input:
+        auspice_config= "config/auspice_config.json",
+        pathogen = "config/pathogen.json",
+        additional_pathogen="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
+    output:
+        pathogen = "build/{lineage}/{segment}/{reference}/pathogen.json",
+        auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json",
+    params:
+        clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', '')
+                            for clade in config["builds"][w.lineage][w.segment]["clade_systems"]]
+    shell:
+        """
+        python3 scripts/merge_jsons.py --lineage {wildcards.lineage} --reference {wildcards.reference} \
+            --segment {wildcards.segment} --clades {params.clades} \
+            --pathogen-jsons {input.pathogen} {input.additional_pathogen} \
+            --auspice-config {input.auspice_config} \
+            --output-pathogen {output.pathogen} \
+            --output-auspice {output.auspice}
+        """
+
+
+
 def get_node_data(w):
     node_data = [
         rules.refine.output.node_data,
@@ -269,7 +292,7 @@ rule export:
         tree=rules.refine.output.tree,
         metadata=rules.parse.output.metadata,
         node_data = get_node_data,
-        auspice_config= "config/auspice_config.json",
+        auspice_config= "build/{lineage}/{segment}/{reference}/auspice_config.json"
     output:
         auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
     params:
@@ -314,8 +337,7 @@ rule make_dataset:
         auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
-        pathogen_json="config/pathogen.json",
-        additional_config="dataset_config/{lineage}/{segment}/{reference}/pathogen.json",
+        pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json",
     output:
         sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta",
         tree="datasets/{lineage}/{segment}/{reference}/tree.json",
@@ -328,7 +350,7 @@ rule make_dataset:
         cp {input.auspice_json} {output.tree}
         cp {input.reference} {output.reference}
         cp {input.annotation} {output.annotation}
-        jq -s '.[0] * .[1]' {input.pathogen_json} {input.additional_config} > {output.pathogen_json}
+        cp {input.pathogen_json} {output.pathogen_json}
         """
 
 
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json
index a9af3dfd..0fec5c9f 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py
new file mode 100644
index 00000000..02657e5f
--- /dev/null
+++ b/nextclade/scripts/merge_jsons.py
@@ -0,0 +1,52 @@
+import json, argparse
+
+def get_clade_configs(name):
+    return {
+    "short_clades": {
+        "name": "short_clade",
+        "displayName": "Abbreviated clade name",
+        "description": "For recent subclades with long names, the prefix describing their history is omitted."
+    },
+    "subclade": {
+        "name": "subclade",
+        "displayName": "Subclade",
+        "description": "Experimental fine-grained subclade annotation."
+    }}.get(name, {'name':name, "displayName":name})
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--lineage", required=True, help="attribute info")
+    parser.add_argument("--segment", required=True, help="attribute info")
+    parser.add_argument("--reference", required=True, help="attribute info")
+    parser.add_argument("--auspice-config", required=True, help="Auspice config JSON with coloring entry to have scale added to")
+    parser.add_argument("--pathogen-jsons", nargs='+', required=True, help="name of the coloring field in the Auspice config JSON")
+    parser.add_argument("--clades", nargs="+", required=True, help="list of values to assign colors to")
+    parser.add_argument("--output-auspice", required=True, help="Auspice config JSON with scale added to the requested coloring")
+    parser.add_argument("--output-pathogen", required=True, help="Auspice config JSON with scale added to the requested coloring")
+    args = parser.parse_args()
+
+    pathogen_json = {}
+    for p in args.pathogen_jsons:
+        with open(p) as fh:
+            pathogen_json.update(json.load(fh))
+
+    with open(args.auspice_config) as fh:
+        auspice_json = json.load(fh)
+
+    pathogen_json['attributes'] = {"name":{"value":args.lineage},
+                                   "segment":{"value":args.segment},
+                                   "reference":{"value":args.reference}}
+
+
+    if len(args.clades):
+        auspice_json['extensions']['nextclade']["clade_node_attrs"] =  [
+            get_clade_configs(c) for c in args.clades
+        ]
+
+    with open(args.output_pathogen, 'w') as fh:
+        json.dump(pathogen_json, fh, indent=2)
+
+    with open(args.output_auspice, 'w') as fh:
+        json.dump(auspice_json, fh, indent=2)
+

From 04744fb8d486e720c7b824f4546df74d1fd57683 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Mon, 25 Sep 2023 17:28:17 +0200
Subject: [PATCH 06/26] fixes and make reference sequences uppercase

---
 nextclade/Snakefile                           |  4 +-
 nextclade/config/config_dict.yaml             |  4 +-
 .../h3n2/ha/CY163680/annotation.gff           |  6 +--
 .../h3n2/ha/EPI1857216/reference.fasta        | 44 ++++++++---------
 .../vic/ha/EPI1926632/reference.fasta         | 48 +++++++++----------
 nextclade/scripts/merge_jsons.py              |  2 +-
 6 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 513d5c85..943aeaec 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -258,7 +258,7 @@ rule virus_specific_jsons:
         pathogen = "build/{lineage}/{segment}/{reference}/pathogen.json",
         auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json",
     params:
-        clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', '')
+        clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', 'default')
                             for clade in config["builds"][w.lineage][w.segment]["clade_systems"]]
     shell:
         """
@@ -344,6 +344,7 @@ rule make_dataset:
         annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff",
         reference="datasets/{lineage}/{segment}/{reference}/reference.fasta",
         pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json",
+        auspice="auspice/{lineage}_{segment}_{reference}.json",
     shell:
         """
         cp {input.sequences} {output.sequences}
@@ -351,6 +352,7 @@ rule make_dataset:
         cp {input.reference} {output.reference}
         cp {input.annotation} {output.annotation}
         cp {input.pathogen_json} {output.pathogen_json}
+        cp {input.auspice_json} {output.auspice}
         """
 
 
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index f94e283a..99044324 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -47,14 +47,14 @@ builds:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
           clade_offset: -17
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
-          include_file: references/h3n2/includes.txt
+          include_file: dataset_config/h3n2/includes.txt
           reference_EPI_ISL: EPI1857216
           reference_strain: A/Darwin/6/2021
         CY163680:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
           clade_offset: 0
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
-          include_file: references/h3n2/includes.txt
+          include_file: dataset_config/h3n2/includes.txt
           reference_EPI_ISL: EPI545340
           reference_strain: A/Wisconsin/67/2005-egg
     na:
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff
index cbb8d4e5..579ff7ff 100644
--- a/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/annotation.gff
@@ -1,5 +1,5 @@
 ##gff-version 3
 ##sequence-region CY163680.1 1 1737
-CY163680.1	feature	CDS	18	65	.	+	.	name="SigPep"
-CY163680.1	feature	CDS	66	1052	.	+	.	name="HA1"
-CY163680.1	feature	CDS	1053	1715	.	+	.	name="HA2"
+CY163680.1	feature	gene	18	65	.	+	.	gene_name="SigPep"
+CY163680.1	feature	gene	66	1052	.	+	.	gene_name="HA1"
+CY163680.1	feature	gene	1053	1715	.	+	.	gene_name="HA2"
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
index ee3b943f..427694bc 100644
--- a/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/reference.fasta
@@ -1,23 +1,23 @@
 >EPI_ISL_1563628 | A/Darwin/6/2021 | A / H3N2 |  | 2021-03-16
-atgaagactatcattgctttgagcaacattctatgtcttgttttcgctcaaaaaatacctggaaatgacaatagcacggc
-aacgctgtgccttgggcaccatgcagtaccaaacggaacgatagtgaaaacaatcacaaatgaccgaattgaagttacta
-atgctactgagttggttcagaattcatcaataggtgaaatatgcggcagtcctcatcagatccttgatggagggaactgc
-acactaatagatgctctattgggggaccctcagtgtgacggctttcaaaataaggaatgggacctttttgttgaaagaag
-cagagccaacagcaactgttacccttatgatgtgccggattatgcctcccttaggtcactagttgcctcatccggcacac
-tggagtttaaaaatgaaagcttcaattggactggagtcaaacaaaacggaacaagttctgcgtgcataaggggatctagt
-agtagtttttttagtagattaaattggttgaccagcttaaacaacatatatccagcacagaacgtgactatgccaaacaa
-ggaacaatttgacaaattgtacatttggggggttcaccacccggatacggacaagaaccaaatctccctgtttgctcaat
-catcaggaagaatcacagtatctaccaaaagaagccaacaagctgtaatcccaaatatcggatctagacccagaataagg
-gatatccctagcagaataagcatctattggacaatagtaaaaccgggagacatacttttgattaacagcacagggaatct
-aattgctcctaggggttacttcaaaatacgaagtgggaaaagctcaataatgagatcagatgcacccattggcaaatgta
-agtctgaatgcatcactccaaatggaagcattcccaatgacaaaccgttccaaaatgtaaacaggatcacatacggggcc
-tgtcccagatatgttaagcaaagcaccctgaaattggcaacaggaatgcgaaatgtaccagagaaacaaaccagaggcat
-atttggcgcaatagcgggtttcatagaaaatggatgggagggaatggtggatggttggtacggtttcaggcatcaaaatt
-ctgagggaagaggacaagcagcagatctcaaaagcactcaagcagcaatcgatcaaatcaatgggaagctgaatcgattg
-atcggaaaaaccaacgagaaattccatcagattgaaaaagaattctcagaagtagaaggaagagttcaagaccttgagaa
-atatgttgaggacactaaaatagatctctggtcatacaacgcggagcttcttgttgccctggagaaccaacatacgattg
-acctaactgactcagaaatgaacaaactgtttgaaaaaacaaagaagcaactgagggaaaatgctgaggatatgggaaat
-ggttgtttcaaaatataccacaaatgtgacaatgcctgcataggatcaataagaaatgaaacttatgaccacaatgtgta
-cagggatgaagcattaaacaaccggttccagatcaagggagttgagctgaagtcagggtacaaagattggatcctatgga
-tttcctttgccatgtcatgttttttgctttgtattgctttgttggggttcatcatgtgggcctgccaaaagggcaacatt
-agatgcaacatttgcatttgagtgcattaattaaaaac
+ATGAAGACTATCATTGCTTTGAGCAACATTCTATGTCTTGTTTTCGCTCAAAAAATACCTGGAAATGACAATAGCACGGC
+AACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTA
+ATGCTACTGAGTTGGTTCAGAATTCATCAATAGGTGAAATATGCGGCAGTCCTCATCAGATCCTTGATGGAGGGAACTGC
+ACACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCTTTCAAAATAAGGAATGGGACCTTTTTGTTGAAAGAAG
+CAGAGCCAACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACAC
+TGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCAAACAAAACGGAACAAGTTCTGCGTGCATAAGGGGATCTAGT
+AGTAGTTTTTTTAGTAGATTAAATTGGTTGACCAGCTTAAACAACATATATCCAGCACAGAACGTGACTATGCCAAACAA
+GGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGATACGGACAAGAACCAAATCTCCCTGTTTGCTCAAT
+CATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGG
+GATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCT
+AATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGTA
+AGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCGTTCCAAAATGTAAACAGGATCACATACGGGGCC
+TGTCCCAGATATGTTAAGCAAAGCACCCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACCAGAGGCAT
+ATTTGGCGCAATAGCGGGTTTCATAGAAAATGGATGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATT
+CTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTG
+ATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAGTTCAAGACCTTGAGAA
+ATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACGATTG
+ACCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGAAAT
+GGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTA
+CAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGA
+TTTCCTTTGCCATGTCATGTTTTTTGCTTTGTATTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATT
+AGATGCAACATTTGCATTTGAGTGCATTAATTAAAAAC
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
index 6bd01349..a77d4fcf 100644
--- a/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/reference.fasta
@@ -1,25 +1,25 @@
 >B/Austria/1359417/2021 | EPI_ISL_6307006 | B / H0N0 | Victoria | 2021-01-09
-attttctaatatccacaaaatgaaggcaataattgtactactcatggtagtaacatccaatgcagatcgaatctgcactg
-ggataacatcgtcaaactcaccacatgtcgtcaaaactgctactcaaggggaggtcaatgtgactggtgtaataccactg
-acaacaacacccaccaaatctcattttgcaaatctcaaaggaacagaaaccagggggaaactatgcccaaaatgcctaaa
-ctgcacagatctggatgtagccttgggcagaccaaaatgcacagggaaaataccctctgcaagggtttcaatactccatg
-aagtcagacctgttacatctgggtgctttcctataatgcatgatagaacaaaaattagacagctgcctaaccttctccga
-ggatacgaacatgtcaggttatcaactcacaacgttatcaatacagaagatgcaccaggaggaccctacgaaattggaac
-ctcagggtcttgcctcaacattaccaatggaaaaggattcttcgcaacaatggcttgggccgtcccaaaaaacaaaacag
-caacaaatccattaacaatagaagtaccatacatttgtacagaagaagaagaccaaattaccgtttgggggttccactct
-gacgacgagacccaaatggcaaggctctatggggattcaaagccccagaagttcacctcatctgccaacggagtgaccac
-acactacgtctcacagattggtggctttccaaatcaaacagaagacggaggactaccacaaagtggcagaattgttgttg
-attacatggtgcaaaaatctggaaaaacaggaacaattacctatcaaagaggtattttattgcctcaaaaggtgtggtgc
-gcaagtggcaagagcaaggtaataaaaggatccttgcccttaattggagaagcagattgcctccatgaaaaatacggtgg
-attaaacaaaagcaagccttactacacaggggaacatgcaaaggccataggaaattgcccaatatgggtgaaaacaccct
-tgaagctggccaatggaaccaaatatagacctcctgcaaaactattaaaggaaagaggtttcttcggagccattgctggt
-ttcttagagggaggatgggaaggaatgattgcaggttggcacggatacacatcccatggggcacatggagtagcggtggc
-agctgaccttaagagcactcaggaggccataaacaagataacaaaaaatctcaactctttgagtgagctggaagtaaaga
-atcttcaaagactaagcggtgccatggatgaactccacaacgaaatactagaactagatgagaaagtggatgatctcaga
-gctgatacaataagctcacagatagaactcgcagtcctgctttccaatgaaggaataataaacagtgaagatgaacatct
-cttggcgcttgaaagaaagctgaagaaaatgctgggcccctctgctgtagagataggaaatggatgctttgaaaccaaac
-acaagtgcaaccagacctgtctcgacagaatagctgctggtacctttgatgcaggagaattttctctccccacctttgat
-tcactgaatattactgctgcatctttaaatgacgatggattggacaatcatactatactgctttactactcaactgctgc
-ctccagtttggctgtaacactgatgatagctatctttgttgtttatatggtctccagagacaatgtttcttgctccattt
-gtctataagggaagttaagccctgtattttcctttattgtagtgcttgtttgcttgttgtcattacaaagaaacgttatt
-gaaaaat
+ATTTTCTAATATCCACAAAATGAAGGCAATAATTGTACTACTCATGGTAGTAACATCCAATGCAGATCGAATCTGCACTG
+GGATAACATCGTCAAACTCACCACATGTCGTCAAAACTGCTACTCAAGGGGAGGTCAATGTGACTGGTGTAATACCACTG
+ACAACAACACCCACCAAATCTCATTTTGCAAATCTCAAAGGAACAGAAACCAGGGGGAAACTATGCCCAAAATGCCTAAA
+CTGCACAGATCTGGATGTAGCCTTGGGCAGACCAAAATGCACAGGGAAAATACCCTCTGCAAGGGTTTCAATACTCCATG
+AAGTCAGACCTGTTACATCTGGGTGCTTTCCTATAATGCATGATAGAACAAAAATTAGACAGCTGCCTAACCTTCTCCGA
+GGATACGAACATGTCAGGTTATCAACTCACAACGTTATCAATACAGAAGATGCACCAGGAGGACCCTACGAAATTGGAAC
+CTCAGGGTCTTGCCTCAACATTACCAATGGAAAAGGATTCTTCGCAACAATGGCTTGGGCCGTCCCAAAAAACAAAACAG
+CAACAAATCCATTAACAATAGAAGTACCATACATTTGTACAGAAGAAGAAGACCAAATTACCGTTTGGGGGTTCCACTCT
+GACGACGAGACCCAAATGGCAAGGCTCTATGGGGATTCAAAGCCCCAGAAGTTCACCTCATCTGCCAACGGAGTGACCAC
+ACACTACGTCTCACAGATTGGTGGCTTTCCAAATCAAACAGAAGACGGAGGACTACCACAAAGTGGCAGAATTGTTGTTG
+ATTACATGGTGCAAAAATCTGGAAAAACAGGAACAATTACCTATCAAAGAGGTATTTTATTGCCTCAAAAGGTGTGGTGC
+GCAAGTGGCAAGAGCAAGGTAATAAAAGGATCCTTGCCCTTAATTGGAGAAGCAGATTGCCTCCATGAAAAATACGGTGG
+ATTAAACAAAAGCAAGCCTTACTACACAGGGGAACATGCAAAGGCCATAGGAAATTGCCCAATATGGGTGAAAACACCCT
+TGAAGCTGGCCAATGGAACCAAATATAGACCTCCTGCAAAACTATTAAAGGAAAGAGGTTTCTTCGGAGCCATTGCTGGT
+TTCTTAGAGGGAGGATGGGAAGGAATGATTGCAGGTTGGCACGGATACACATCCCATGGGGCACATGGAGTAGCGGTGGC
+AGCTGACCTTAAGAGCACTCAGGAGGCCATAAACAAGATAACAAAAAATCTCAACTCTTTGAGTGAGCTGGAAGTAAAGA
+ATCTTCAAAGACTAAGCGGTGCCATGGATGAACTCCACAACGAAATACTAGAACTAGATGAGAAAGTGGATGATCTCAGA
+GCTGATACAATAAGCTCACAGATAGAACTCGCAGTCCTGCTTTCCAATGAAGGAATAATAAACAGTGAAGATGAACATCT
+CTTGGCGCTTGAAAGAAAGCTGAAGAAAATGCTGGGCCCCTCTGCTGTAGAGATAGGAAATGGATGCTTTGAAACCAAAC
+ACAAGTGCAACCAGACCTGTCTCGACAGAATAGCTGCTGGTACCTTTGATGCAGGAGAATTTTCTCTCCCCACCTTTGAT
+TCACTGAATATTACTGCTGCATCTTTAAATGACGATGGATTGGACAATCATACTATACTGCTTTACTACTCAACTGCTGC
+CTCCAGTTTGGCTGTAACACTGATGATAGCTATCTTTGTTGTTTATATGGTCTCCAGAGACAATGTTTCTTGCTCCATTT
+GTCTATAAGGGAAGTTAAGCCCTGTATTTTCCTTTATTGTAGTGCTTGTTTGCTTGTTGTCATTACAAAGAAACGTTATT
+GAAAAAT
diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py
index 02657e5f..10eb5df4 100644
--- a/nextclade/scripts/merge_jsons.py
+++ b/nextclade/scripts/merge_jsons.py
@@ -41,7 +41,7 @@ def get_clade_configs(name):
 
     if len(args.clades):
         auspice_json['extensions']['nextclade']["clade_node_attrs"] =  [
-            get_clade_configs(c) for c in args.clades
+            get_clade_configs(c) for c in args.clades if c!='default'
         ]
 
     with open(args.output_pathogen, 'w') as fh:

From da3704a545e5f5f65c07cbb4bee316378ea391ed Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Tue, 26 Sep 2023 23:18:44 +0200
Subject: [PATCH 07/26] add outliers

---
 config/h1n1pdm/outliers.txt          |  4 ++++
 config/h1n1pdm/reference_strains.txt |  1 -
 config/h3n2/outliers.txt             |  2 ++
 nextclade/Snakefile                  |  2 ++
 nextclade/config/config_dict.yaml    | 12 ++++++------
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/config/h1n1pdm/outliers.txt b/config/h1n1pdm/outliers.txt
index de309446..310ebd47 100644
--- a/config/h1n1pdm/outliers.txt
+++ b/config/h1n1pdm/outliers.txt
@@ -10,6 +10,7 @@ A/Asturias/RR6898/2010
 A/Austria/1048413/2018
 A/Austria/183/2009-egg
 A/Bangladesh/2021/2012
+A/Bangkok/P1600/2023
 A/Bari/166/2016
 A/Bari/167/2016
 A/Beijing/1/2009-egg
@@ -51,6 +52,7 @@ A/India/4562/2021
 A/India/8484/2021
 A/India/9324/2021
 A/India/9825/2021
+A/India/CT-AIIMSR-266/2022
 A/India/Pun1418633/2014
 A/India/TS-NIV2019nCoV177/2020
 A/Iowa/1/2006
@@ -110,6 +112,7 @@ A/Rennes/F_006_111_BL/2020
 A/Rennes/F_006_119_JD/2020
 A/RheinlandPfalz/1/2020
 A/RioGrandedoNorte/117490/2012
+A/RioGrandeDoNorte/2023-012263-IEC/2023
 A/SaintLucia/7340/2020
 A/Shandong/1/2009
 A/Shandong/1/2009-egg
@@ -123,6 +126,7 @@ A/SouthAfrica/16112/2021
 A/SouthAfrica/16325/2021
 A/SouthAfrica/1857/2013
 A/SouthAfrica/NHLS-UCT-GS-0017/2021
+A/SouthAfrica/NHLS-UCT-GS-0034/2021
 A/SouthAfrica/PET20279/2021
 A/SouthDakota/3/2008
 A/SriLanka/11/2012
diff --git a/config/h1n1pdm/reference_strains.txt b/config/h1n1pdm/reference_strains.txt
index dbbf65a7..6db5111c 100644
--- a/config/h1n1pdm/reference_strains.txt
+++ b/config/h1n1pdm/reference_strains.txt
@@ -5,7 +5,6 @@ A/Arizona/33/2017
 A/Arkansas/4/2020
 A/Arkansas/8/2020
 A/Arkansas/8/2020-egg
-A/Austria/1048413/2018
 A/Bangladesh/2021/2012
 A/Bangladesh/3002/2015
 A/Bolivia/559/2013
diff --git a/config/h3n2/outliers.txt b/config/h3n2/outliers.txt
index 0ab95da5..b9c1473e 100644
--- a/config/h3n2/outliers.txt
+++ b/config/h3n2/outliers.txt
@@ -10,6 +10,7 @@ A/Austria/839906/2015
 A/Bangladesh/3010/2020
 A/BosniaandHerzegovia/288G/2019
 A/Brazil/BA-LACEN-BA053-292045410/2022
+A/Brazil/BA-LACEN-BA071-292012660/2021
 A/Busan/15453/2009
 A/California/NHRC0001/2011
 A/Cambodia/NHRCC00001/2009
@@ -463,6 +464,7 @@ A/Sedbury/2991/2023
 A/Shanghai-Minxing/1482/2017
 A/Shanghai/6/2014
 A/Singapore/GP11121/2022
+A/SouthAfrica/K056301/2023
 A/SouthAustralia/1/2021
 A/SouthAustralia/22/2018
 A/SouthAustralia/47/2016
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 943aeaec..0d6d84f8 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -105,6 +105,7 @@ rule subsample:
         augur filter \
             --sequences {input.aligned_sequences} \
             --metadata {input.enriched_metadata} \
+            --exclude {input.exclude} \
             --include {input.include_strains} {params.other_include} \
             --include-where EPI_ISL={params.reference_EPI_ISL} \
             {params.filter_arguments} \
@@ -148,6 +149,7 @@ rule tree:
             --alignment {input.alignment} \
             --output {output.tree} \
             --nthreads {threads} \
+            --tree-builder-args '-ninit 10 -n 4 -czb' \
             > /dev/null
         """
 
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index 99044324..75531d50 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -11,12 +11,12 @@ builds:
           key: "subclade"
       refs:
         CY121680:
-          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500  --subsample-max-sequences 1500"
+          filter: "--min-date 2014 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500  --subsample-max-sequences 1500"
           clade_offset: 0
           reference_EPI_ISL: EPI1583287
           reference_strain: A/California/7/2009-egg #TODO: exclude
         MW626062:
-          filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          filter: "--min-date 2019 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500"
           clade_offset: 0
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv"
           reference_EPI_ISL: EPI1812046
@@ -44,14 +44,14 @@ builds:
           key: "short-clade"
       refs:
         EPI1857216:
-          filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: -17
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
           include_file: dataset_config/h3n2/includes.txt
           reference_EPI_ISL: EPI1857216
           reference_strain: A/Darwin/6/2021
         CY163680:
-          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
           include_file: dataset_config/h3n2/includes.txt
@@ -77,7 +77,7 @@ builds:
           key: "subclade"
       refs:
         KX058884:
-          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1500"
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv"
           reference_EPI_ISL: EPI696970
@@ -88,7 +88,7 @@ builds:
           url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"
       refs:
         CY073894:
-          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
+          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000"
           clade_offset: -30
           reference_EPI_ISL: CY073894
           reference_strain: B/Brisbane/60/2008

From dacde90ef030554a9c96dc4ef831418ccd3029d1 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Wed, 27 Sep 2023 18:32:38 +0200
Subject: [PATCH 08/26] add includes

---
 nextclade/Snakefile                           | 6 +++---
 nextclade/config/config_dict.yaml             | 2 --
 nextclade/dataset_config/h1n1pdm/includes.txt | 8 ++++++++
 nextclade/dataset_config/vic/includes.txt     | 5 +++++
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 0d6d84f8..4ed28051 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -88,6 +88,7 @@ rule subsample:
         aligned_sequences=rules.parse.output.sequences,
         enriched_metadata=rules.parse.output.metadata,
         include_strains="../config/{lineage}/reference_strains.txt",
+        nextclade_include="dataset_config/{lineage}/include.txt",
         exclude="../config/{lineage}/outliers.txt",
     output:
         sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
@@ -98,15 +99,14 @@ rule subsample:
         ]["filter"],
         reference_EPI_ISL=lambda w: config["builds"][w.lineage][w.segment]["refs"][
             w.reference
-        ]["reference_EPI_ISL"],
-        other_include = lambda w:config["builds"][w.lineage][w.segment]["refs"][w.reference].get("include_file","")
+        ]["reference_EPI_ISL"]
     shell:
         """
         augur filter \
             --sequences {input.aligned_sequences} \
             --metadata {input.enriched_metadata} \
             --exclude {input.exclude} \
-            --include {input.include_strains} {params.other_include} \
+            --include {input.include_strains} {input.nextclade_include} \
             --include-where EPI_ISL={params.reference_EPI_ISL} \
             {params.filter_arguments} \
             --output {output.sampled_sequences} \
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index 75531d50..32fb1c71 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -47,14 +47,12 @@ builds:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: -17
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
-          include_file: dataset_config/h3n2/includes.txt
           reference_EPI_ISL: EPI1857216
           reference_strain: A/Darwin/6/2021
         CY163680:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
           clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
-          include_file: dataset_config/h3n2/includes.txt
           reference_EPI_ISL: EPI545340
           reference_strain: A/Wisconsin/67/2005-egg
     na:
diff --git a/nextclade/dataset_config/h1n1pdm/includes.txt b/nextclade/dataset_config/h1n1pdm/includes.txt
index e69de29b..e168d0bf 100644
--- a/nextclade/dataset_config/h1n1pdm/includes.txt
+++ b/nextclade/dataset_config/h1n1pdm/includes.txt
@@ -0,0 +1,8 @@
+A/Lao/1632/2023
+A/NorthCarolina/6/2023
+A/Victoria/114/2023
+A/Victoria/27/2023
+A/Tuvalu/GX23096/2023
+A/India/Pune-Nivsari_22_884/2022
+A/Nepal/22FL2393/2022
+A/Singapore/GP6589/2022
diff --git a/nextclade/dataset_config/vic/includes.txt b/nextclade/dataset_config/vic/includes.txt
index e69de29b..ffdd2b25 100644
--- a/nextclade/dataset_config/vic/includes.txt
+++ b/nextclade/dataset_config/vic/includes.txt
@@ -0,0 +1,5 @@
+B/Brazil/1417/2023
+B/Massachusetts/1/2022
+B/Amazonas/2022-014046-IEC/2022
+B/Iquitos/FPI20551/2022
+B/Pennsylvania/3/2022

From f21122a348a0a942cd193f8ab2e202b5d786cdbd Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Wed, 27 Sep 2023 18:48:37 +0200
Subject: [PATCH 09/26] fix include file name and clean rules

---
 nextclade/Snakefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 4ed28051..8d1fd3e5 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -88,7 +88,7 @@ rule subsample:
         aligned_sequences=rules.parse.output.sequences,
         enriched_metadata=rules.parse.output.metadata,
         include_strains="../config/{lineage}/reference_strains.txt",
-        nextclade_include="dataset_config/{lineage}/include.txt",
+        nextclade_include="dataset_config/{lineage}/includes.txt",
         exclude="../config/{lineage}/outliers.txt",
     output:
         sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
@@ -281,7 +281,6 @@ def get_node_data(w):
     ]
 
     for clade in config["builds"][w.lineage][w.segment]["clade_systems"]:
-        print(clade)
         node_data.append("build/{lineage}/{segment}/{reference}/".format(**w) + f'clade_{clade}.json')
 
     return node_data
@@ -362,12 +361,15 @@ rule make_dataset:
 rule clean:
     shell:
         """
-        rm -rf output test data/clades* data/include* auspice/*
+        rm -rf build datasets auspice
         """
 
 
 rule clean_all:
     shell:
         """
-        rm -rf output test auspice build data
+        rm -rf build
+        rm -rf auspice
+        rm -rf datasets
+        rm -rf data/
         """

From 403e27c654c32d02671d3b6e2046c08babf83058 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Fri, 10 Nov 2023 15:29:58 +0100
Subject: [PATCH 10/26] sort out short clade names, add outlier pruning

---
 nextclade/Snakefile               |  21 +++++-
 nextclade/config/config_dict.yaml |   3 +
 nextclade/config/pathogen.json    | 121 ++++++++++++++++++------------
 nextclade/scripts/merge_jsons.py  |   3 +
 4 files changed, 97 insertions(+), 51 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 8d1fd3e5..426ae70d 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -174,10 +174,29 @@ rule root:
         cp {params.outdir}/rerooted.newick {output.tree}
         """
 
+rule prune_outliers:
+    input:
+        tree = rules.root.output.tree
+    output:
+        tree="build/{lineage}/{segment}/{reference}/tree_rooted_pruned.nwk"
+    params:
+        outliers = "build/{lineage}/{segment}/{reference}/tt_out/outliers.tsv"
+    run:
+        import pandas as pd
+        from Bio import Phylo
+        outliers = pd.read_csv(params.outliers, sep='\t', index_col=0)
+        T = Phylo.read(input.tree, 'newick')
+
+        for n in outliers.index:
+            if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n):
+                print("prune", n)
+                T.prune(n)
+        Phylo.write(T, output.tree, "newick")
+
 # refine while keeping the root
 rule refine:
     input:
-        tree=rules.root.output.tree,
+        tree=rules.prune_outliers.output.tree,
         alignment=rules.align.output.alignment,
         enriched_metadata=rules.parse.output.metadata,
     output:
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index 32fb1c71..ede743b5 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -5,7 +5,10 @@ builds:
     ha:
       clade_systems:
         clade:
+          url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv"
+        short-clade:
           url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades.tsv"
+          key: "short-clade"
         subclade:
           url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv"
           key: "subclade"
diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json
index ab5c112d..a6471a56 100644
--- a/nextclade/config/pathogen.json
+++ b/nextclade/config/pathogen.json
@@ -1,53 +1,74 @@
 {
-    "schemaVersion": "3.0.0",
-    "alignmentParams": {
-        "excessBandwidth": 9,
-        "terminalBandwidth": 100,
-        "allowedMismatches": 4,
-        "gapAlignmentSide": "right",
-        "minSeedCover": 0.1
+  "schemaVersion": "3.0.0",
+  "alignmentParams": {
+    "excessBandwidth": 9,
+    "terminalBandwidth": 100,
+    "allowedMismatches": 4,
+    "gapAlignmentSide": "right",
+    "minSeedCover": 0.1
+  },
+  "files": {
+    "examples": "example_sequences.fasta",
+    "genomeAnnotation": "annotation.gff",
+    "pathogenJson": "pathogen.json",
+    "reference": "reference.fasta",
+    "treeJson": "tree.json"
+  },
+  "qc": {
+    "privateMutations": {
+      "enabled": true,
+      "typical": 5,
+      "cutoff": 15,
+      "weightLabeledSubstitutions": 2,
+      "weightReversionSubstitutions": 1,
+      "weightUnlabeledSubstitutions": 1
     },
-    "files":{
-        "examples": "example_sequences.fasta",
-        "genomeAnnotation": "annotation.gff",
-        "pathogenJson": "pathogen.json",
-        "reference": "reference.fasta",
-        "treeJson": "tree.json"
-        },
-    "qc": {
-        "privateMutations": {
-          "enabled": true,
-          "typical": 5,
-          "cutoff": 15,
-          "weightLabeledSubstitutions": 2,
-          "weightReversionSubstitutions": 1,
-          "weightUnlabeledSubstitutions": 1
-        },
-        "missingData": {
-          "enabled": false,
-          "missingDataThreshold": 100,
-          "scoreBias": 10
-        },
-        "snpClusters": {
-          "enabled": false,
-          "windowSize": 100,
-          "clusterCutOff": 5,
-          "scoreWeight": 50
-        },
-        "mixedSites": {
-          "enabled": true,
-          "mixedSitesThreshold": 4
-        },
-        "frameShifts": {
-          "enabled": true
-        },
-        "stopCodons": {
-          "enabled": true,
-          "ignoredStopCodons": []
-        }
+    "missingData": {
+      "enabled": false,
+      "missingDataThreshold": 100,
+      "scoreBias": 10
     },
-    "geneOrderPreference": [
-        "HA1",
-        "HA2"
-      ]
-}
+    "snpClusters": {
+      "enabled": false,
+      "windowSize": 100,
+      "clusterCutOff": 5,
+      "scoreWeight": 50
+    },
+    "mixedSites": {
+      "enabled": true,
+      "mixedSitesThreshold": 4
+    },
+    "frameShifts": {
+      "enabled": true
+    },
+    "stopCodons": {
+      "enabled": true,
+      "ignoredStopCodons": []
+    }
+  },
+  "geneOrderPreference": [
+    "HA1",
+    "HA2"
+  ],
+  "maintenance": {
+    "website": [
+      "https://nextstrain.org",
+      "https://clades.nextstrain.org"
+    ],
+    "documentation": [
+      "https://github.com/nextstrain/seasonal-flu"
+    ],
+    "source code": [
+      "https://github.com/nextstrain/seasonal_flu"
+    ],
+    "issues": [
+      "https://github.com/nextstrain/seasonal_flu/issues"
+    ],
+    "organizations": [
+      "Nextstrain"
+    ],
+    "authors": [
+      "Nextstrain team <https://nextstrain.org>"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py
index 10eb5df4..59fa0394 100644
--- a/nextclade/scripts/merge_jsons.py
+++ b/nextclade/scripts/merge_jsons.py
@@ -38,6 +38,9 @@ def get_clade_configs(name):
                                    "segment":{"value":args.segment},
                                    "reference":{"value":args.reference}}
 
+    pathogen_json['geneOrderPreference'] = {"ha": ["HA1", "HA2"], "na":["NA"]}.get(args.segment, [])
+    if args.segment in ['ha', 'na']:
+        pathogen_json['defaultGene'] = {"ha": "HA1", "na":"NA"}.get(args.segment)
 
     if len(args.clades):
         auspice_json['extensions']['nextclade']["clade_node_attrs"] =  [

From e64b4e88c18a1288b6ba6ba6f603e8380de2a0bf Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Fri, 10 Nov 2023 15:30:09 +0100
Subject: [PATCH 11/26] add outliers

---
 config/h3n2/outliers.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/config/h3n2/outliers.txt b/config/h3n2/outliers.txt
index b9c1473e..83553b5e 100644
--- a/config/h3n2/outliers.txt
+++ b/config/h3n2/outliers.txt
@@ -11,6 +11,8 @@ A/Bangladesh/3010/2020
 A/BosniaandHerzegovia/288G/2019
 A/Brazil/BA-LACEN-BA053-292045410/2022
 A/Brazil/BA-LACEN-BA071-292012660/2021
+A/Brazil/BA-LACEN-BA123-292029844/2021
+A/Brazil/BA-LACEN-BA128-292008241/2021
 A/Busan/15453/2009
 A/California/NHRC0001/2011
 A/Cambodia/NHRCC00001/2009
@@ -28,6 +30,8 @@ A/Catalonia/NSVH100570896/2017
 A/Catalonia/NSVH100731127/2017
 A/Catalonia/NSVH100748648/2018
 A/Catalonia/NSVH100751838/2018
+A/ChiangRai/NIC-P3252/2022
+A/ChiangRai/D1249/2022
 A/Chile/8266/2003
 A/Corsica/10/2009
 A/Corsica/11/2008
@@ -46,6 +50,7 @@ A/Corsica/39/2009
 A/Corsica/42/2009
 A/Corsica/7/2007
 A/Cyprus/F83/2017
+A/Dakar/922/2022
 A/Darwin/143/2020
 A/Delaware/3/2011
 A/Delaware/33/2017

From 26bf61a02fc4fb22a306aebcc58b283e149b0c0b Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Fri, 10 Nov 2023 17:26:07 +0100
Subject: [PATCH 12/26] nextclade: add readme's and fix attributes

---
 nextclade/Snakefile                           | 14 +++++++----
 nextclade/config/pathogen.json                | 11 +++++++--
 .../h1n1pdm/ha/CY121680/README.md             | 22 ++++++++++++++++++
 .../h1n1pdm/ha/MW626062/README.md             | 22 ++++++++++++++++++
 .../h1n1pdm/ha/MW626062/pathogen.json         |  1 -
 .../h1n1pdm/na/MW626056/README.md             | 22 ++++++++++++++++++
 .../h1n1pdm/na/MW626056/pathogen.json         |  1 -
 .../dataset_config/h3n2/ha/CY163680/README.md | 22 ++++++++++++++++++
 .../h3n2/ha/CY163680/pathogen.json            |  1 -
 .../h3n2/ha/EPI1857216/README.md              | 23 +++++++++++++++++++
 .../h3n2/ha/EPI1857216/pathogen.json          |  1 -
 .../h3n2/na/EPI1857215/README.md              | 23 +++++++++++++++++++
 .../h3n2/na/EPI1857215/pathogen.json          |  1 -
 .../vic/ha/EPI1926632/README.md               | 22 ++++++++++++++++++
 .../vic/ha/EPI1926632/pathogen.json           |  1 -
 .../dataset_config/vic/ha/KX058884/README.md  | 22 ++++++++++++++++++
 .../vic/ha/KX058884/pathogen.json             |  1 -
 .../dataset_config/vic/na/CY073894/README.md  | 22 ++++++++++++++++++
 .../vic/na/CY073894/pathogen.json             |  1 -
 nextclade/scripts/merge_jsons.py              | 11 ++++++---
 20 files changed, 227 insertions(+), 17 deletions(-)
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
 create mode 100644 nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
 create mode 100644 nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
 create mode 100644 nextclade/dataset_config/h3n2/ha/CY163680/README.md
 create mode 100644 nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
 create mode 100644 nextclade/dataset_config/h3n2/na/EPI1857215/README.md
 create mode 100644 nextclade/dataset_config/vic/ha/EPI1926632/README.md
 create mode 100644 nextclade/dataset_config/vic/ha/KX058884/README.md
 create mode 100644 nextclade/dataset_config/vic/na/CY073894/README.md

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 426ae70d..85125b56 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -280,10 +280,13 @@ rule virus_specific_jsons:
         auspice = "build/{lineage}/{segment}/{reference}/auspice_config.json",
     params:
         clades = lambda w: [config["builds"][w.lineage][w.segment]["clade_systems"][clade].get('key', 'default')
-                            for clade in config["builds"][w.lineage][w.segment]["clade_systems"]]
+                            for clade in config["builds"][w.lineage][w.segment]["clade_systems"]],
+        reference_name = lambda w: config["builds"][w.lineage][w.segment]['refs'][w.reference]['reference_strain']
     shell:
         """
-        python3 scripts/merge_jsons.py --lineage {wildcards.lineage} --reference {wildcards.reference} \
+        python3 scripts/merge_jsons.py --lineage {wildcards.lineage} \
+            --reference {wildcards.reference} \
+            --reference-name {params.reference_name} \
             --segment {wildcards.segment} --clades {params.clades} \
             --pathogen-jsons {input.pathogen} {input.additional_pathogen} \
             --auspice-config {input.auspice_config} \
@@ -357,12 +360,14 @@ rule make_dataset:
         auspice_json="build/{lineage}/{segment}/{reference}/auspice.json",
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
+        readme="dataset_config/{lineage}/{segment}/{reference}/README.md",
         pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json",
     output:
-        sequences="datasets/{lineage}/{segment}/{reference}/example_sequences.fasta",
+        sequences="datasets/{lineage}/{segment}/{reference}/sequences.fasta",
         tree="datasets/{lineage}/{segment}/{reference}/tree.json",
-        annotation="datasets/{lineage}/{segment}/{reference}/annotation.gff",
+        annotation="datasets/{lineage}/{segment}/{reference}/genome_annotation.gff3",
         reference="datasets/{lineage}/{segment}/{reference}/reference.fasta",
+        readme="datasets/{lineage}/{segment}/{reference}/README.md",
         pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json",
         auspice="auspice/{lineage}_{segment}_{reference}.json",
     shell:
@@ -370,6 +375,7 @@ rule make_dataset:
         cp {input.sequences} {output.sequences}
         cp {input.auspice_json} {output.tree}
         cp {input.reference} {output.reference}
+        cp {input.readme} {output.readme}
         cp {input.annotation} {output.annotation}
         cp {input.pathogen_json} {output.pathogen_json}
         cp {input.auspice_json} {output.auspice}
diff --git a/nextclade/config/pathogen.json b/nextclade/config/pathogen.json
index a6471a56..ee06a003 100644
--- a/nextclade/config/pathogen.json
+++ b/nextclade/config/pathogen.json
@@ -7,10 +7,17 @@
     "gapAlignmentSide": "right",
     "minSeedCover": 0.1
   },
+  "compatibility": {
+    "cli": "3.0.0-alpha.0",
+    "web": "3.0.0-alpha.0"
+  },
+  "defaultGene": "HA1",
   "files": {
-    "examples": "example_sequences.fasta",
-    "genomeAnnotation": "annotation.gff",
+    "changelog": "CHANGELOG.md",
+    "examples": "sequences.fasta",
+    "genomeAnnotation": "genome_annotation.gff3",
     "pathogenJson": "pathogen.json",
+    "readme": "README.md",
     "reference": "reference.fasta",
     "treeJson": "tree.json"
   },
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
new file mode 100644
index 00000000..90099616
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h1n1pdm_ha       | Influenza A H1N1pdm HA                   |
+| reference            | CY121680             | A/California/07/2009                      |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
new file mode 100644
index 00000000..fc893f29
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h1n1pdm_ha       | Influenza A H1N1pdm HA                   |
+| reference            | MW626062             | A/Wisconsin/588/2019                      |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json
index a9af3dfd..0fec5c9f 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
new file mode 100644
index 00000000..dbf50de6
--- /dev/null
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h1n1pdm_na       | Influenza A H1N1pdm HA                   |
+| reference            | MW626056             | A/Wisconsin/588/2019                      |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json
index 27ec895a..56945894 100644
--- a/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
new file mode 100644
index 00000000..4efdec2a
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h3n2_ha          | Influenza A H3N2 HA                      |
+| reference            | CY163680             | A/Wisconsin/67/2005                      |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json
index a9af3dfd..0fec5c9f 100644
--- a/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
new file mode 100644
index 00000000..07116d3c
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
@@ -0,0 +1,23 @@
+# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h3n2_ha          | Influenza A H3N2 HA                      |
+| reference            | EPI1857216           | A/Darwin/6/2021                          |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/)
+ * Identification of glycosilation motifs
+ * Counting of mutations in the RBD
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json
index 6b5cd7dd..c50e5311 100644
--- a/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/pathogen.json
@@ -1,5 +1,4 @@
 {
-  "schemaVersion": "1.10.0",
   "nucMutLabelMap": {},
   "nucMutLabelMapReverse": {},
   "phenotypeData":[
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
new file mode 100644
index 00000000..99a76b4e
--- /dev/null
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
@@ -0,0 +1,23 @@
+# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu_h3n2_na/EPI1857216)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_h3n2_na          | Influenza A H3N2 NA                      |
+| reference            | EPI1857216           | A/Darwin/6/2021                          |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/)
+ * Identification of glycosilation motifs
+ * Counting of mutations in the RBD
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json
index a56465d3..0b454508 100644
--- a/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/README.md b/nextclade/dataset_config/vic/ha/EPI1926632/README.md
new file mode 100644
index 00000000..1d9e0509
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu_vic_ha/EPI1926632)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_vic_ha           | Influenza B Vic HA                       |
+| reference            | EPI1926632           | B/Austria/1359417/2021                   |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json
index d80db6a5..cd3daefa 100644
--- a/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/pathogen.json
@@ -1,5 +1,4 @@
 {
-  "schemaVersion": "1.10.0",
   "nucMutLabelMap": {},
   "nucMutLabelMapReverse": {},
   "aaMotifs": [
diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md
new file mode 100644
index 00000000..d28dc35c
--- /dev/null
+++ b/nextclade/dataset_config/vic/ha/KX058884/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_vic_ha           | Influenza B Vic HA                       |
+| reference            | KX058884             | B/Brisbane/60/2008                       |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/vic/ha/KX058884/pathogen.json b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json
index a9af3dfd..0fec5c9f 100644
--- a/nextclade/dataset_config/vic/ha/KX058884/pathogen.json
+++ b/nextclade/dataset_config/vic/ha/KX058884/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md
new file mode 100644
index 00000000..c7cbf354
--- /dev/null
+++ b/nextclade/dataset_config/vic/na/CY073894/README.md
@@ -0,0 +1,22 @@
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_na/CY073894)
+
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu_vic_na           | Influenza B Vic HA                       |
+| reference            | CY073894             | B/Brisbane/60/2008                       |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades and subclades based on the nomenclature defined in [github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/)
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/vic/na/CY073894/pathogen.json b/nextclade/dataset_config/vic/na/CY073894/pathogen.json
index 3bfeb859..efc674fd 100644
--- a/nextclade/dataset_config/vic/na/CY073894/pathogen.json
+++ b/nextclade/dataset_config/vic/na/CY073894/pathogen.json
@@ -1,5 +1,4 @@
 {
-    "schemaVersion": "1.10.0",
     "nucMutLabelMap": {},
     "nucMutLabelMapReverse": {},
     "aaMotifs": [
diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py
index 59fa0394..1ffc48d2 100644
--- a/nextclade/scripts/merge_jsons.py
+++ b/nextclade/scripts/merge_jsons.py
@@ -19,6 +19,7 @@ def get_clade_configs(name):
     parser.add_argument("--lineage", required=True, help="attribute info")
     parser.add_argument("--segment", required=True, help="attribute info")
     parser.add_argument("--reference", required=True, help="attribute info")
+    parser.add_argument("--reference-name", required=True, help="attribute info")
     parser.add_argument("--auspice-config", required=True, help="Auspice config JSON with coloring entry to have scale added to")
     parser.add_argument("--pathogen-jsons", nargs='+', required=True, help="name of the coloring field in the Auspice config JSON")
     parser.add_argument("--clades", nargs="+", required=True, help="list of values to assign colors to")
@@ -34,9 +35,13 @@ def get_clade_configs(name):
     with open(args.auspice_config) as fh:
         auspice_json = json.load(fh)
 
-    pathogen_json['attributes'] = {"name":{"value":args.lineage},
-                                   "segment":{"value":args.segment},
-                                   "reference":{"value":args.reference}}
+    flu_type = {'h3n2':'A', 'h1n1pdm':'A', 'vic':'B', 'yam':'B'}[args.lineage]
+    lineage_name = {'h3n2':'H3N2', 'h1n1pdm':'H1N1pdm', 'vic':'Victoria', 'yam':'Yamagata'}[args.lineage]
+
+    pathogen_json['attributes'] = {"name": f"Influenza {flu_type} {lineage_name} {args.segment.upper()}",
+                                   "segment": args.segment,
+                                   "reference accession": args.reference,
+                                   "reference name": args.reference_name}
 
     pathogen_json['geneOrderPreference'] = {"ha": ["HA1", "HA2"], "na":["NA"]}.get(args.segment, [])
     if args.segment in ['ha', 'na']:

From 34172fde8edea546feffad903e7c0623a711fb21 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Fri, 10 Nov 2023 21:29:12 +0100
Subject: [PATCH 13/26] nextclade: add changelog

---
 nextclade/Snakefile               | 15 +++++++++++++++
 nextclade/config/config_dict.yaml | 10 ++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 85125b56..2813e8b4 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -43,6 +43,18 @@ rule download_clades:
         curl {params.source_tsv} > {output.clade_tsv}
         """
 
+rule download_changelog:
+    message:
+        "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}"
+    output:
+        changelog = "data/{lineage}_{segment}_changelog.md"
+    params:
+        source=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["changelog"],
+    shell:
+        """
+        curl {params.source} > {output.changelog}
+        """
+
 rule offset_clades:
     input:
         rules.download_clades.output,
@@ -361,6 +373,7 @@ rule make_dataset:
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
         readme="dataset_config/{lineage}/{segment}/{reference}/README.md",
+        changelog="data/{lineage}_{segment}_changelog.md",
         pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json",
     output:
         sequences="datasets/{lineage}/{segment}/{reference}/sequences.fasta",
@@ -370,12 +383,14 @@ rule make_dataset:
         readme="datasets/{lineage}/{segment}/{reference}/README.md",
         pathogen_json="datasets/{lineage}/{segment}/{reference}/pathogen.json",
         auspice="auspice/{lineage}_{segment}_{reference}.json",
+        changelog="datasets/{lineage}/{segment}/{reference}/CHANGELOG.md",
     shell:
         """
         cp {input.sequences} {output.sequences}
         cp {input.auspice_json} {output.tree}
         cp {input.reference} {output.reference}
         cp {input.readme} {output.readme}
+        cp {input.changelog} {output.changelog}
         cp {input.annotation} {output.annotation}
         cp {input.pathogen_json} {output.pathogen_json}
         cp {input.auspice_json} {output.auspice}
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index ede743b5..31d99c8b 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -3,6 +3,7 @@ clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/"
 builds:
   h1n1pdm:
     ha:
+      changelog: "seasonal_A-H1N1pdm_HA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/clades-long.tsv"
@@ -21,10 +22,10 @@ builds:
         MW626062:
           filter: "--min-date 2019 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500"
           clade_offset: 0
-          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h1n1pdm/ha/clades-long.tsv"
           reference_EPI_ISL: EPI1812046
           reference_strain: A/Wisconsin/588/2019
     na:
+      changelog: "seasonal_A-H1N1pdm_NA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_A-H1N1pdm_NA/main/.auto-generated/subclades.tsv"
@@ -36,6 +37,7 @@ builds:
           reference_strain: A/Wisconsin/588/2019
   h3n2:
     ha:
+      changelog: "seasonal_A-H3N2_HA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_A-H3N2_HA/main/.auto-generated/clades-long.tsv"
@@ -49,16 +51,15 @@ builds:
         EPI1857216:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: -17
-          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
           reference_EPI_ISL: EPI1857216
           reference_strain: A/Darwin/6/2021
         CY163680:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
-          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/h3n2/ha/clades-long.tsv"
           reference_EPI_ISL: EPI545340
           reference_strain: A/Wisconsin/67/2005-egg
     na:
+      changelog: "seasonal_A-H3N2_NA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_A-H3N2_NA/main/.auto-generated/subclades.tsv"
@@ -70,6 +71,7 @@ builds:
           reference_strain: A/Darwin/6/2021
   vic:
     ha:
+      changelog: "seasonal_B-Vic_HA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_B-Vic_HA/main/.auto-generated/clades.tsv"
@@ -80,10 +82,10 @@ builds:
         KX058884:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
-          clade_url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/vic/ha/clades.tsv"
           reference_EPI_ISL: EPI696970
           reference_strain: B/Brisbane/60/2008-egg
     na:
+      changelog: "seasonal_B-Vic_NA/main/CHANGELOG.md"
       clade_systems:
         clade:
           url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"

From 99497fd4d3236c4ce9d2549b1167c2cbbc8baacf Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 12 Nov 2023 15:45:37 +0100
Subject: [PATCH 14/26] nextclade: add to dataset readme

---
 .../dataset_config/h1n1pdm/ha/CY121680/README.md    | 13 +++++++++++++
 .../dataset_config/h1n1pdm/ha/MW626062/README.md    | 11 +++++++++++
 .../dataset_config/h1n1pdm/na/MW626056/README.md    |  9 +++++++++
 nextclade/dataset_config/h3n2/ha/CY163680/README.md | 12 ++++++++++++
 .../dataset_config/h3n2/ha/EPI1857216/README.md     | 11 +++++++++++
 .../dataset_config/h3n2/na/EPI1857215/README.md     |  8 ++++++++
 nextclade/dataset_config/vic/ha/KX058884/README.md  | 11 +++++++++++
 nextclade/dataset_config/vic/na/CY073894/README.md  |  9 +++++++++
 8 files changed, 84 insertions(+)

diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
index 90099616..b8b0874a 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
@@ -1,5 +1,7 @@
 # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680)
 
+This dataset uses an older reference sequence (A/California/07/2009) and recent sequences will differ at a large number of positions from this reference.
+For the analysis of currently circulating viruses, the dataset using A/Wisconsin/588/2019 as reference might be more appropriate.
 
 ## Dataset attributes
 
@@ -17,6 +19,17 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+Clade names are structured as _Number-Letter_ binomials separated by periods as in `6B.1A.5a.2a.1`. These sometimes get shortened by omission of leading binomials like `5a.2a.1`.
+
+In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
index fc893f29..03c8197a 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
@@ -1,5 +1,6 @@
 # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062)
 
+This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses.
 
 ## Dataset attributes
 
@@ -17,6 +18,16 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+Clade names are structured as _Number-Letter_ binomials separated by periods as in `6B.1A.5a.2a.1`. These sometimes get shortened by omission of leading binomials like `5a.2a.1`.
+
+In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
index dbf50de6..10c38f0d 100644
--- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
@@ -1,5 +1,6 @@
 # Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056)
 
+This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses.
 
 ## Dataset attributes
 
@@ -8,6 +9,14 @@
 | name                 | flu_h1n1pdm_na       | Influenza A H1N1pdm HA                   |
 | reference            | MW626056             | A/Wisconsin/588/2019                      |
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
+
+This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+
 
 ## Features
 This dataset supports
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
index 4efdec2a..c91fff85 100644
--- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
@@ -1,5 +1,7 @@
 # Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680)
 
+This dataset uses an older reference sequence (A/Wisconsin/67/2005) and recent sequences will differ at a large number of positions from this reference.
+For the analysis of currently circulating viruses, the dataset using A/Darwin/6/2021 as reference might be more appropriate.
 
 ## Dataset attributes
 
@@ -17,6 +19,16 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
+
+In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`.
+The leading letter is an alias of a previous name.
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
index 07116d3c..a2ced41f 100644
--- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
@@ -1,5 +1,6 @@
 # Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216)
 
+This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable for the analysis of circulating viruses.
 
 ## Dataset attributes
 
@@ -18,6 +19,16 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
+
+In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`.
+The leading letter is an alias of a previous name.
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
index 99a76b4e..4855ee8f 100644
--- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
@@ -8,6 +8,14 @@
 | name                 | flu_h3n2_na          | Influenza A H3N2 NA                      |
 | reference            | EPI1857216           | A/Darwin/6/2021                          |
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
+
+This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+
 
 ## Features
 This dataset supports
diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md
index d28dc35c..fe88397a 100644
--- a/nextclade/dataset_config/vic/ha/KX058884/README.md
+++ b/nextclade/dataset_config/vic/ha/KX058884/README.md
@@ -1,5 +1,6 @@
 # Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884)
 
+The reference sequence for this datasets precedes the deletions at positions 162ff in the HA1 protein of the virus and thus follows the canonical numbering of aminoacids in the protein.
 
 ## Dataset attributes
 
@@ -17,6 +18,16 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `V1A.3a.2`.
+
+In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.3.2`.
+The leading letter is an alias of a previous name.
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md
index c7cbf354..dae95036 100644
--- a/nextclade/dataset_config/vic/na/CY073894/README.md
+++ b/nextclade/dataset_config/vic/na/CY073894/README.md
@@ -17,6 +17,15 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
+
+This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.2.3`.
+The leading letter is an alias of a previous name.
+
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html

From 57cb639884dccaf30432eb32cb7fc80d3a465133 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 12 Nov 2023 15:52:58 +0100
Subject: [PATCH 15/26] nextclade: add links to README

---
 .../h1n1pdm/ha/CY121680/README.md              |  2 ++
 .../h1n1pdm/ha/MW626062/README.md              |  1 +
 .../h1n1pdm/na/MW626056/README.md              | 16 +++++++++-------
 .../dataset_config/h3n2/ha/CY163680/README.md  |  3 ++-
 .../h3n2/na/EPI1857215/README.md               | 18 ++++++++++--------
 .../dataset_config/vic/ha/KX058884/README.md   |  1 +
 .../dataset_config/vic/na/CY073894/README.md   |  1 +
 7 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
index b8b0874a..b14250ef 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
@@ -28,6 +28,8 @@ Clade names are structured as _Number-Letter_ binomials separated by periods as
 In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/).
+
 
 
 ## What is Nextclade dataset
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
index 03c8197a..91cc32d3 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
@@ -27,6 +27,7 @@ Clade names are structured as _Number-Letter_ binomials separated by periods as
 In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_HA/).
 
 ## What is Nextclade dataset
 
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
index 10c38f0d..9ecb25f9 100644
--- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
@@ -9,13 +9,6 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit
 | name                 | flu_h1n1pdm_na       | Influenza A H1N1pdm HA                   |
 | reference            | MW626056             | A/Wisconsin/588/2019                      |
 
-## Clades of seasonal influenza viruses
-
-The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
-
-This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
-These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
-The leading letter is an alias of a previous name.
 
 
 ## Features
@@ -26,6 +19,15 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
+
+This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H1N1pdm_NA/).
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
index c91fff85..fa391c39 100644
--- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
@@ -23,11 +23,12 @@ This dataset supports
 
 The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
 Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
-Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
+Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
 
 In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/).
 
 ## What is Nextclade dataset
 
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
index 4855ee8f..a23b9233 100644
--- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
@@ -8,14 +8,6 @@
 | name                 | flu_h3n2_na          | Influenza A H3N2 NA                      |
 | reference            | EPI1857216           | A/Darwin/6/2021                          |
 
-## Clades of seasonal influenza viruses
-
-The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
-
-This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
-These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
-The leading letter is an alias of a previous name.
-
 
 ## Features
 This dataset supports
@@ -26,6 +18,16 @@ This dataset supports
  * Sequence QC
  * Phylogenetic placement
 
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers **do not** define "clades" for the neuraminidase segment.
+
+This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
+These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `C.1.2`.
+The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_NA/).
+
+
 ## What is Nextclade dataset
 
 Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md
index fe88397a..0f117264 100644
--- a/nextclade/dataset_config/vic/ha/KX058884/README.md
+++ b/nextclade/dataset_config/vic/ha/KX058884/README.md
@@ -27,6 +27,7 @@ Clade names are structured as _Number-Letter_ binomials (with exceptions) separa
 In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.3.2`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_HA/).
 
 ## What is Nextclade dataset
 
diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md
index dae95036..f2dba64e 100644
--- a/nextclade/dataset_config/vic/na/CY073894/README.md
+++ b/nextclade/dataset_config/vic/na/CY073894/README.md
@@ -24,6 +24,7 @@ The WHO Collaborating centers **do not** define "clades" for the neuraminidase s
 This dataset focuses on "subclades" that in analogy to the HA segment are defined to break down diversity at high resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `A.2.3`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/](https://github.com/influenza-clade-nomenclature/seasonal_B-Vic_NA/).
 
 
 ## What is Nextclade dataset

From 2893860e47a2cd00419439b6dc668b2c204ca6ea Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 12 Nov 2023 15:54:53 +0100
Subject: [PATCH 16/26] nextclade: cp command

---
 nextclade/Snakefile                                   | 3 +++
 nextclade/dataset_config/h3n2/ha/EPI1857216/README.md | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 2813e8b4..31b898fe 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -413,3 +413,6 @@ rule clean_all:
         rm -rf datasets
         rm -rf data/
         """
+
+
+# cp datasets/h3n2/ha/EPI1857216/* ../../../nextstrain/nextclade_data/data/nextstrain/flu/h3n2/ha/EPI1857216
\ No newline at end of file
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
index a2ced41f..1bb9a2c8 100644
--- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
@@ -23,11 +23,12 @@ This dataset supports
 
 The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
 Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
-Clade names are structured as _Number-Letter_ binomials separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
+Clade names are structured as _Number-Letter_ binomials (with exceptions) separated by periods as in `3C.2a1b.2a.2a.1a`. These sometimes get shortened by omission of leading binomials like `2a.1`.
 
 In addition to these clades, "subclades" are defined to break down diversity at higher resolution and allow following the spread of different viral groups.
 These follow a Pango-like nomenclature consisting of a letter followed by a numbers separated by periods as in `G.1.3.1`.
 The leading letter is an alias of a previous name.
+Details of the nomenclature system can be found at [github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/](https://github.com/influenza-clade-nomenclature/seasonal_A-H3N2_HA/).
 
 ## What is Nextclade dataset
 

From 7d4c044a70a2658f87fa19d7c4d72e471044b3cc Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 12:55:04 +0100
Subject: [PATCH 17/26] add outliers

---
 config/h1n1pdm/outliers.txt | 1 +
 config/vic/outliers.txt     | 1 +
 config/yam/outliers.txt     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/config/h1n1pdm/outliers.txt b/config/h1n1pdm/outliers.txt
index 310ebd47..52ff0d4c 100644
--- a/config/h1n1pdm/outliers.txt
+++ b/config/h1n1pdm/outliers.txt
@@ -69,6 +69,7 @@ A/Louisiana/EVTL-100047/2022
 A/Malaysia/2142295/2009
 A/Malaysia/2142299/2009
 A/Malaysia/2143696/2009
+A/Manitoba/2/2021
 A/Minnesota/3/2008
 A/Minnesota/33/2014
 A/Minnesota/46/2015
diff --git a/config/vic/outliers.txt b/config/vic/outliers.txt
index 3fda1fbf..1f46722e 100644
--- a/config/vic/outliers.txt
+++ b/config/vic/outliers.txt
@@ -3,6 +3,7 @@ B/Alagoas/4386/2023
 B/Auckland/1/2008
 B/Bangkok/SI17/2012
 B/Bangkok/SI58/2012
+B/Bari/53/2023
 B/Brisbane/14/2016
 B/Brisbine/33/2008
 B/California/87/2017-egg
diff --git a/config/yam/outliers.txt b/config/yam/outliers.txt
index b8a3642c..1dad5be1 100644
--- a/config/yam/outliers.txt
+++ b/config/yam/outliers.txt
@@ -9,6 +9,7 @@ B/Kolkata/N-2047/2009
 B/Nairobi/351/2005
 B/NewHampshire/1/2016
 B/Norway/2155/2017
+B/Palermo/2/2011
 B/Riyadh/3/2010
 B/Riyadh/4/2010
 B/Thailand/CU-B10303/2014

From 5ed45edc4c41dc15295d731e9d6dc026de1ca482 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 12:56:05 +0100
Subject: [PATCH 18/26] nextclade: refine worflow, readme's, add Yam dataset

---
 nextclade/Snakefile                           | 46 ++++++++++++++++---
 nextclade/config/config_dict.yaml             | 24 +++++++++-
 .../h1n1pdm/ha/CY121680/README.md             |  4 +-
 .../h1n1pdm/ha/MW626062/README.md             |  4 +-
 .../h1n1pdm/na/MW626056/README.md             |  4 +-
 .../dataset_config/h3n2/ha/CY163680/README.md |  4 +-
 .../h3n2/ha/EPI1857216/README.md              |  4 +-
 .../h3n2/na/EPI1857215/README.md              |  4 +-
 .../vic/ha/EPI1926632/README.md               |  4 +-
 .../dataset_config/vic/ha/KX058884/README.md  |  4 +-
 .../dataset_config/vic/na/CY073894/README.md  |  4 +-
 .../dataset_config/yam/ha/JN993010/README.md  | 28 +++++++++++
 .../yam/ha/JN993010/pathogen.json             | 25 ++++++++++
 nextclade/dataset_config/yam/includes.txt     |  1 +
 nextclade/scripts/merge_jsons.py              |  7 ++-
 15 files changed, 141 insertions(+), 26 deletions(-)
 create mode 100644 nextclade/dataset_config/yam/ha/JN993010/README.md
 create mode 100644 nextclade/dataset_config/yam/ha/JN993010/pathogen.json
 create mode 100644 nextclade/dataset_config/yam/includes.txt

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 31b898fe..bd1acee0 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -37,13 +37,13 @@ rule download_clades:
     output:
         clade_tsv = "data/{clade}_{lineage}_{segment}_{reference}_raw.tsv"
     params:
-        source_tsv=lambda w: config['clade_repo'] + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'],
+        source_tsv=lambda w: (config['clade_repo'] if w.lineage!='yam' else '') + config["builds"][w.lineage][w.segment]["clade_systems"][w.clade]['url'],
     shell:
         """
         curl {params.source_tsv} > {output.clade_tsv}
         """
 
-rule download_changelog:
+rule download_changelog_clades:
     message:
         "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}"
     output:
@@ -55,6 +55,18 @@ rule download_changelog:
         curl {params.source} > {output.changelog}
         """
 
+rule download_changelog_dataset:
+    message:
+        "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}"
+    output:
+        changelog = "data/{lineage}_{segment}_{reference}_dataset-changelog.md"
+    params:
+        source=lambda w: f"{config['dataset_repo']}/{w.lineage}/{w.segment}/{w.reference}/CHANGELOG.md",
+    shell:
+        """
+        curl {params.source} > {output.changelog}
+        """
+
 rule offset_clades:
     input:
         rules.download_clades.output,
@@ -103,8 +115,8 @@ rule subsample:
         nextclade_include="dataset_config/{lineage}/includes.txt",
         exclude="../config/{lineage}/outliers.txt",
     output:
-        sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
-        sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt",
+        sampled_sequences="build/{lineage}/{segment}/{reference}/subsample_tmp.fasta",
+        sampled_strains="build/{lineage}/{segment}/{reference}/subsample_tmp.txt",
     params:
         filter_arguments=lambda w: config["builds"][w.lineage][w.segment]["refs"][
             w.reference
@@ -125,6 +137,27 @@ rule subsample:
             --output-strains {output.sampled_strains}
         """
 
+rule subsample_harddate:
+    input:
+        sequences=rules.subsample.output.sampled_sequences,
+        enriched_metadata=rules.parse.output.metadata,
+    output:
+        sampled_sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
+        sampled_strains="build/{lineage}/{segment}/{reference}/subsample.txt",
+    params:
+        hardmin=lambda w: config["builds"][w.lineage][w.segment]["refs"][
+            w.reference
+        ]["hardmin_date"],
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.enriched_metadata} \
+            --min-date {params.hardmin} \
+            --output {output.sampled_sequences} \
+            --output-strains {output.sampled_strains}
+        """
+
 rule align:
     input:
         sequences="build/{lineage}/{segment}/{reference}/subsample.fasta",
@@ -180,7 +213,7 @@ rule root:
             --tree {input.tree} \
             --sequence-length 1500 \
             --dates {input.metadata} \
-            --clock-filter 4 \
+            --clock-filter 3 \
             --clock-filter-method local \
             --outdir {params.outdir}
         cp {params.outdir}/rerooted.newick {output.tree}
@@ -373,6 +406,7 @@ rule make_dataset:
         annotation="dataset_config/{lineage}/{segment}/{reference}/annotation.gff",
         reference="dataset_config/{lineage}/{segment}/{reference}/reference.fasta",
         readme="dataset_config/{lineage}/{segment}/{reference}/README.md",
+        dataset_changelog="data/{lineage}_{segment}_{reference}_dataset-changelog.md",
         changelog="data/{lineage}_{segment}_changelog.md",
         pathogen_json="build/{lineage}/{segment}/{reference}/pathogen.json",
     output:
@@ -390,7 +424,7 @@ rule make_dataset:
         cp {input.auspice_json} {output.tree}
         cp {input.reference} {output.reference}
         cp {input.readme} {output.readme}
-        cp {input.changelog} {output.changelog}
+        cp {input.dataset_changelog} {output.changelog}
         cp {input.annotation} {output.annotation}
         cp {input.pathogen_json} {output.pathogen_json}
         cp {input.auspice_json} {output.auspice}
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index 31d99c8b..ff866cc6 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -1,4 +1,5 @@
 clade_repo: "https://raw.githubusercontent.com/influenza-clade-nomenclature/"
+dataset_repo: "https://raw.githubusercontent.com/nextstrain/nextclade_data/master/data/nextstrain/flu"
 
 builds:
   h1n1pdm:
@@ -17,11 +18,13 @@ builds:
         CY121680:
           filter: "--min-date 2014 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500  --subsample-max-sequences 1500"
           clade_offset: 0
+          hardmin_date: 2009
           reference_EPI_ISL: EPI1583287
           reference_strain: A/California/7/2009-egg #TODO: exclude
         MW626062:
           filter: "--min-date 2019 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500 --subsample-max-sequences 1500"
           clade_offset: 0
+          hardmin_date: 2015
           reference_EPI_ISL: EPI1812046
           reference_strain: A/Wisconsin/588/2019
     na:
@@ -33,6 +36,7 @@ builds:
         MW626056:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
           clade_offset: 0
+          hardmin_date: 2015
           reference_EPI_ISL: EPI1812046
           reference_strain: A/Wisconsin/588/2019
   h3n2:
@@ -51,11 +55,13 @@ builds:
         EPI1857216:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: -17
+          hardmin_date: 2015
           reference_EPI_ISL: EPI1857216
           reference_strain: A/Darwin/6/2021
         CY163680:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
+          hardmin_date: 2004
           reference_EPI_ISL: EPI545340
           reference_strain: A/Wisconsin/67/2005-egg
     na:
@@ -67,6 +73,7 @@ builds:
         EPI1857215:
           filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 1500"
           clade_offset: 4
+          hardmin_date: 2015
           reference_EPI_ISL: EPI1857215
           reference_strain: A/Darwin/6/2021
   vic:
@@ -82,6 +89,7 @@ builds:
         KX058884:
           filter: "--min-date 2014 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 2000"
           clade_offset: 0
+          hardmin_date: 2007
           reference_EPI_ISL: EPI696970
           reference_strain: B/Brisbane/60/2008-egg
     na:
@@ -91,7 +99,21 @@ builds:
           url: "seasonal_B-Vic_NA/main/.auto-generated/subclades.tsv"
       refs:
         CY073894:
-          filter: "--min-date 2019 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000"
+          filter: "--min-date 2014 --probabilistic-sampling --group-by year region --min-length 1400 --subsample-max-sequences 2000"
           clade_offset: -30
+          hardmin_date: 2007
           reference_EPI_ISL: CY073894
           reference_strain: B/Brisbane/60/2008
+  yam:
+    ha:
+      changelog: "seasonal_B-Yam_HA/main/CHANGELOG.md"
+      clade_systems:
+        clade:
+          url: "https://raw.githubusercontent.com/nextstrain/seasonal-flu/master/config/yam/ha/clades.tsv"
+      refs:
+        JN993010:
+          filter: "--min-date 2005 --probabilistic-sampling --group-by year --min-length 1500 --subsample-max-sequences 1000"
+          clade_offset: 0
+          hardmin_date: 2003
+          reference_EPI_ISL: EPI271600
+          reference_strain: B/Wisconsin/01/2010
diff --git a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
index b14250ef..648c0027 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/CY121680/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu_h1n1pdm_ha/CY121680)
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/California/07/2009" (flu/h1n1pdm/ha/CY121680)
 
 This dataset uses an older reference sequence (A/California/07/2009) and recent sequences will differ at a large number of positions from this reference.
 For the analysis of currently circulating viruses, the dataset using A/Wisconsin/588/2019 as reference might be more appropriate.
@@ -7,7 +7,7 @@ For the analysis of currently circulating viruses, the dataset using A/Wisconsin
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h1n1pdm_ha       | Influenza A H1N1pdm HA                   |
+| name                 | flu/h1n1pdm/ha       | Influenza A H1N1pdm HA                   |
 | reference            | CY121680             | A/California/07/2009                      |
 
 
diff --git a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
index 91cc32d3..b3830dec 100644
--- a/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
+++ b/nextclade/dataset_config/h1n1pdm/ha/MW626062/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_ha/MW626062)
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu/h1n1pdm/ha/MW626062)
 
 This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses.
 
@@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h1n1pdm_ha       | Influenza A H1N1pdm HA                   |
+| name                 | flu/h1n1pdm/ha       | Influenza A H1N1pdm HA                   |
 | reference            | MW626062             | A/Wisconsin/588/2019                      |
 
 
diff --git a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
index 9ecb25f9..d618a759 100644
--- a/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
+++ b/nextclade/dataset_config/h1n1pdm/na/MW626056/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu_h1n1pdm_na/MW626056)
+# Nextclade dataset for "Influenza A H1N1pdm HA" based on reference "A/Wisconsin/588/2019" (flu/h1n1pdm/na/MW626056)
 
 This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suitable for the analysis of currently circulating viruses.
 
@@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Wisconsin/588/2019) and is suit
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h1n1pdm_na       | Influenza A H1N1pdm HA                   |
+| name                 | flu/h1n1pdm/na       | Influenza A H1N1pdm HA                   |
 | reference            | MW626056             | A/Wisconsin/588/2019                      |
 
 
diff --git a/nextclade/dataset_config/h3n2/ha/CY163680/README.md b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
index fa391c39..decd5059 100644
--- a/nextclade/dataset_config/h3n2/ha/CY163680/README.md
+++ b/nextclade/dataset_config/h3n2/ha/CY163680/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu_h3n2_ha/CY163680)
+# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Wisconsin/67/2005" (flu/h3n2/ha/CY163680)
 
 This dataset uses an older reference sequence (A/Wisconsin/67/2005) and recent sequences will differ at a large number of positions from this reference.
 For the analysis of currently circulating viruses, the dataset using A/Darwin/6/2021 as reference might be more appropriate.
@@ -7,7 +7,7 @@ For the analysis of currently circulating viruses, the dataset using A/Darwin/6/
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h3n2_ha          | Influenza A H3N2 HA                      |
+| name                 | flu/h3n2/ha          | Influenza A H3N2 HA                      |
 | reference            | CY163680             | A/Wisconsin/67/2005                      |
 
 
diff --git a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
index 1bb9a2c8..3972b34c 100644
--- a/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
+++ b/nextclade/dataset_config/h3n2/ha/EPI1857216/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu_h3n2_ha/EPI1857216)
+# Nextclade dataset for "Influenza A H3N2 HA" based on reference "A/Darwin/6/2021" (flu/h3n2/ha/EPI1857216)
 
 This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable for the analysis of circulating viruses.
 
@@ -6,7 +6,7 @@ This dataset uses a recent reference sequence (A/Darwin/6/2021) and is suitable
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h3n2_ha          | Influenza A H3N2 HA                      |
+| name                 | flu/h3n2/ha          | Influenza A H3N2 HA                      |
 | reference            | EPI1857216           | A/Darwin/6/2021                          |
 
 
diff --git a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
index a23b9233..f501ed8a 100644
--- a/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
+++ b/nextclade/dataset_config/h3n2/na/EPI1857215/README.md
@@ -1,11 +1,11 @@
-# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu_h3n2_na/EPI1857216)
+# Nextclade dataset for "Influenza A H3N2 NA" based on reference "A/Darwin/6/2021" (flu/h3n2/na/EPI1857216)
 
 
 ## Dataset attributes
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_h3n2_na          | Influenza A H3N2 NA                      |
+| name                 | flu/h3n2/na          | Influenza A H3N2 NA                      |
 | reference            | EPI1857216           | A/Darwin/6/2021                          |
 
 
diff --git a/nextclade/dataset_config/vic/ha/EPI1926632/README.md b/nextclade/dataset_config/vic/ha/EPI1926632/README.md
index 1d9e0509..11e6b3e7 100644
--- a/nextclade/dataset_config/vic/ha/EPI1926632/README.md
+++ b/nextclade/dataset_config/vic/ha/EPI1926632/README.md
@@ -1,11 +1,11 @@
-# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu_vic_ha/EPI1926632)
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Austria/1359417/2021" (flu/vic/ha/EPI1926632)
 
 
 ## Dataset attributes
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_vic_ha           | Influenza B Vic HA                       |
+| name                 | flu/vic/ha           | Influenza B Vic HA                       |
 | reference            | EPI1926632           | B/Austria/1359417/2021                   |
 
 
diff --git a/nextclade/dataset_config/vic/ha/KX058884/README.md b/nextclade/dataset_config/vic/ha/KX058884/README.md
index 0f117264..e001f7d4 100644
--- a/nextclade/dataset_config/vic/ha/KX058884/README.md
+++ b/nextclade/dataset_config/vic/ha/KX058884/README.md
@@ -1,4 +1,4 @@
-# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_ha/KX058884)
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu/vic/ha/KX058884)
 
 The reference sequence for this datasets precedes the deletions at positions 162ff in the HA1 protein of the virus and thus follows the canonical numbering of aminoacids in the protein.
 
@@ -6,7 +6,7 @@ The reference sequence for this datasets precedes the deletions at positions 162
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_vic_ha           | Influenza B Vic HA                       |
+| name                 | flu/vic/ha           | Influenza B Vic HA                       |
 | reference            | KX058884             | B/Brisbane/60/2008                       |
 
 
diff --git a/nextclade/dataset_config/vic/na/CY073894/README.md b/nextclade/dataset_config/vic/na/CY073894/README.md
index f2dba64e..f24e20aa 100644
--- a/nextclade/dataset_config/vic/na/CY073894/README.md
+++ b/nextclade/dataset_config/vic/na/CY073894/README.md
@@ -1,11 +1,11 @@
-# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu_vic_na/CY073894)
+# Nextclade dataset for "Influenza B Vic HA" based on reference "B/Brisbane/60/2008" (flu/vic/na/CY073894)
 
 
 ## Dataset attributes
 
 | attribute            | value                | value friendly                           |
 | -------------------- | -------------------- | ---------------------------------------- |
-| name                 | flu_vic_na           | Influenza B Vic HA                       |
+| name                 | flu/vic/na           | Influenza B Vic HA                       |
 | reference            | CY073894             | B/Brisbane/60/2008                       |
 
 
diff --git a/nextclade/dataset_config/yam/ha/JN993010/README.md b/nextclade/dataset_config/yam/ha/JN993010/README.md
new file mode 100644
index 00000000..eecc77ef
--- /dev/null
+++ b/nextclade/dataset_config/yam/ha/JN993010/README.md
@@ -0,0 +1,28 @@
+# Nextclade dataset for "Influenza B Yam HA" based on reference "B/Wisconsin/01/2010" (flu/yam/ha/JN993010)
+
+B/Yamagata viruses have not been observed since 2020. This dataset is provided for analysis of old sequences or suspected Yamagata sequences.
+
+## Dataset attributes
+
+| attribute            | value                | value friendly                           |
+| -------------------- | -------------------- | ---------------------------------------- |
+| name                 | flu/yam/ha           | Influenza B Yam HA                       |
+| reference            | JN993010             | B/Wisconsin/01/2010                      |
+
+
+## Features
+This dataset supports
+
+ * Assignment to clades
+ * Identification of glycosilation motifs
+ * Sequence QC
+ * Phylogenetic placement
+
+## Clades of seasonal influenza viruses
+
+The WHO Collaborating centers define "clades" as genetic groups of viruses with signature mutations to facilitate discussion of circulating diversity of the viruses.
+Clade demarcation do not always coincide with significantly different antigenic properties of the viruses.
+
+## What is Nextclade dataset
+
+Read more about Nextclade datasets in Nextclade documentation: https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html
diff --git a/nextclade/dataset_config/yam/ha/JN993010/pathogen.json b/nextclade/dataset_config/yam/ha/JN993010/pathogen.json
new file mode 100644
index 00000000..0fec5c9f
--- /dev/null
+++ b/nextclade/dataset_config/yam/ha/JN993010/pathogen.json
@@ -0,0 +1,25 @@
+{
+    "nucMutLabelMap": {},
+    "nucMutLabelMapReverse": {},
+    "aaMotifs": [
+      {
+        "name": "glycosylation",
+        "nameShort": "Glyc.",
+        "nameFriendly": "Glycosylation",
+        "description": "N-linked glycosylation motifs (N-X-S/T with X any amino acid other than P)",
+        "includeGenes": [
+          {
+            "gene":"HA1",
+            "ranges":[]
+          },
+          {
+            "gene":"HA2",
+            "ranges":[{"begin":0, "end":186}]
+          }
+        ],
+        "motifs": [
+          "N[^P][ST]"
+        ]
+      }
+    ]
+}
\ No newline at end of file
diff --git a/nextclade/dataset_config/yam/includes.txt b/nextclade/dataset_config/yam/includes.txt
new file mode 100644
index 00000000..824bfcf5
--- /dev/null
+++ b/nextclade/dataset_config/yam/includes.txt
@@ -0,0 +1 @@
+B/Phuket/3073/2013
diff --git a/nextclade/scripts/merge_jsons.py b/nextclade/scripts/merge_jsons.py
index 1ffc48d2..6037a612 100644
--- a/nextclade/scripts/merge_jsons.py
+++ b/nextclade/scripts/merge_jsons.py
@@ -7,11 +7,16 @@ def get_clade_configs(name):
         "displayName": "Abbreviated clade name",
         "description": "For recent subclades with long names, the prefix describing their history is omitted."
     },
+    "short-clades": {
+        "name": "short-clade",
+        "displayName": "Abbreviated clade name",
+        "description": "For recent subclades with long names, the prefix describing their history is omitted."
+    },
     "subclade": {
         "name": "subclade",
         "displayName": "Subclade",
         "description": "Experimental fine-grained subclade annotation."
-    }}.get(name, {'name':name, "displayName":name})
+    }}.get(name, {'name':name, "displayName":name, "description":""})
 
 
 if __name__=="__main__":

From 4ea5bb2d74929af6f8820c5bb04522cc3a88e4c1 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 13:00:47 +0100
Subject: [PATCH 19/26] nextclade: update auspice config

---
 nextclade/config/auspice_config.json | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/nextclade/config/auspice_config.json b/nextclade/config/auspice_config.json
index 418a8348..ed218b66 100644
--- a/nextclade/config/auspice_config.json
+++ b/nextclade/config/auspice_config.json
@@ -1,9 +1,8 @@
 {
-  "title": "Genomic epidemiology of Influenza",
-  "build_url": "https://github.com/neherlab/nextclade_data_workflows",
+  "title": "Nextclade reference dataset for seasonal influenza viruses",
+  "build_url": "https://github.com/nextstrain/seasonal-flu",
   "maintainers": [
-    { "name": "Cornelius Roemer", "url": "https://neherlab.org" },
-    { "name": "Richard Neher", "url": "https://neherlab.org" }
+    { "name": "Nextstrain team", "url": "https://nextstrain.org" }
   ],
   "extensions": {
     "nextclade": {

From 931fb345c6963e2212979390d5fd1d64a56857d3 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 13:05:01 +0100
Subject: [PATCH 20/26] nextclade: update comments and messages

---
 nextclade/Snakefile               | 6 ++----
 nextclade/config/config_dict.yaml | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index bd1acee0..7f830b15 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -45,7 +45,7 @@ rule download_clades:
 
 rule download_changelog_clades:
     message:
-        "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}"
+        "Downloading nomenclature changelog for {wildcards.lineage} from {params.source} -> {output}"
     output:
         changelog = "data/{lineage}_{segment}_changelog.md"
     params:
@@ -57,7 +57,7 @@ rule download_changelog_clades:
 
 rule download_changelog_dataset:
     message:
-        "Downloading clade definitions for {wildcards.lineage} from {params.source} -> {output}"
+        "Downloading previous dataset changelog for {wildcards.lineage} from {params.source} -> {output}"
     output:
         changelog = "data/{lineage}_{segment}_{reference}_dataset-changelog.md"
     params:
@@ -448,5 +448,3 @@ rule clean_all:
         rm -rf data/
         """
 
-
-# cp datasets/h3n2/ha/EPI1857216/* ../../../nextstrain/nextclade_data/data/nextstrain/flu/h3n2/ha/EPI1857216
\ No newline at end of file
diff --git a/nextclade/config/config_dict.yaml b/nextclade/config/config_dict.yaml
index ff866cc6..2ee812bb 100644
--- a/nextclade/config/config_dict.yaml
+++ b/nextclade/config/config_dict.yaml
@@ -15,7 +15,7 @@ builds:
           url: "seasonal_A-H1N1pdm_HA/main/.auto-generated/subclades.tsv"
           key: "subclade"
       refs:
-        CY121680:
+        CY121680: # exclude South Korean genomes because of sequencing artifacts close to the start of HA
           filter: "--min-date 2014 --probabilistic-sampling  --exclude-where country='south_korea' --group-by year --min-length 1500  --subsample-max-sequences 1500"
           clade_offset: 0
           hardmin_date: 2009

From dc141617e9bc29576128dafba824bc3c754bd7d2 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 13:08:09 +0100
Subject: [PATCH 21/26] nextclade: generalize 'genes' function

---
 nextclade/Snakefile | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 7f830b15..2158fa58 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -8,6 +8,13 @@ wildcard_constraints:
     segment = r'pb2|pb1|pa|ha|np|na|mp|ns',
     reference="[^_/]+",
 
+
+def genes(w):
+    return {
+        'ha': ["SigPep", "HA1", "HA2"],
+        'na': ["N"]
+    }.get(w.segment, [])
+
 def all_builds(w):
     builds = []
     for lineage in config["builds"]:
@@ -102,11 +109,6 @@ rule parse:
             --output-sequences {output.sequences}
         """
 
-
-def genes(w):
-    if w.segment=='ha': return ["SigPep", "HA1", "HA2"]
-    if w.segment=='na': return ["NA"]
-
 rule subsample:
     input:
         aligned_sequences=rules.parse.output.sequences,

From c33b654961eb8acbf3aef7cfe37339261f1a20b1 Mon Sep 17 00:00:00 2001
From: Richard Neher <richard.neher@unibas.ch>
Date: Sun, 19 Nov 2023 13:10:07 +0100
Subject: [PATCH 22/26] nextclade: remove redundant env variable def

---
 nextclade/Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 2158fa58..fe0a1de0 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -370,7 +370,6 @@ rule export:
         date=datetime.datetime.utcnow().strftime("%Y-%m-%d"),
     shell:
         """
-        AUGUR_RECURSION_LIMIT=10000 \
         augur export v2 \
             --tree {input.tree} \
             --metadata {input.metadata} \

From a72cc56781a5aad8ad2d64c9da3c22c92a316fae Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 20 Nov 2023 16:45:48 -0800
Subject: [PATCH 23/26] Fix NA gene name

---
 nextclade/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index fe0a1de0..7daaeee2 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -12,7 +12,7 @@ wildcard_constraints:
 def genes(w):
     return {
         'ha': ["SigPep", "HA1", "HA2"],
-        'na': ["N"]
+        'na': ["NA"]
     }.get(w.segment, [])
 
 def all_builds(w):

From 17eaf0b3e2da2278246b011bb8f1a06ac871b3e1 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 20 Nov 2023 16:46:03 -0800
Subject: [PATCH 24/26] Check for outliers file before trying to open it

Outliers don't always exist from the treetime clock command, so check
before trying to open and avoid a file not found error.
---
 nextclade/Snakefile | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 7daaeee2..ef54d280 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -229,15 +229,20 @@ rule prune_outliers:
     params:
         outliers = "build/{lineage}/{segment}/{reference}/tt_out/outliers.tsv"
     run:
+        from pathlib import Path
         import pandas as pd
         from Bio import Phylo
-        outliers = pd.read_csv(params.outliers, sep='\t', index_col=0)
+
         T = Phylo.read(input.tree, 'newick')
 
-        for n in outliers.index:
-            if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n):
-                print("prune", n)
-                T.prune(n)
+        if Path(params.outliers).exists():
+            outliers = pd.read_csv(params.outliers, sep='\t', index_col=0)
+
+            for n in outliers.index:
+                if outliers.loc[n,"given_date"]>2020 and ('-egg' not in n):
+                    print("prune", n)
+                    T.prune(n)
+
         Phylo.write(T, output.tree, "newick")
 
 # refine while keeping the root

From c8a068433def78b06b7e5d9abd76cd9dc20b34d4 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 20 Nov 2023 17:13:34 -0800
Subject: [PATCH 25/26] Change variable name for genes expected by Nextclade

---
 nextclade/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index ef54d280..73d54fa6 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -168,7 +168,7 @@ rule align:
     output:
         alignment="build/{lineage}/{segment}/{reference}/align.aligned.fasta"
     params:
-        outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{gene}}.fasta",
+        outdir=lambda w: f"build/{w.lineage}/{w.segment}/{w.reference}/aligned.gene.{{cds}}.fasta",
         nextclade_bin = "./nextclade_v3"
     threads: 3
     shell:

From 43438be5fd04f81c35051dfbcf2e33da967f0c4c Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 21 Nov 2023 10:14:00 -0800
Subject: [PATCH 26/26] Set a default config for Nextclade workflow

Use the standard approach of defining a default config for the Nextclade
workflow and save everyone a couple of characters of typing.
---
 nextclade/Snakefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nextclade/Snakefile b/nextclade/Snakefile
index 73d54fa6..910e3512 100644
--- a/nextclade/Snakefile
+++ b/nextclade/Snakefile
@@ -1,3 +1,5 @@
+configfile: "config/config_dict.yaml"
+
 import datetime