Skip to content

Commit

Permalink
Merge pull request #29: Update usage of "accession" as the ID column
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin authored Sep 14, 2023
2 parents 0729bcd + d123d60 commit 29b1c29
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 51 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ include: "workflow/snakemake_rules/glycosylation.smk"
include: "workflow/snakemake_rules/clades.smk"


if config.get("deploy_url"):
if "deploy_url" in config:
include: "workflow/snakemake_rules/nextstrain_automation.smk"

rule clean:
Expand Down
2 changes: 1 addition & 1 deletion config/configfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ exclude: "config/outliers.txt"
description: "config/description.md"

strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

subtypes: ['a', 'b']

Expand Down
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def _get_all_targets(wildcards):
)
elif len(remote_file_names) != len(set(remote_file_names)):
print(f"Skipping file upload for {target!r} because there are duplicate remote file names.")
elif not config.get("s3_dst"):
elif "s3_dst" not in config:
print(f"Skipping file upload for {target!r} because the destintion was not defined.")
else:
all_targets.extend(
Expand Down
6 changes: 4 additions & 2 deletions scripts/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
25 changes: 0 additions & 25 deletions scripts/wrangle_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion workflow/envs/nextstrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ dependencies:
- pip
- pip:
- awscli==1.18.45
- nextstrain-augur==11.3.0
- nextstrain-augur==22.2.0
- nextstrain-cli==1.16.5
- rethinkdb==2.3.0.post6
28 changes: 10 additions & 18 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,6 @@ This part of the workflow expects input files
metadata = "data/metadata.tsv"
'''

rule wrangle_metadata:
input:
metadata="data/{a_or_b}/metadata.tsv",
output:
metadata="data/{a_or_b}/metadata_by_accession.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
python3 scripts/wrangle_metadata.py --metadata {input.metadata} \
--strain-id {params.strain_id} \
--output {output.metadata}
"""

rule index_sequences:
message:
"""
Expand Down Expand Up @@ -64,21 +50,23 @@ rule filter:
input:
sequences = "data/{a_or_b}/sequences.fasta",
reference = "config/{a_or_b}reference.gbk",
metadata = "data/{a_or_b}/metadata_by_accession.tsv",
metadata = "data/{a_or_b}/metadata.tsv",
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000)
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -196,13 +184,15 @@ rule refine:
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
date_inference = config["refine"]["date_inference"]
date_inference = config["refine"]["date_inference"],
strain_id=config["strain_id_field"],
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--coalescent {params.coalescent} \
Expand Down Expand Up @@ -264,12 +254,14 @@ rule traits:
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
params:
columns = config["traits"]["columns"]
columns = config["traits"]["columns"],
strain_id=config["strain_id_field"],
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence
Expand Down
8 changes: 6 additions & 2 deletions workflow/snakemake_rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ rule export:
auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json",
root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json"
params:
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny"
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
strain_id=config["strain_id_field"],
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.node_data} \
--title {params.title:q} \
--description {input.description} \
Expand All @@ -62,10 +64,12 @@ rule final_strain_name:
output:
auspice_json=build_dir + "/{a_or_b}/{build_name}/tree_renamed.json"
params:
display_strain_field=lambda w: config.get("display_strain_field", "strain"),
strain_id=config["strain_id_field"],
display_strain_field=config.get("display_strain_field", "strain"),
shell:
"""
python3 scripts/set_final_strain_name.py --metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down

0 comments on commit 29b1c29

Please sign in to comment.