Skip to content

Commit

Permalink
Merge pull request #117 from nextstrain/keep-reference-outliers
Browse files Browse the repository at this point in the history
Keep reference strains in tree even if they are outliers
  • Loading branch information
huddlej authored Aug 29, 2023
2 parents 8621068 + 3386bf6 commit 3b049da
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
8 changes: 7 additions & 1 deletion scripts/flag_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def prepare_tree(T):
parser.add_argument('--reroot', action="store_true", help="reroot the tree")
parser.add_argument('--optimize', action="store_true", help="optimize sigma and mu")
parser.add_argument('--dates', type=str, help='csv/tsv file with dates for each sequence')
parser.add_argument('--keep-strains', type=str, help='a list of strains to keep in the output tree regardless of outlier status (i.e., reference strains that need to be retained in the build)')
parser.add_argument('--output-outliers', type=str, help='file for outliers')
parser.add_argument('--output-tree', type=str, help='file for pruned tree')

Expand Down Expand Up @@ -153,9 +154,14 @@ def prepare_tree(T):
df.to_csv(args.output_outliers, index=False, sep='\t')

if args.output_tree:
keep_strains = set()
if args.keep_strains:
with open(args.keep_strains, "r", encoding="utf-8") as fh:
keep_strains = {line.strip() for line in fh}

from Bio import Phylo
T = tt.tree
for r, row in df.iterrows():
if row['diagnosis']!='bad_date':
if row['diagnosis']!='bad_date' and row["sequence"] not in keep_strains:
T.prune(row['sequence'])
Phylo.write(T, args.output_tree, 'newick')
3 changes: 3 additions & 0 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,16 @@ rule prune_outliers:
output:
tree = build_dir + "/{build_name}/{segment}/tree_without_outgroup_clean.nwk",
outliers = build_dir + "/{build_name}/{segment}/outliers.tsv"
params:
keep_strains_argument=lambda wildcards: "--keep-strains " + config["builds"][wildcards.build_name]["include"] if "include" in config["builds"][wildcards.build_name] else "",
shell:
"""
python3 scripts/flag_outliers.py \
--tree {input.tree:q} \
--aln {input.aln} \
--dates {input.metadata} \
--cutoff 4.0 \
{params.keep_strains_argument} \
--output-tree {output.tree:q} --output-outliers {output.outliers} 2>&1 | tee {log}
"""

Expand Down

0 comments on commit 3b049da

Please sign in to comment.