Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ancestral): make --genes optional when reconstructing translations, if omitted, translate all genes in annotation #1668

Draft
wants to merge 8 commits into
base: nextclade-compat
Choose a base branch
from
23 changes: 15 additions & 8 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,11 +314,15 @@ def register_parser(parent_subparsers):
)
amino_acid_options_group.add_argument('--annotation',
help='GenBank or GFF file containing the annotation')
amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list)")
amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list), if not provided, all genes will be translated")
amino_acid_options_group.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
"Currently only supported for FASTA-input. Specify the file name via a "
"template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
"by the gene name.")
amino_acid_options_group.add_argument(
'--use-nextclade-gff-parsing', action="store_true", default=False,
help="Read GFF annotations the way Nextclade does, using CDSes (including compound) and same qualifiers for gene names."
)

output_group = parser.add_argument_group(
"outputs",
Expand All @@ -345,9 +349,10 @@ def validate_arguments(args, is_vcf):
This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
invalid combinations up-front we can exit quickly.
"""
aa_arguments = (args.annotation, args.genes, args.translations)
if any(aa_arguments) and not all(aa_arguments):
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")
mandatory_aa_arguments = (args.annotation, args.translations)
all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing, args.genes)
if any(all_aa_arguments) and not all(mandatory_aa_arguments):
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.")

if args.output_sequences and args.output_vcf:
raise AugurError("Both sequence (fasta) and VCF output have been requested, but these are incompatible.")
Expand Down Expand Up @@ -426,20 +431,22 @@ def run(args):
anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']),
'strand': '+', 'type': 'source'}}

if not is_vcf and args.genes:
genes = parse_genes_argument(args.genes)
if not is_vcf and args.annotation:
genes = parse_genes_argument(args.genes) if args.genes else None

from .utils import load_features
## load features; only requested features if genes given
features = load_features(args.annotation, genes)
features = load_features(args.annotation, genes, args.use_nextclade_gff_parsing)
# Ensure the already-created nuc annotation coordinates match those parsed from the reference file
if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in genes:
for gene in features:
if gene == 'nuc':
continue
print(f"Processing gene: {gene}")
fname = args.translations.replace("%GENE", gene)
feat = features[gene]
Expand Down
204 changes: 149 additions & 55 deletions augur/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import argparse
import Bio
import Bio.Phylo
import Bio.SeqRecord
import numpy as np
import os, json, sys
import pandas as pd
from Bio.SeqFeature import SimpleLocation, CompoundLocation, SeqFeature, FeatureLocation
from collections import defaultdict, OrderedDict
from io import RawIOBase
from textwrap import dedent
from typing import Dict
from .__version__ import __version__

from augur.data import as_file
Expand Down Expand Up @@ -166,7 +169,7 @@ def default(self, obj):
return super().default(obj)


def load_features(reference, feature_names=None):
def load_features(reference, feature_names=None, use_nextclade_gff_parsing=False):
"""
Parse a GFF/GenBank reference file. See the docstrings for _read_gff and
_read_genbank for details.
Expand All @@ -177,6 +180,8 @@ def load_features(reference, feature_names=None):
File path to GFF or GenBank (.gb) reference
feature_names : None or set or list (optional)
Restrict the genes we read to those in the set/list
use_nextclade_gff_parsing : bool (optional)
If True, parse GFF file the way Nextclade does

Returns
-------
Expand All @@ -194,6 +199,8 @@ def load_features(reference, feature_names=None):
raise AugurError(f"reference sequence file {reference!r} not found")

if '.gff' in reference.lower():
if use_nextclade_gff_parsing:
return _read_gff_like_nextclade(reference, feature_names)
return _read_gff(reference, feature_names)
else:
return _read_genbank(reference, feature_names)
Expand Down Expand Up @@ -228,7 +235,6 @@ def _read_nuc_annotation_from_gff(record, reference):
if len(sequence_regions)>1:
raise AugurError(f"Reference {reference!r} contains multiple ##sequence-region pragma lines. Augur can only handle GFF files with a single one.")
elif sequence_regions:
from Bio.SeqFeature import SeqFeature, FeatureLocation
(name, start, stop) = sequence_regions[0]
nuc['pragma'] = SeqFeature(
FeatureLocation(start, stop, strand=1),
Expand Down Expand Up @@ -259,6 +265,114 @@ def _read_nuc_annotation_from_gff(record, reference):
raise AugurError(f"Reference {reference!r} didn't define any information we can use to create the 'nuc' annotation. You can use a line with a 'record' or 'source' GFF type or a ##sequence-region pragma.")


def _load_gff_record(reference, valid_types=None) -> Bio.SeqRecord.SeqRecord:
"""
Open a GFF file and return single record.
If there are multiple records, raise AugurError.
Optionally, limit the GFF types to parse.
"""
from BCBio import GFF

limit_info = {'gff_type': valid_types} if valid_types else None

with open_file(reference) as in_handle:
# Note that `GFF.parse` doesn't always yield GFF records in the order
# one may expect, but since we raise AugurError if there are multiple
# this doesn't matter.
# TODO: Remove warning suppression after it's addressed upstream:
# <https://github.com/chapmanb/bcbb/issues/143>
import warnings

from Bio import BiopythonDeprecationWarning
warnings.simplefilter("ignore", BiopythonDeprecationWarning)
gff_entries = list(GFF.parse(in_handle, limit_info=limit_info))
warnings.simplefilter("default", BiopythonDeprecationWarning)

if len(gff_entries) == 0:
msg = f"Reference {reference!r} contains no valid data rows."
if valid_types:
msg += f" Valid GFF types (3rd column) are {', '.join(valid_types)}."
raise AugurError(msg)
if len(gff_entries) > 1:
raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
return gff_entries[0]

def _lookup_feature_name_like_nextclade(feature) -> str:
# Matching Nextclade conventions (NAME_ATTRS_CDS)
# https://github.com/nextstrain/nextclade/blob/59e757fd9c9f8d8edd16cf2063d77a859d4d3b96/packages/nextclade/src/io/gff3.rs#L35-L54
QUALIFIER_PRIORITY = [ "Name", "name", "Alias", "alias", "standard_name", "old-name", "Gene", "gene", "gene_name",
"locus_tag", "product", "gene_synonym", "gb-synonym", "acronym", "gb-acronym", "protein_id", "ID"]

for qualifier in QUALIFIER_PRIORITY:
if qualifier in feature.qualifiers:
return feature.qualifiers[qualifier][0]
raise AugurError(f"No valid feature name found for feature for feature {feature.id}")

def _read_gff_like_nextclade(reference, feature_names) -> Dict[str, SeqFeature]:
"""
Read a GFF file the way Nextclade does. That means:
- We only look at CDS features.
- CDS features with identical IDs are joined into a compound feature.
- The feature name is taken from qualifiers in the same priority order as in Nextclade.

Parameters
----------
reference : string
File path to GFF reference
feature_names : None or set or list
Restrict the genes we read to those in the set/list

Returns
-------
features : dict
keys: feature names, values: :py:class:`Bio.SeqFeature.SeqFeature`
Note that feature names may not equivalent to GenBank feature keys

Raises
------
AugurError
If the reference file contains no IDs or multiple different seqids
If a gene is found with the name 'nuc'
"""
rec = _load_gff_record(reference)
features = {'nuc': _read_nuc_annotation_from_gff(rec, reference)}

cds_features: Dict[str, SeqFeature] = {}

def _flatten(feature):
if feature.type == "CDS":
if not feature.id:
# Hack to ensure we have some IDs for all features
# Technically, all features should have an ID, but Nextclade doesn't require it
feature.id = str(feature.qualifiers)
if feature.id in cds_features:
if isinstance(cds_features[feature.id].location, CompoundLocation):
cds_features[feature.id].location.parts.append(feature.location)
else:
cds_features[feature.id].location = CompoundLocation([cds_features[feature.id].location, feature.location])
else:
cds_features[feature.id] = feature
for sub_feature in getattr(feature, "sub_features", []):
_flatten(sub_feature)

for feature in rec.features:
_flatten(feature)

for feature in cds_features.values():
feature_name = _lookup_feature_name_like_nextclade(feature)
if feature_name == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")
if feature_names is None or feature_name in feature_names:
features[feature_name] = feature

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print(f"Couldn't find gene {fe} in GFF")

return features


def _read_gff(reference, feature_names):
"""
Read a GFF file. We only read GFF IDs 'gene' or 'source' (the latter may not technically
Expand Down Expand Up @@ -288,65 +402,47 @@ def _read_gff(reference, feature_names):
If the reference file contains no IDs or multiple different seqids
If a gene is found with the name 'nuc'
"""
from BCBio import GFF
valid_types = ['gene', 'source', 'region']
features = {}

with open_file(reference) as in_handle:
# Note that `GFF.parse` doesn't always yield GFF records in the order
# one may expect, but since we raise AugurError if there are multiple
# this doesn't matter.
# TODO: Remove warning suppression after it's addressed upstream:
# <https://github.com/chapmanb/bcbb/issues/143>
import warnings
from Bio import BiopythonDeprecationWarning
warnings.simplefilter("ignore", BiopythonDeprecationWarning)
gff_entries = list(GFF.parse(in_handle, limit_info={'gff_type': valid_types}))
warnings.simplefilter("default", BiopythonDeprecationWarning)
rec = _load_gff_record(reference, valid_types=valid_types)

if len(gff_entries) == 0:
raise AugurError(f"Reference {reference!r} contains no valid data rows. Valid GFF types (3rd column) are {', '.join(valid_types)}.")
elif len(gff_entries) > 1:
raise AugurError(f"Reference {reference!r} contains multiple seqids (first column). Augur can only handle GFF files with a single seqid.")
else:
rec = gff_entries[0]

features['nuc'] = _read_nuc_annotation_from_gff(rec, reference)
features_skipped = 0

for feat in rec.features:
if feat.type == "gene":
# Check for gene names stored in qualifiers commonly used by
# virus-specific gene maps first (e.g., 'gene',
# 'gene_name'). Then, check for qualifiers used by non-viral
# pathogens (e.g., 'locus_tag').
if "gene" in feat.qualifiers:
fname = feat.qualifiers["gene"][0]
elif "gene_name" in feat.qualifiers:
fname = feat.qualifiers["gene_name"][0]
elif "locus_tag" in feat.qualifiers:
fname = feat.qualifiers["locus_tag"][0]
else:
features_skipped+=1
fname = None
features = {'nuc': _read_nuc_annotation_from_gff(rec, reference)}

features_skipped = 0

if fname == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")
for feat in rec.features:
if feat.type == "gene":
# Check for gene names stored in qualifiers commonly used by
# virus-specific gene maps first (e.g., 'gene',
# 'gene_name'). Then, check for qualifiers used by non-viral
# pathogens (e.g., 'locus_tag').
if "gene" in feat.qualifiers:
fname = feat.qualifiers["gene"][0]
elif "gene_name" in feat.qualifiers:
fname = feat.qualifiers["gene_name"][0]
elif "locus_tag" in feat.qualifiers:
fname = feat.qualifiers["locus_tag"][0]
else:
features_skipped+=1
fname = None

if feature_names is not None and fname not in feature_names:
# Skip (don't store) this feature
continue
if fname == 'nuc':
raise AugurError(f"Reference {reference!r} contains a gene with the name 'nuc'. This is not allowed.")

if fname:
features[fname] = feat
if feature_names is not None and fname not in feature_names:
# Skip (don't store) this feature
continue

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print("Couldn't find gene {} in GFF or GenBank file".format(fe))
if fname:
features[fname] = feat

if feature_names is not None:
for fe in feature_names:
if fe not in features:
print("Couldn't find gene {} in GFF or GenBank file".format(fe))

if features_skipped:
print(f"WARNING: {features_skipped} GFF rows of type=gene skipped as they didn't have a gene, gene_name or locus_tag attribute.")
if features_skipped:
print(f"WARNING: {features_skipped} GFF rows of type=gene skipped as they didn't have a gene, gene_name or locus_tag attribute.")

return features

Expand Down Expand Up @@ -852,8 +948,6 @@ def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nu
See schema-annotations.json for the schema this conforms to

"""
from Bio.SeqFeature import SimpleLocation, CompoundLocation

if assert_nuc and 'nuc' not in features:
raise AugurError("Genome features must include a feature for 'nuc'")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Setup

$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/ebola/tree.nwk \
> --alignment $TESTDIR/../data/ebola/masked.fasta \
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
> --use-nextclade-gff-parsing \
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
> --infer-ambiguous \
> --inference joint \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> > /dev/null

Check that output is as expected

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> --exclude-regex-paths "\['seqid'\]" -- \
> "$TESTDIR/../data/ebola/nt_muts.json" \
> "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
{}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Setup

$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.
Only include the specified genes, not all that are in the GFF.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/ebola/tree.nwk \
> --alignment $TESTDIR/../data/ebola/masked.fasta \
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
> --genes GP sGP ssGP \
> --use-nextclade-gff-parsing \
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
> --infer-ambiguous \
> --inference joint \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> > /dev/null

Check that output is as expected

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> --exclude-regex-paths "\['seqid'\]" -- \
> "$TESTDIR/../data/ebola/nt_muts_gene_subset.json" \
> "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
{}
2 changes: 1 addition & 1 deletion tests/functional/ancestral/cram/invalid-args.t
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ This should fail.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --output-node-data "ancestral_mutations.json" > /dev/null
ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
ERROR: For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.
[2]

Missing tree file
Expand Down
Loading
Loading