diff --git a/scripts/align_for_tree.py b/scripts/align_for_tree.py index 475cc43..c21350d 100644 --- a/scripts/align_for_tree.py +++ b/scripts/align_for_tree.py @@ -1,4 +1,4 @@ -from Bio import SeqIO +from Bio import SeqIO, AlignIO from Bio.SeqRecord import SeqRecord import shutil import argparse @@ -8,27 +8,28 @@ def alignfortree(realign, align, reference, newoutput, build): if build != "genome": shutil.copy(realign, newoutput) else: - realigned = {s.id:s for s in SeqIO.parse(realign, "fasta")} + realigned_aln = AlignIO.read(realign, 'fasta') + insert_length = realigned_aln.get_alignment_length() + realigned = {s.id:s for s in realigned_aln} original = SeqIO.parse(align, "fasta") ref = SeqIO.read(reference, "genbank") - + for feature in ref.features: - if feature.type =='gene': - a =str((list(feature.qualifiers.items())[0])[-1])[2:-2] - if a == "G": - startofgene = int(list(feature.location)[0]) - endofgene = int(list(feature.location)[-1])+1 - break + if feature.type =='gene' or feature.type=='CDS': + a =str((list(feature.qualifiers.items())[0])[-1])[2:-2] + if a == "G": + startofgene = int(list(feature.location)[0]) + endofgene = int(list(feature.location)[-1])+1 for record_original in original: sequence_to_insert = realigned.get(record_original.id, None) if sequence_to_insert is None: - sequence_to_insert = '-' * (endofgene - startofgene) + sequence_to_insert = '-' * insert_length else: sequence_to_insert = sequence_to_insert.seq - record_for_tree = record_original.seq.replace(record_original.seq[startofgene:endofgene], sequence_to_insert) - newrecord = SeqRecord(record_for_tree, id=record_original.id, description=record_original.description) + newseq = record_original.seq[:startofgene] + sequence_to_insert + record_original.seq[endofgene:] + newrecord = SeqRecord(newseq, id=record_original.id, description=record_original.description) records.append(newrecord) SeqIO.write(records, newoutput, "fasta") diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index e87b63e..8acf9a0 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -56,9 +56,9 @@ rule filter: output: sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta" params: - group_by = config["filter"]["group_by"], - min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}', - subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000), + group_by = config["filter"]["group_by"], + min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}', + subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000), strain_id=config["strain_id_field"], shell: """ @@ -185,9 +185,9 @@ rule refine: tree = build_dir + "/{a_or_b}/{build_name}/tree.nwk", node_data = build_dir + "/{a_or_b}/{build_name}/branch_lengths.json" params: - coalescent = config["refine"]["coalescent"], - clock_filter_iqd = config["refine"]["clock_filter_iqd"], - date_inference = config["refine"]["date_inference"], + coalescent = config["refine"]["coalescent"], + clock_filter_iqd = config["refine"]["clock_filter_iqd"], + date_inference = config["refine"]["date_inference"], strain_id=config["strain_id_field"], shell: """ @@ -257,7 +257,7 @@ rule traits: log: "logs/{a_or_b}/traits_{build_name}_rsv.txt" params: - columns = config["traits"]["columns"], + columns = config["traits"]["columns"], strain_id=config["strain_id_field"], shell: """ diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 6d8e9ec..1766996 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -39,7 +39,7 @@ rule export: auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json", root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json" params: - title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", + title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny", strain_id=config["strain_id_field"], metadata_colors = lambda w: '' if w.build_name=='genome' else f"--color-by-metadata clade" shell: