Skip to content

Commit

Permalink
Merge pull request #59 from nextstrain/fix/alignment-length
Browse files Browse the repository at this point in the history
Fix/alignment length
  • Loading branch information
rneher authored Mar 20, 2024
2 parents 1df2377 + 4cf1cd4 commit f8f426f
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 20 deletions.
25 changes: 13 additions & 12 deletions scripts/align_for_tree.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from Bio import SeqIO
from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
import shutil
import argparse
Expand All @@ -8,27 +8,28 @@ def alignfortree(realign, align, reference, newoutput, build):
if build != "genome":
shutil.copy(realign, newoutput)
else:
realigned = {s.id:s for s in SeqIO.parse(realign, "fasta")}
realigned_aln = AlignIO.read(realign, 'fasta')
insert_length = realigned_aln.get_alignment_length()
realigned = {s.id:s for s in realigned_aln}
original = SeqIO.parse(align, "fasta")
ref = SeqIO.read(reference, "genbank")

for feature in ref.features:
if feature.type =='gene':
a =str((list(feature.qualifiers.items())[0])[-1])[2:-2]
if a == "G":
startofgene = int(list(feature.location)[0])
endofgene = int(list(feature.location)[-1])+1
break
if feature.type =='gene' or feature.type=='CDS':
a =str((list(feature.qualifiers.items())[0])[-1])[2:-2]
if a == "G":
startofgene = int(list(feature.location)[0])
endofgene = int(list(feature.location)[-1])+1

for record_original in original:
sequence_to_insert = realigned.get(record_original.id, None)
if sequence_to_insert is None:
sequence_to_insert = '-' * (endofgene - startofgene)
sequence_to_insert = '-' * insert_length
else:
sequence_to_insert = sequence_to_insert.seq

record_for_tree = record_original.seq.replace(record_original.seq[startofgene:endofgene], sequence_to_insert)
newrecord = SeqRecord(record_for_tree, id=record_original.id, description=record_original.description)
newseq = record_original.seq[:startofgene] + sequence_to_insert + record_original.seq[endofgene:]
newrecord = SeqRecord(newseq, id=record_original.id, description=record_original.description)
records.append(newrecord)

SeqIO.write(records, newoutput, "fasta")
Expand Down
14 changes: 7 additions & 7 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ rule filter:
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=config["strain_id_field"],
shell:
"""
Expand Down Expand Up @@ -185,9 +185,9 @@ rule refine:
tree = build_dir + "/{a_or_b}/{build_name}/tree.nwk",
node_data = build_dir + "/{a_or_b}/{build_name}/branch_lengths.json"
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
date_inference = config["refine"]["date_inference"],
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
date_inference = config["refine"]["date_inference"],
strain_id=config["strain_id_field"],
shell:
"""
Expand Down Expand Up @@ -257,7 +257,7 @@ rule traits:
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
params:
columns = config["traits"]["columns"],
columns = config["traits"]["columns"],
strain_id=config["strain_id_field"],
shell:
"""
Expand Down
2 changes: 1 addition & 1 deletion workflow/snakemake_rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ rule export:
auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json",
root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json"
params:
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
strain_id=config["strain_id_field"],
metadata_colors = lambda w: '' if w.build_name=='genome' else f"--color-by-metadata clade"
shell:
Expand Down

0 comments on commit f8f426f

Please sign in to comment.