Skip to content

Commit

Permalink
Merge pull request #114 from griffithlab/intron_variants
Browse files Browse the repository at this point in the history
Handle variants with multiple stops in the amino acid change
  • Loading branch information
susannasiebert authored Apr 23, 2018
2 parents e2877c2 + 964bb04 commit d0b98ee
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 10 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ before_install:
- sudo apt-get install -y ghostscript
install:
#Remove the following line when pvacseq-client is first pushed to the live pypi site
- pip install -i https://testpypi.python.org/pypi pvacseq-client
#This has been updated to pull from the live pypi site, instead of test pypi.
#Not sure if it can or needs to be deleted altogether.
- pip install pvacseq-client
- pip install -e .[API]
before_script:
- "export DISPLAY=:99.0"
Expand Down
32 changes: 23 additions & 9 deletions lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,15 @@ def execute(self):
position = int(line['protein_position'].split('-', 1)[0]) - 1
elif variant_type == 'missense' or variant_type == 'inframe_ins':
wildtype_amino_acid, mutant_amino_acid = line['amino_acid_change'].split('/')
if wildtype_amino_acid.endswith('*'):
wildtype_amino_acid = wildtype_amino_acid.replace('*', '')
if mutant_amino_acid.endswith('*'):
mutant_amino_acid = mutant_amino_acid.replace('*', '')
if '*' in wildtype_amino_acid:
wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
elif 'X' in wildtype_amino_acid:
wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
if '*' in mutant_amino_acid:
mutant_amino_acid = mutant_amino_acid.split('*')[0]
stop_codon_added = True
elif 'X' in mutant_amino_acid:
mutant_amino_acid = mutant_amino_acid.split('X')[0]
stop_codon_added = True
else:
stop_codon_added = False
Expand All @@ -124,10 +129,15 @@ def execute(self):
elif variant_type == 'inframe_del':
variant_type = 'inframe_del'
wildtype_amino_acid, mutant_amino_acid = line['amino_acid_change'].split('/')
if wildtype_amino_acid.endswith('*'):
wildtype_amino_acid = wildtype_amino_acid.replace('*', '')
if mutant_amino_acid.endswith('*'):
mutant_amino_acid = mutant_amino_acid.replace('*', '')
if '*' in wildtype_amino_acid:
wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
elif 'X' in wildtype_amino_acid:
wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
if '*' in mutant_amino_acid:
mutant_amino_acid = mutant_amino_acid.split('*')[0]
stop_codon_added = True
elif 'X' in mutant_amino_acid:
mutant_amino_acid = mutant_amino_acid.split('X')[0]
stop_codon_added = True
else:
stop_codon_added = False
Expand All @@ -151,7 +161,11 @@ def execute(self):
mutation_start_position, wildtype_subsequence = self.get_wildtype_subsequence(position, full_wildtype_sequence, wildtype_amino_acid_length, peptide_sequence_length, line)
mutation_end_position = mutation_start_position + wildtype_amino_acid_length
if wildtype_amino_acid != '-' and wildtype_amino_acid != wildtype_subsequence[mutation_start_position:mutation_end_position]:
sys.exit("ERROR: There was a mismatch between the actual wildtype amino acid and the expected amino acid. Did you use the same reference build version for VEP that you used for creating the VCF?\n%s" % line)
if line['amino_acid_change'].split('/')[0].count('*') > 1:
print("Warning: Amino acid change is not sane - contains multiple stops. Skipping entry {}".format(line['index']))
continue
else:
sys.exit("ERROR: There was a mismatch between the actual wildtype amino acid sequence ({}) and the expected amino acid sequence ({}). Did you use the same reference build version for VEP that you used for creating the VCF?\n{}".format(wildtype_subsequence[mutation_start_position:mutation_end_position], wildtype_amino_acid, line))
if stop_codon_added:
mutant_subsequence = wildtype_subsequence[:mutation_start_position] + mutant_amino_acid
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chromosome_name start stop reference variant gene_name transcript_name amino_acid_change ensembl_gene_id wildtype_amino_acid_sequence downstream_amino_acid_sequence fusion_amino_acid_sequence variant_type protein_position transcript_expression gene_expression normal_depth normal_vaf tdna_depth tdna_vaf trna_depth trna_vaf index protein_length_change
12 96617457 96617460 CAGA C ELK3 ENST00000547249 AX/X ENSG00000111145 MESAITLWQFLLQLLLDQKHEHLICWTSNDGEFKLLKA inframe_del 38-39 NA NA NA NA NA NA NA NA 39805.ELK3.ENST00000547249.inframe_del.38-39AX/X
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chromosome_name start stop reference variant gene_name transcript_name amino_acid_change ensembl_gene_id wildtype_amino_acid_sequence downstream_amino_acid_sequence fusion_amino_acid_sequence variant_type protein_position transcript_expression gene_expression normal_depth normal_vaf tdna_depth tdna_vaf trna_depth trna_vaf index protein_length_change
3 41265565 41266581 TACTCAAGGTTTGTGTCATTAAATCTTTAGTTACTGAATTGGGGCTCTGCTTCGTTGCCATTAAGCCAGTCTGGCTGAGATCCCCCTGCTTTCCTCTCTCCCTGCTTACTTGTCAGGCTACCTTTTGCTCCATTTTCTGCTCACTCCTCCTAATGGCTTGGTGAAATAGCAAACAAGCCACCAGCAGGAATCTAGTCTGGATGACTGCTTCTGGAGCCTGGATGCAGTACCATTCTTCCACTGATTCAGTGAGTAACTGTTAGGTGGTTCCCTAAGGGATTAGGTATTTCATCACTGAGCTAACCCTGGCTATCATTCTGCTTTTCTTGGCTGTCTTTCAGATTTGACTTTATTTCTAAAAATATTTCAATGGGTCATATCACAGATTCTTTTTTTTTAAATTAAAGTAACATTTCCAATCTACTAATGCTAATACTGTTTCGTATTTATAGCTGATTTGATGGAGTTGGACATGGCCATGGAACCAGACAGAAAAGCGGCTGTTAGTCACTGGCAGCAACAGTCTTACCTGGACTCTGGAATCCATTCTGGTGCCACTACCACAGCTCCTTCTCTGAGTGGTAAAGGCAATCCTGAGGAAGAGGATGTGGATACCTCCCAAGTCCTGTATGAGTGGGAACAGGGATTTTCTCAGTCCTTCACTCAAGAACAAGTAGCTGGTAAGAGTATTATTTTTCATTGCCTTACTGAAAGTCAGAATGCAGTTTTGAGAACTAAAAAGTTAGTGTATAATAGTTTAAATAAAATGTTGTGGTGAAGAAAAGAGAGTAATAGCAATGTCACTTTTACCATTTAGGATAGCAAATACTTAGGTAAATGCTGAACTGTGGATAGTGAGTGTTGAATTAACCTTTTCCAGATATTGATGGACAGTATGCAATGACTCGAGCTCAGAGGGTACGAGCTGCTATGTTCCCTGAGACATTAGATGAGGGCATGCAGATCCCATCTACACAGTTTGATGCTGCTCATCCCACTAATGTCCAGCGTTTGGCT T CTNNB1 ENST00000349496 TQGLCH*IFSY*IGALLRCH*ASLAEIPLLSSLPAYLSGYLLLHFLLTPPNGLVK*QTSHQQESSLDDCFWSLDAVPFFH*FSE*LLGGSLRD*VFHH*ANPGYHSAFLGCLSDLTLFLKIFQWVISQILFF*IKVTFPIY*C*YCFVFIADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVAGKSIIFHCLTESQNAVLRTKKLVYNSLNKMLW*RKESNSNVTFTI*DSKYLGKC*TVDSEC*INLFQILMDSMQ*LELRGYELLCSLRH*MRACRSHLHSLMLLIPLMSSVWX/- ENSG00000168036 MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVADIDGQYAMTRAQRVRAAMFPETLDEGMQIPSTQFDAAHPTNVQRLAEPSQMLKHAVVNLINYQDDAELATRAIPELTKLLNDEDQVVVNKAAVMVHQLSKKEASRHAIMRSPQMVSAIVRTMQNTNDVETARCTAGTLHNLSHHREGLLAIFKSGGIPALVKMLGSPVDSVLFYAITTLHNLLLHQEGAKMAVRLAGGLQKMVALLNKTNVKFLAITTDCLQILAYGNQESKLIILASGGPQALVNIMRTYTYEKLLWTTSRVLKVLSVCSSNKPAIVEAGGMQALGLHLTDPSQRLVQNCLWTLRNLSDAATKQEGMEGLLGTLVQLLGSDDINVVTCAAGILSNLTCNNYKNKMMVCQVGGIEALVRTVLRAGDREDITEPAICALRHLTSRHQEAEMAQNAVRLHYGLPVVVKLLHPPSHWPLIKATVGLIRNLALCPANHAPLREQGAIPRLVQLLVRAHQDTQRRTSMGGTQQQFVEGVRMEEIVEGCTGALHILARDVHNRIVIRGLNTIPLFVQLLYSPIENIQRVAAGVLCELAQDKEAAEAIEAEGATAPLTELLHSRNEGVATYAAAVLFRMSEDKPQDYKKRLSVELTSSLFRTEPMAWNETADLGLDIGAQGEPLGYRQDDPSYRSFHSGGYGQDALGMDPMMEHEMGGHHPGADYPVDGLPDLGHAQDLMDGLPPGDSNQLAWFDTDL inframe_del 3-126 NA NA NA NA NA NA NA NA 1.CTNNB1.ENST00000349496.inframe_del.3-126TQGLCH*IFSY*IGALLRCH*ASLAEIPLLSSLPAYLSGYLLLHFLLTPPNGLVK*QTSHQQESSLDDCFWSLDAVPFFH*FSE*LLGGSLRD*VFHH*ANPGYHSAFLGCLSDLTLFLKIFQWVISQILFF*IKVTFPIY*C*YCFVFIADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSLSGKGNPEEEDVDTSQVLYEWEQGFSQSFTQEQVAGKSIIFHCLTESQNAVLRTKKLVYNSLNKMLW*RKESNSNVTFTI*DSKYLGKC*TVDSEC*INLFQILMDSMQ*LELRGYELLCSLRH*MRACRSHLHSLMLLIPLMSSVWX/-
42 changes: 42 additions & 0 deletions tests/test_fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,48 @@ def test_protein_change_with_asterisk_in_wildtype_and_mutant(self):
self.assertEqual(os.path.getsize(generate_fasta_output_file.name), 0)
self.assertEqual(os.path.getsize(generate_fasta_key_output_file.name), 0)

def test_protein_change_with_X_in_wildtype_and_mutatnt(self):
peptide_sequence_length = '21'
test_data_dir = os.path.join(self.test_data_dir, 'protein_change_with_X_in_wildtype_and_mutant')
generate_fasta_input_file = os.path.join(test_data_dir, 'input.tsv')
generate_fasta_output_file = tempfile.NamedTemporaryFile()
generate_fasta_key_output_file = tempfile.NamedTemporaryFile()

generate_fasta_params = {
'input_file' : generate_fasta_input_file,
'peptide_sequence_length' : self.peptide_sequence_length,
'epitope_length' : self.epitope_length,
'output_file' : generate_fasta_output_file.name,
'output_key_file' : generate_fasta_key_output_file.name,
'downstream_sequence_length': None,
}
generator = FastaGenerator(**generate_fasta_params)

self.assertFalse(generator.execute())
self.assertEqual(os.path.getsize(generate_fasta_output_file.name), 0)
self.assertEqual(os.path.getsize(generate_fasta_key_output_file.name), 0)

def test_protein_change_with_multiple_asterisks(self):
peptide_sequence_length = '21'
test_data_dir = os.path.join(self.test_data_dir, 'protein_change_with_multiple_asterisks')
generate_fasta_input_file = os.path.join(test_data_dir, 'input.tsv')
generate_fasta_output_file = tempfile.NamedTemporaryFile()
generate_fasta_key_output_file = tempfile.NamedTemporaryFile()

generate_fasta_params = {
'input_file' : generate_fasta_input_file,
'peptide_sequence_length' : self.peptide_sequence_length,
'epitope_length' : self.epitope_length,
'output_file' : generate_fasta_output_file.name,
'output_key_file' : generate_fasta_key_output_file.name,
'downstream_sequence_length': None,
}
generator = FastaGenerator(**generate_fasta_params)

self.assertFalse(generator.execute())
self.assertEqual(os.path.getsize(generate_fasta_output_file.name), 0)
self.assertEqual(os.path.getsize(generate_fasta_key_output_file.name), 0)

def test_distance_from_start_works_as_expected(self):
generate_fasta_input_file = tempfile.NamedTemporaryFile()
generate_fasta_output_file = tempfile.NamedTemporaryFile()
Expand Down

0 comments on commit d0b98ee

Please sign in to comment.