forked from joaks1/python-translation-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate.py
256 lines (210 loc) · 9.95 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#! /usr/bin/env python3
import sys
def pop_next_codon(sequence):
"""Removes and returns the first 3 bases.
Returns a tuple of a string of the first three bases and a string of the remaing sequence.
"""
codon = sequence[0:3] #takes the first three bases
remaining_seq = sequence[3:] #the rest of the sequence
return codon, remaining_seq #returns the two parts of the sequence
joinedgenes = ""
def translate_sequence(rna_sequence, genetic_code):
"""Translates a sequence of RNA into a sequence of amino acids.
Translates `rna_sequence` into string of amino acids, according to the
`genetic_code` given as a dict. Translation begins at the first position of
the `rna_sequence` and continues until the first stop codon is encountered
or the end of `rna_sequence` is reached.
If `rna_sequence` is less than 3 bases long, or starts with a stop codon,
an empty string is returned.
Parameters
----------
rna_sequence : str
A string representing an RNA sequence (upper or lower-case).
genetic_code : dict
A dictionary mapping all 64 codons (strings of three RNA bases) to
amino acids (string of single-letter amino acid abbreviation). Stop
codons should be represented with asterisks ('*').
Returns
-------
str
A string of the translated amino acids.
"""
rna_sequence = rna_sequence.upper() #makes it all upper case
amino_acid_list = [] #setting up variable
while True:
if len(rna_sequence) <3:
break #if the sequence is less than three bases long, returns empty string
codon, remaining_seq = pop_next_codon(rna_sequence) #this function pop_next_codon is defined earlier
rna_sequence = remaining_seq #this is the sequence after the first three bases as seen in the function pop_next_codon
aa = genetic_code[codon]
if aa == "*":
break #if the sequence contains stop codon, returns empty string
amino_acid_list.append(aa)
return "".join(amino_acid_list)
def get_all_translations(rna_sequence, genetic_code):
"""Get a list of all amino acid sequences encoded by an RNA sequence.
All three reading frames of `rna_sequence` are scanned from 'left' to
'right', and the generation of a sequence of amino acids is started
whenever the start codon 'AUG' is found. The `rna_sequence` is assumed to
be in the correct orientation (i.e., no reverse and/or complement of the
sequence is explored).
The function returns a list of all possible amino acid sequences that
are encoded by `rna_sequence`.
If no amino acids can be translated from `rna_sequence`, an empty list is
returned.
Parameters
----------
rna_sequence : str
A string representing an RNA sequence (upper or lower-case).
genetic_code : dict
A dictionary mapping all 64 codons (strings of three RNA bases) to
amino acids (string of single-letter amino acid abbreviation). Stop
codons should be represented with asterisks ('*').
Returns
-------
list
A list of strings; each string is an sequence of amino acids encoded by
`rna_sequence`.
"""
rna_sequence = rna_sequence.upper() #makes the sequence all capital letters
number_of_bases = len(rna_sequence) #gives the length of the rna sequence
last_codon_index = number_of_bases - 3 #shows where the last codon in rna_sequence starts
if last_codon_index < 0: #tests whether rna_sequence is long enough to contain any codons
return[] #if it's too short, return empty list
amino_acid_seq_list = []
for base_index in range(last_codon_index +1): #gives number of bases to loop through, not sure why last_codon_index + 1
codon = rna_sequence[base_index: base_index +3] #indicates that each codon is three bases starting at the base_index
if codon == "AUG": #if start codon appears
aa_seq = translate_sequence( #use the translation function we defined to translate sequence
rna_sequence = rna_sequence[base_index:],
genetic_code = genetic_code)
if aa_seq: #if aa_seq happens
amino_acid_seq_list.append(aa_seq) #add this to the list we defined earlier
return amino_acid_seq_list #return the list of codons
reverse = "string"
reverse_upper = "string"
def get_reverse(sequence):
"""Reverse orientation of `sequence`.
Returns a string with `sequence` in the reverse order.
If `sequence` is empty, an empty string is returned.
Examples
--------
>>> get_reverse('AUGC')
'CGUA'
"""
reverse = sequence[::-1]
reverse_upper = (reverse.upper())
return reverse_upper
def get_complement(sequence):
"""Get the complement of a `sequence` of nucleotides.
Returns a string with the complementary sequence of `sequence`.
If `sequence` is empty, an empty string is returned.
Examples
--------
>>> get_complement('AUGC')
'UACG'
"""
complementary_strand = ""
for base in sequence :
if base == "A" :
complementary_strand += "U"
elif base == "a" :
complementary_strand += "U"
elif base == "U" :
complementary_strand += "A"
elif base == "u" :
complementary_strand += "A"
elif base == "G" :
complementary_strand += "C"
elif base == "g" :
complementary_strand +="C"
elif base == "C" :
complementary_strand += "G"
elif base == "c" :
complementary_strand += "G"
else :
print("Wrong input")
break
return complementary_strand
reverse_2 = "string"
compstrand = ""
def reverse_and_complement(sequence):
"""Get the reversed and complemented form of a `sequence` of nucleotides.
Returns a string that is the reversed and complemented sequence
of `sequence`.
If `sequence` is empty, an empty string is returned.
Examples
--------
>>> reverse_and_complement('AUGC')
'GCAU'
"""
reve = get_reverse(sequence)
reve_comp = get_complement(reve)
return reve_comp
def get_longest_peptide(rna_sequence, genetic_code):
"""Get the longest peptide encoded by an RNA sequence.
Explore six reading frames of `rna_sequence` (the three reading frames of
`rna_sequence`, and the three reading frames of the reverse and complement
of `rna_sequence`) and return (as a string) the longest sequence of amino
acids that it encodes, according to the `genetic_code`.
If no amino acids can be translated from `rna_sequence` nor its reverse and
complement, an empty string is returned.
Parameters
----------
rna_sequence : str
A string representing an RNA sequence (upper or lower-case).
genetic_code : dict
A dictionary mapping all 64 codons (strings of three RNA bases) to
amino acids (string of single-letter amino acid abbreviation). Stop
codons should be represented with asterisks ('*').
Returns
-------
str
A string of the longest sequence of amino acids encoded by
`rna_sequence`.
"""
peptides = get_all_translations(rna_sequence = rna_sequence,
genetic_code = genetic_code) #use the get_all_translations function to translate the base rna_sequence
rev_comp_seq = reverse_and_complement(rna_sequence) #use the reverse_and_complement function that we defined to get the reverse/complement of rna_sequence
rev_comp_peptides = get_all_translations(rna_sequence = rev_comp_seq,
genetic_code = genetic_code) #use the get_all_translations function to translate the reverse/complement
peptides += rev_comp_peptides #add this translation to peptides variable
if not peptides:
return "" #unsure what this section means
if len(peptides) <2: #if doesn't contain a two codons, return the first amino acid
return peptides[0]
most_number_of_bases = -1
longest_peptide_index = -1
for peptide_index, aa_seq in enumerate(peptides): #enumerate function tracks iterations of the loop
if len(aa_seq) > most_number_of_bases: #if more than -1 bases?
longest_peptide_index = peptide_index #the counter is equal to the longest peptide index
most_number_of_bases = len(aa_seq) #change the most number of bases so that it moves to the next index next time
return peptides[longest_peptide_index] #return the last peptide
if __name__ == '__main__':
genetic_code = {'GUC': 'V', 'ACC': 'T', 'GUA': 'V', 'GUG': 'V', 'ACU': 'T', 'AAC': 'N', 'CCU': 'P', 'UGG': 'W', 'AGC': 'S', 'AUC': 'I', 'CAU': 'H', 'AAU': 'N', 'AGU': 'S', 'GUU': 'V', 'CAC': 'H', 'ACG': 'T', 'CCG': 'P', 'CCA': 'P', 'ACA': 'T', 'CCC': 'P', 'UGU': 'C', 'GGU': 'G', 'UCU': 'S', 'GCG': 'A', 'UGC': 'C', 'CAG': 'Q', 'GAU': 'D', 'UAU': 'Y', 'CGG': 'R', 'UCG': 'S', 'AGG': 'R', 'GGG': 'G', 'UCC': 'S', 'UCA': 'S', 'UAA': '*', 'GGA': 'G', 'UAC': 'Y', 'GAC': 'D', 'UAG': '*', 'AUA': 'I', 'GCA': 'A', 'CUU': 'L', 'GGC': 'G', 'AUG': 'M', 'CUG': 'L', 'GAG': 'E', 'CUC': 'L', 'AGA': 'R', 'CUA': 'L', 'GCC': 'A', 'AAA': 'K', 'AAG': 'K', 'CAA': 'Q', 'UUU': 'F', 'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'GCU': 'A', 'GAA': 'E', 'AUU': 'I', 'UUG': 'L', 'UUA': 'L', 'UGA': '*', 'UUC': 'F'}
rna_seq = ("AUG"
"UAC"
"UGG"
"CAC"
"GCU"
"ACU"
"GCU"
"CCA"
"UAU"
"ACU"
"CAC"
"CAG"
"AAU"
"AUC"
"AGU"
"ACA"
"GCG")
longest_peptide = get_longest_peptide(rna_sequence = rna_seq,
genetic_code = genetic_code)
assert isinstance(longest_peptide, str), "Oops: the longest peptide is {0}, not a string".format(longest_peptide)
message = "The longest peptide encoded by\n\t'{0}'\nis\n\t'{1}'\n".format(
rna_seq,
longest_peptide)
sys.stdout.write(message)
if longest_peptide == "MYWHATAPYTHQNISTA":
sys.stdout.write("Indeed.\n")