-
Notifications
You must be signed in to change notification settings - Fork 0
/
srprnasnewapproach.py
197 lines (143 loc) · 7.34 KB
/
srprnasnewapproach.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#takes a sequence. Then breaks it into 21 letter long areas sequentially (0 through 22, 1 through 23, 2 through 24 , n-21 through n.
#makes every possible variant "chunk" of those 21 letters
#it then makes a list of every reverse complement "comp" of those chunks
#if a reverse complement chunk is found in the original chunck list, it is kept as an intersect:
#
#
# the first round is checking for 4bp and greater complements.
#The second round will check for complements of 4bp with an internal mismatch (not made yet)
#wikipedia hs srp structures
#Database with dotbracket
#http://www.lix.polytechnique.fr/~ponty/enseignement/MathewsRNASorted.faa
from operator import itemgetter
from Bio.Seq import Seq
def revcomp_hamming_distance(s1, s2):
"""Return the Hamming distance between equal-length sequences"""
s1 = str(Seq(s1).reverse_complement())
the_zip = zip(s1, s2)
if sum(ch1 != ch2 for ch1, ch2 in the_zip)==0:
#print "the matching zip" + str(the_zip)
return sum(ch1 != ch2 for ch1, ch2 in the_zip)
def internal_hamming_distance(s1, s2):
"""Return the Hamming distance between equal-length sequences"""
s1 = str(Seq(s1).reverse_complement())
the_zip = zip(s1[1:-1], s2[1:-1])
if s1[:1] == s2[:1] and s1[-1:] == s2[-1:]: #check for one mismatch internally if the beginning and end are the same
return sum(ch1 != ch2 for ch1, ch2 in the_zip)
else:
return 0
#use hamming distance to deal with wobble pair problem
mismatchtargets = []
seq_file = "GGGGCCGTAGCTCAGCTGGGAGAGCACCTGCTTTGCAAGCAGGGGGTCGTCGGTTCGATCCCGTCCGGCTCCACCA"
#This one has 3bp long complement to find. Program will only fold 4bp+ stretches now
#http://trna.bioinf.uni-leipzig.de/DataOutput/Result?ID=tdbD00003770
#alanine from Brucella melitensis
#seq_file = "GGGGGCUCUGUUGGUUCUCCCGCAACGCUACUCUGUUUACCAGGUCAGGUCCGGAAGGAAGCAGCCAAGGCAGAUGACGCGUGUGCCGGGAUGUAGCUGGCAGGGCCCCCACC"
#seq_file = 'GCCGGGCGCGGTGGCGCGTGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCTGGAGGATCGCTTGAGTCCAGGAGTTCTGGGCTGTAGTGCGCTATGCCGATCGGGTGTCCGCACTAAGTTCGGCATCAATATGGTGACCTCCCGGGAGCGGGGGACCACCAGGTTGCCTAAGGAGGGGTGAACCGGCCCAGGTCGGAAACGGAGCAGGTCAAAACTCCCGTGCTGATCAGTAGTGGGATCGCGCCTGTGAATAGCCACTGCACTCCAGCCTGGGCAACATAGCGAG'
#http://www.ncbi.nlm.nih.gov/nuccore/527047183?report=fasta
#######position marking
prevpos1=0#locations of previous folds
prevpos2=0
prevpos3=0
prevpos4=0
maxmark = 0 #the furthest point that the loop has been folded
firstfoldpos = 0 #location of the first fold (for folding 3' end to 5' end)
last = 0
#####file related
tempseq_file2 =''
extendedseq_file = []
dashedseq_file =''
tempseq_file = seq_file
length = len(seq_file)
#####storage
nonoverlapcomp = []
paired = []
testlist= []
pieces =[]
targets = []
#######counters
start = 9 # 9 for trnas, 0 otherwise.
n = 4 # start looking for reverse complements of 4bp or above.
counter = 0
while n > 3: #only looking currently for reverse complements of 4bp or longer
pieces = [] #the current sections of block broken up.
piece_pos =[] #where in the sequence each piece (for checking overlaps)
piece_comps = [] #The reverse complements of all the pieces
##########Which sequence version to use as the working sequence
if counter == 0: #there's no loop increasing counter for now
#if not extendedseq_file: #if the first fold has not been found yet, use the original file
working_seq = seq_file
#else:
# working_seq = extendedseq_file #if the first fold has been found, concatenate the sequence previous to the first fold to the end of the sequence. (the tail pairs with the head)
if counter ==0: #there's no loop increasing counter for now
block = working_seq[start:start+length] #the current area of the sequence
##########This breaks the sequence into pieces like 12345678, 1234567, 123456,...12, 2345678, 234567, 23456, etc...) and creates a list of all chunk reverse complements
block = block + working_seq[0:10]
len_block = len(block)
j = 0 #beginning of piece
end = length-1 #end of piece
k=j+10
while j<end:
while k > j:
piece = block[j:k+1]
if piece.isalpha(): #for later when sequences with dashes are run through
#pieces.append(piece) # all the pieces
pieces.append([piece, start+j, start+k+1]) # all the pieces with positional information #the positional tracking is off)
#current = Seq(piece)
#currentrevcomp = str(current.reverse_complement())
#piece_comps.append(currentrevcomp)#all the revcomps
k = k - 1
j = j + 1
k=j+10
##########
pieces = sorted(pieces, key = itemgetter(1))
pieces_copy = pieces
for piece in pieces:
for piece_copy in pieces_copy:
if piece[2] < piece_copy[1] and (piece_copy[1]-piece[2])<25:
if piece[1] > last:
if piece[1] > length:
piece[1] = piece[1]-length
if len(piece[0]) ==len(piece_copy[0]) and len(piece[0]) > n-1:
#print "before rchd" + str(piece[0])
if revcomp_hamming_distance(piece[0], piece_copy[0]) ==0:
range1=set(range(piece[1],piece[2]))
range2=set(range(piece_copy[1], piece_copy[2]))
if not range1.intersection(range2):
targets.append(piece + piece_copy)
last = piece_copy[2]
#print "last" + str(last)
if len(piece[0]) == len(piece_copy[0]) and len(piece[0])>6:
if internal_hamming_distance(piece[0], piece_copy[0]) ==1: #if there's one internal mismatch
#print piece
#print piece_comp
range1=set(range(piece[1],piece[2]))
range2=set(range(piece_copy[1], piece_copy[2]))
if not range1.intersection(range2):
targets.append(piece + piece_copy)
last=piece_copy[2]
#if firstfoldpos == 0:
#firstfoldpos = min(target_range)
#extendedseq_file = seq_file + seq_file[0:firstfoldpos]
#start = 0
#nonoverlapcomp = []
#n = n-1
for i in range(len(targets)):
# [(targets[i][j]-length) for j in [4,5] if (targets[i][j] > length-1)] bad list comprehension
if targets[i][4] > length-1:
targets[i][4] = targets[i][4]-length
if targets[i][5] > length-1:
targets[i][5] = targets[i][5]-length
print "The original sequence: " + str(seq_file)
#for making dot bracket format later
#dot = ""
#for char in tempseq_file:
# if char.isalpha():
# dot = dot + "."
# else:
# dot = dot + char
#print "Dot bracket format: "+ str(dot)
sorted_list = sorted(targets, key=itemgetter(1,4))
#print "Folded areas" + str(sorted_list)
for item in range(len(sorted_list)):
print 'fold #{0}: {1}'.format(item+1, sorted_list[item])