forked from sreeramkannan/Shannon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fast_reps.py
150 lines (128 loc) · 4.8 KB
/
fast_reps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import time
import sys
import re
import pdb,math
import numpy
from sets import Set
BASES = ['A', 'G', 'C', 'T']
r=18
rmer_to_contig = {}
contig_to_rmer = {}
cmer_to_contig = {}
def reverse_complement(bases):
"""Return the reverse complement of BASES. Assumes BASES is
all uppercase.
"""
replacements = [('A', 't'), ('T', 'a'), ('C', 'g'), ('G', 'c')]
for ch1, ch2 in replacements:
bases = re.sub(ch1, ch2, bases)
return bases[::-1].upper()
class Counter():
def __init__(self, name, report_length):
self.name = name
self.count = 0
self.report_length = report_length
def increment(self):
self.count += 1
if self.count % self.report_length == 0:
print "{:s}: {:s}, processed {:d}".format(time.asctime(), self.name, self.count)
def find_kmers(contig,k,ds):
'''find the kmers of a contig of size k with ds denoting double stranded)'''
rmer_list = []
for i in range(0, len(contig)-r+1):
rmer_list.append(contig[i:i+r])
if ds:
rmer_list.append(reverse_complement(contig[i:i+r]))
return rmer_list
def argmax(lst, key):
"""Returns the element x in LST that maximizes KEY(x).
"""
best = lst[0]
for x in lst[1:]:
if key(x) > key(best):
best = x
return best
def duplicate_check(contigs, contig_name, ds, f = 0.99):
# To add: if rmer in the contig multiple times, only increment the dup-contig once for each time its in dup-contig
dup_count = {}
contig = contigs[contig_name]
qName = contig_name; qLen = len(contig);
max_till_now = 0; max_contig_index = -1
for rmer in find_kmers(contig,r,ds):
if rmer in rmer_to_contig:
for dup in rmer_to_contig[rmer]:
if dup == contig_name:
'Only take duplicates other than curr contig'
continue
if dup not in dup_count:
dup_count[dup] = 1
else:
dup_count[dup] += 1
if dup_count[dup] >= max_till_now:
max_till_now = dup_count[dup]; max_contig_name = dup
imp_dups = {} #Duplicates that are considered
for dup in dup_count:
if dup_count[dup] >= f*(qLen-r):
imp_dups[dup] = numpy.zeros(len(contig))
imp_dups_set = Set(list(imp_dups.keys()))
a = numpy.zeros(len(contig))
for i in range(len(contig)-r+1):
rmer = contig[i:i+r]
if rmer in rmer_to_contig:
for dup in imp_dups_set.intersection(rmer_to_contig[rmer]):
imp_dups[dup][i:i+r] = 1
if ds:
rmer_rc=reverse_complement(rmer)
if rmer_rc in rmer_to_contig:
for dup in imp_dups_set.intersection(rmer_to_contig[rmer]):
imp_dups[dup][i:i+r] = 1
for dup in imp_dups: #for dup in dup_count:
tName = dup; tLen = len(contigs[tName])
score = sum(imp_dups[dup])
if score >= f*qLen:
if (tLen > qLen) or ((tLen == qLen) and (tName > qName)):
return True
return False
def find_reps(infile, outfile,ds):
contigs = {}
contig_no = 0
for line in open(infile):
if line[0]=='>':
temp = line.strip().split(); curr_name = temp[0][1:]
contig_no += 1
if contig_no % 1000 == 0:
print ("processed " + str(contig_no) + " sequences")
continue
else:
curr_contig = line.strip()
contigs[curr_name] = curr_contig
for rmer in find_kmers(curr_contig,r,ds):
if rmer in rmer_to_contig:
rmer_to_contig[rmer].add(curr_name)
else:
rmer_to_contig[rmer]= Set([curr_name])
with open(outfile,'w') as out_file:
contig_no = 0
for contig_name in contigs:
contig_no +=1
duplicate_suspect = duplicate_check(contigs,contig_name, ds)
if contig_no % 1000 == 0:
print("written " + str(contig_no) + " sequences")
if not duplicate_suspect:
out_file.write('>' + contig_name+'\n')
out_file.write(contigs[contig_name]+'\n')
def main():
if len(sys.argv) == 1:
arguments = ['asd', 'in_fasta', 'out_fasta', '-d']
else:
arguments = sys.argv
#pdb.set_trace()
ds = '-d' in arguments
arguments = [a for a in arguments if len(a) > 0 and a[0] != '-'][1:]
infile, outfile = arguments[:2]
#pdb.set_trace()
find_reps(infile, outfile, ds)
if __name__ == '__main__':
#c1 = Counter("Loading", 10**6)
#c2 = Counter("Correction", 10**6)
main()