-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFrequentWordsWithMismatches.py
59 lines (45 loc) · 1.8 KB
/
FrequentWordsWithMismatches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Find the most frequent kmers with at most d mismatches
from AproximatePatternCount import ApproximatePatternCount
from Neighbors import Neighbors
from Combinations import Combinations
import itertools
import time
from collections import defaultdict
def FrequentWordsWithMismatches(Genome, k, d):
start = time.process_time()
'''
1. Create a k-mer list with ['A','C','G','T'] (4**k k-mers in the list)
2. Calculate approximate pattern count for each k-mer (total 4**k)
3. Get k-mer(s) with maximum approximate pattern count.
'''
aprox_frq_words = []
frequencies = {}
# all existent combinations of k-length kmers 4**k
combinations = Combinations(k)
for kmer in combinations:
count = ApproximatePatternCount(kmer, Genome, d)
frequencies[kmer] = count
for kmer in frequencies:
if frequencies[kmer] == max(frequencies.values()):
aprox_frq_words.append(kmer)
end = time.process_time()
print("Time:", end - start)
return aprox_frq_words
def FasterFrequentWordsWithMismatches(Genome, k, d):
start = time.process_time()
aprox_frq_words = []
frequencies = defaultdict(lambda: 0)
# all existent kmers with d mismatches of current kmer in genome
for index in range(len(Genome) - k + 1):
curr_kmer = Genome[index : index + k]
neighbors = Neighbors(curr_kmer, d)
for kmer in neighbors:
frequencies[kmer] += 1
for kmer in frequencies:
if frequencies[kmer] == max(frequencies.values()):
aprox_frq_words.append(kmer)
end = time.process_time()
print("Time:", end - start)
return aprox_frq_words
# print(FrequentWordsWithMismatches("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4,1))
# print(FasterFrequentWordsWithMismatches("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4,1))