-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepeatitive_sequence_suppression.py
163 lines (138 loc) · 8.18 KB
/
repeatitive_sequence_suppression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Repetitive Sequence Suppression Algorithm
import re # regular expression
# This algorithm has a significant draw back which is for a data with no repetitions
# it produces compressed data larger than the original data and this is shown in
# the functions ending with "primitive" word. However, we can just approach differently
# to solve such a problem and this is shown in the functions with the same name as
# the last two but without the "primitive" keyword.
# I put a test case to show the difference, check it out!
def suppress_repetitive_sequence(sequence):
result = "" # Initialize an empty string to store the suppressed sequence
count = 1 # Initialize a counter to keep track of the length of the current repetitive sequence
for i in range(1, len(sequence)):
# If the current element is equal to the previous element, increase the counter
if sequence[i] == sequence[i - 1]:
count += 1
else: # If the current element is not equal to the previous element
if count >= 2: # If the length of the current sequence is greater than 1
# mark the occurrence of the repeated element along with the count
result += "".join(f"r{sequence[i - 1]}{count}")
else: # If the length of the current repetitive sequence is NOT greater than 1
# just append the character to the result string
result += "".join(sequence[i - count:i])
count = 1 # Reset the counter for the next sequence
# the last character or sequence (as no character next to the last one)
if count >= 2: # If the length of the current sequence is greater than 1
# mark the occurrence of the repeated element along with the count
result += "".join(f"r{sequence[-1]}{count}")
else: # If the length of the current repetitive sequence is NOT greater than 1
# just append the character to the result string
result += "".join(sequence[len(sequence) - count:])
return result # Return the resulting sequence
def decompress_sequence(compressed_sequence):
result = "" # Initialize an empty string to store the decompressed sequence
# Use regular expression to find all occurrences of the compressed repetitive sequences
# this returns a list of tuples of characters and their occurrence times
matches = re.findall(r'r(\w)(\d+)', compressed_sequence)
start_index = 0
for match in matches:
char, count = match # get character and count from tuple
# find where this character relies on in the sequence
index = compressed_sequence.find(f"r{char}{count}", start_index)
# Append any characters before the current repetitive sequence to the result
result += compressed_sequence[start_index:index]
# Append the sequence repeated to the result
result += char * int(count)
# move the start index next to the repeated sequence flag
start_index = index + len(f"r{char}{count}")
# Append any remaining characters after the last repetitive sequence to the result
result += compressed_sequence[start_index:]
return result
def suppress_repetitive_sequence_primitive(sequence):
result = "" # Initialize an empty string to store the suppressed sequence
count = 1 # Initialize a counter to keep track of the length of the current repetitive sequence
for i in range(1, len(sequence)):
# If the current element is equal to the previous element, increase the counter
if sequence[i] == sequence[i - 1]:
count += 1
else: # If the current element is not equal to the previous element
# mark the occurrence of the repeated element along with the count
result += "".join(f"r{sequence[i - 1]}{count}")
count = 1 # Reset the counter for the next sequence
# the last character or sequence (as no character next to the last one)
# mark the occurrence of the repeated element along with the count
result += "".join(f"r{sequence[-1]}{count}")
return result # Return the resulting sequence
def decompress_sequence_primitive(compressed_sequence):
result = "" # Initialize an empty string to store the decompressed sequence
# Use regular expression to find all occurrences of the compressed repetitive sequences
# this returns a list of tuples of characters and their occurrence times
matches = re.findall(r'r(\w)(\d+)', compressed_sequence)
start_index = 0
for match in matches:
char, count = match # get character and count from tuple
# find where this character relies on in the sequence
index = compressed_sequence.find(f"r{char}{count}", start_index)
# Append any characters before the current repetitive sequence to the result
result += compressed_sequence[start_index:index]
# Append the sequence repeated to the result
result += char * int(count)
# move the start index next to the repeated sequence flag
start_index = index + len(f"r{char}{count}")
# Append any remaining characters after the last repetitive sequence to the result
result += compressed_sequence[start_index:]
return result
if __name__ == '__main__':
# The two test cases show the great difference between the two approaches
# the primitive approach gives 3 times more data when data has no repeated characters
# while the enhanced one gives the same data where nothing can be done actually
# for the second case with a lot of repeated characters the primitive approach
# can deliver 45.45% compression ratio while the enhanced delivers 39.39%
# if you checked the outputs carefully you will notice that the primitive approach
# insists on encoding each and every character while the enhanced one only encodes
# the repeated ones meaning that it only encodes when it's necessary
# Test cases
print("Test Case #1: No Repetition")
no_repetition_text = "thisisatextwithnospaces"
# primitive approach
prim_compressed_output1 = suppress_repetitive_sequence_primitive(no_repetition_text)
prim_compression_ratio1 = ((len(prim_compressed_output1) / len(no_repetition_text)) * 100)
prim_decompressed_output1 = decompress_sequence_primitive(prim_compressed_output1)
print("Primitive Approach")
print(f"original text: {no_repetition_text}")
print(f"compressed output: {prim_compressed_output1}")
print(f"decompression output: {prim_decompressed_output1}")
print(f"compression ratio: {round(prim_compression_ratio1, 2)}%")
print()
# enhanced approach
compressed_output1 = suppress_repetitive_sequence(no_repetition_text)
compression_ratio1 = ((len(compressed_output1) / len(no_repetition_text)) * 100)
decompressed_output1 = decompress_sequence(compressed_output1)
print("Enhanced Approach")
print(f"original text: {no_repetition_text}")
print(f"compressed output: {compressed_output1}")
print(f"decompression output: {decompressed_output1}")
print(f"compression ratio: {round(compression_ratio1, 2)}%")
print()
print("Test Case #2: Repetition")
repetition_text = "aaaaaaaaaaaaaaabcccccccccdddddddddddddddeeeeeeefgghhhhhhhhhhiiiiii"
# primitive approach
prim_compressed_output2 = suppress_repetitive_sequence_primitive(repetition_text)
prim_compression_ratio2 = ((len(prim_compressed_output2) / len(repetition_text)) * 100)
prim_decompressed_output2 = decompress_sequence_primitive(prim_compressed_output2)
print("Primitive Approach")
print(f"original text: {repetition_text}")
print(f"compressed output: {prim_compressed_output2}")
print(f"decompression output: {prim_decompressed_output2}")
print(f"compression ratio: {round(prim_compression_ratio2, 2)}%")
print()
# enhanced approach
compressed_output2 = suppress_repetitive_sequence(repetition_text)
compression_ratio2 = ((len(compressed_output2) / len(repetition_text)) * 100)
uncompressed_output2 = decompress_sequence(compressed_output2)
print("Enhanced Approach")
print(f"original text: {repetition_text}")
print(f"compressed output: {compressed_output2}")
print(f"decompression output: {uncompressed_output2}")
print(f"compression ratio: {round(compression_ratio2, 2)}%")
print()