-
Notifications
You must be signed in to change notification settings - Fork 4
/
spimi.py
117 lines (110 loc) · 5.5 KB
/
spimi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import ast
import re
import collections
from collections import OrderedDict
def spimi_invert(documents, block_size_limit):
""" Applies the Single-pass in-memory indexing algorithm """
block_number = 0
documents_count = len(documents)
dictionary = {} # (term - postings list)
for index, docID in enumerate(documents):
for term in documents[docID]:
# If term occurs for the first time
if term not in dictionary:
# Add term to dictionary, create new postings list, and add docID
dictionary[term] = [docID]
# else:
# # If term has a subsequent occurence
# if docID not in dictionary[term]:
# # Add a posting (docID) to the existing posting list of the term
# dictionary[term].append(docID)
else:
dictionary[term].append(docID)
if sys.getsizeof(dictionary) > block_size_limit or (index == documents_count-1):
temp_dict = sort_terms(dictionary)
write_block_to_disk(temp_dict, block_number)
temp_dict = {}
block_number += 1
dictionary = {}
print("SPIMI invert complete!")
def sort_terms(term_postings_list):
""" Sorts dictionary terms in alphabetical order """
print(" -- Sorting terms...")
sorted_dictionary = OrderedDict() # keep track of insertion order
sorted_terms = sorted(term_postings_list)
for term in sorted_terms:
result = [int(docIds) for docIds in term_postings_list[term]]
result_tftd = calculate_tftd(result)
sorted_dictionary[term] = result_tftd
return sorted_dictionary
def calculate_tftd(pl_with_duplicates):
""" Add term frequency of term in each document """
# print(pl_with_duplicates)
counter = collections.Counter(pl_with_duplicates)
pl_tftd = [[int(docId), counter[docId]] for docId in counter.keys()]
return pl_tftd
def write_block_to_disk(term_postings_list, block_number):
""" Writes index of the block (dictionary + postings list) to disk """
# Define block
base_path = 'index_blocks/'
block_name = 'block-' + str(block_number) + '.txt'
block = open(base_path + block_name, 'a+')
print(" -- Writing term-positing list block: " + block_name + "...")
# Write term : posting lists to block
for index, term in enumerate(term_postings_list):
# Term - Posting List Format
# term:[docID1, docID2, docID3]
# e.g. cat:[4,9,21,42]
block.write(str(term) + ":" + str((term_postings_list[term])) + "\n")
block.close()
def merge_blocks(blocks):
""" Merges SPIMI blocks into final inverted index """
merge_completed = False
spimi_index = open('spimi_inverted_index.txt', 'a+')
# Collect initial pointers to (term : postings list) entries of each SPIMI blocks
temp_index = OrderedDict()
for num, block in enumerate(blocks):
print("-- Reading into memory...", blocks[num].name)
line = blocks[num].readline() # term:[docID1, docID2, docID3]
line_tpl = line.rsplit(':', 1)
term = line_tpl[0]
postings_list = ast.literal_eval(line_tpl[1])
temp_index[num] = {term:postings_list}
while not merge_completed:
# Convert into an array of [{term: [postings list]}, blockID]
tpl_block = ([[temp_index[i], i] for i in temp_index])
# Fetch the current term postings list with the smallest alphabetical term
smallest_tpl = min(tpl_block, key=lambda t: list(t[0].keys()))
# Extract term
smallest_tpl_term = (list(smallest_tpl[0].keys())[0])
# Fetch all IDs of blocks that contain the same term in their currently pointed (term: postings list) :
# For each block, check if the smallest term is in the array of terms from all blocks then extract the block id
smallest_tpl_block_ids = [block_id for block_id in temp_index if smallest_tpl_term in [term for term in temp_index[block_id]]]
# Build a new postings list which contains all postings related to the current smallest term
# Flatten the array of postings and sort
smallest_tpl_pl = sorted(sum([pl[smallest_tpl_term] for pl in (temp_index[block_id] for block_id in smallest_tpl_block_ids)], []))
spimi_index.write(str(smallest_tpl_term) + ":" + str(smallest_tpl_pl) + "\n")
# Collect the next sectioned (term : postings list) entries from blocks that contained the previous smallest tpl term
for block_id in smallest_tpl_block_ids:
# Read the blocks and read tpl in a temporary index
block = [file for file in blocks if re.search('block-'+str(block_id), file.name)]
if block[0]:
line = block[0].readline()
if not line == '':
line_tpl = line.rsplit(':', 1)
term = line_tpl[0]
postings_list = ast.literal_eval(line_tpl[1])
temp_index[block_id] = {term:postings_list}
else:
# Delete block entry from the temporary sectioned index holder if no line found
del temp_index[block_id]
blocks.remove(block[0])
print("Finished merging block:", block[0].name)
else:
blocks.remove(block[0])
# If all block IO streams have been merged
if not blocks:
merge_completed = True
print("SPIMI completed! All blocks merged into final index: spimi_inverted_index.txt")
return 0