-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexer.py
64 lines (45 loc) · 1.62 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import io
from pprint import pprint
import Parser
# GLOBAL CONSTANTS
DOC_TOKEN_COUNT = {}
INVERTED_INDEX = {}
# Read the list of stopwords.
with open('common_words') as f:
STOP_WORDS = f.read().splitlines()
def unigram_index(stopping):
# Generates a unigram index from the tokenized output from Parser
# If 'stopping' is True, does not index stop words
# Invokes Parser to parse the raw html files,
# and generate tokens
Parser.main()
for doc_name in Parser.DOC_TOKENS_MAP:
tokens = Parser.DOC_TOKENS_MAP[doc_name]
print("Indexing document : " + doc_name)
# Keep track of number of tokens in each document.
DOC_TOKEN_COUNT[doc_name] = len(tokens)
for token in tokens:
if stopping:
if token not in STOP_WORDS:
index_token(token, doc_name)
else:
index_token(token, doc_name)
print("Completed Indexing!")
def index_token(token, doc_name):
if token not in INVERTED_INDEX:
INVERTED_INDEX[token] = {doc_name: 1}
elif doc_name not in INVERTED_INDEX[token]:
INVERTED_INDEX[token][doc_name] = 1
else:
INVERTED_INDEX[token][doc_name] += 1
def output_index_to_file(filename):
# Saving the generated inverted index to file
print("Saving index to file . .")
with io.open(filename + ".txt", "w") as outfile:
pprint(INVERTED_INDEX, stream=outfile)
def main(stopping):
# Generating unigram index.
# Pass 'stopping' as true if stopping is to be performed,
# else pass false.
unigram_index(stopping)
print("Completed Indexing.")