-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBIM.py
124 lines (103 loc) · 3.58 KB
/
BIM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 25 05:40:58 2019
@author: cricketjanoon
"""
import numpy as np
import re
import math
##----------------------------
trainingSetCount = 32604
numberOfSearchesToShow = 5
##----------------------------
##-->> sample queries
# "gbagbo loyalists regain ground in ivory coast beach"
# "on debt ahead of high-level talks"
# "'nhl finals,' 'nixon in china'"
# "superintendent of the beaumont school district"
# "here's a look at some of the league's top young,"
#taking query from the user
inputQuery = input("Enter your query: ")
query = []
for term in inputQuery.split():
t = re.sub(r"[^a-zA-Z0-9]","",term)
query.append(t)
#reading the file and storing the documetns
file = open("LSI-data.txt")
documentMatrix = np.full((trainingSetCount,3), dtype=object, fill_value = "empty")
documentCount = 0
index = 0
for line in file:
if documentCount>=trainingSetCount:
break
if line in ['\n','\r\n']:
documentCount = documentCount + 1
index = 0
else:
index = index + 1
if index == 1:
documentMatrix[documentCount,1] = line
elif index == 2:
documentMatrix[documentCount,2] = line
elif index == 7:
documentMatrix[documentCount,0] = line
#cleaning text code
for i in range(documentMatrix.shape[0]):
for j in range(documentMatrix.shape[1]):
documentMatrix[i,j] = re.sub(r'[^\w\s_]+', '', documentMatrix[i,j]).strip()
documentMatrix[i,j] = documentMatrix[i,j].lower()
documentMatrix = documentMatrix.astype(str)
documentMatrix2 = np.full((trainingSetCount), dtype=object, fill_value = "empty")
for i in range(len(documentMatrix)):
documentMatrix2[i] = documentMatrix[i,1] + " " + documentMatrix[i, 2]
documentMatrix2 = documentMatrix2.astype(str)
documents = list(documentMatrix2)
documentMatrix2 = None
#collect all words in the all documents
bagOfWords = []
for line in documents:
for word in line.split():
word = re.sub(r"[^a-zA-Z0-9]","",word)
word = word.lower()
if word not in bagOfWords:
bagOfWords.append(word)
#instantiate term document matrix of appropriate size with zeros
termDocumentMatrix = np.zeros((len(documents), len(bagOfWords)), dtype = 'int8')
#populate term document matrix
for i in range(len(documents)):
docWordList = documents[i].split();
for t in range(len(docWordList)):
docWordList[t] = re.sub(r"[^a-zA-Z0-9]","",docWordList[t])
docWordList[t] = docWordList[t].lower()
for j in range(len(bagOfWords)):
if bagOfWords[j] in docWordList:
termDocumentMatrix[i,j] = 1
#calculate dft of each term and calculate its weight "w"
n = len(documents)
dft = np.zeros((len(bagOfWords)))
weight = np.zeros((len(bagOfWords)))
for i in range(len(bagOfWords)):
for j in range(len(documents)):
if termDocumentMatrix[j,i] == 1:
dft[i] = dft[i] +1
weight[i] = math.log(n/dft[i], 2)
#now starting real BIM, calculating the retrieval status value of each doc
docProb = np.zeros((len(documents)))
for q in query:
termIndex = bagOfWords.index(q)
for d in range(len(termDocumentMatrix)):
if termDocumentMatrix[d, termIndex] == 1:
docProb[d] = docProb[d] + weight[termIndex]
#displaying the results
indices = docProb.argsort()[-numberOfSearchesToShow:][::-1]
print("\n")
for i in range(len(indices)):
print("Title: " + documentMatrix[indices[i],1])
print("Content: " + documentMatrix[indices[i],2])
print("Category: " + documentMatrix[indices[i],0])
print("RSV: " + str(docProb[indices[i]]))
print("Terms found: ", end='')
for j in range(len(termDocumentMatrix[indices[i]])):
if termDocumentMatrix[indices[i], j] == 1 and bagOfWords[j] in query:
print(bagOfWords[j] + ", ", end = '')
print("\n")