-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.c
350 lines (292 loc) · 7.32 KB
/
search.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#include "hashmap.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
struct hashmap* training(char** files, int numBuckets);
struct hashmap* documentFrequency(struct hashmap* theMap);
struct hashmap* stopWords(struct hashmap*);
void readQuery(struct hashmap* theMap, char* searchQuery);
void rank(struct hashmap* theMap, char** theWords,int numWords);
struct hashmap* training(char** files, int numBuckets)
{
struct hashmap* theMap = hm_create(numBuckets);
FILE* dataFile;
int i;
for (i = 0; i < 3; i++)
{
// opens data file to read from
dataFile = fopen(files[i], "r");
char getWords[100];
// while there are still words to read , read from file
while ((fgets(getWords, 100, dataFile)) != NULL)
{
// uses strtok to get a word before each space
char *theWord = strtok(getWords, " ");
while (theWord != NULL)
{
// gets the word from the hash map
struct llnode* occuring = hm_get(theMap, theWord);
// if the word is not in the hashmap put it in with the corrent occurences depending on the file
if (occuring == NULL)
{
if (!(strcmp(files[i], "D1.txt")))
{
hm_put(theMap, theWord, 1, 0, 0,0);
}
if(!(strcmp(files[i], "D2.txt")))
{
hm_put(theMap, theWord,0, 1, 0,0);
}
if (!(strcmp(files[i], "D3.txt")))
{
hm_put(theMap, theWord, 0, 0, 1,0);
}
}
// if the word is in the hashmap all ready, update the occurences depending on the file
else
{
if (!(strcmp(files[i], "D1.txt")))
{
int occur = occuring->d1_occurences;
occur = occur + 1;
occuring->d1_occurences=occur;
}
if (!(strcmp(files[i], "D2.txt")))
{
int occur = occuring->d2_occurences;
occur = occur + 1;
occuring->d2_occurences=occur;
}
if(!(strcmp(files[i],"D3.txt")))
{
int occur = occuring->d3_occurences;
occur = occur + 1;
occuring->d3_occurences=occur;
}
}
theWord = strtok(NULL, " ");
}
}
}
// call documentFequency on the map with updated occurences
struct hashmap* docFreqMap = documentFrequency(theMap);
return docFreqMap;
}
struct hashmap* documentFrequency(struct hashmap* theMap)
{
int i = 0;
// iterates through the whole hashmap
while (i < theMap->num_buckets)
{
struct llnode* temp = theMap->map[i];
struct llnode* iterator = temp;
// depending on the num occurences for each file is how docFre is updated
while (iterator != NULL)
{
int docFre = 0;
if (iterator->d1_occurences > 0)
{
docFre = docFre + 1;
}
if (iterator->d2_occurences)
{
docFre = docFre + 1;
}
if (iterator->d3_occurences)
{
docFre = docFre + 1;
}
// updates the nodes doc frequency
iterator->docFreq = docFre;
iterator = iterator->next;
}
i++;
}
// returns the updated hashmap
return theMap;
}
struct hashmap* stopWords(struct hashmap* theMap)
{
struct llnode* temp;
int i=0;
// iterates through entire hashmap
while (i < theMap->num_buckets)
{
temp = theMap->map[i];
struct llnode* iterator = temp;
while (iterator != NULL)
{
// calculates the inverse docFrequency
double decimalDoc = (double)iterator->docFreq;
double inverseDoc = log10(3.0 / decimalDoc);
// if it is 0 then it removes the word
if (inverseDoc == 0)
{
hm_remove(theMap, iterator->word);
}
iterator = iterator->next;
}
i++;
}
return theMap;
}
void readQuery(struct hashmap* theMap, char* searchQuery)
{
int numWords = 0;
int j = 0;
// counts how many words there are
while (j<100)
{
// if there is a space it adds to the word count
if (searchQuery[j] == ' ')
{
numWords = numWords + 1;
}
if (searchQuery[j] == '\000')// breaks after the last word
{
numWords = numWords + 1;
break;
}
j++;
}
// creates an array for the words of the query to go in
char** theWords = malloc(sizeof(char*)*numWords);
int i = 0;
{
char* individualWord;
while (i < numWords)
{
individualWord = strtok(searchQuery, " ");
while (individualWord != NULL)
{
// copies word into the array
char* wordCopy = (char*)malloc(sizeof(char)*strlen(individualWord));
strcpy(wordCopy, individualWord);
theWords[i] = individualWord;
i++;
individualWord = strtok(NULL, " ");
}
i++;
}
}
//calls ranks
rank(theMap, theWords, numWords);
}
void rank(struct hashmap* theMap,char** theWords, int numWords)
{
int i = 0;
double doc1Sum;
double doc2Sum;
double doc3Sum;
while (i < numWords)
{
//tf-idf variables
double doc1;
double doc2;
double doc3;
// gets the node for the word, if the word isnt there then frequencys are 0
struct llnode* temp = hm_get(theMap, theWords[i]);
if (temp == NULL)
{
doc1 = 0;
doc2 = 0;
doc3 = 0;
}
else // if the node is there
{
// gets the doc frequency for that node
double freq = (double)temp->docFreq;
// calculates the tf-idf for each word for each document
doc1 = ((double)(temp->d1_occurences))* (log10(3.0 / freq));
doc2 = ((double)(temp->d2_occurences))* (log10(3.0 / freq));
doc3 = ((double)(temp->d3_occurences))* (log10(3.0 / freq));
// keeps a running sum for each document
doc1Sum = doc1Sum + doc1;
doc2Sum = doc2Sum + doc2;
doc3Sum = doc3Sum + doc3;
}
i++;
}
//create a char array of the files to sort them
char** fileArr = malloc(sizeof(char*) * 3);
fileArr[0] = "D1.txt";
fileArr[1] = "D2.txt";
fileArr[2] = "D3.txt";
// also create an array of the tf-idf values
double tfIdf [3];
tfIdf[0] = doc1Sum;
tfIdf[1] = doc2Sum;
tfIdf[2] = doc3Sum;
int g;
int j;
char* tempWord;
// goes through and sorts the file array by the tfIdf
for (g = 0; g < 3; g++)
{
for (j = g; j <3; j++)
{
// compares and if true it swaps them
if (tfIdf[j] > tfIdf[g])
{
tempWord = fileArr[g];
fileArr[g] = fileArr[j];
fileArr[j] = tempWord;
}
}
}
int h;
int b;
printf("Document Order:\n");
for (h = 0; h < 3; h++)
{
b = h + 1;
printf("%d. %s\n",b, fileArr[h]);
}
}
int main(void)
{
// list of files to be read
char** files = malloc(sizeof(char*) * 3);
files[0] = "D1.txt";
files[1] = "D2.txt";
files[2] = "D3.txt";
//asks user for number of buckets to create hashmap and sends it to training() and removes stop words
printf("Enter the number of buckets for the hashmap(int)\n");
int numBuckets;
scanf("%d", &numBuckets);
struct hashmap* theMap = training(files, numBuckets);
struct hashmap* stopMap= stopWords(theMap);
// start query process
printf("Enter X to exit or S to search\n");
char searchExit;
scanf(" %c", &searchExit);
char searchQuery[100]; // decided to assume the query was not longer than 100
//depending on the search char
while (1)
{
if ((searchExit != 'S') && searchExit != 'X')
{
printf("Not a proper search/exit character");
break;
}
if (searchExit == 'X')
{
printf("Exiting\n");
break;
}
// if S is enter prompts user to enter a search
if (searchExit == 'S')
{
printf("what do you want to search?\n");
scanf(" %[^\n]s", searchQuery);
// calls read query and then asks user if they want to search again or not
readQuery(stopMap, searchQuery);
printf("Enter X to exit or S to search\n");
scanf(" %c", &searchExit);
}
}
// destroy the map
hm_destroy(stopMap);
return 0;
}