-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhash.c
430 lines (368 loc) · 10.4 KB
/
hash.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/* This file is part of the software similarity tester SIM.
Written by Dick Grune, Vrije Universiteit, Amsterdam.
$Id: hash.c,v 2.18 2012-06-08 06:52:14 Gebruiker Exp $
*/
/* Text is compared by comparing every substring to all substrings
to the right of it; this process is in essence quadratic. However,
only substrings of length at least 'Min_Run_Size' are of interest,
which gives us the possibility to speed up this process by using
a hash table.
For every position in the text, we construct an index which gives
the next position in the text at which a run of Min_Run_Size tokens
starts that has the same hash code, as calculated by hash1(). If
there is no such run, the index is 0. These forward references are
kept in the array forward_reference[].
To construct this array, we use a hash table last_index[] whose size
is a prime and which is about 8 times smaller than the text array.
The hash table last_index[] is set up such that last_index[i] is the
index of the latest token with hash_code i, or 0 if there is none.
This results in hash chains of an average length of 8. See
Make_Forward_References().
If there is not enough room for a hash table of the proper size
(which can be considerable) the hashing is not efficient any more.
In that case, the forward reference table is scanned a second time,
eliminating from any chain all references to runs that do not hash to
the same value under a second hash function, hash2(). For the UNIX
manuals this reduced the number of matches from 91.9% to 1.9% (of
which 0.06% was genuine).
*/
#include <stdio.h>
#include <stdint.h>
#include "system.par"
#include "debug.par"
#include "sim.h"
#include "text.h"
#include "Malloc.h"
#include "error.h"
#include "token.h"
#include "language.h"
#include "token.h"
#include "tokenarray.h"
#include "options.h"
#include "hash.h"
/* MAIN ENTRIES */
static size_t *forward_reference; /* to be filled by Malloc() */
static size_t n_forward_references;
static void make_forward_references_hash1(void);
static void make_forward_references_hash2(void);
#ifdef DB_FORW_REF
static void db_forward_references(const char *);
static void make_forward_references_hash3(void);
#endif
void
Make_Forward_References(void) {
/* Constructs the forward references table.
*/
n_forward_references = Text_Length();
forward_reference =
(size_t *)Calloc(
n_forward_references, sizeof (size_t)
);
make_forward_references_hash1();
make_forward_references_hash2();
#ifdef DB_FORW_REF
make_forward_references_hash3();
#endif
}
size_t
Forward_Reference(size_t i) {
if (i == 0 || i >= n_forward_references) {
fatal("internal error, bad forward reference");
}
return forward_reference[i];
}
void
Free_Forward_References(void) {
Free((char *)forward_reference);
}
/* HASHING */
/*
We want a hash function whose time cost does not depend on
Min_Run_Size, which is a problem since the size of the object
we derive the hash value from IS equal to Min_Run_Size!
Therefore we base the hash function on a sample of at most
N_SAMPLES tokens from the input string; this works at least
as well in practice.
*/
#define N_SAMPLES 24
#define OPERATION ^
/* An alternative algorithm; does not seem to make any difference.
#define N_SAMPLES 23
#define OPERATION +
*/
/* Another algorithm; not yet tested
#define N_SAMPLES 24
#define OPERATION + 613 *
*/
static size_t *last_index;
static size_t hash_table_size;
static size_t sample_pos[N_SAMPLES];
/* The prime numbers of the form 4 * i + 3 for some i, all greater
than twice the previous one and smaller than 2^40 (for now). */
static const uint64_t prime[] =
{
#if 0
3,
7,
19,
43,
103,
211,
431,
863,
1747,
3499,
7019,
#endif
14051,
28111,
56239,
112507,
225023,
450067,
900139,
1800311,
3600659,
7201351,
14402743,
28805519,
57611039,
115222091,
230444239,
460888499,
921777067,
1843554151,
UINT64_C (3687108307),
UINT64_C (7374216631),
UINT64_C (14748433279),
UINT64_C (29496866579),
UINT64_C (58993733159),
UINT64_C (117987466379),
UINT64_C (235974932759),
UINT64_C (471949865531),
UINT64_C (943899731087)
};
static void
init_hash_table(void) {
size_t n;
/* find the ideal hash table size */
n = 0;
while (prime[n] < Text_Length()) {
n++;
/* this will always terminate, if prime[] is large enough */
}
/* see if we can allocate that much space, and if not, step down */
last_index = 0;
while (!last_index && n >= 0) {
hash_table_size = prime[n];
last_index = (size_t *)
TryCalloc(hash_table_size, sizeof (size_t));
n--;
}
if (!last_index) {
fatal("out of memory");
}
/* find sample positions */
for (n = 0; n < N_SAMPLES; n++) {
/* straigh-line approximation; uninituitive as usual */
sample_pos[n] = (
(2 * n * (Min_Run_Size - 1) + (N_SAMPLES - 1))
/ (2 * (N_SAMPLES - 1))
);
}
}
static size_t hash1(const Token *);
static void
make_forward_references_hash1(void) {
int n;
init_hash_table();
/* set up the forward references using the last_index hash table */
for (n = 0; n < Number_Of_Texts; n++) {
struct text *txt = &Text[n];
size_t j;
for ( /* all pos'ns in txt except the last Min_Run_Size-1 */
j = txt->tx_start; /* >= 1 */
j + Min_Run_Size - 1 < txt->tx_limit;
j++
) {
if (May_Be_Start_Of_Run(Token_Array[j])) {
size_t h = hash1(&Token_Array[j]);
if (last_index[h]) {
forward_reference[last_index[h]] = j;
}
last_index[h] = j;
}
}
}
Free((char *)last_index);
#ifdef DB_FORW_REF
db_forward_references("first hashing");
#endif /* DB_FORW_REF */
}
static size_t
hash1(const Token *p) {
/* hash1(p) returns the hash code of Min_Run_Size
tokens starting at p; caller guarantees that there
are at least Min_Run_Size tokens.
*/
uint64_t h_val;
int n;
h_val = 0;
for (n = 0; n < N_SAMPLES; n++) {
h_val = (h_val << 2) OPERATION Token2int(p[sample_pos[n]]);
if (h_val & (1ULL<<63)) {
h_val ^= (1ULL<<63|1);
}
}
return (size_t) (h_val % hash_table_size);
}
static size_t hash2(const Token *);
static void
make_forward_references_hash2(void) {
size_t i;
/* Clean out spurious matches, by a quadratic algorithm.
Note that we do not want to eliminate overlapping
sequences in this stage, since we might be removing the
wrong copy.
*/
for (i = 0; i+Min_Run_Size < Text_Length(); i++) {
size_t j = i;
size_t h2 = hash2(&Token_Array[i]);
/* Find the first token sequence in the chain
with same secondary hash code.
*/
while ( /* there is still a forward reference */
(j = forward_reference[j])
&& /* its hash code does not match */
hash2(&Token_Array[j]) != h2
) {
/* continue searching */
}
/* short-circuit forward reference to it, or to zero */
forward_reference[i] = j;
}
#ifdef DB_FORW_REF
db_forward_references("second hashing");
#endif /* DB_FORW_REF */
}
static size_t
hash2(const Token *p) {
/* A simple-minded hashing for the secondary sweep;
sample first and last token (on 64-bit systems, also
two tokens from the middle).
*/
uint64_t h_val = 0;
h_val ^= ((size_t)Token2int(p[sample_pos[(N_SAMPLES - 1) / 4]])) << 48;
h_val ^= ((size_t)Token2int(p[sample_pos[(N_SAMPLES - 1) * 3 / 4 ]])) << 32;
h_val ^= ((size_t)Token2int(p[sample_pos[N_SAMPLES - 1]])) << 16;
h_val ^= (size_t)Token2int(p[sample_pos[0]]);
return (size_t) h_val;
}
#ifdef DB_FORW_REF
static int hash3(const Token *, const Token *);
static void
db_print_forward_references(void) {
size_t n;
size_t *printed_at =
(size_t *)Calloc(Text_Length(), sizeof (size_t));
for (n = 1; n < Text_Length(); n++) {
size_t fw = forward_reference[n];
if (fw == 0) continue;
fprintf(Debug_File, "FWR[%zd]:", n);
if (printed_at[fw]) {
fprintf(Debug_File, " see %zd", printed_at[fw]);
}
else {
while (fw) {
fprintf(Debug_File, " %zd", fw);
printed_at[fw] = n;
fw = forward_reference[fw];
}
}
fprintf(Debug_File, "\n");
}
Free((void *)printed_at);
}
static void
make_forward_references_hash3(void) {
size_t i;
/* Do a third hash to check up on the previous two */
/* This time we use a genuine compare */
for (i = 0; i+Min_Run_Size < Text_Length(); i++) {
size_t j = i;
while ( /* there is still a forward reference */
(j = forward_reference[j])
&& /* its hash code does not match */
!hash3(&Token_Array[i], &Token_Array[j])
) {
/* continue searching */
}
/* short-circuit forward reference to it, or to zero */
forward_reference[i] = j;
}
db_forward_references("third hashing");
}
static int
hash3(const Token *p, const Token *q) {
/* a full comparison for the tertiary sweep */
size_t n;
for (n = 0; n < Min_Run_Size; n++) {
if (!Token_EQ(p[n], q[n])) return 0;
}
return 1;
}
static size_t
db_frw_chain(size_t n, char *crossed_out) {
size_t chain_len = 0;
size_t fw;
for (fw = n; fw; fw = forward_reference[fw]) {
if (crossed_out[fw]) {
fprintf(Debug_File,
">>>> error: forward references cross <<<<\n"
);
}
chain_len++;
crossed_out[fw] = 1;
}
fprintf(Debug_File, "chain_start = %zd, chain_len = %zd\n", n, chain_len);
/* if there are two values, the chain length is still 1 */
return chain_len - 1;
}
static void
db_forward_references(const char *msg) {
size_t n;
size_t n_frw_chains = 0; /* number of forward ref. chains */
size_t tot_frwc_len = 0;
char *crossed_out;
fprintf(Debug_File, "\n\n**** DB_FORWARD_REFERENCES, %s ****\n", msg);
fprintf(Debug_File, "hash_table_size = %zu\n", hash_table_size);
fprintf(Debug_File, "N_SAMPLES = %d\n", N_SAMPLES);
crossed_out = (char *)Calloc(Text_Length(), sizeof (char));
/* Each forward_reference[n] starts in principle a new
chain, and these chains never touch each other.
We check this property by marking the positions in each
chain in an array; if we meet a marked entry while
following a chain, it must have been on an earlier chain
and we have an error.
We also determine the lengths of the chains, for statistics.
*/
if (forward_reference[0]) {
fprintf(Debug_File,
">>>> forward_reference[0] is not zero <<<<\n"
);
}
for (n = 1; n < Text_Length(); n++) {
if (forward_reference[n] && !crossed_out[n]) {
/* start of a new chain */
n_frw_chains++;
tot_frwc_len += db_frw_chain(n, crossed_out);
}
}
db_print_forward_references();
Free((char *)crossed_out);
fprintf(Debug_File,
"text length = %zu, # forward chains = %zd, total frw chain length = %zd\n\n",
Text_Length(), n_frw_chains, tot_frwc_len
);
}
#endif /* DB_FORW_REF */