-
Notifications
You must be signed in to change notification settings - Fork 16
/
benchmark.php
120 lines (80 loc) · 2.52 KB
/
benchmark.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
<?php
/* This program will benchmark searching for 1,000 keywords in a 5,000 word text all at once. */
/* It compares our ahocorasick method with regex and strpos. */
require('ahocorasick.php');
require('benchmark_setup.php'); /* keywords and text */
$loops = 10;
print("Loaded " . count($needles) . " keywords to search on a text of " .
strlen($haystack) . " characters.\n");
print("\nSearching with strpos...\n");
$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
foreach($needles as $n) {
$k = 0;
while(($k = strpos($haystack, $n, $k)) !== FALSE) {
$found[] = array($n, $k);
++$k;
}
}
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
$found_strpos = $found;
print("\nSearching with preg_match...\n");
//Note, this actually sucks and misses cases where one needle is a prefix or
//suffix of another.
$regex = '/' . implode('|', $needles) . '/';
$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
while(preg_match($regex, $haystack, $m, PREG_OFFSET_CAPTURE, $k)) {
$found[] = $m[0];
$k = $m[0][1] + 1;
}
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
//print_r($found);
print("\nSearching with preg_match_all...\n");
//Note, this actually sucks and misses cases where one needle is a prefix or
//suffix of another.
$regex = '/' . implode('|', $needles) . '/';
$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$k = 0;
preg_match_all($regex, $haystack, $found, PREG_OFFSET_CAPTURE);
$found = $found[0];
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
print("\nSearching with aho corasick...\n");
$ac = new ahocorasick();
foreach ($needles as $n) $ac->add_needle($n);
$ac->finalize();
$st = microtime(1);
for ($loop = 0; $loop < $loops; ++$loop) {
$found = array();
$found = $ac->search($haystack);
}
$et = microtime(1);
print("time: " . ($et - $st) . "\n");
//Check that the answers match.
//First sort the arrays.
$comp = function($a, $b) {return ($a[1] === $b[1]) ? ($a[0] > $b[0]) : ($a[1] > $b[1]);};
usort($found, $comp);
usort($found_strpos, $comp);
if ($found_strpos !== $found) {
print("ERROR - Aho Corasick got the wrong result.\n");
print("strpos size: " . count($found_strpos) . "\n");
print("aho corasick size: " . count($found) . "\n");
for ($i = 0; $i < count($found); ++$i) {
if ($found_strpos[$i] !== $found[$i]) {
print("Mismatch $i\n");
print_r($found_strpos[$i]);
print_r($found[$i]);
}
}
}