-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathahocorasick.go
165 lines (152 loc) · 3.08 KB
/
ahocorasick.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
//----------------
//Func : Aho Corasick Word Match 敏感词匹配
//Author: xjh
//Date : 2018/09/13
//Note : 基于github.com/gansidui/ahocorasick
// 支持线程安全
// 支持中文(UTF8)
// 比使用正则匹配regexp有上千倍性能提升
//----------------
package ahocorasick
import (
"container/list"
)
type trieNode struct {
count int
fail *trieNode
child map[rune]*trieNode
index int
wordlen int
}
func newTrieNode() *trieNode {
return &trieNode{
count: 0,
fail: nil,
child: make(map[rune]*trieNode),
index: -1,
}
}
type ACMatcher struct {
root *trieNode
size int
}
func NewMatcher(dict []string) *ACMatcher {
m := &ACMatcher{
root: newTrieNode(),
size: 0,
}
for i := range dict {
m.insert(dict[i])
}
m.build()
return m
}
//包含敏感词位置、个数,统计所有可能项,比如"民主权"会统计成两个词
func (m *ACMatcher) Match(s string) []int {
curNode := m.root
var p *trieNode = nil
var ret []int
for _, v := range s {
for curNode.child[v] == nil && curNode != m.root {
curNode = curNode.fail
}
curNode = curNode.child[v]
if curNode == nil {
curNode = m.root
}
p = curNode
for p != m.root && p.count > 0 {
for i := 0; i < p.count; i++ {
ret = append(ret, p.index)
}
p = p.fail
}
}
return ret
}
//将s中的敏感词替换为repl,一旦匹配到敏感词立即替换,不会参与后续匹配
func (m *ACMatcher) Replace(s string, repl string) string {
curNode := m.root
var p *trieNode = nil
var ret []rune
replr := []rune(repl)
for _, v := range s {
for curNode.child[v] == nil && curNode != m.root {
curNode = curNode.fail
}
curNode = curNode.child[v]
if curNode == nil {
curNode = m.root
}
p = curNode
if p != m.root && p.count > 0 {
ret = ret[:len(ret)-p.wordlen+1]
ret = append(ret, replr...)
curNode = m.root
} else {
ret = append(ret, v)
}
}
return string(ret)
}
//是否包含敏感词,查找到任意立即返回
func (m *ACMatcher) Has(s string) bool {
curNode := m.root
var p *trieNode = nil
for _, v := range s {
for curNode.child[v] == nil && curNode != m.root {
curNode = curNode.fail
}
curNode = curNode.child[v]
if curNode == nil {
curNode = m.root
}
p = curNode
for p != m.root && p.count > 0 {
return true
}
}
return false
}
func (m *ACMatcher) Size() int {
return m.size
}
func (m *ACMatcher) build() {
ll := list.New()
ll.PushBack(m.root)
for ll.Len() > 0 {
temp := ll.Remove(ll.Front()).(*trieNode)
var p *trieNode = nil
for i, v := range temp.child {
if temp == m.root {
v.fail = m.root
} else {
p = temp.fail
for p != nil {
if p.child[i] != nil {
v.fail = p.child[i]
break
}
p = p.fail
}
if p == nil {
v.fail = m.root
}
}
ll.PushBack(v)
}
}
}
func (m *ACMatcher) insert(s string) {
curNode := m.root
for _, v := range s {
if curNode.child[v] == nil {
curNode.child[v] = newTrieNode()
}
curNode = curNode.child[v]
}
curNode.wordlen = len([]rune(s))
curNode.count++
curNode.index = m.size
m.size++
}