-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwumanber.go
110 lines (100 loc) · 2.33 KB
/
wumanber.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package wumanber
// from https://code.google.com/wu-manber-chinese
type wumanber struct {
pat_mlength int
block_n int
table_size int
// shift map[int]int
// prefix map[int][]prefix_pair
shift []int
prefix [][]struct{ hash, id int }
patterns [][]rune
}
func hash(chars []rune) (h int) {
for _, c := range chars {
h = int(c) + h<<6 + h<<16 - h
}
return 0
}
func New(patterns []string) *wumanber {
v := &wumanber{
patterns: make([][]rune, len(patterns)),
pat_mlength: 0xffffffff, // large enough
block_n: 3,
}
for i, pat := range patterns {
v.patterns[i] = []rune(pat)
len := len(v.patterns[i])
if v.pat_mlength > len {
v.pat_mlength = len
}
}
if v.block_n > v.pat_mlength {
v.block_n = v.pat_mlength
}
var primes = []int{66047, 263167, 16785407}
v.table_size = primes[2]
for _, prime := range primes {
if prime > v.pat_mlength*10*len(patterns) {
v.table_size = prime
break
}
}
v.shift = make([]int, v.table_size)
for i, _ := range v.shift {
v.shift[i] = v.pat_mlength + 1 - v.block_n
}
v.prefix = make([][]struct{ hash, id int }, v.table_size)
for id, pat := range v.patterns {
for i := v.pat_mlength; i >= v.block_n; i = i - 1 {
h := hash(pat[i-v.block_n:i]) % v.table_size
if v.shift[h] > v.pat_mlength-i {
v.shift[h] = v.pat_mlength - i
}
if i == v.pat_mlength {
ph := hash(pat[:v.block_n]) % v.table_size
v.prefix[h] = append(v.prefix[h], struct{ hash, id int }{ph, id})
}
}
}
return v
}
func (wm *wumanber) search(text []rune) (v []int) {
var index = wm.pat_mlength - 1
for index < len(text) {
h := hash(text[index-wm.block_n+1:index+1]) % wm.table_size
shift := wm.shift[h]
if shift > 0 {
index = index + shift
continue
}
phidx := index - wm.pat_mlength + 1
ph := hash(text[phidx:phidx+wm.block_n]) % wm.table_size
prefixes := wm.prefix[ph]
for _, prefix := range prefixes {
if prefix.hash != ph {
continue
}
pat := wm.patterns[prefix.id]
if rune_has_prefix(text[phidx:], pat) {
v = append(v, prefix.id)
}
}
index++
}
return
}
func (wm *wumanber) Search(text string) (v []int) {
return wm.search([]rune(text))
}
func rune_has_prefix(r []rune, prefix []rune) bool {
if len(r) < len(prefix) {
return false
}
for idx, p := range prefix {
if r[idx] != p {
return false
}
}
return true
}