forked from niven/simhashing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim.go
519 lines (426 loc) · 15.7 KB
/
sim.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
// This implements a datastore for simhashes (http://matpalm.com/resemblance/simhash/)
// that allows for (at one point) efficient searching of items within some Hamming Distance of a know text
// Currently we do reasonable as long as we have stored many items (~20M) and we search within a range of 8 or so.
package simhashing
import "fmt"
import "container/heap"
const bit_length = 8
const size = 1 << bit_length
const max_keys_per_node = 256
const bits_per_key = 8
// both of these assume bits_per_key equals 8
var level_chunks = [...]uint64{
0xff << 0,
0xff << 8,
0xff << 16,
0xff << 24,
0xff << 32,
0xff << 48,
0xff << 56,
}
// calculating thes is very unreadable
// just the top 8N bits representing the unmatched bits of the hash at level N
var masks = [...]uint64{
0xffffffffffffffff,
0xffffffffffffff00,
0xffffffffffff0000,
0xffffffffff000000,
0xffffffff00000000,
0xffffff0000000000,
0xffff000000000000,
0xff00000000000000,
}
// BTW: stuff is uint8 since that saves space
// so don't make bit_length > 255 :)
// for every bit pattern of 8 bits [0-255] we have a distance [0-8] and some number of patterns
// the third dimension is not constant so we'll use slices
// maybe some kind of pointer and allocate once? append() will overallocate I guess
// we could also save space here with mirroring thing but IDC
var distance_table [size][bit_length + 1][]uint8
var hamming [size][size]uint8
// setup a lookup table for all hamming distances 0-255 x 0-255
// (could make this smaller by mirroring etc, but why bother?)
func init() {
for i := 0; i < size; i++ {
for j := 0; j < size; j++ {
count := 0
for d := i ^ j; d > 0; count++ {
d &= d - 1 // clear the least significant bit set
}
hamming[i][j] = uint8(count)
// fmt.Printf("hamming_distance for %v,%v = %d\n", i, j, count)
distance_table[i][count] = append(distance_table[i][count], uint8(j))
}
}
for i := 0; i < size; i++ {
for j := 0; j < size; j++ {
}
}
//fmt.Println(distance_table)
}
// calculate HDs using a table lookup
// todo: check if this is actually faster than the one in util
// (I think it is, but not much)
func hamming_distance(a uint64, b uint64) (distance uint8) {
distance = 0
var a_chunk, b_chunk uint8
// a>0 || b>0 in this case a^b is sufficient since 0 means a=b=0 or a==b which makes their distance 0 :)
for a^b != 0 {
a_chunk = uint8((size - 1) & a)
b_chunk = uint8((size - 1) & b)
distance += hamming[a_chunk][b_chunk]
a = a >> bit_length
b = b >> bit_length
}
return
}
type SimStore struct {
values []entry
nodes map[uint8]*SimStore // all subtrees based on the first bits_per_key LSB (maybe mae this an array, maybe faster?)
level uint8 // determines which bitrange we pick to split keys into nodes
}
type entry struct {
key uint64
id int64
}
// Creates a new SimStore
func NewSimStore() *SimStore {
return &SimStore{level: 0}
}
// Inserts a new value in the store
func (s *SimStore) Insert(text string, id int64) {
s.insert(entry{key: SimHash(text), id: id})
}
// inserts a new value in the store, doesn't rehash etc
func (s *SimStore) insert(item entry) {
if len(s.nodes) > 0 {
// get the byte for this level
b := uint8((level_chunks[s.level] & item.key) >> (s.level * bits_per_key)) // this gets you the Nth byte
// fmt.Printf("Node insert: level %d, getting byte: 0b%064b & 0b%064b = 0b%08b (%d)\n", s.level, level_chunks[ s.level ], item.key, b, b)
_, exists := s.nodes[b]
if !exists {
s.nodes[b] = &SimStore{level: s.level + 1}
}
s.nodes[b].insert(item)
} else {
s.values = append(s.values, item)
if len(s.values) > max_keys_per_node { // different constant here would be better I think
s.split()
}
}
}
// go ever every key and put it in a node based on the value of its Nth byte
func (s *SimStore) split() {
s.nodes = make(map[uint8]*SimStore, size)
for _, item := range s.values {
b := uint8((level_chunks[s.level] & item.key) >> (s.level * bits_per_key)) // this gets you the Nth byte
_, exists := s.nodes[b]
if !exists {
s.nodes[b] = &SimStore{level: s.level + 1}
}
// don't bother with Insert(), we are splitting so we'll always be adding to the keys at this point
s.nodes[b].values = append(s.nodes[b].values, item)
}
// we don't need our values anymore
s.values = s.values[0:0]
}
// Returns a tree of nodes and number of keys per node
func (s *SimStore) String() string {
return s.pretty("")
}
func (s *SimStore) pretty(indent string) string {
out := ""
if len(s.nodes) > 0 {
out += fmt.Sprintf("%slevel % 2d\n", indent, s.level)
for index, node := range s.nodes {
out += fmt.Sprintf("%s%03d: %s", indent, index, node.pretty(indent+" "))
}
} else {
return fmt.Sprintf("%skeys [%d/%d]\n", indent, len(s.values), size)
}
return out
}
// return the number of keys and nodes in the store
func (s *SimStore) Stats() (keys, nodes int) {
for _, subtree := range s.nodes {
k, n := subtree.Stats()
keys += k
nodes += n
}
nodes += len(s.nodes)
keys += len(s.values)
return
}
// returns true if target is present in the store
func (s *SimStore) Contains(text string) (present bool, index int64) {
return s.contains(SimHash(text))
}
// returns true if target is present in the store
func (s *SimStore) contains(target uint64) (present bool, index int64) {
if len(s.nodes) > 0 {
b := uint8((level_chunks[s.level] & target) >> (s.level * bits_per_key)) // this gets you the Nth byte
subtree, exists := s.nodes[b]
if exists {
return subtree.contains(target)
} else {
return false, -1
}
} else {
// check every key
for _, item := range s.values {
if item.key == target {
return true, item.id
}
}
}
return false, -1
}
// returns all the hashes with a Hamming Distance of distance or less
// (less than or equal to make searching for 0 more natural)
// Checks every single key (more of a debuf thing I guess)
func (s *SimStore) FindScanAll(target uint64, distance uint8) (found []uint64) {
found = make([]uint64, 0)
if len(s.nodes) > 0 {
for _, subtree := range s.nodes {
found = append(found, subtree.FindScanAll(target, distance)...)
}
} else {
for _, item := range s.values {
if hamming_distance(item.key, target) <= distance {
found = append(found, item.key)
}
}
}
return
}
// returns all the hashes with a Hamming Distance of distance or less
// (less than or equal to make searching for 0 more natural)
// returns the matches found as well as the number of keys and nodes checked
func (s *SimStore) Find(text string, distance uint8) (found []int64, keys_checked int, nodes_checked int) {
return s.find(SimHash(text), distance)
}
// returns all the hashes with a Hamming Distance of distance or less
// (less than or equal to make searching for 0 more natural)
// returns the matches found as well as the number of keys and nodes checked
func (s *SimStore) find(target uint64, distance uint8) (found []int64, keys_checked int, nodes_checked int) {
found = make([]int64, 0)
b := int((level_chunks[s.level] & target) >> (s.level * bits_per_key)) // this gets you the Nth byte
// fmt.Printf("Target 0b%064b, distance %02d, byte 0b%08b [%d]\n", target, distance, b, b)
if len(s.nodes) > 0 {
// for all bytes that are within distance (with a max of 8) we check all nodes
// since hamming_distance is additive
// example: looking for 101011 with distance 2
// we take the LSBs 11, and find everything that is within distance 2:
// (00, 10, 01, 11) and recurse:
// node[00].Find( 101011, 0 ) (already 'spent' distance 2)
// node[10].Find( 101011, 1)
// node[01].Find( 101011, 1)
// node[11].Find( 101011, 2) (distance for this subrange was 0, 2 left to 'spend')
// fmt.Println(distance_table[b])
end := min(8, distance)
// fmt.Printf("Searching from 0 to %03d\n", end)
for i := uint8(0); i <= end; i++ { // check everything withing distance range
// fmt.Printf("Distance %d\n", i)
for _, d := range distance_table[b][i] { // lookup which bytes are that distance from us
subtree, exists := s.nodes[d] // check in those subtrees, if they exist
// fmt.Printf("Checking byte 0b%08b - node exists: %v\n", d, exists)
if exists {
// recurse, but the distance gets smaller
f, k, n := subtree.find(target, distance-i)
keys_checked += k
nodes_checked += n
found = append(found, f...)
}
}
}
nodes_checked += len(s.nodes)
} else {
// we need the part of the hash that has not been matched yet, so the (64 - bits_per_key*(level+1)) MSBs
// eg to get the top 12 bits we do 1<<12 (0b1000000000000), -1 (0b0111111111111), then shifted to the MSBs
// ehr, so let's just use a lookup ;)
//mask := ()(1 << (64 - bits_per_key*(s.level+1))) -1) << (64-s.level*bits_per_key)
mask := masks[s.level]
masked_target := target & mask
for _, item := range s.values {
if hamming_distance(item.key&mask, masked_target) <= distance {
found = append(found, item.id)
}
}
keys_checked += len(s.values)
}
return
}
// Find the closest thing matching the input
func (s *SimStore) FindClosest(text string) int64 {
target := SimHash(text)
fmt.Printf("FC 0b%064b\n", target)
// we're just going to use the target and then flip every bit in turn
// and call contains() a lot (which is reasonably efficient)
// in this 'tried' map we keep track of things we already tried
// if we have very few items in the store, we'll end up wasting huge amounts
// of time calling contains() for things that won't be found.
// Yeah, so after trying that I quickly realized we're wasting huge amounts of space
// storing all the keys we're trying, and also generating lots of dupe keys
// That is a truly awful method.
// So instead, I'll start by searching any nodes that have HD0 for the byte of our level
// This is basically the same as contains(), but we'll try increasing byte distances
// What happens if we get something like (8 bite example, 2 bit splits)
// target: 00 00 00 11
// searching depth first: 00 00 00 00, hey, distance 2, pretty close
// however 10 00 00 11 was distance 1 but we never checked
// but if we search breadth-first we end up searching everything :(
// Maybe using a stack with (distance_so_far, subtree) sorted by least dist
// and then expanding that?
// A traditional approach to searching if you will :)
// instead of sorting we should of course use a heap
// ok, so let's start by checking keys if we have it (just return the one with the smallest distance?)
// but what if other search directions find better matches? We need to also check all the frontiers
// with HD < HD_of_first_key_found to see if we can do better.
// first of all, do we even have nodes, bro?
if len(s.nodes) == 0 {
_, closest := find_closest_in_keys(&s.values, target)
return closest
}
// the hard case is much much harder than the simple one unfortunately
sh := &SearchHeap{}
heap.Init(sh)
// first, stick all our nodes in
b := uint8((level_chunks[s.level] & target) >> (s.level * bits_per_key)) // this gets you the Nth byte
for prefix, subtree := range s.nodes {
item := &Distance{
hamming_distance: hamming[b][prefix],
subtree: subtree,
}
heap.Push(sh, item)
}
// then expand the shortest distance until we hit a key
var best_key uint64
var closest int64
paths_tried := 0
// TODO: the above might be better off as an entry type
for sh.Len() > 0 {
shortest := heap.Pop(sh).(*Distance)
paths_tried++
fmt.Printf("Checking distance %03d (level: %v)\n", shortest.hamming_distance, shortest.subtree.level)
if len(shortest.subtree.nodes) == 0 {
fmt.Printf("This node has keys\n")
best_key, closest = find_closest_in_keys(&shortest.subtree.values, target)
break
}
// add expanded subtrees to heap
b := uint8((level_chunks[shortest.subtree.level] & target) >> (shortest.subtree.level * bits_per_key)) // this gets you the Nth byte
for prefix, subtree := range shortest.subtree.nodes {
// fmt.Printf("Expanding distance %d with %d\n", shortest.hamming_distance, hamming[b][prefix])
item := &Distance{
hamming_distance: shortest.hamming_distance + hamming[b][prefix], // here we add the distance, since we're going down a level
subtree: subtree,
}
heap.Push(sh, item)
}
}
fmt.Printf("First key (upper bound): 0b%064b (id %v)\n", best_key, closest)
// expand remaining nodes that have a distance < that of the first key, they might have better results
// while the rest can't have
upper_bound := hamming_distance(target, best_key)
// if we found an exact match, we can skip everything else and just return that
if upper_bound == 0 {
return closest
}
fmt.Printf("Checking remaining nodes, upper bound is %d\n", upper_bound)
for sh.Len() > 0 {
shortest := heap.Pop(sh).(*Distance)
paths_tried++
// we're done if no items remain with a total HD of less than/equal the key we have
if upper_bound <= shortest.hamming_distance {
fmt.Printf("Every possible other path will result in larger distance, bailing out (%d paths left)\n", sh.Len())
break
}
// now expand this one, until we find keys
if len(shortest.subtree.nodes) == 0 {
possible_closer_key, possible_closer_id := find_closest_in_keys(&shortest.subtree.values, target)
possible_distance := hamming_distance(target, possible_closer_key)
// woot, improvement
if possible_distance < upper_bound {
fmt.Printf("Got an improvement from %d to %d\n", upper_bound, possible_distance)
upper_bound = possible_distance
best_key = possible_closer_key
closest = possible_closer_id
}
}
// just expand nodes (if we have any)
b := uint8((level_chunks[shortest.subtree.level] & target) >> (shortest.subtree.level * bits_per_key)) // this gets you the Nth byte
// fmt.Printf("Expanding distance %d \n", shortest.hamming_distance )
for prefix, subtree := range shortest.subtree.nodes {
new_distance := shortest.hamming_distance + hamming[b][prefix]
// no point in adding nodes that never could lead to an improvement
if new_distance >= upper_bound {
continue
}
item := &Distance{
hamming_distance: new_distance, // here we add the distance, since we're going down a level
subtree: subtree,
}
heap.Push(sh, item)
}
}
fmt.Printf("Checked %d paths\n", paths_tried)
// so if we have any IDs with value 0, we're returning bad things
return closest
}
func find_closest_in_keys(v *[]entry, target uint64) (key uint64, closest int64) {
// just check all the keys.
distance := uint8(255) // any real one will be less
for _, item := range *v {
dist := hamming_distance(item.key, target)
if dist < distance {
distance = dist
closest = item.id
key = item.key
}
}
return
}
// Here is our heap of HDs and SimStore nodes
// These names are awful
type Distance struct {
hamming_distance uint8 // this is always 0-256
subtree *SimStore // the node we're referring to
// below is copied from the docs, kind ugly stuff
// The index is needed by update and is maintained by the heap.Interface methods.
index int // The index of the item in the heap.
}
// this is the thing that implements the heap
// (copied from the example, no clue how the Push/Pop work)
type SearchHeap []*Distance
func (sh SearchHeap) Len() int { return len(sh) }
func (sh SearchHeap) Less(i, j int) bool {
// We want Pop to give us the lowest hamming_distance
return sh[i].hamming_distance < sh[j].hamming_distance
}
func (sh SearchHeap) Swap(i, j int) {
sh[i], sh[j] = sh[j], sh[i]
sh[i].index = i
sh[j].index = j
}
func (sh *SearchHeap) Push(x interface{}) {
n := len(*sh)
item := x.(*Distance)
item.index = n
*sh = append(*sh, item)
}
func (sh *SearchHeap) Pop() interface{} {
old := *sh
n := len(old)
item := old[n-1]
item.index = -1 // for safety
*sh = old[0 : n-1]
return item
}
// end of heap stuff
// so math.Min want float64s... and the casting is ugly
func min(a uint8, b uint8) uint8 {
if a > b {
return b
}
return a
}