-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbintrie-builder.lisp
167 lines (148 loc) · 5.2 KB
/
bintrie-builder.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
(defpackage dawg.bintrie-builder
(:use :common-lisp :dawg.global)
(:export build-from-file
build-from-list
collect-children
node-label
node-terminal?
node-sibling-total
node-child
element-count))
(in-package :dawg.bintrie-builder)
(package-alias :dawg.octet-stream :stream)
(package-alias :dict-0.2.0 :dict)
;;;;;;;;;;;;;;;
;;; declamation
(declaim #.*fastest*
(inline make-node collect-children calc-child-total calc-sibling-total
element-count
node=))
;;;;;;;;
;;; node
(defstruct node
(label 0 :type octet)
(terminal? nil :type boolean)
(child nil :type (or null node))
(sibling nil :type (or null node))
(child-total 0 :type positive-fixnum) ; amount of child side nodes
(sibling-total 0 :type positive-fixnum) ; amount of sibling side nodes
(hash -1 :type fixnum))
;;;;;;;;;;;;;;;;;;;;;;
;;; auxiliary function
(defun calc-child-total (node)
(with-slots (child) (the node node)
(if (null child)
0
(the positive-fixnum
(+ (if (node-terminal? child) 1 0)
(node-child-total child) (node-sibling-total child))))))
(defun calc-sibling-total (node)
(with-slots (sibling) (the node node)
(if (null sibling)
0
(the positive-fixnum
(+ (if (node-terminal? sibling) 1 0)
(node-child-total sibling) (node-sibling-total sibling))))))
;;;;;;;;;;;;;;;;;
;;; hash function
(defun node= (n1 n2)
(and (eq (node-child n1) (node-child n2))
(eq (node-sibling n1) (node-sibling n2))
(= (node-label n1) (node-label n2))
(eq (node-terminal? n1) (node-terminal? n2))))
(defun sxhash-node (node)
(if (null node)
#.(sxhash nil)
(with-slots (hash child-total sibling-total) (the node node)
(when (= -1 hash)
(setf hash (logxor (sxhash (node-label node))
(sxhash (node-terminal? node))
(fixnumize (* (sxhash-node (node-child node)) 7))
(fixnumize (* (sxhash-node (node-sibling node)) 13))))
(setf child-total (calc-child-total node)
sibling-total (calc-sibling-total node)))
hash)))
(dict:define-test node-test sxhash-node node=)
;;;;;;;;;;;;;;;;;;
;;; build function
(defun share (node memo)
(if (null node)
nil
(or (dict:get node memo)
(progn
(setf (node-child node) (share (node-child node) memo)
(node-sibling node) (share (node-sibling node) memo))
(dict:get node memo))
(setf (dict:get node memo) node))))
(defun push-child (in parent)
(if (stream:eos? in)
(setf (node-terminal? parent) t)
(let ((new-node (make-node :label (stream:read in))))
(shiftf (node-sibling new-node) (node-child parent) new-node)
(push-child in new-node))))
(defun insert (in parent memo)
(let ((node (node-child parent)))
(if (or (null node)
(stream:eos? in)
(/= (stream:peek in) (node-label node)))
(progn
(setf (node-child parent) (share node memo))
(push-child in parent))
(insert (stream:eat in) node memo))))
(defun build-impl (key-generator show-progress)
(loop WITH trie = (make-node)
WITH memo = (dict:make :test 'node-test :rehash-threshold 0.75)
FOR num fixnum FROM 0
FOR key = (funcall key-generator)
WHILE key
DO
(when (and show-progress (zerop (mod num 100000)))
(format t "~&; ~A~%" num))
(let ((in (stream:make key)))
(declare (dynamic-extent in))
(insert in trie memo))
FINALLY
(return (share trie memo))))
(defun build-from-list (keyset &key show-progress)
(when show-progress
(format t "~&; build trie list (size ~A):~%" (length keyset)))
(build-impl (lambda () (prog1 (car keyset)
(setf keyset (cdr keyset))))
show-progress))
(defun build-from-file (filepath &key show-progress)
(when show-progress
(format t "~&; build trie from ~A:~%" filepath))
(with-open-file (in filepath)
(build-impl (lambda () (read-line in nil nil))
show-progress)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; other external function
(defun element-count (node)
(with-slots (terminal? child-total) (the node node)
(the fixnum (+ (if terminal? 1 0) child-total))))
(defun collect-children (node)
(loop WITH acc = '()
FOR child = (node-child node)
THEN (node-sibling child)
WHILE child
DO
(push child acc)
FINALLY
(return acc)))
;;;;;;;;;;;;;
;;; for debug
(defun member? (key trie)
(declare #.*interface*
(simple-characters key)
(node trie))
(let ((in (stream:make key)))
(declare (dynamic-extent in))
(nlet recur ((in in) (node (node-child trie)) (parent trie))
(cond ((stream:eos? in) (node-terminal? parent))
((null node) nil)
((= (stream:peek in) (node-label node))
(recur (stream:eat in) (node-child node) node))
((< (stream:peek in) (node-label node))
(recur in (node-sibling node) parent))))))
(package-alias :dawg.octet-stream)
(package-alias :dict-0.2.0)