From 3d0de4937e6a81c4e0bf503ee31ed37eb3f94b99 Mon Sep 17 00:00:00 2001 From: Hayley Patton Date: Sat, 24 Jun 2023 15:01:01 +1000 Subject: [PATCH 1/5] Introduce a NIHed hash table, optimised for hash consing. --- Code/DFA-construction/hash-cons.lisp | 52 ++++++++++++++++++++++++++++ Code/DFA-construction/re-types.lisp | 20 +++++------ Code/DFA-construction/type.lisp | 30 ++++++++-------- Code/one-more-re-nightmare.asd | 1 + 4 files changed, 78 insertions(+), 25 deletions(-) create mode 100644 Code/DFA-construction/hash-cons.lisp diff --git a/Code/DFA-construction/hash-cons.lisp b/Code/DFA-construction/hash-cons.lisp new file mode 100644 index 0000000..83e050d --- /dev/null +++ b/Code/DFA-construction/hash-cons.lisp @@ -0,0 +1,52 @@ +(in-package :one-more-re-nightmare) + +;;; A hash-table designed for hash consing. There are two main +;;; deficiencies in the hash tables provided by Common Lisp: +;;; 1. We cannot promise that a key is definitely new and avoid +;;; comparing keys. We may only "upsert" keys. +;;; 2. The hash function used by SBCL behaves very poorly with +;;; large substitution lists (as used by TAG-SET). + +(defconstant +buckets+ 4096) + +(defstruct (hash-cons-table (:constructor make-hash-cons-table (test hash))) + (buckets (make-array +buckets+ :initial-element '()) + :type (simple-vector #.+buckets+) + :read-only y) + (test #'equal :type function :read-only t) + (hash #'sxhash :type function :read-only t)) + +(declaim (inline %bucket)) +(defun %bucket (hc-table key) + (mod (the fixnum (funcall (hash-cons-table-hash hc-table) key)) +buckets+)) + +(defun insert-hc (hc-table key value) + (push (cons key value) + (svref (hash-cons-table-buckets hc-table) + (%bucket hc-table key))) + value) + +(defun lookup-hc (hc-table key) + (let ((pair (assoc key + (svref (hash-cons-table-buckets hc-table) (%bucket hc-table key)) + :test (hash-cons-table-test hc-table)))) + (if (null pair) + (values nil nil) + (values (cdr pair) t)))) + +(defun tag-set-hash (substitutions) + (let ((hash 0)) + (flet ((update (x) + (setf hash (logand most-positive-fixnum (+ (* x 31) hash))))) + (loop for ((n v) . source) in (first substitutions) + do (update (sxhash n)) + (update v) + (trivia:match source + ('nil (update 0)) + ('position (update 1)) + ((list n v) (update (sxhash n)) (update v))))) + hash)) + +(defmethod print-object ((h hash-cons-table) s) + (print-unreadable-object (h s :type t) + (format s "~D entries" (reduce #'+ (hash-cons-table-buckets h) :key #'length)))) diff --git a/Code/DFA-construction/re-types.lisp b/Code/DFA-construction/re-types.lisp index 4022ff5..13acacc 100644 --- a/Code/DFA-construction/re-types.lisp +++ b/Code/DFA-construction/re-types.lisp @@ -1,16 +1,16 @@ (in-package :one-more-re-nightmare) (define-types - (literal set) - (empty-string) - (repeat r min max can-empty) - (tag-set substitutions) - (alpha expression history) - (grep vector prototype) - (either r s) - (both r s) - (invert r) - (join r s)) + ((literal set)) + ((empty-string)) + ((repeat r min max can-empty)) + ((tag-set substitutions) equal tag-set-hash) + ((alpha expression history)) + ((grep vector prototype)) + ((either r s)) + ((both r s)) + ((invert r)) + ((join r s))) (define-rewrites (literal set) :printer ((literal set) diff --git a/Code/DFA-construction/type.lisp b/Code/DFA-construction/type.lisp index 26e89de..64744a9 100644 --- a/Code/DFA-construction/type.lisp +++ b/Code/DFA-construction/type.lisp @@ -2,10 +2,10 @@ (defvar *table-names* '()) -(defmacro define-hash-consing-table (name) +(defmacro define-hash-consing-table (name &optional (test 'equal) (hash 'sxhash)) `(progn (defvar ,name) - (pushnew ',name *table-names*) + (pushnew '(,name ,test ,hash) *table-names*) ',name)) (defconstant +uncomputed+ '+uncomputed+) @@ -18,7 +18,9 @@ (%has-tags-p :initform +uncomputed+ :accessor cached-has-tags-p))) (defmacro define-types (&body types) - (loop for (name . slots) in types + (loop for ((name . slots) test* hash*) in types + for test = (or test* 'equal) + for hash = (or hash* 'sxhash) collect (let ((variables (loop for slot in slots collect (gensym (symbol-name slot)))) (internal-creator (alexandria:format-symbol t "%~a" name)) (table-name (alexandria:format-symbol '#:one-more-re-nightmare @@ -32,14 +34,13 @@ ,@(loop for slot in slots for variable in variables appending `((list 'slot-value instance-name '',slot) ,variable))))) - (define-hash-consing-table ,table-name) + (define-hash-consing-table ,table-name ,test ,hash) (defun ,internal-creator ,slots - (or (gethash (list ,@slots) ,table-name) + (or (lookup-hc ,table-name (list ,@slots)) (let ((instance (make-instance ',name))) ,@(loop for slot in slots collect `(setf (slot-value instance ',slot) ,slot)) - (setf (gethash (list ,@slots) ,table-name) - instance)))))) + (insert-hc ,table-name (list ,@slots) instance)))))) into forms finally (return `(progn ,@forms)))) @@ -58,7 +59,7 @@ collect `((list ,@pattern) ,replacement)) ,@(loop for ((nil . pattern) (nil . replacement)) in hash-cons collect `((list ,@pattern) - (or (gethash (list ,@replacement) ,table-name) + (or (lookup-hc ,table-name (list ,@replacement)) (trivia.next:next)))) (_ (,internal-creator ,@slots))))))) (indent:define-indentation define-type (4 &body)) @@ -67,11 +68,10 @@ (alexandria:once-only (table key) (alexandria:with-gensyms (value present?) `(multiple-value-bind (,value ,present?) - (gethash ,key ,table) + (lookup-hc ,table ,key) (if ,present? ,value - (setf (gethash ,key ,table) - (progn ,@body))))))) + (insert-hc ,table ,key (progn ,@body))))))) (defmacro with-slot-consing ((accessor object &key (when 't)) &body body) (alexandria:once-only (object) @@ -89,11 +89,11 @@ ,value))))))) (defmacro with-hash-consing-tables (() &body body) - `(let ,(loop for name in *table-names* - collect `(,name (make-hash-table :test 'equal))) + `(let ,(loop for (name test hash) in *table-names* + collect `(,name (make-hash-cons-table #',test #',hash))) ,@body)) (defmacro clear-global-tables () "Set up global tables for testing." - `(setf ,@(loop for name in *table-names* - append `(,name (make-hash-table :test 'equal))))) + `(setf ,@(loop for (name test hash) in *table-names* + append `(,name (make-hash-cons-table #',test #',hash))))) diff --git a/Code/one-more-re-nightmare.asd b/Code/one-more-re-nightmare.asd index 462ed4c..74b62c0 100644 --- a/Code/one-more-re-nightmare.asd +++ b/Code/one-more-re-nightmare.asd @@ -11,6 +11,7 @@ (:module "DFA-construction" :components ((:file "type") (:file "sets") + (:file "hash-cons") (:file "re-types") (:file "nullable") (:file "tag-sets") From a13f319b1a5efa82328dddd0162310fd51e2fa17 Mon Sep 17 00:00:00 2001 From: Hayley Patton Date: Sat, 24 Jun 2023 15:22:00 +1000 Subject: [PATCH 2/5] Add EXCEEDED-STATE-LIMIT. --- Code/DFA-construction/make-dfa.lisp | 8 ++++++-- Code/package.lisp | 1 + Documentation/linting.scrbl | 28 ++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/Code/DFA-construction/make-dfa.lisp b/Code/DFA-construction/make-dfa.lisp index bd5b4ea..3b88233 100644 --- a/Code/DFA-construction/make-dfa.lisp +++ b/Code/DFA-construction/make-dfa.lisp @@ -78,6 +78,11 @@ (reverse result)))) (defvar *probably-bad-limit* 1000) +(define-condition exceeded-state-limit (error) + () + (:report "Made too many states - either your regular expression is too complicated, or one-more-re-nightmare is broken. +(Either way, you're not going to get this compiled any time soon.)")) + (defun make-dfa-from-expressions (expressions) (let ((states (make-hash-table)) (possibly-similar-states (make-hash-table)) @@ -94,8 +99,7 @@ (loop (when (null work-list) (return)) (when (> (hash-table-count states) *probably-bad-limit*) - (error "Made too many states - either your regular expression is too complicated, or one-more-re-nightmare is broken. -(Either way, you're not going to get this compiled any time soon.)")) + (error 'exceeded-state-limit)) (let* ((expression (pop work-list)) (state (find-state expression))) (cond diff --git a/Code/package.lisp b/Code/package.lisp index 18758a8..1f041ef 100644 --- a/Code/package.lisp +++ b/Code/package.lisp @@ -4,6 +4,7 @@ #:all-matches #:all-string-matches #:first-match #:first-string-match #:do-matches + #:exceeded-state-limit #:lint-style-warning #:not-matchable-style-warning #:matching-too-much-style-warning)) diff --git a/Documentation/linting.scrbl b/Documentation/linting.scrbl index 82ed082..514f598 100644 --- a/Documentation/linting.scrbl +++ b/Documentation/linting.scrbl @@ -205,3 +205,31 @@ always be bound to an index, and never @cl{nil}, because the first two registers designate the bounds of the entire match. } + +@subsection{"Made too many states - ..."} + +@definitions{ +@define-condition["exceeded-state-limit" "error"] +} + +@definition-section["Explanation"]{ + +The compiler generates a deterministic finite automaton, which may +(in semi-rare cases) produce a number of states exponentially +proportional to the complexity of the regular expression. Complements +and intersections may produce doubly-exponential numbers of states. + +It is also possible, but hopefully more rare, that the compiler lacks +rules to generate a finite number of states. + +} + +@definition-section["Examples"]{ + +@cl{(compile-regular-expression "1[01]{9}")} signals the error "Made too +many states - either your regular expression is too complicated, or +one-more-re-nightmare is broken. (Either way, you're not going to get +this compiled any time soon.)" In general, the regular expression +@${\mathtt{1} \cdot \left\{ \mathtt{0}, \mathtt{1} \right\}^n} requires +@${\mathcal{O}(2^n)} states. +} From 5f0cdf4a87038adcf0027721d1e51c2a7026b242 Mon Sep 17 00:00:00 2001 From: Hayley Patton Date: Sat, 24 Jun 2023 15:44:38 +1000 Subject: [PATCH 3/5] Avoid looking up tag sets in UNIQUE-TAGS --- Code/DFA-construction/tag-sets.lisp | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Code/DFA-construction/tag-sets.lisp b/Code/DFA-construction/tag-sets.lisp index e426aa8..bf625ac 100644 --- a/Code/DFA-construction/tag-sets.lisp +++ b/Code/DFA-construction/tag-sets.lisp @@ -90,28 +90,28 @@ (has-tags-p r)) (_ nil)))) -(defvar *allow-alpha* t) -(defun map-tags (f re) +;; Don't touch the table for UNIQUE-TAGS, since the tag set will +;; definitely not be in the table. +(defun %definitely-fresh-tag-set (set) + (let ((ts (make-instance 'tag-set))) + (setf (slot-value ts 'substitutions) set) + (insert-hc *tag-set-table* (list set) ts) + ts)) + +(defun unique-tags (re) ;; Return the same RE if we have no tags to replace. - (unless (has-tags-p re) - (return-from map-tags re)) + (unless (has-tags-p re) (return-from unique-tags re)) (trivia:match re - ((tag-set set) (tag-set (funcall f set))) - ((either r s) (either (map-tags f r) (map-tags f s))) - ((both r s) (both (map-tags f r) (map-tags f s))) - ((join r s) (join (map-tags f r) (map-tags f s))) - ((invert r) (invert (map-tags f r))) - ((repeat r min max c) (repeat (map-tags f r) min max c)) + ((tag-set set) (%definitely-fresh-tag-set (unique-assignments set))) + ((either r s) (either (unique-tags r) (unique-tags s))) + ((both r s) (both (unique-tags r) (unique-tags s))) + ((join r s) (join (unique-tags r) (unique-tags s))) + ((invert r) (invert (unique-tags r))) + ((repeat r min max c) (repeat (unique-tags r) min max c)) ((alpha r old-tags) - (unless (or *allow-alpha* (eq old-tags (empty-set))) + (unless (eq old-tags (empty-set)) (error "Can't modify tags with history")) - (alpha (map-tags f r) - (map-tags f old-tags))) + (alpha (unique-tags r) (unique-tags old-tags))) ((grep r s) - (grep (map-tags f r) - (map-tags f s))) + (grep (unique-tags r) (unique-tags s))) (_ re))) - -(defun unique-tags (re) - (let ((*allow-alpha* nil)) - (map-tags #'unique-assignments re))) From 6caed0bdcc9457abdf5fae0258573008e95567be Mon Sep 17 00:00:00 2001 From: Hayley Patton Date: Fri, 7 Jul 2023 15:54:59 +1000 Subject: [PATCH 4/5] Make it work in CCL, add best-effort :INTERPRETED --- Code/Compiler/code-generation.lisp | 2 ++ Code/DFA-construction/hash-cons.lisp | 3 ++- Tests/regrind.lisp | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Code/Compiler/code-generation.lisp b/Code/Compiler/code-generation.lisp index e21fa7b..d1150bf 100644 --- a/Code/Compiler/code-generation.lisp +++ b/Code/Compiler/code-generation.lisp @@ -56,6 +56,8 @@ (:interpreted (let ((sb-ext:*evaluator-mode* :interpret)) (eval form))) + #-sbcl + (:interpreted (eval form)) (:compiled (compile nil form)) (:literal diff --git a/Code/DFA-construction/hash-cons.lisp b/Code/DFA-construction/hash-cons.lisp index 83e050d..f5ae91b 100644 --- a/Code/DFA-construction/hash-cons.lisp +++ b/Code/DFA-construction/hash-cons.lisp @@ -7,7 +7,8 @@ ;;; 2. The hash function used by SBCL behaves very poorly with ;;; large substitution lists (as used by TAG-SET). -(defconstant +buckets+ 4096) +(eval-when (:compile-toplevel :load-toplevel :execute) + (defconstant +buckets+ 4096)) (defstruct (hash-cons-table (:constructor make-hash-cons-table (test hash))) (buckets (make-array +buckets+ :initial-element '()) diff --git a/Tests/regrind.lisp b/Tests/regrind.lisp index 4130818..50f34f6 100644 --- a/Tests/regrind.lisp +++ b/Tests/regrind.lisp @@ -44,7 +44,7 @@ (let ((success t)) (lparallel:pdotimes (i n success n) (let* ((*remaining-depth* depth) - #+sbcl (one-more-re-nightmare::*code-type* :interpreted) + (one-more-re-nightmare::*code-type* :interpreted) (re (random-re)) (haystack (random-haystack))) (handler-case From 08a15935dc8b7db40758570e6ac436f36948d80f Mon Sep 17 00:00:00 2001 From: Hayley Patton Date: Mon, 17 Jul 2023 16:45:04 +1000 Subject: [PATCH 5/5] Shrink hash consing tables. They hit the SBCL GC and mutator hard somehow. --- Code/DFA-construction/hash-cons.lisp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Code/DFA-construction/hash-cons.lisp b/Code/DFA-construction/hash-cons.lisp index f5ae91b..3a21d8d 100644 --- a/Code/DFA-construction/hash-cons.lisp +++ b/Code/DFA-construction/hash-cons.lisp @@ -8,7 +8,7 @@ ;;; large substitution lists (as used by TAG-SET). (eval-when (:compile-toplevel :load-toplevel :execute) - (defconstant +buckets+ 4096)) + (defconstant +buckets+ 256)) (defstruct (hash-cons-table (:constructor make-hash-cons-table (test hash))) (buckets (make-array +buckets+ :initial-element '())