fix: ReDoS in CSS tokenizer STRING rule

flavorjones · flavorjones · commit 807f6ee32e67 · 2026-04-27T11:05:04.000-04:00
The STRING rule had two ambiguities that caused exponential backtracking on unterminated quoted-string input: 1. The body's negated class `[^\n\r\f"]` matched a literal `\`, overlapping with the {escape} branch. Input like `[foo="\a\a\a...` had 2**N parses for N pairs. 2. {unicode}'s `[0-9A-Fa-f]{1,6}` admitted six match lengths per escape position. Input like `\aaaaaa\aaaaaa...` had 6**N parses. When the closing quote was missing the engine enumerated every parse before failing, so a sub-100-byte payload could hang the process indefinitely. The fix: - Excludes `\` from the body's negated class, so backslashes can only enter via {escape}, removing the cross-branch ambiguity. - Wraps the body alternation in an atomic group `(?>...)*` to lock each iteration's match decision, removing the within-escape length ambiguity. - Adds `\\?{nl}` for CSS line continuation, previously absorbed by the loose negated class. - Drops the `(?<!\\)(?:\\{2})*` bookkeeping that existed only to recover from the original ambiguity. Adds two performance benchmarks asserting linear parse time for both ambiguity classes. ref: GHSA-c4rq-3m3g-8wgx
diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb
@@ -1,7 +1,6 @@
-# frozen_string_literal: true
 #--
 # DO NOT MODIFY!!!!
-# This file is automatically generated by rex 1.0.7
+# This file is automatically generated by rex 1.0.8
 # from lexical definition file "lib/nokogiri/css/tokenizer.rex".
 #++
 
@@ -132,7 +131,7 @@ def _next_token
                   when (text = @ss.scan(/[\s]+/))
                      action { [:S, text] }
 
-                  when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
+                  when (text = @ss.scan(/("(?>[^\n\r\f"\\]|\\?(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*"|'(?>[^\n\r\f'\\]|\\?(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*')/))
                      action { [:STRING, text] }
 
                   when (text = @ss.scan(/./))
diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex
@@ -16,8 +16,8 @@ macro
   name      {nmstart}{nmchar}*
   ident     -?{name}
   charref   {nmchar}+
-  string1   "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
-  string2   '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
+  string1   "(?>[^\n\r\f"\\]|\\?{nl}|{nonascii}|{escape})*"
+  string2   '(?>[^\n\r\f'\\]|\\?{nl}|{nonascii}|{escape})*'
   string    ({string1}|{string2})
 
 rule
diff --git a/test/css/bench_tokenizer.rb b/test/css/bench_tokenizer.rb
@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+
+require "helper"
+require "timeout"
+
+class TestBenchCSSTokenizer < Nokogiri::TestBenchmark
+  # GHSA-c4rq-3m3g-8wgx: ambiguous regex in the STRING rule backtracks
+  # exponentially on unterminated `[foo="\a\a\a..."` input. Each sample
+  # repeats the parse to average out per-call jitter.
+  describe "css string tokenizer (cross-branch ambiguity)" do
+    bench_range { bench_linear(10_000, 60_000, 10_000) }
+
+    bench_performance_linear("redos in STRING rule", 0.99) do |n|
+      Timeout.timeout(5) do
+        payload = %([foo=") + ('\\a' * n) + "x"
+        50.times do
+          Nokogiri::CSS.xpath_for(payload)
+        rescue Nokogiri::CSS::SyntaxError
+        end
+      end
+    end
+  end
+
+  # The unicode escape's `[0-9A-Fa-f]{1,6}` quantifier admits 6 different
+  # match lengths per escape position, which without an atomic group
+  # multiplies into 6**N parses on unterminated `[foo="\aaaaaa\aaaaaa..."`.
+  describe "css string tokenizer (unicode escape length ambiguity)" do
+    bench_range { bench_linear(2_000, 12_000, 2_000) }
+
+    bench_performance_linear("redos in unicode escape length", 0.99) do |n|
+      Timeout.timeout(5) do
+        payload = %([foo=") + ('\\aaaaaa' * n) + "x"
+        150.times do
+          Nokogiri::CSS.xpath_for(payload)
+        rescue Nokogiri::CSS::SyntaxError
+        end
+      end
+    end
+  end
+end