fix: ReDoS in CSS tokenizer ident rule

flavorjones · flavorjones · commit 9bada21a709f · 2026-04-27T11:05:04.000-04:00
A second instance of the same backtracking pattern: `{unicode}`'s `[0-9A-Fa-f]{1,6}` admits six match lengths per escape position, and {nmchar} appears under `*` in {name}. When the `{ident}\({w}` rule fails (no `(` after an identifier-shaped prefix), the engine backtracks through `{nmchar}*` for 6**N parses. Payload `\aaaaaa\aaaaaa...X` triggers it: at n=8 it takes 330ms, at n=10 it takes 11.4s. Wrap the body alternations of {nmchar} and {nmstart} in atomic groups, mirroring the prior STRING-rule fix. Each nmchar/nmstart match is locked once committed, so the outer `{nmchar}*` can release whole iterations but cannot try alternative inner consumption of the {1,6} hex run. Add a benchmark test asserting linear time, similar to previous. ref: GHSA-c4rq-3m3g-8wgx
diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb
@@ -1,3 +1,4 @@
+# frozen_string_literal: true
 #--
 # DO NOT MODIFY!!!!
 # This file is automatically generated by rex 1.0.8
@@ -62,13 +63,13 @@ def _next_token
                   when (text = @ss.scan(/has\([\s]*/))
                      action { [:HAS, text] }
 
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
+                  when (text = @ss.scan(/-?(?>[_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
                      action { [:FUNCTION, text] }
 
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
+                  when (text = @ss.scan(/-?(?>[_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
                      action { [:IDENT, text] }
 
-                  when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
+                  when (text = @ss.scan(/\#(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
                      action { [:HASH, text] }
 
                   when (text = @ss.scan(/[\s]*~=[\s]*/))
diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex
@@ -11,8 +11,8 @@ macro
   unicode   \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
 
   escape    ({unicode}|\\[^\n\r\f0-9A-Fa-f])
-  nmchar    ([_A-Za-z0-9-]|{nonascii}|{escape})
-  nmstart   ([_A-Za-z]|{nonascii}|{escape})
+  nmchar    (?>[_A-Za-z0-9-]|{nonascii}|{escape})
+  nmstart   (?>[_A-Za-z]|{nonascii}|{escape})
   name      {nmstart}{nmchar}*
   ident     -?{name}
   charref   {nmchar}+
diff --git a/test/css/bench_tokenizer.rb b/test/css/bench_tokenizer.rb
@@ -37,4 +37,22 @@ class TestBenchCSSTokenizer < Nokogiri::TestBenchmark
       end
     end
   end
+
+  # The function-call rule {ident}\({w} requires `(` after an identifier.
+  # If the `(` is missing and the ident-shaped prefix contains many
+  # `\<6-hex>` escapes, the engine backtracks through the {1,6}
+  # ambiguity inside `{nmchar}*` for 6**N parses.
+  describe "css ident tokenizer (function-rule failure ambiguity)" do
+    bench_range { bench_linear(50_000, 300_000, 50_000) }
+
+    bench_performance_linear("redos in function rule", 0.99) do |n|
+      Timeout.timeout(5) do
+        payload = ('\\aaaaaa' * n) + "X"
+        1000.times do
+          Nokogiri::CSS.xpath_for(payload)
+        rescue Nokogiri::CSS::SyntaxError
+        end
+      end
+    end
+  end
 end