Skip to content

Commit 52feb61

Browse files
authored
fix: backtracking in CSS tokenizer rules (#3626)
**What problem is this PR intended to solve?** Addresses three regular expression backtracking / redos issues. ref: GHSA-c4rq-3m3g-8wgx **Have you included adequate test coverage?** Yes, added benchmark suite tests to assert linearity of regex performance. **Does this change affect the behavior of either the C or the Java implementations?** N/A
2 parents c1b912c + 760bde0 commit 52feb61

3 files changed

Lines changed: 72 additions & 9 deletions

File tree

lib/nokogiri/css/tokenizer.rb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22
#--
33
# DO NOT MODIFY!!!!
4-
# This file is automatically generated by rex 1.0.7
4+
# This file is automatically generated by rex 1.0.8
55
# from lexical definition file "lib/nokogiri/css/tokenizer.rex".
66
#++
77

@@ -63,13 +63,13 @@ def _next_token
6363
when (text = @ss.scan(/has\([\s]*/))
6464
action { [:HAS, text] }
6565

66-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
66+
when (text = @ss.scan(/-?(?>[_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
6767
action { [:FUNCTION, text] }
6868

69-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
69+
when (text = @ss.scan(/-?(?>[_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
7070
action { [:IDENT, text] }
7171

72-
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
72+
when (text = @ss.scan(/\#(?>[_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
7373
action { [:HASH, text] }
7474

7575
when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -132,7 +132,7 @@ def _next_token
132132
when (text = @ss.scan(/[\s]+/))
133133
action { [:S, text] }
134134

135-
when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
135+
when (text = @ss.scan(/("(?>[^\n\r\f"\\]|\\?(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*"|'(?>[^\n\r\f'\\]|\\?(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*')/))
136136
action { [:STRING, text] }
137137

138138
when (text = @ss.scan(/./))

lib/nokogiri/css/tokenizer.rex

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ macro
1111
unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
1212

1313
escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
14-
nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
15-
nmstart ([_A-Za-z]|{nonascii}|{escape})
14+
nmchar (?>[_A-Za-z0-9-]|{nonascii}|{escape})
15+
nmstart (?>[_A-Za-z]|{nonascii}|{escape})
1616
name {nmstart}{nmchar}*
1717
ident -?{name}
1818
charref {nmchar}+
19-
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
20-
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
19+
string1 "(?>[^\n\r\f"\\]|\\?{nl}|{nonascii}|{escape})*"
20+
string2 '(?>[^\n\r\f'\\]|\\?{nl}|{nonascii}|{escape})*'
2121
string ({string1}|{string2})
2222
2323
rule

test/css/bench_tokenizer.rb

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# frozen_string_literal: true
2+
3+
require "helper"
4+
require "timeout"
5+
6+
class TestBenchCSSTokenizer < Nokogiri::TestBenchmark
7+
# JRuby's JIT warmup makes per-call timings too noisy for an R**2 fit;
8+
# the ReDoS property is a regex property, not an engine one, so MRI
9+
# coverage is sufficient.
10+
before { skip("benchmarks are too noisy under JRuby JIT") if Nokogiri.jruby? }
11+
12+
# GHSA-c4rq-3m3g-8wgx: ambiguous regex in the STRING rule backtracks
13+
# exponentially on unterminated `[foo="\a\a\a..."` input. Each sample
14+
# repeats the parse to average out per-call jitter.
15+
describe "css string tokenizer (cross-branch ambiguity)" do
16+
bench_range { bench_linear(10_000, 60_000, 10_000) }
17+
18+
bench_performance_linear("redos in STRING rule", 0.99) do |n|
19+
Timeout.timeout(5) do
20+
payload = %([foo=") + ('\\a' * n) + "x"
21+
50.times do
22+
Nokogiri::CSS.xpath_for(payload)
23+
rescue Nokogiri::CSS::SyntaxError
24+
end
25+
end
26+
end
27+
end
28+
29+
# The unicode escape's `[0-9A-Fa-f]{1,6}` quantifier admits 6 different
30+
# match lengths per escape position, which without an atomic group
31+
# multiplies into 6**N parses on unterminated `[foo="\aaaaaa\aaaaaa..."`.
32+
describe "css string tokenizer (unicode escape length ambiguity)" do
33+
bench_range { bench_linear(2_000, 12_000, 2_000) }
34+
35+
bench_performance_linear("redos in unicode escape length", 0.99) do |n|
36+
Timeout.timeout(5) do
37+
payload = %([foo=") + ('\\aaaaaa' * n) + "x"
38+
150.times do
39+
Nokogiri::CSS.xpath_for(payload)
40+
rescue Nokogiri::CSS::SyntaxError
41+
end
42+
end
43+
end
44+
end
45+
46+
# The function-call rule {ident}\({w} requires `(` after an identifier.
47+
# If the `(` is missing and the ident-shaped prefix contains many
48+
# `\<6-hex>` escapes, the engine backtracks through the {1,6}
49+
# ambiguity inside `{nmchar}*` for 6**N parses.
50+
describe "css ident tokenizer (function-rule failure ambiguity)" do
51+
bench_range { bench_linear(50_000, 300_000, 50_000) }
52+
53+
bench_performance_linear("redos in function rule", 0.99) do |n|
54+
Timeout.timeout(5) do
55+
payload = ('\\aaaaaa' * n) + "X"
56+
1000.times do
57+
Nokogiri::CSS.xpath_for(payload)
58+
rescue Nokogiri::CSS::SyntaxError
59+
end
60+
end
61+
end
62+
end
63+
end

0 commit comments

Comments
 (0)