Skip to content

Commit 5f3206a

Browse files
committed
Tune the suspicious_regex linter to make it less noisy
* Allow unescaped '.' when followed by a quantifier (e.g., '.*', '.+', '.?','.{') since those are commonly intentional uses of the dot metacharacter. * Allow 'https?' patterns without flagging the '?' as suspicious since that's a common pattern for matching both 'http' and 'https'.
1 parent 9c14978 commit 5f3206a

1 file changed

Lines changed: 80 additions & 49 deletions

File tree

cloudflare_rules/src/linter/suspicious_regex.rs

Lines changed: 80 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,14 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
2121
let mut visitor = SuspiciousRegexVisitor { result: Vec::new() };
2222

2323
// Simple scanner helpers that detect unescaped special characters outside of character classes.
24-
fn scan_pattern(pattern: &str) -> (bool, bool, bool, bool) {
25-
// returns (has_unescaped_dot_outside_class, has_unescaped_qmark_outside_class, contains_slash_star_slash, ends_with_slash_star)
24+
// Returns (has_unescaped_literal_dot, has_unescaped_query_like, contains_slash_star_slash)
25+
fn scan_pattern(pattern: &str) -> (bool, bool, bool) {
2626
let mut escaped = false;
2727
let mut in_class = false;
2828
let mut prev: Option<char> = None;
29-
let mut has_dot = false;
30-
let mut has_q = false;
29+
let mut has_unescaped_dot_literal = false;
30+
let mut has_unescaped_query_like = false;
3131
let mut contains_slash_star_slash = false;
32-
let mut ends_with_slash_star = false;
3332

3433
let chars: Vec<char> = pattern.chars().collect();
3534
for (i, &c) in chars.iter().enumerate() {
@@ -40,6 +39,7 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
4039
}
4140
if c == '\\' {
4241
escaped = true;
42+
prev = Some(c);
4343
continue;
4444
}
4545
if c == '[' && !in_class {
@@ -54,39 +54,44 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
5454
}
5555

5656
if !in_class {
57+
let next = chars.get(i + 1).copied();
5758
if c == '.' {
58-
has_dot = true;
59+
// Treat an unescaped '.' as suspicious only when it's not immediately
60+
// followed by a quantifier (e.g., '.*', '.+', '.?','.{') since those
61+
// are commonly intentional uses of the dot metacharacter.
62+
let next_is_quant = matches!(next, Some('*' | '+' | '?' | '{'));
63+
if !next_is_quant {
64+
has_unescaped_dot_literal = true;
65+
}
5966
}
67+
6068
if c == '?' {
61-
has_q = true;
69+
// Ignore the ? if the previous character is a quantifier (e.g., '.*?', '.+?') or otherwise part of a sensible use
70+
// (? starts regex commands eg (?i) or (?:...), so we only flag it when it looks like a literal question mark that might be intended as a query separator.
71+
// Optional character classes like [a-z]? are common
72+
let prev_is_valid = matches!(prev, Some('*' | '+' | '(' | ')' | '[' | ']'));
73+
// If a '?' appears unescaped and is followed by an alphanumeric or '='/'&',
74+
// it is likely the literal query separator rather than a regex quantifier.
75+
if !prev_is_valid {
76+
has_unescaped_query_like = true;
77+
}
6278
}
79+
6380
if c == '*'
64-
&& let Some(p) = prev
65-
&& p == '/'
81+
&& let Some('/') = prev
82+
&& let Some('/') | None = next
6683
{
67-
// pattern contains "/*"; check following char for another '/'
68-
if i + 1 < chars.len() && chars[i + 1] == '/' {
69-
contains_slash_star_slash = true;
70-
}
84+
contains_slash_star_slash = true;
7185
}
7286
}
7387

7488
prev = Some(c);
7589
}
7690

77-
// ends_with_slash_star check (unescaped)
78-
if pattern.ends_with("/*") {
79-
// ensure it isn't escaped (i.e., ends with "\/*")
80-
if !pattern.ends_with("\\/*") {
81-
ends_with_slash_star = true;
82-
}
83-
}
84-
8591
(
86-
has_dot,
87-
has_q,
92+
has_unescaped_dot_literal,
93+
has_unescaped_query_like,
8894
contains_slash_star_slash,
89-
ends_with_slash_star,
9095
)
9196
}
9297

@@ -99,8 +104,7 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
99104
// pattern is a valid regex (we ignore parse errors and still run the heuristics).
100105
let _ = Parser::new().parse(pattern);
101106

102-
let (has_dot, has_q, contains_slash_star_slash, ends_with_slash_star) =
103-
scan_pattern(pattern);
107+
let (has_dot, has_q, contains_slash_star_slash) = scan_pattern(pattern);
104108

105109
// Determine field context when available to make smarter heuristics.
106110
let field_name = if let IdentifierExpr::Field(field) = &node.lhs.identifier {
@@ -115,7 +119,7 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
115119
// Path-related heuristics
116120
if let Some(fname) = &field_name {
117121
if fname == "http.request.uri.path" || fname == "raw.http.request.uri.path" {
118-
if contains_slash_star_slash || ends_with_slash_star {
122+
if contains_slash_star_slash {
119123
suspicious = true;
120124
message = "Regex uses quantifiers on path separators (e.g., `/*`); \
121125
this often indicates a wildcard intent — consider using a \
@@ -134,7 +138,8 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
134138
|| fname == "raw.http.request.full_uri"
135139
{
136140
// Full URI: unescaped dots and question marks are common mistakes
137-
if has_dot || has_q {
141+
// Allow matching on optional 's' in 'https' (e.g., https?://) without flagging as suspicious since that's a common pattern, but flag other unescaped '?' characters that look like they might be intended as query separators.
142+
if has_dot || (has_q && !pattern.contains("https?")) {
138143
suspicious = true;
139144
message = r"Regex contains unescaped `.` or `?` characters in a URI; escape them (e.g., `\.` and `\?`) or use a wildcard match where appropriate.".to_string();
140145
}
@@ -148,27 +153,6 @@ fn lint(_config: &LinterConfig, ast: &FilterAst, _expr: &str) -> Vec<LintReport>
148153
}
149154
}
150155

151-
// Generic heuristics (applies regardless of field)
152-
if !suspicious {
153-
// common glob-like regex: ^/foo/.*$ or /.*
154-
if pattern.contains("/.*")
155-
|| pattern.starts_with("^/") && pattern.contains(".*")
156-
{
157-
suspicious = true;
158-
message = "Regex uses `.*` to match path segments; consider using a \
159-
wildcard match or be more specific."
160-
.to_string();
161-
}
162-
}
163-
164-
if !suspicious {
165-
// question marks used literally in query parts
166-
if has_q && pattern.contains("https://") {
167-
suspicious = true;
168-
message = r"Regex contains an unescaped `?` within a URI; escape it as `\?` or use a wildcard match.".to_string();
169-
}
170-
}
171-
172156
if suspicious {
173157
self.result.push(LintReport {
174158
id: LINT_NAME.into(),
@@ -324,4 +308,51 @@ mod test {
324308
r#"http.request.uri.path matches "/foo/bar/index\.html""#,
325309
);
326310
}
311+
312+
#[test]
313+
fn test_ok_dot_star_patterns() {
314+
assert_no_lint_message(&LINTER, r#"http.request.uri.path matches "^/foo/.*""#);
315+
assert_no_lint_message(&LINTER, r#"http.request.uri.path matches "^/foo.*""#);
316+
assert_no_lint_message(&LINTER, r#"http.request.uri.path matches "^/.*/bar""#);
317+
}
318+
319+
#[test]
320+
fn test_https_optional() {
321+
assert_no_lint_message(
322+
&LINTER,
323+
r#"http.request.uri matches "https?://www\.example\.com/foo""#,
324+
);
325+
}
326+
327+
#[test]
328+
fn test_case_insensitive() {
329+
assert_no_lint_message(
330+
&LINTER,
331+
r#"http.request.uri matches "(?i)https://www\.example\.com/foo""#,
332+
);
333+
}
334+
335+
#[test]
336+
fn test_case_groups() {
337+
assert_no_lint_message(
338+
&LINTER,
339+
r#"http.request.uri matches "https://www\.example\.com/(?:foo/|bar/)index\.html""#,
340+
);
341+
assert_no_lint_message(
342+
&LINTER,
343+
r#"http.request.uri matches "https://www\.example\.com/(foo/|bar/)?index\.html""#,
344+
);
345+
assert_no_lint_message(
346+
&LINTER,
347+
r#"http.request.uri matches "https://www\.example\.com/api/v1[0-9]?/foo""#,
348+
);
349+
}
350+
351+
#[test]
352+
fn test_non_greedy_and_charclass_no_warn() {
353+
assert_no_lint_message(
354+
&LINTER,
355+
r#"http.request.uri.path matches "/foo/[abc]*?/bar""#,
356+
);
357+
}
327358
}

0 commit comments

Comments
 (0)