Skip to content

Commit 93816a8

Browse files
j-mendezclaude
andcommitted
fix: lazy anti-bot body check in smart mode, exhaustive pattern tests, bump to v2.47.5
Short-circuit detect_anti_bot_from_body via Rust || lazy eval so it only runs when rerender and script_src are both false. Added unit tests covering every AC_BODY_SCAN (8 patterns) and AC_URL_SCAN (16 patterns) index to prevent mapping regressions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9c08b93 commit 93816a8

10 files changed

Lines changed: 155 additions & 50 deletions

File tree

Cargo.lock

Lines changed: 15 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

spider/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A web crawler and scraper, building blocks for data curation workloads."
66
repository = "https://github.com/spider-rs/spider"
@@ -120,11 +120,11 @@ features = ["serde", "headers", "dynamic-versions"]
120120

121121
[dependencies.spider_agent_types]
122122
path = "../spider_agent_types"
123-
version = "2.47.4"
123+
version = "2.47.5"
124124

125125
[dependencies.spider_agent]
126126
path = "../spider_agent"
127-
version = "2.47.4"
127+
version = "2.47.5"
128128
optional = true
129129
default-features = false
130130

spider/src/page.rs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3608,14 +3608,10 @@ impl Page {
36083608
let _ = rewriter.end();
36093609
}
36103610

3611-
// Anti-bot body detection as additional upgrade signal
3612-
let anti_bot_upgrade =
3613-
crate::utils::detect_anti_bot_from_body(&html_resource.as_bytes().to_vec())
3614-
.is_some();
3615-
36163611
let should_upgrade = rerender.load(Ordering::Relaxed)
36173612
|| script_src.load(Ordering::Relaxed)
3618-
|| anti_bot_upgrade;
3613+
// Anti-bot body detection as fallback upgrade signal (lazy — skipped when already upgrading)
3614+
|| crate::utils::detect_anti_bot_from_body(&html_resource.as_bytes().to_vec()).is_some();
36193615
if should_upgrade {
36203616
if let Some(browser_controller) = browser
36213617
.get_or_init(|| {
@@ -4025,14 +4021,10 @@ impl Page {
40254021
let _ = rewriter.end();
40264022
}
40274023

4028-
// Anti-bot body detection as additional upgrade signal
4029-
let anti_bot_upgrade =
4030-
crate::utils::detect_anti_bot_from_body(&html_resource.as_bytes().to_vec())
4031-
.is_some();
4032-
40334024
let should_upgrade = rerender.load(Ordering::Relaxed)
40344025
|| script_src.load(Ordering::Relaxed)
4035-
|| anti_bot_upgrade;
4026+
// Anti-bot body detection as fallback upgrade signal (lazy — skipped when already upgrading)
4027+
|| crate::utils::detect_anti_bot_from_body(&html_resource.as_bytes().to_vec()).is_some();
40364028
if should_upgrade {
40374029
if let Some(browser_controller) = browser
40384030
.get_or_init(|| {

spider/src/utils/mod.rs

Lines changed: 124 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6676,30 +6676,143 @@ mod tests {
66766676
// Too large - returns None
66776677
let large_body = vec![0u8; 40_000];
66786678
assert!(detect_anti_bot_from_body(&large_body).is_none());
6679-
// Normal page
6679+
// Normal page - no match
66806680
let normal = b"<html><body>Hello world</body></html>".to_vec();
66816681
assert!(detect_anti_bot_from_body(&normal).is_none());
6682-
// Alibaba TMD - _____tmd_____
6683-
let tmd = br#"<script>window.location.replace("https://example.com/_____tmd_____/punish?x5secdata=abc");</script>"#.to_vec();
6682+
6683+
// Pattern 0: cf-error-code → Cloudflare
6684+
assert_eq!(
6685+
detect_anti_bot_from_body(&b"<span class=\"cf-error-code\">1020</span>".to_vec()),
6686+
Some(AntiBotTech::Cloudflare)
6687+
);
6688+
// Pattern 1: Access to this page has been denied → Cloudflare
6689+
assert_eq!(
6690+
detect_anti_bot_from_body(
6691+
&b"<h1>Access to this page has been denied</h1>".to_vec()
6692+
),
6693+
Some(AntiBotTech::Cloudflare)
6694+
);
6695+
// Pattern 2: DataDome
6696+
assert_eq!(
6697+
detect_anti_bot_from_body(&b"<script src=\"https://js.DataDome.co/tags.js\">".to_vec()),
6698+
Some(AntiBotTech::DataDome)
6699+
);
6700+
// Pattern 3: perimeterx → PerimeterX
6701+
assert_eq!(
6702+
detect_anti_bot_from_body(&b"<script>window._pxAppId='perimeterx';</script>".to_vec()),
6703+
Some(AntiBotTech::PerimeterX)
6704+
);
6705+
// Pattern 4: funcaptcha → ArkoseLabs
6706+
assert_eq!(
6707+
detect_anti_bot_from_body(
6708+
&b"<iframe src=\"https://client-api.arkoselabs.com/funcaptcha\">".to_vec()
6709+
),
6710+
Some(AntiBotTech::ArkoseLabs)
6711+
);
6712+
// Pattern 5: Incapsula → Imperva
66846713
assert_eq!(
6685-
detect_anti_bot_from_body(&tmd),
6714+
detect_anti_bot_from_body(
6715+
&b"Request unsuccessful. Incapsula incident ID: 123".to_vec()
6716+
),
6717+
Some(AntiBotTech::Imperva)
6718+
);
6719+
// Pattern 6: _____tmd_____ → AlibabaTMD
6720+
assert_eq!(
6721+
detect_anti_bot_from_body(
6722+
&br#"<script>window.location.replace("https://example.com/_____tmd_____/punish?x5secdata=abc");</script>"#.to_vec()
6723+
),
66866724
Some(AntiBotTech::AlibabaTMD)
66876725
);
6688-
// Alibaba TMD - x5secdata in body
6689-
let x5sec = br#"<script>sessionStorage.x5referer=window.location.href;window.location.replace("https://example.com/punish?x5secdata=xyz&x5step=1");</script>"#.to_vec();
6726+
// Pattern 7: x5secdata → AlibabaTMD
66906727
assert_eq!(
6691-
detect_anti_bot_from_body(&x5sec),
6728+
detect_anti_bot_from_body(
6729+
&br#"<script>sessionStorage.x5referer=window.location.href;window.location.replace("https://example.com/punish?x5secdata=xyz&x5step=1");</script>"#.to_vec()
6730+
),
66926731
Some(AntiBotTech::AlibabaTMD)
66936732
);
66946733
}
66956734

66966735
#[test]
66976736
fn test_detect_antibot_from_url() {
6698-
assert!(
6699-
detect_antibot_from_url("https://example.com/cdn-cgi/challenge-platform").is_some()
6700-
);
6737+
// No match
67016738
assert!(detect_antibot_from_url("https://example.com/page").is_none());
6702-
// Alibaba TMD URL pattern
6739+
6740+
// Pattern 0: /cdn-cgi/challenge-platform → Cloudflare
6741+
assert_eq!(
6742+
detect_antibot_from_url("https://example.com/cdn-cgi/challenge-platform/h/b"),
6743+
Some(AntiBotTech::Cloudflare)
6744+
);
6745+
// Pattern 1: datadome.co → DataDome
6746+
assert_eq!(
6747+
detect_antibot_from_url("https://api.datadome.co/validate"),
6748+
Some(AntiBotTech::DataDome)
6749+
);
6750+
// Pattern 2: dd-api.io → DataDome
6751+
assert_eq!(
6752+
detect_antibot_from_url("https://dd-api.io/js/v1"),
6753+
Some(AntiBotTech::DataDome)
6754+
);
6755+
// Pattern 3: perimeterx.net → PerimeterX
6756+
assert_eq!(
6757+
detect_antibot_from_url("https://client.perimeterx.net/main.min.js"),
6758+
Some(AntiBotTech::PerimeterX)
6759+
);
6760+
// Pattern 4: px-captcha → PerimeterX
6761+
assert_eq!(
6762+
detect_antibot_from_url("https://example.com/px-captcha"),
6763+
Some(AntiBotTech::PerimeterX)
6764+
);
6765+
// Pattern 5: arkoselabs.com → ArkoseLabs
6766+
assert_eq!(
6767+
detect_antibot_from_url("https://client-api.arkoselabs.com/fc/gt2/"),
6768+
Some(AntiBotTech::ArkoseLabs)
6769+
);
6770+
// Pattern 6: funcaptcha → ArkoseLabs
6771+
assert_eq!(
6772+
detect_antibot_from_url("https://example.com/funcaptcha/verify"),
6773+
Some(AntiBotTech::ArkoseLabs)
6774+
);
6775+
// Pattern 7: kasada.io → Kasada
6776+
assert_eq!(
6777+
detect_antibot_from_url("https://ips.kasada.io/149/script"),
6778+
Some(AntiBotTech::Kasada)
6779+
);
6780+
// Pattern 8: fingerprint.com → FingerprintJS
6781+
assert_eq!(
6782+
detect_antibot_from_url("https://api.fingerprint.com/v3"),
6783+
Some(AntiBotTech::FingerprintJS)
6784+
);
6785+
// Pattern 9: fpjs.io → FingerprintJS
6786+
assert_eq!(
6787+
detect_antibot_from_url("https://fpjs.io/agent"),
6788+
Some(AntiBotTech::FingerprintJS)
6789+
);
6790+
// Pattern 10: incapsula → Imperva
6791+
assert_eq!(
6792+
detect_antibot_from_url("https://example.com/incapsula/resource"),
6793+
Some(AntiBotTech::Imperva)
6794+
);
6795+
// Pattern 11: imperva → Imperva
6796+
assert_eq!(
6797+
detect_antibot_from_url("https://example.com/imperva/block"),
6798+
Some(AntiBotTech::Imperva)
6799+
);
6800+
// Pattern 12: radwarebotmanager → RadwareBotManager
6801+
assert_eq!(
6802+
detect_antibot_from_url("https://example.com/radwarebotmanager/api"),
6803+
Some(AntiBotTech::RadwareBotManager)
6804+
);
6805+
// Pattern 13: reblaze.com → Reblaze
6806+
assert_eq!(
6807+
detect_antibot_from_url("https://reblaze.com/check"),
6808+
Some(AntiBotTech::Reblaze)
6809+
);
6810+
// Pattern 14: cheq.ai → CHEQ
6811+
assert_eq!(
6812+
detect_antibot_from_url("https://api.cheq.ai/verify"),
6813+
Some(AntiBotTech::CHEQ)
6814+
);
6815+
// Pattern 15: _____tmd_____/punish → AlibabaTMD
67036816
assert_eq!(
67046817
detect_antibot_from_url(
67056818
"https://www.miravia.es/p/i123/_____tmd_____/punish?x5secdata=abc"

spider_agent/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A concurrent-safe multimodal agent for web automation and research."
66
repository = "https://github.com/spider-rs/spider"
@@ -28,8 +28,8 @@ parking_lot = "0.12"
2828
base64 = "0.22"
2929

3030
# Extracted types and HTML processing
31-
spider_agent_types = { version = "2.47.4", path = "../spider_agent_types" }
32-
spider_agent_html = { version = "2.47.4", path = "../spider_agent_html" }
31+
spider_agent_types = { version = "2.47.5", path = "../spider_agent_types" }
32+
spider_agent_html = { version = "2.47.5", path = "../spider_agent_html" }
3333

3434
# HTML processing (still needed for engine internals)
3535
lol_html = "2"

spider_agent_html/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent_html"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "HTML processing utilities for spider_agent — cleaning, content analysis, and diffing."
66
repository = "https://github.com/spider-rs/spider"
@@ -24,7 +24,7 @@ serde = { version = "1", features = ["derive"] }
2424
serde_json = "1"
2525

2626
# Types from our types crate
27-
spider_agent_types = { version = "2.47.4", path = "../spider_agent_types" }
27+
spider_agent_types = { version = "2.47.5", path = "../spider_agent_types" }
2828

2929
[dev-dependencies]
3030
serde_json = "1"

spider_agent_types/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent_types"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "Pure data types and constants for spider_agent automation. Zero heavy dependencies."
66
repository = "https://github.com/spider-rs/spider"

spider_cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_cli"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "The fastest web crawler CLI written in Rust."
66
repository = "https://github.com/spider-rs/spider"

spider_utils/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_utils"
3-
version = "2.47.4"
3+
version = "2.47.5"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "Utilities to use for Spider Web Crawler."
66
repository = "https://github.com/spider-rs/spider"

0 commit comments

Comments
 (0)