Skip to content

Commit d6c2ae2

Browse files
j-mendezclaude
andcommitted
fix: validate 598/599 as genuine errors before ConnectError, bump to v2.47.3
Only classify 598/599 as ConnectError when the page is empty or has an error_status set. If a real server spoofs 598/599 with content, it falls through to ServerError instead. Added 8 tests covering all status branches. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 71a51d0 commit d6c2ae2

10 files changed

Lines changed: 130 additions & 37 deletions

File tree

Cargo.lock

Lines changed: 15 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

spider/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A web crawler and scraper, building blocks for data curation workloads."
66
repository = "https://github.com/spider-rs/spider"
@@ -120,11 +120,11 @@ features = ["serde", "headers", "dynamic-versions"]
120120

121121
[dependencies.spider_agent_types]
122122
path = "../spider_agent_types"
123-
version = "2.47.2"
123+
version = "2.47.3"
124124

125125
[dependencies.spider_agent]
126126
path = "../spider_agent"
127-
version = "2.47.2"
127+
version = "2.47.3"
128128
optional = true
129129
default-features = false
130130

spider/src/utils/mod.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2688,17 +2688,19 @@ pub async fn fetch_page_html_chrome_base(
26882688
let next_event = async { listener.next().await };
26892689

26902690
let event = match chunk_idle {
2691-
Some(timeout) => match tokio::time::timeout(timeout, next_event).await {
2692-
Ok(Some(event)) => event,
2693-
Ok(None) => break,
2694-
Err(_elapsed) => {
2695-
log::warn!(
2691+
Some(timeout) => {
2692+
match tokio::time::timeout(timeout, next_event).await {
2693+
Ok(Some(event)) => event,
2694+
Ok(None) => break,
2695+
Err(_elapsed) => {
2696+
log::warn!(
26962697
"chrome network idle timeout ({timeout:?}), force-stopping page"
26972698
);
2698-
let _ = page_clone.force_stop_all().await;
2699-
break;
2699+
let _ = page_clone.force_stop_all().await;
2700+
break;
2701+
}
27002702
}
2701-
},
2703+
}
27022704
None => match next_event.await {
27032705
Some(event) => event,
27042706
None => break,

spider/src/website.rs

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2461,8 +2461,9 @@ impl Website {
24612461
self.status = CrawlStatus::Blocked;
24622462
} else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
24632463
self.status = CrawlStatus::RateLimited;
2464-
} else if page.status_code == *UNKNOWN_STATUS_ERROR
2465-
|| page.status_code == *CHROME_UNKNOWN_STATUS_ERROR
2464+
} else if (page.status_code == *UNKNOWN_STATUS_ERROR
2465+
|| page.status_code == *CHROME_UNKNOWN_STATUS_ERROR)
2466+
&& (page.is_empty() || page.error_status.is_some())
24662467
{
24672468
self.status = CrawlStatus::ConnectError;
24682469
} else if page.status_code.is_server_error() {
@@ -10990,6 +10991,96 @@ mod tests {
1099010991
"Should not build rotator with no proxies"
1099110992
);
1099210993
}
10994+
10995+
fn make_page(status: reqwest::StatusCode) -> crate::page::Page {
10996+
let mut page = crate::page::Page::default();
10997+
page.status_code = status;
10998+
page
10999+
}
11000+
11001+
#[test]
11002+
fn test_crawl_status_599_empty_page_is_connect_error() {
11003+
let mut website = crate::website::Website::new("http://example.com");
11004+
let page = make_page(*crate::page::UNKNOWN_STATUS_ERROR);
11005+
let links = hashbrown::HashSet::new();
11006+
website.set_crawl_initial_status(&page, &links);
11007+
assert_eq!(*website.get_status(), super::CrawlStatus::ConnectError);
11008+
}
11009+
11010+
#[test]
11011+
fn test_crawl_status_598_empty_page_is_connect_error() {
11012+
let mut website = crate::website::Website::new("http://example.com");
11013+
let page = make_page(*crate::page::CHROME_UNKNOWN_STATUS_ERROR);
11014+
let links = hashbrown::HashSet::new();
11015+
website.set_crawl_initial_status(&page, &links);
11016+
assert_eq!(*website.get_status(), super::CrawlStatus::ConnectError);
11017+
}
11018+
11019+
#[test]
11020+
fn test_crawl_status_598_with_error_status_is_connect_error() {
11021+
let mut website = crate::website::Website::new("http://example.com");
11022+
let mut page = make_page(*crate::page::CHROME_UNKNOWN_STATUS_ERROR);
11023+
page.html = Some(b"<html><body>some content</body></html>".to_vec().into());
11024+
page.error_status = Some("Invalid proxy configuration.".into());
11025+
let links = hashbrown::HashSet::new();
11026+
website.set_crawl_initial_status(&page, &links);
11027+
assert_eq!(*website.get_status(), super::CrawlStatus::ConnectError);
11028+
}
11029+
11030+
#[test]
11031+
fn test_crawl_status_598_with_content_no_error_is_server_error() {
11032+
let mut website = crate::website::Website::new("http://example.com");
11033+
let mut page = make_page(*crate::page::CHROME_UNKNOWN_STATUS_ERROR);
11034+
page.html = Some(
11035+
b"<html><body>real server content</body></html>"
11036+
.to_vec()
11037+
.into(),
11038+
);
11039+
let links = hashbrown::HashSet::new();
11040+
website.set_crawl_initial_status(&page, &links);
11041+
assert_eq!(*website.get_status(), super::CrawlStatus::ServerError);
11042+
}
11043+
11044+
#[test]
11045+
fn test_crawl_status_599_with_content_no_error_is_server_error() {
11046+
let mut website = crate::website::Website::new("http://example.com");
11047+
let mut page = make_page(*crate::page::UNKNOWN_STATUS_ERROR);
11048+
page.html = Some(
11049+
b"<html><body>real server content</body></html>"
11050+
.to_vec()
11051+
.into(),
11052+
);
11053+
let links = hashbrown::HashSet::new();
11054+
website.set_crawl_initial_status(&page, &links);
11055+
assert_eq!(*website.get_status(), super::CrawlStatus::ServerError);
11056+
}
11057+
11058+
#[test]
11059+
fn test_crawl_status_500_is_server_error() {
11060+
let mut website = crate::website::Website::new("http://example.com");
11061+
let page = make_page(reqwest::StatusCode::INTERNAL_SERVER_ERROR);
11062+
let links = hashbrown::HashSet::new();
11063+
website.set_crawl_initial_status(&page, &links);
11064+
assert_eq!(*website.get_status(), super::CrawlStatus::ServerError);
11065+
}
11066+
11067+
#[test]
11068+
fn test_crawl_status_429_is_rate_limited() {
11069+
let mut website = crate::website::Website::new("http://example.com");
11070+
let page = make_page(reqwest::StatusCode::TOO_MANY_REQUESTS);
11071+
let links = hashbrown::HashSet::new();
11072+
website.set_crawl_initial_status(&page, &links);
11073+
assert_eq!(*website.get_status(), super::CrawlStatus::RateLimited);
11074+
}
11075+
11076+
#[test]
11077+
fn test_crawl_status_empty_page_200_is_empty() {
11078+
let mut website = crate::website::Website::new("http://example.com");
11079+
let page = make_page(reqwest::StatusCode::OK);
11080+
let links = hashbrown::HashSet::new();
11081+
website.set_crawl_initial_status(&page, &links);
11082+
assert_eq!(*website.get_status(), super::CrawlStatus::Empty);
11083+
}
1099311084
}
1099411085

1099511086
#[tokio::test]

spider_agent/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A concurrent-safe multimodal agent for web automation and research."
66
repository = "https://github.com/spider-rs/spider"
@@ -28,8 +28,8 @@ parking_lot = "0.12"
2828
base64 = "0.22"
2929

3030
# Extracted types and HTML processing
31-
spider_agent_types = { version = "2.47.2", path = "../spider_agent_types" }
32-
spider_agent_html = { version = "2.47.2", path = "../spider_agent_html" }
31+
spider_agent_types = { version = "2.47.3", path = "../spider_agent_types" }
32+
spider_agent_html = { version = "2.47.3", path = "../spider_agent_html" }
3333

3434
# HTML processing (still needed for engine internals)
3535
lol_html = "2"

spider_agent_html/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent_html"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "HTML processing utilities for spider_agent — cleaning, content analysis, and diffing."
66
repository = "https://github.com/spider-rs/spider"
@@ -24,7 +24,7 @@ serde = { version = "1", features = ["derive"] }
2424
serde_json = "1"
2525

2626
# Types from our types crate
27-
spider_agent_types = { version = "2.47.2", path = "../spider_agent_types" }
27+
spider_agent_types = { version = "2.47.3", path = "../spider_agent_types" }
2828

2929
[dev-dependencies]
3030
serde_json = "1"

spider_agent_types/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_agent_types"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "Pure data types and constants for spider_agent automation. Zero heavy dependencies."
66
repository = "https://github.com/spider-rs/spider"

spider_cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_cli"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "The fastest web crawler CLI written in Rust."
66
repository = "https://github.com/spider-rs/spider"

spider_utils/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_utils"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "Utilities to use for Spider Web Crawler."
66
repository = "https://github.com/spider-rs/spider"

spider_worker/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_worker"
3-
version = "2.47.2"
3+
version = "2.47.3"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "The fastest web crawler as a worker or proxy."
66
repository = "https://github.com/spider-rs/spider"

0 commit comments

Comments
 (0)