Skip to content

Commit 5a91990

Browse files
j-mendezclaude
andcommitted
kernel: fix spider remote-crawl hang (chrome_intercept) + chromey default
The spider::Website driver hung >60s on every remote-browser crawl because the gottem-adapters-chrome spider dep lacked the `chrome_intercept` feature. That feature gates the CDP Network/Fetch domain setup spider uses to detect page completion; without it compiled in, crawl() falls back to the 120s request_timeout. Verified against a live Kernel session (examples/ kernel_spider_probe): with the feature, example.com renders in ~1.4s (both intercept on and off); without it, it times out. The runtime with_chrome_intercept() call is not required — compiling the feature is. Also flip the Kernel driver default to raw chromey (drive_cdp) — lean, fast, direct CDP — with spider::Website now an opt-in per request via `provider_options.kernel.spider = true` (control flag stripped from the Kernel create payload). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 9c997df commit 5a91990

3 files changed

Lines changed: 86 additions & 28 deletions

File tree

crates/gottem-adapters-chrome/Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@ gottem-core = { path = "../gottem-core", version = "0.1.16" }
2424
# `Ws(Url(TlsFeatureNotEnabled))`. Cargo unifies chromey's features across the
2525
# build, so this also covers spider::Website's remote-connection path.
2626
chromey = { version = "2", features = ["chrome_tls_connection"] }
27-
spider = { workspace = true, features = ["basic", "chrome"] }
27+
# `chrome_intercept` sets up the CDP Network/Fetch domain spider uses to detect
28+
# page completion; without it a remote-browser crawl waits on the full
29+
# request_timeout. Matches spider's own remote-chrome examples.
30+
spider = { workspace = true, features = ["basic", "chrome", "chrome_intercept"] }
2831
async-trait = { workspace = true }
2932
bytes = { workspace = true }
3033
url = { workspace = true }
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//! Local probe: drive a remote (Kernel) browser via spider::Website over CDP and
2+
//! time it, to validate the chrome_intercept fix for the slow-crawl hang.
3+
//!
4+
//! cargo run -p gottem-adapters-chrome --example kernel_spider_probe -- <cdp_ws_url> [url]
5+
//!
6+
//! Env: INTERCEPT=0 disables chrome_intercept (to reproduce the slow path).
7+
//! RUST_LOG=spider=info for spider's own logs.
8+
9+
use spider::features::chrome_common::RequestInterceptConfiguration;
10+
use spider::tokio;
11+
use spider::website::Website;
12+
use std::time::Instant;
13+
14+
#[tokio::main]
15+
async fn main() {
16+
let args: Vec<String> = std::env::args().collect();
17+
let cdp = args.get(1).expect("usage: <cdp_ws_url> [url]").clone();
18+
let url = args
19+
.get(2)
20+
.cloned()
21+
.unwrap_or_else(|| "https://example.com".to_string());
22+
let intercept = std::env::var("INTERCEPT").map(|v| v != "0").unwrap_or(true);
23+
eprintln!("intercept={intercept} url={url}");
24+
25+
let mut b = Website::new(&url);
26+
b.with_limit(1)
27+
.with_stealth(false)
28+
.with_fingerprint(false)
29+
.with_chrome_connection(Some(cdp));
30+
if intercept {
31+
b.with_chrome_intercept(RequestInterceptConfiguration::new(true));
32+
}
33+
let mut website = b.build().expect("build");
34+
35+
let mut rx = website.subscribe(16);
36+
let t = Instant::now();
37+
website.crawl().await;
38+
let elapsed = t.elapsed();
39+
40+
let mut n = 0;
41+
let mut bytes = 0usize;
42+
let mut status = 0u16;
43+
loop {
44+
match rx.try_recv() {
45+
Ok(p) => {
46+
n += 1;
47+
bytes = p.get_html_bytes_u8().len();
48+
status = p.status_code.as_u16();
49+
}
50+
Err(spider::tokio::sync::broadcast::error::TryRecvError::Lagged(_)) => continue,
51+
Err(_) => break,
52+
}
53+
}
54+
eprintln!("DONE elapsed={elapsed:?} pages={n} status={status} html_bytes={bytes}");
55+
}

crates/gottem-adapters-chrome/src/lib.rs

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
//!
1313
//! [`KernelCdpAdapter`] covers Kernel ([onkernel.com](https://kernel.sh)), which
1414
//! has no *static* endpoint: it `POST`s `/browsers` to mint a per-session
15-
//! `cdp_ws_url`, drives it (via `spider::Website` by default, or raw chromey
16-
//! [`drive_cdp`] when `provider_options.kernel.spider_disabled` is set), then
15+
//! `cdp_ws_url`, drives it (raw chromey [`drive_cdp`] by default, or
16+
//! `spider::Website` when `provider_options.kernel.spider = true`), then
1717
//! `DELETE`s the session (Kernel bills per running second). It reports
1818
//! [`AdapterKind::Custom`]`("kernel_cdp")`.
1919
//!
@@ -286,10 +286,11 @@ const KERNEL_DEFAULT_CREATE_URL: &str = "https://api.onkernel.com/browsers";
286286
/// *running* second, so releasing promptly is a cost guarantee, not just hygiene.
287287
///
288288
/// Two drivers, switchable per request:
289-
/// - **default** — [`spider::Website`] via [`scrape_via_spider`]: crawl() +
290-
/// subscription (streams pages to the user), stealth/fingerprint off.
291-
/// - **`provider_options.kernel.spider_disabled = true`** — raw chromey
292-
/// ([`drive_cdp`]): direct CDP, more control for bespoke automation.
289+
/// - **default** — raw chromey ([`drive_cdp`]): direct CDP, lean and fast.
290+
/// - **`provider_options.kernel.spider = true`** — [`spider::Website`] via
291+
/// [`scrape_via_spider`]: crawl() + subscription (streams pages to the user),
292+
/// stealth/fingerprint off. Requires spider's `chrome_intercept` feature for
293+
/// page-completion detection over the remote browser.
293294
///
294295
/// Browser config is tunable per request via `provider_options.kernel` (e.g.
295296
/// `{ "headless": false, "gpu": true, "proxy_id": "...", "viewport": {...} }`),
@@ -397,12 +398,13 @@ impl Adapter for KernelCdpAdapter {
397398
)
398399
.await?;
399400

400-
// ---- 2. Drive the remote browser. Default path is spider::Website —
401-
// its crawl() + subscription is how rendered pages stream back, and it
402-
// carries the quality steps. Opt out per request with
403-
// `provider_options.kernel.spider_disabled = true` to drive raw chromey
404-
// directly (more control for bespoke automation; no spider wrapper).
405-
let driven = if kernel_spider_disabled(req) {
401+
// ---- 2. Drive the remote browser. Default is raw chromey ([`drive_cdp`])
402+
// — direct CDP, lean and fast. Opt into the spider::Website path
403+
// (crawl() + subscription, quality steps) per request with
404+
// `provider_options.kernel.spider = true`.
405+
let driven = if kernel_use_spider(req) {
406+
scrape_via_spider(&session.cdp_ws_url, req.url.as_str(), route.timeout(), cancel).await
407+
} else {
406408
drive_cdp(
407409
&session.cdp_ws_url,
408410
req.url.as_str(),
@@ -411,8 +413,6 @@ impl Adapter for KernelCdpAdapter {
411413
cancel,
412414
)
413415
.await
414-
} else {
415-
scrape_via_spider(&session.cdp_ws_url, req.url.as_str(), route.timeout(), cancel).await
416416
};
417417

418418
// ---- 3. On success, read Kernel's own usage meter for actual-cost
@@ -479,7 +479,7 @@ fn kernel_cost_dollars(uptime_ms: u64, headless: bool, gpu: bool) -> f64 {
479479

480480
/// Keys the adapter consumes from `provider_options.kernel` as its own control
481481
/// flags — they must NOT be forwarded to Kernel's create-browser API.
482-
const KERNEL_CONTROL_KEYS: &[&str] = &["spider_disabled"];
482+
const KERNEL_CONTROL_KEYS: &[&str] = &["spider"];
483483

484484
/// Default create body (cheap + stealthy), with `provider_options.kernel`
485485
/// layered over the top so callers can flip `headless`, request `gpu`, pin a
@@ -501,12 +501,12 @@ fn build_kernel_create_body(req: &ScrapeRequest) -> serde_json::Value {
501501
body
502502
}
503503

504-
/// Whether this request opted out of the spider driver (`provider_options.kernel
505-
/// .spider_disabled = true`) to drive raw chromey directly instead.
506-
fn kernel_spider_disabled(req: &ScrapeRequest) -> bool {
504+
/// Whether this request opted into the spider::Website driver
505+
/// (`provider_options.kernel.spider = true`). Default is raw chromey.
506+
fn kernel_use_spider(req: &ScrapeRequest) -> bool {
507507
req.provider_options
508508
.get("kernel")
509-
.and_then(|o| o.get("spider_disabled"))
509+
.and_then(|o| o.get("spider"))
510510
.and_then(|v| v.as_bool())
511511
.unwrap_or(false)
512512
}
@@ -803,25 +803,25 @@ mod tests {
803803
let mut req = ScrapeRequest::get(Url::parse("https://example.com/").unwrap());
804804
req.provider_options.insert(
805805
"kernel".to_string(),
806-
serde_json::json!({ "spider_disabled": true, "headless": false }),
806+
serde_json::json!({ "spider": true, "headless": false }),
807807
);
808808
let body = build_kernel_create_body(&req);
809809
assert_eq!(body["headless"], serde_json::json!(false)); // real Kernel field passes
810-
assert!(body.get("spider_disabled").is_none()); // control flag stripped
810+
assert!(body.get("spider").is_none()); // control flag stripped
811811
}
812812

813813
#[test]
814-
fn kernel_spider_disabled_reads_flag() {
814+
fn kernel_use_spider_reads_flag() {
815815
let base = || ScrapeRequest::get(Url::parse("https://example.com/").unwrap());
816-
assert!(!kernel_spider_disabled(&base())); // absent → spider (default)
816+
assert!(!kernel_use_spider(&base())); // absent → chromey (default)
817817
let mut on = base();
818818
on.provider_options
819-
.insert("kernel".into(), serde_json::json!({ "spider_disabled": true }));
820-
assert!(kernel_spider_disabled(&on));
819+
.insert("kernel".into(), serde_json::json!({ "spider": true }));
820+
assert!(kernel_use_spider(&on));
821821
let mut off = base();
822822
off.provider_options
823-
.insert("kernel".into(), serde_json::json!({ "spider_disabled": false }));
824-
assert!(!kernel_spider_disabled(&off));
823+
.insert("kernel".into(), serde_json::json!({ "spider": false }));
824+
assert!(!kernel_use_spider(&off));
825825
}
826826

827827
#[test]

0 commit comments

Comments
 (0)