-
Notifications
You must be signed in to change notification settings - Fork 206
Expand file tree
/
Copy pathchrome_nxdomain_test.rs
More file actions
109 lines (101 loc) · 4.16 KB
/
chrome_nxdomain_test.rs
File metadata and controls
109 lines (101 loc) · 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//! cargo run --example chrome_nxdomain_test --features chrome
//!
//! Verifies the chrome-path DNS hedge (v2.51.187 timer arm + v2.51.188
//! CDP loadingFailed arm) on a confirmed-NXDOMAIN host. Prints elapsed
//! time, status_code, and should_retry for two back-to-back calls so
//! the v2.51.186 cache effect is visible on the second.
//!
//! Expected outcome with the hedge active against real chrome:
//! * First call: status_code = 525 (DNS_RESOLVE_ERROR),
//! elapsed ≈ chrome launch + (1-3s hedge tick).
//! * Second call: status_code = 525, elapsed ≈ chrome launch only
//! (cache hits at `Page::new_base` so the navigation
//! is short-circuited at sub-µs cost).
//!
//! ## Configuration
//!
//! * `TARGET_URL` — host to test. Default: `https://kingfishelectric.com/`.
//! * `CHROME_WS_URL` — remote chrome endpoint (HTTP `/json/version` URL
//! or `ws://` URL). When set, `Website::with_chrome_connection` is
//! called so the test runs against a real chrome instance. Without
//! it the test falls back to local chrome (or spider's HTTP fallback
//! if no local chrome binary is available).
//! * `REQUEST_TIMEOUT_SECS` — per-page timeout. Default: 30s.
use spider::features::chrome_common::RequestInterceptConfiguration;
use spider::tokio;
use spider::website::Website;
use std::time::Duration;
#[tokio::main]
async fn main() {
let url =
std::env::var("TARGET_URL").unwrap_or_else(|_| "https://kingfishelectric.com/".to_string());
let chrome_ws = std::env::var("CHROME_WS_URL").ok();
let request_timeout_secs = std::env::var("REQUEST_TIMEOUT_SECS")
.ok()
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(30);
println!("[chrome_nxdomain_test] target={url}");
match &chrome_ws {
Some(ws) => println!("[chrome_nxdomain_test] remote chrome connection: {ws}"),
None => println!(
"[chrome_nxdomain_test] no CHROME_WS_URL set — using local chrome auto-launch (if available)"
),
}
println!("[chrome_nxdomain_test] request_timeout={request_timeout_secs}s");
println!(
"[chrome_nxdomain_test] first call should hit hedge arm; second should hit v2.51.186 cache"
);
// First request — through the full chrome path. Hedge fires
// either via the CDP loadingFailed event (v2.51.188) or the
// timer arm (v2.51.187), populating the cache for the next
// request.
let first = run_once(&url, chrome_ws.as_deref(), request_timeout_secs).await;
print_pages("first", first.0, &first.1);
// Second request — cached NXDOMAIN, should short-circuit at
// chrome_nxdomain_shortcircuit (~50ns DashMap shard read).
let second = run_once(&url, chrome_ws.as_deref(), request_timeout_secs).await;
print_pages("second", second.0, &second.1);
}
async fn run_once(
url: &str,
chrome_ws: Option<&str>,
request_timeout_secs: u64,
) -> (Duration, Website) {
let mut website = Website::new(url)
.with_limit(1)
.with_respect_robots_txt(false)
.with_chrome_intercept(RequestInterceptConfiguration::new(true))
.build()
.unwrap();
if let Some(ws) = chrome_ws {
website.with_chrome_connection(Some(ws.to_string()));
}
website.configuration.request_timeout = Some(Duration::from_secs(request_timeout_secs));
let start = std::time::Instant::now();
website.scrape().await;
let elapsed = start.elapsed();
(elapsed, website)
}
fn print_pages(label: &str, elapsed: Duration, w: &Website) {
let pages = w.get_pages();
match pages {
Some(ps) if !ps.is_empty() => {
for p in ps.iter() {
println!(
"[{}][{:.3}s] {} -> status={} html_len={} should_retry={}",
label,
elapsed.as_secs_f32(),
p.get_url(),
p.status_code,
p.get_html_bytes_u8().len(),
p.should_retry,
);
}
}
_ => println!(
"[{}][{:.3}s] no pages returned (NXDOMAIN classified pre-page-list)",
label,
elapsed.as_secs_f32()
),
}
}