Skip to content

Commit 9fd01ee

Browse files
j-mendezclaude
andcommitted
fix(spider): remote cache fallback for Chrome mode when cache_mem enabled
get_cached_url_base only checked local CACACHE_MANAGER (in-memory, per-process). When chrome_remote_cache was also enabled, its remote fallback was dead code due to feature gate conflict (cache_mem takes priority). Now falls back to hybrid_cache_server on local miss with 3s timeout, matching the chrome_remote_cache-only behavior. Bump v2.47.56 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 09fca7e commit 9fd01ee

3 files changed

Lines changed: 78 additions & 6 deletions

File tree

Cargo.lock

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

spider/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider"
3-
version = "2.47.55"
3+
version = "2.47.56"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A web crawler and scraper, building blocks for data curation workloads."
66
repository = "https://github.com/spider-rs/spider"

spider/src/utils/mod.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5060,6 +5060,78 @@ pub async fn get_cached_url_base(
50605060
}
50615061
}
50625062

5063+
// Fallback: query remote hybrid_cache_server when chrome_remote_cache is enabled.
5064+
// The local CACACHE_MANAGER is in-memory (per-process), so it misses on first
5065+
// request or after restart. The remote cache persists across processes and has
5066+
// data populated by browser_server's CDP interception.
5067+
#[cfg(feature = "chrome_remote_cache")]
5068+
{
5069+
let cache_site =
5070+
chromiumoxide::cache::manager::site_key_for_target_url(target_url, auth_opt.as_deref());
5071+
let make_session_key = |url: &str| format!("GET:{}", url);
5072+
5073+
let try_session_get = |url: &str| {
5074+
chromiumoxide::cache::remote::get_session_cache_item(
5075+
&cache_site,
5076+
&make_session_key(url),
5077+
)
5078+
.and_then(|(http_response, stored_policy)| {
5079+
if allow_stale || !stored_policy.is_stale(now) {
5080+
let accept_lang = http_response
5081+
.headers
5082+
.get("accept-language")
5083+
.or_else(|| http_response.headers.get("Accept-Language"))
5084+
.map(|h| h.as_str());
5085+
decode_cached_html_bytes(&http_response.body, accept_lang)
5086+
} else {
5087+
None
5088+
}
5089+
})
5090+
};
5091+
5092+
// Check chromiumoxide session cache (may have been seeded by a prior navigation).
5093+
if let Some(body) = try_session_get(target_url) {
5094+
return Some(body);
5095+
}
5096+
5097+
// Pull from the remote cache server, seed local session cache, then retry.
5098+
// Timeout prevents blocking the critical path if the cache server is slow/down.
5099+
let _ = tokio::time::timeout(
5100+
Duration::from_secs(3),
5101+
chromiumoxide::cache::remote::get_cache_site(
5102+
target_url,
5103+
auth_opt.as_deref(),
5104+
Some("true"),
5105+
),
5106+
)
5107+
.await;
5108+
5109+
if let Some(body) = try_session_get(target_url) {
5110+
return Some(body);
5111+
}
5112+
5113+
// Try alternate URL (with/without trailing slash).
5114+
let alt_url: Option<String> = if target_url.ends_with('/') {
5115+
let trimmed = target_url.trim_end_matches('/');
5116+
if trimmed.is_empty() || trimmed == target_url {
5117+
None
5118+
} else {
5119+
Some(trimmed.to_string())
5120+
}
5121+
} else {
5122+
let mut s = String::with_capacity(target_url.len() + 1);
5123+
s.push_str(target_url);
5124+
s.push('/');
5125+
Some(s)
5126+
};
5127+
5128+
if let Some(alt) = alt_url {
5129+
if let Some(body) = try_session_get(&alt) {
5130+
return Some(body);
5131+
}
5132+
}
5133+
}
5134+
50635135
None
50645136
}
50655137

0 commit comments

Comments
 (0)