Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
253 changes: 245 additions & 8 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ landlock = { version = "0.4", optional = true }
rppal = { version = "0.22", optional = true }

[dev-dependencies]
# Enable sentry's TestTransport for runtime smoke of the observability
# before_send filter (see tests/observability_smoke.rs).
sentry = { version = "0.47.0", features = ["test"] }

[features]
sandbox-landlock = ["dep:landlock"]
Expand Down
142 changes: 141 additions & 1 deletion src/core/observability.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
//! Centralised error reporting for the core.
//! Centralised error reporting for the core, plus a Sentry
//! `before_send` filter that drops per-attempt transient-upstream
//! provider failures.
//!
//! Wraps `tracing::error!` (which the global subscriber forwards to Sentry via
//! `sentry-tracing`) inside a `sentry::with_scope` so each captured event
Expand All @@ -18,6 +20,21 @@ use std::fmt::Display;
/// anything you'd want to facet on (`error_kind`, `tool_name`, `method`).
pub type Tag<'a> = (&'a str, &'a str);

/// HTTP status codes that the reliable-provider layer already handles via
/// retry + fallback, so per-attempt Sentry reports add noise without signal:
///
/// - **408** Request Timeout
/// - **429** Too Many Requests
/// - **502** Bad Gateway
/// - **503** Service Unavailable
/// - **504** Gateway Timeout
///
/// Single source of truth for both the call-site classifier
/// (`openhuman::providers::ops::should_report_provider_http_failure`) and the
/// `before_send` filter (`is_transient_provider_http_failure`). Update here
/// and both sites pick it up — keeps the two layers from drifting.
pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504];

/// Capture an error to Sentry with structured tags.
///
/// `domain` and `operation` are required and become tags `domain:<…>` and
Expand Down Expand Up @@ -62,6 +79,37 @@ pub fn report_error<E: Display + ?Sized>(
);
}

/// Returns true when a Sentry event is a per-attempt provider HTTP failure
/// that the reliable-provider layer already handles via retry + fallback.
///
/// The primary suppression lives at the call site
/// (`openhuman::providers::ops::should_report_provider_http_failure`),
/// which short-circuits transient codes before `report_error` ever fires.
/// This helper is intended for use inside the `sentry::ClientOptions`
/// `before_send` hook as defense-in-depth — it catches any future call
/// site that emits a `tracing::error!` with the same shape but bypasses
/// the classifier.
///
/// Match criteria (all required):
/// - tag `domain == "llm_provider"` — pins the filter to provider-originated
/// events so an unrelated subsystem emitting `failure=non_2xx`/`status=503`
/// for its own reasons doesn't get silently dropped
/// - tag `failure == "non_2xx"` (the marker set by `ops::api_error`)
/// - tag `status` parses to one of [`TRANSIENT_PROVIDER_HTTP_STATUSES`]
pub fn is_transient_provider_http_failure(event: &sentry::protocol::Event<'_>) -> bool {
let tags = &event.tags;
if tags.get("domain").map(String::as_str) != Some("llm_provider") {
return false;
}
if tags.get("failure").map(String::as_str) != Some("non_2xx") {
return false;
}
let Some(status_u16) = tags.get("status").and_then(|s| s.parse::<u16>().ok()) else {
return false;
};
TRANSIENT_PROVIDER_HTTP_STATUSES.contains(&status_u16)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -99,4 +147,96 @@ mod tests {
&[("a", "1"), ("b", "2"), ("c", "3"), ("d", "4")],
);
}

fn event_with_tags(pairs: &[(&str, &str)]) -> sentry::protocol::Event<'static> {
let mut event = sentry::protocol::Event::default();
let mut tags: std::collections::BTreeMap<String, String> =
std::collections::BTreeMap::new();
for (k, v) in pairs {
tags.insert((*k).to_string(), (*v).to_string());
}
event.tags = tags;
event
}

#[test]
fn transient_filter_drops_429_408_502_503_504() {
for status in ["429", "408", "502", "503", "504"] {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "non_2xx"),
("status", status),
]);
assert!(
is_transient_provider_http_failure(&event),
"status {status} must be classified as transient and filtered"
);
}
}

#[test]
fn transient_filter_keeps_permanent_failures() {
for status in ["400", "401", "403", "404", "500"] {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "non_2xx"),
("status", status),
]);
assert!(
!is_transient_provider_http_failure(&event),
"status {status} must NOT be filtered — it's actionable"
);
}
}

#[test]
fn transient_filter_keeps_aggregate_all_exhausted() {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "all_exhausted"),
("status", "503"),
]);
assert!(
!is_transient_provider_http_failure(&event),
"aggregate all_exhausted events must surface (they are the cascade signal)"
);
}

#[test]
fn transient_filter_keeps_events_with_no_status_tag() {
let event = event_with_tags(&[("domain", "llm_provider"), ("failure", "non_2xx")]);
assert!(
!is_transient_provider_http_failure(&event),
"missing status tag must not be silently dropped"
);
}

// Regression guard for CodeRabbit review on #1529: the filter must scope
// to provider events only. Other subsystems emit `failure=non_2xx` (e.g.
// `providers/compatible.rs` uses the same marker for OAI-compatible
// error paths, but every site goes through `report_error(..,
// "llm_provider", ..)` so the domain tag is consistent), but the broader
// point is: any future caller that re-uses the same tag set for a
// different domain must NOT be silently dropped by this filter.
#[test]
fn transient_filter_keeps_events_with_no_domain_tag() {
let event = event_with_tags(&[("failure", "non_2xx"), ("status", "503")]);
assert!(
!is_transient_provider_http_failure(&event),
"missing domain tag means the event isn't provider-originated — must surface"
);
}

#[test]
fn transient_filter_keeps_events_from_other_domains() {
let event = event_with_tags(&[
("domain", "scheduler"),
("failure", "non_2xx"),
("status", "503"),
]);
assert!(
!is_transient_provider_http_failure(&event),
"non-provider domain must surface even if failure/status tags collide"
);
}
}
13 changes: 13 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ fn main() {
environment: Some(std::borrow::Cow::Owned(resolve_environment())),
send_default_pii: false,
before_send: Some(std::sync::Arc::new(|mut event| {
// Defense-in-depth: drop transient-upstream provider failures that
// slipped past the call-site classifier. The reliable-provider
// layer already retries 429/408/502/503/504 with backoff +
// fallback, and the aggregate "all providers exhausted" event
// still fires for genuine outages. Per-attempt reports flood
// Sentry — see OPENHUMAN-TAURI-2E (~1393 events), -84 (~1050),
// -T (~871). The primary fix lives in
// `openhuman::providers::ops::should_report_provider_http_failure`
// (transient codes excluded). This filter catches any future call
// site that bypasses it.
if openhuman_core::core::observability::is_transient_provider_http_failure(&event) {
return None;
}
// Strip server_name (hostname) to avoid leaking machine identity
event.server_name = None;
// Attach the cached account uid so Sentry can count unique users
Expand Down
43 changes: 33 additions & 10 deletions src/openhuman/agent/harness/tool_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,16 +408,39 @@ pub(crate) async fn run_tool_call_loop(
)
}
Err(e) => {
crate::core::observability::report_error(
&e,
"agent",
"provider_chat",
&[
("provider", provider_name),
("model", model),
("iteration", &(iteration + 1).to_string()),
],
);
// Transient upstream failures (rate-limit, gateway 5xx, "no
// healthy upstream", etc.) are already classified + retried
// by reliable.rs and produce an aggregate Sentry event only
// when every provider/model is exhausted. Reporting each
// per-iteration provider_chat error here duplicates the
// signal and floods Sentry — see OPENHUMAN-TAURI-3Y/3Z
// (~46 events combined) and the underlying TAURI-2E/84/T
// (~3300 events from raw per-attempt 429/503/504 reports).
let transient = crate::openhuman::providers::reliable::is_rate_limited(&e)
|| crate::openhuman::providers::reliable::is_upstream_unhealthy(&e);
if transient {
Comment thread
coderabbitai[bot] marked this conversation as resolved.
tracing::warn!(
domain = "agent",
operation = "provider_chat",
provider = provider_name,
model = model,
iteration = iteration + 1,
error = %format!("{e:#}"),
"[agent] transient provider_chat failure — retried upstream; \
aggregated all-providers-exhausted will report if applicable"
);
} else {
crate::core::observability::report_error(
&e,
"agent",
"provider_chat",
&[
("provider", provider_name),
("model", model),
("iteration", &(iteration + 1).to_string()),
],
);
}
return Err(e);
}
};
Expand Down
71 changes: 43 additions & 28 deletions src/openhuman/providers/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,18 @@ const OPENHUMAN_BACKEND_PROVIDER_LABEL: &str = "OpenHuman";

/// Whether a non-2xx provider response is worth reporting to Sentry.
///
/// 429 Too Many Requests is a transient, caller-side throttling signal — the
/// reliable-provider layer already retries with backoff and falls back across
/// providers/models, and the aggregate "all providers exhausted" event still
/// fires if every attempt fails. Reporting each individual 429 floods Sentry
/// (see OPENHUMAN-TAURI-6Y: ~8K events/day from one user being rate-limited
/// by an upstream model). Callers should still propagate the error so retry
/// and fallback logic runs unchanged; this only gates the Sentry report.
/// Transient upstream statuses — 429 Too Many Requests, 408 Request Timeout,
/// and 502/503/504 gateway-layer failures — are caller-side throttling or
/// upstream-capacity signals. The reliable-provider layer already retries
/// with backoff and falls back across providers/models, and the aggregate
/// "all providers exhausted" event still fires if every attempt fails.
/// Reporting each individual transient failure floods Sentry (see
/// OPENHUMAN-TAURI-6Y / 2E / 84 / T: thousands of events/day per user from
/// a single upstream rate-limit / outage window). Callers should still
/// propagate the error so retry and fallback logic runs unchanged; this
/// only gates the per-attempt Sentry report.
pub fn should_report_provider_http_failure(status: reqwest::StatusCode) -> bool {
status != reqwest::StatusCode::TOO_MANY_REQUESTS
!crate::core::observability::TRANSIENT_PROVIDER_HTTP_STATUSES.contains(&status.as_u16())
}

/// Build a sanitized provider error from a failed HTTP response.
Expand Down Expand Up @@ -425,25 +428,37 @@ mod tests {
}

#[test]
fn skips_sentry_report_for_429_only() {
// 429 is transient rate-limit — reliable.rs retries + falls back, and
// the aggregate "all providers exhausted" event still fires for genuine
// outages. Reporting each 429 individually floods Sentry.
assert!(!should_report_provider_http_failure(
reqwest::StatusCode::TOO_MANY_REQUESTS
));
// Everything else (auth, server, gateway, etc.) is still worth a report.
assert!(should_report_provider_http_failure(
reqwest::StatusCode::UNAUTHORIZED
));
assert!(should_report_provider_http_failure(
reqwest::StatusCode::INTERNAL_SERVER_ERROR
));
assert!(should_report_provider_http_failure(
reqwest::StatusCode::BAD_GATEWAY
));
assert!(should_report_provider_http_failure(
reqwest::StatusCode::SERVICE_UNAVAILABLE
));
fn skips_sentry_report_for_transient_upstream_statuses() {
// Transient statuses — 429 rate-limit, 408 client timeout, and 502/503/504
// gateway-layer failures — are retried by reliable.rs. The aggregate
// "all providers exhausted" event still fires for genuine outages.
// Reporting each attempt individually floods Sentry (OPENHUMAN-TAURI-2E
// ~1393 events, 84 ~1050 events, T ~871 events).
for transient in [
reqwest::StatusCode::TOO_MANY_REQUESTS,
reqwest::StatusCode::REQUEST_TIMEOUT,
reqwest::StatusCode::BAD_GATEWAY,
reqwest::StatusCode::SERVICE_UNAVAILABLE,
reqwest::StatusCode::GATEWAY_TIMEOUT,
] {
assert!(
!should_report_provider_http_failure(transient),
"transient status {transient} must not trigger per-attempt Sentry report"
);
}
// Auth + permanent server faults remain reportable — those are
// misconfiguration or genuine bugs, not transient capacity issues.
for reportable in [
reqwest::StatusCode::UNAUTHORIZED,
reqwest::StatusCode::FORBIDDEN,
reqwest::StatusCode::BAD_REQUEST,
reqwest::StatusCode::NOT_FOUND,
reqwest::StatusCode::INTERNAL_SERVER_ERROR,
] {
assert!(
should_report_provider_http_failure(reportable),
"status {reportable} must still report to Sentry"
);
}
}
}
25 changes: 23 additions & 2 deletions src/openhuman/providers/reliable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,34 @@ fn is_context_window_exceeded(err: &anyhow::Error) -> bool {
hints.iter().any(|hint| lower.contains(hint))
}

/// Detect provider-side temporary capacity/outage errors that are often surfaced
/// as HTTP 5xx with text like "no healthy upstream".
/// Detect provider-side temporary capacity/outage errors. Covers:
///
/// - HTTP `408 Request Timeout`, `502 Bad Gateway`, `503 Service Unavailable`,
/// `504 Gateway Timeout` — both via direct `reqwest::Error` downcast and via
/// the formatted `"<provider> API error (<status>): …"` text emitted by
/// `ops::api_error` (the path that actually reaches `report_error`).
/// - Provider-agnostic text markers like `"no healthy upstream"` /
/// `"upstream unavailable"` that don't come with a typed status.
///
/// Pairs with [`is_rate_limited`] which handles 429 separately. Together they
/// form the transient-classifier the tool-call loop uses before deciding
/// whether to push a per-attempt event to Sentry (see OPENHUMAN-TAURI-2E /
/// -84 / -T / -G classes — per-iteration noise from upstream throttling).
pub(crate) fn is_upstream_unhealthy(err: &anyhow::Error) -> bool {
if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
if let Some(status) = reqwest_err.status() {
if matches!(status.as_u16(), 408 | 502 | 503 | 504) {
return true;
}
}
}
let lower = err.to_string().to_lowercase();
lower.contains("no healthy upstream")
|| lower.contains("upstream unavailable")
|| lower.contains("service unavailable")
|| lower.contains("408 request timeout")
|| lower.contains("502 bad gateway")
|| lower.contains("504 gateway timeout")
}

/// Check if an error is a rate-limit (429) error.
Expand Down
23 changes: 23 additions & 0 deletions src/openhuman/providers/reliable_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,29 @@ fn upstream_unhealthy_does_not_flag_generic_error() {
assert!(!is_upstream_unhealthy(&err));
}

// 408/502/504 must also classify as transient — `ops::api_error` formats
// the upstream failure as "<provider> API error (<status>): <body>", and the
// tool-call loop ORs is_rate_limited (429) with is_upstream_unhealthy. Before
// this fix only 503/text-pattern matched; 408/502/504 leaked per-iteration
// Sentry events (CodeRabbit review on #1529, OPENHUMAN-TAURI-T/-2E/-84).
#[test]
fn upstream_unhealthy_detects_408_request_timeout() {
let err = anyhow::anyhow!("OpenAI API error (408 Request Timeout): upstream took too long");
assert!(is_upstream_unhealthy(&err));
}

#[test]
fn upstream_unhealthy_detects_502_bad_gateway() {
let err = anyhow::anyhow!("Anthropic API error (502 Bad Gateway): bad gateway");
assert!(is_upstream_unhealthy(&err));
}

#[test]
fn upstream_unhealthy_detects_504_gateway_timeout() {
let err = anyhow::anyhow!("OpenAI API error (504 Gateway Timeout): upstream timed out");
assert!(is_upstream_unhealthy(&err));
}

#[test]
fn failure_reason_upstream_unhealthy_wins_over_rate_limited() {
// Both rate_limited AND upstream_unhealthy — upstream_unhealthy must win.
Expand Down
Loading
Loading