-
Notifications
You must be signed in to change notification settings - Fork 760
Expand file tree
/
Copy pathobservability.rs
More file actions
242 lines (224 loc) · 9.57 KB
/
observability.rs
File metadata and controls
242 lines (224 loc) · 9.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
//! Centralised error reporting for the core, plus a Sentry
//! `before_send` filter that drops per-attempt transient-upstream
//! provider failures.
//!
//! Wraps `tracing::error!` (which the global subscriber forwards to Sentry via
//! `sentry-tracing`) inside a `sentry::with_scope` so each captured event
//! carries consistent tags identifying the failing domain/operation plus any
//! callsite-specific context (session id, request id, tool name, …).
//!
//! Why this helper exists: errors that bubble up as `Result::Err` without ever
//! being logged at error level never reach Sentry. The agent-turn path is the
//! canonical example — `run_single` used to publish a `DomainEvent::AgentError`
//! and return `Err(_)`, but Sentry never saw it. Funnel error sites through
//! `report_error` so they show up tagged and grep-friendly in Sentry.
use std::fmt::Display;
/// A `(key, value)` pair attached as a Sentry tag. Tags are short, indexed,
/// and filterable in the Sentry UI — prefer them over free-form fields for
/// anything you'd want to facet on (`error_kind`, `tool_name`, `method`).
pub type Tag<'a> = (&'a str, &'a str);
/// HTTP status codes that the reliable-provider layer already handles via
/// retry + fallback, so per-attempt Sentry reports add noise without signal:
///
/// - **408** Request Timeout
/// - **429** Too Many Requests
/// - **502** Bad Gateway
/// - **503** Service Unavailable
/// - **504** Gateway Timeout
///
/// Single source of truth for both the call-site classifier
/// (`openhuman::providers::ops::should_report_provider_http_failure`) and the
/// `before_send` filter (`is_transient_provider_http_failure`). Update here
/// and both sites pick it up — keeps the two layers from drifting.
pub const TRANSIENT_PROVIDER_HTTP_STATUSES: &[u16] = &[408, 429, 502, 503, 504];
/// Capture an error to Sentry with structured tags.
///
/// `domain` and `operation` are required and become tags `domain:<…>` and
/// `operation:<…>`. `extra` is an optional list of extra tag pairs. The error
/// itself is rendered via `Display` and emitted as a `tracing::error!` event,
/// which the Sentry tracing layer turns into a Sentry event under the active
/// scope.
///
/// Use stable, low-cardinality values for tag keys/values so Sentry can group
/// and aggregate. High-cardinality data (full IDs, payloads) belongs in the
/// error message body, not in tags.
pub fn report_error<E: Display + ?Sized>(
err: &E,
domain: &str,
operation: &str,
extra: &[Tag<'_>],
) {
// Use the alternate format specifier so `anyhow::Error` renders its full
// context chain (outer context + every wrapped cause, joined by ": ").
// Plain `Display` impls fall back to the standard representation. Without
// this, anyhow's default `to_string()` only emits the outermost context
// and the underlying cause (e.g. a `toml::de::Error` with line/column) is
// dropped — making the captured Sentry event undiagnosable. See
// OPENHUMAN-TAURI-B2 for an instance where this masked the real failure.
let message = format!("{err:#}");
sentry::with_scope(
|scope| {
scope.set_tag("domain", domain);
scope.set_tag("operation", operation);
for (k, v) in extra {
scope.set_tag(*k, *v);
}
},
|| {
tracing::error!(
domain = domain,
operation = operation,
error = %message,
"[observability] {domain}.{operation} failed: {message}"
);
},
);
}
/// Returns true when a Sentry event is a per-attempt provider HTTP failure
/// that the reliable-provider layer already handles via retry + fallback.
///
/// The primary suppression lives at the call site
/// (`openhuman::providers::ops::should_report_provider_http_failure`),
/// which short-circuits transient codes before `report_error` ever fires.
/// This helper is intended for use inside the `sentry::ClientOptions`
/// `before_send` hook as defense-in-depth — it catches any future call
/// site that emits a `tracing::error!` with the same shape but bypasses
/// the classifier.
///
/// Match criteria (all required):
/// - tag `domain == "llm_provider"` — pins the filter to provider-originated
/// events so an unrelated subsystem emitting `failure=non_2xx`/`status=503`
/// for its own reasons doesn't get silently dropped
/// - tag `failure == "non_2xx"` (the marker set by `ops::api_error`)
/// - tag `status` parses to one of [`TRANSIENT_PROVIDER_HTTP_STATUSES`]
pub fn is_transient_provider_http_failure(event: &sentry::protocol::Event<'_>) -> bool {
let tags = &event.tags;
if tags.get("domain").map(String::as_str) != Some("llm_provider") {
return false;
}
if tags.get("failure").map(String::as_str) != Some("non_2xx") {
return false;
}
let Some(status_u16) = tags.get("status").and_then(|s| s.parse::<u16>().ok()) else {
return false;
};
TRANSIENT_PROVIDER_HTTP_STATUSES.contains(&status_u16)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper must accept `&anyhow::Error`, `&dyn std::error::Error`, and
/// plain `&str` — the three shapes that show up at error sites today.
#[test]
fn report_error_accepts_common_error_shapes() {
let anyhow_err = anyhow::anyhow!("boom");
report_error(&anyhow_err, "test", "anyhow_shape", &[]);
let io_err = std::io::Error::other("io failed");
report_error(&io_err, "test", "io_shape", &[("kind", "io")]);
report_error("plain message", "test", "str_shape", &[]);
}
#[test]
fn anyhow_chain_is_rendered_in_full() {
// Regression guard: `err.to_string()` on an anyhow chain only emits
// the outermost context. Using `{:#}` joins every cause, which is
// what Sentry needs to actually diagnose wrapped failures.
let inner = std::io::Error::other("inner cause");
let wrapped = anyhow::Error::from(inner).context("outer ctx");
assert_eq!(format!("{wrapped:#}"), "outer ctx: inner cause");
}
#[test]
fn report_error_does_not_panic_with_many_tags() {
let err = anyhow::anyhow!("multi-tag");
report_error(
&err,
"test",
"multi_tag",
&[("a", "1"), ("b", "2"), ("c", "3"), ("d", "4")],
);
}
fn event_with_tags(pairs: &[(&str, &str)]) -> sentry::protocol::Event<'static> {
let mut event = sentry::protocol::Event::default();
let mut tags: std::collections::BTreeMap<String, String> =
std::collections::BTreeMap::new();
for (k, v) in pairs {
tags.insert((*k).to_string(), (*v).to_string());
}
event.tags = tags;
event
}
#[test]
fn transient_filter_drops_429_408_502_503_504() {
for status in ["429", "408", "502", "503", "504"] {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "non_2xx"),
("status", status),
]);
assert!(
is_transient_provider_http_failure(&event),
"status {status} must be classified as transient and filtered"
);
}
}
#[test]
fn transient_filter_keeps_permanent_failures() {
for status in ["400", "401", "403", "404", "500"] {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "non_2xx"),
("status", status),
]);
assert!(
!is_transient_provider_http_failure(&event),
"status {status} must NOT be filtered — it's actionable"
);
}
}
#[test]
fn transient_filter_keeps_aggregate_all_exhausted() {
let event = event_with_tags(&[
("domain", "llm_provider"),
("failure", "all_exhausted"),
("status", "503"),
]);
assert!(
!is_transient_provider_http_failure(&event),
"aggregate all_exhausted events must surface (they are the cascade signal)"
);
}
#[test]
fn transient_filter_keeps_events_with_no_status_tag() {
let event = event_with_tags(&[("domain", "llm_provider"), ("failure", "non_2xx")]);
assert!(
!is_transient_provider_http_failure(&event),
"missing status tag must not be silently dropped"
);
}
// Regression guard for CodeRabbit review on #1529: the filter must scope
// to provider events only. Other subsystems emit `failure=non_2xx` (e.g.
// `providers/compatible.rs` uses the same marker for OAI-compatible
// error paths, but every site goes through `report_error(..,
// "llm_provider", ..)` so the domain tag is consistent), but the broader
// point is: any future caller that re-uses the same tag set for a
// different domain must NOT be silently dropped by this filter.
#[test]
fn transient_filter_keeps_events_with_no_domain_tag() {
let event = event_with_tags(&[("failure", "non_2xx"), ("status", "503")]);
assert!(
!is_transient_provider_http_failure(&event),
"missing domain tag means the event isn't provider-originated — must surface"
);
}
#[test]
fn transient_filter_keeps_events_from_other_domains() {
let event = event_with_tags(&[
("domain", "scheduler"),
("failure", "non_2xx"),
("status", "503"),
]);
assert!(
!is_transient_provider_http_failure(&event),
"non-provider domain must surface even if failure/status tags collide"
);
}
}