Skip to content

Commit a7d21ac

Browse files
committed
fix(a2a): address PR #3704 review findings — security, observability, type design
Security-critical: - Resolve A2A task lookup ambiguity: Rust runtime pre-resolves agent from URL path and injects agent_id into task-method proxy params; Python service refuses to guess when (task_id, no agent_id) matches multiple rows. Prevents cross-agent task read/cancel via shared task_id. - Encrypt push webhook auth_token at rest via services_auth.encode_auth; new list_push_configs_for_dispatch returns decrypted tokens for the trusted Rust sidecar only. Fixes secret-at-rest leak. - Upsert semantics on push-config re-registration: mutable fields (auth_token, events, enabled) update in place instead of silently keeping stale config. Key rotations now take effect. - Fail-closed on query-param auth decrypt failure (matches header path). - Refuse empty/missing A2A_RUST_AUTH_SECRET at startup; add cross-field RuntimeConfig validation (retry budget, TTL ordering, session TTL). - Feature-flag gate on internal A2A endpoints: reject trusted requests when MCPGATEWAY_A2A_ENABLED=false (defense-in-depth). - events/flush rejects events for unknown task_ids (400) to prevent visibility bypass. Observability: - logger.exception on _authorize_internal_a2a_method errors. - logger.warning on agent-visibility denial across 7 internal endpoints. - warn! on JSON parse failures in 4 proxy sites (server.rs) and malformed Last-Event-ID headers. - warn! on session fingerprint mismatch (security-relevant signal). - MetricsCollector.webhook_retry_exhausted counter + record method. - Log Rust-cache invalidation scheduling failures instead of silent pass. - Rollback-after-failure in task persistence now logs and calls invalidate. Type design / schema: - New A2ATaskState enum with terminal-state validator on A2ATaskUpdate. - Unknown task-state protocol values now warn (passed through for forward-compat). - Alembic: tenant=String(255), icon_url=String(767) match the ORM (prevents Postgres VARCHAR-length drift between fresh and migrated DBs). - Rename two A2A migration files to hash-first convention matching the rest of the project. - trust.rs doc/comment clarifies the header is SHA256, not HMAC. - BTreeMap for query-param merge ensures deterministic ordering. - encode_auth_context uses .expect() — empty header would look anonymous. Test coverage: - Deny-path regressions for internal A2A endpoints: RBAC denied, wrong-team, feature-disabled (20 new cases). - Rust↔Python bridge: ConnectError/ConnectTimeout/ReadTimeout/PoolTimeout now wrap as RustA2ARuntimeError with correct is_timeout flag. - Push config: encryption round-trip, token rotation, upsert semantics, undecryptable-legacy heal path, dispatch-listing SQL visibility scope. - Rust concurrency: session concurrent lookup/invalidate leaves storage consistent; concurrent create produces unique IDs; queue overflow under concurrent producers rejects with Full. - RuntimeConfig cross-field validation (5 scenarios). - Rust push: webhook retry-exhaustion metric increments. - Binary startup refuses without auth_secret. - select_downstream_agent: ORDER BY name + enabled filter assertions. Deferred to follow-up issues: - Typestate wrappers for trust-boundary types (#4233) - SSE client-disconnect integration test (#4234) Signed-off-by: Jonathan Springer <jps@s390x.com>
1 parent dddd41d commit a7d21ac

24 files changed

+1648
-261
lines changed

.secrets.baseline

Lines changed: 19 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"files": "^.secrets.baseline|package-lock.json|Cargo.lock|scripts/sign_image.sh|scripts/zap|sonar-project.properties|uv.lock|go.sum|mcpgateway/sri_hashes.json|^.secrets.baseline$",
44
"lines": null
55
},
6-
"generated_at": "2026-04-15T16:31:17Z",
6+
"generated_at": "2026-04-15T18:51:42Z",
77
"plugins_used": [
88
{
99
"name": "AWSKeyDetector"
@@ -5136,31 +5136,31 @@
51365136
"hashed_secret": "f2b14f68eb995facb3a1c35287b778d5bd785511",
51375137
"is_secret": false,
51385138
"is_verified": false,
5139-
"line_number": 5497,
5139+
"line_number": 5687,
51405140
"type": "Secret Keyword",
51415141
"verified_result": null
51425142
},
51435143
{
51445144
"hashed_secret": "f42a3fabe1e9bed059d727f47eb752e3aa61b977",
51455145
"is_secret": false,
51465146
"is_verified": false,
5147-
"line_number": 5554,
5147+
"line_number": 5744,
51485148
"type": "Secret Keyword",
51495149
"verified_result": null
51505150
},
51515151
{
51525152
"hashed_secret": "b85788b459aa4d67e1070930dae6d0827756aadb",
51535153
"is_secret": false,
51545154
"is_verified": false,
5155-
"line_number": 5592,
5155+
"line_number": 5782,
51565156
"type": "Secret Keyword",
51575157
"verified_result": null
51585158
},
51595159
{
51605160
"hashed_secret": "52dcc83ec1e54426ad58a64854d1eb8d5f5d9685",
51615161
"is_secret": false,
51625162
"is_verified": false,
5163-
"line_number": 5593,
5163+
"line_number": 5783,
51645164
"type": "Secret Keyword",
51655165
"verified_result": null
51665166
}
@@ -7502,119 +7502,111 @@
75027502
"hashed_secret": "41db7c65f665e40b44178f95f5f86473ca17160e",
75037503
"is_secret": false,
75047504
"is_verified": false,
7505-
"line_number": 72,
7505+
"line_number": 73,
75067506
"type": "Secret Keyword",
75077507
"verified_result": null
75087508
},
75097509
{
75107510
"hashed_secret": "4ea8d2335b430796cf3f500368c5b0f5b1dc90f5",
75117511
"is_secret": false,
75127512
"is_verified": false,
7513-
"line_number": 190,
7513+
"line_number": 191,
75147514
"type": "Secret Keyword",
75157515
"verified_result": null
75167516
},
75177517
{
75187518
"hashed_secret": "1a91d62f7ca67399625a4368a6ab5d4a3baa6073",
75197519
"is_secret": false,
75207520
"is_verified": false,
7521-
"line_number": 191,
7521+
"line_number": 192,
75227522
"type": "Secret Keyword",
75237523
"verified_result": null
75247524
},
75257525
{
75267526
"hashed_secret": "5ef3af6cd9b5aa907fa618fac829baaec6763b42",
75277527
"is_secret": false,
75287528
"is_verified": false,
7529-
"line_number": 385,
7529+
"line_number": 387,
75307530
"type": "Secret Keyword",
75317531
"verified_result": null
75327532
},
75337533
{
75347534
"hashed_secret": "94c9325c8ade4f6327d87e240713c2fd7fdbfc63",
75357535
"is_secret": false,
75367536
"is_verified": false,
7537-
"line_number": 386,
7537+
"line_number": 388,
75387538
"type": "Secret Keyword",
75397539
"verified_result": null
75407540
},
75417541
{
75427542
"hashed_secret": "7bb0c6efd2edb1ae20dab6dd260895ad8895e07e",
75437543
"is_secret": false,
75447544
"is_verified": false,
7545-
"line_number": 422,
7545+
"line_number": 424,
75467546
"type": "Secret Keyword",
75477547
"verified_result": null
75487548
},
75497549
{
75507550
"hashed_secret": "ddbeee31dc29b74fcede0bea4d3fc5276d9148c8",
75517551
"is_secret": false,
75527552
"is_verified": false,
7553-
"line_number": 448,
7553+
"line_number": 451,
75547554
"type": "Secret Keyword",
75557555
"verified_result": null
75567556
},
75577557
{
75587558
"hashed_secret": "122758dd535bc6d246f36a686b29c7c40fab1315",
75597559
"is_secret": false,
75607560
"is_verified": false,
7561-
"line_number": 799,
7561+
"line_number": 803,
75627562
"type": "Secret Keyword",
75637563
"verified_result": null
75647564
},
75657565
{
75667566
"hashed_secret": "fa727f587e9f6115da6d4eb600dd8b6fe06cf69a",
75677567
"is_secret": false,
75687568
"is_verified": false,
7569-
"line_number": 801,
7569+
"line_number": 805,
75707570
"type": "Secret Keyword",
75717571
"verified_result": null
75727572
},
75737573
{
75747574
"hashed_secret": "250cdef3ce09000fa9a939a13e5f73433d96a852",
75757575
"is_secret": false,
75767576
"is_verified": false,
7577-
"line_number": 2354,
7577+
"line_number": 2399,
75787578
"type": "Secret Keyword",
75797579
"verified_result": null
75807580
},
75817581
{
75827582
"hashed_secret": "f2b14f68eb995facb3a1c35287b778d5bd785511",
75837583
"is_secret": false,
75847584
"is_verified": false,
7585-
"line_number": 2363,
7586-
"type": "Secret Keyword",
7587-
"verified_result": null
7588-
},
7589-
{
7590-
"hashed_secret": "1902e3d6fc4e78a0bcc50ba12b882769afbf4a8c",
7591-
"is_secret": false,
7592-
"is_verified": false,
7593-
"line_number": 2395,
7585+
"line_number": 2408,
75947586
"type": "Secret Keyword",
75957587
"verified_result": null
75967588
},
75977589
{
75987590
"hashed_secret": "d92342976d720ff38cf5dcb329be41959ab1ba6c",
75997591
"is_secret": false,
76007592
"is_verified": false,
7601-
"line_number": 2816,
7593+
"line_number": 3241,
76027594
"type": "Secret Keyword",
76037595
"verified_result": null
76047596
},
76057597
{
76067598
"hashed_secret": "a343c9b5b1abfcf385d5c6f6dc0282f4d88a416e",
76077599
"is_secret": false,
76087600
"is_verified": false,
7609-
"line_number": 2977,
7601+
"line_number": 3402,
76107602
"type": "Secret Keyword",
76117603
"verified_result": null
76127604
},
76137605
{
76147606
"hashed_secret": "c00dbbc9dadfbe1e232e93a729dd4752fade0abf",
76157607
"is_secret": false,
76167608
"is_verified": false,
7617-
"line_number": 3009,
7609+
"line_number": 3434,
76187610
"type": "Secret Keyword",
76197611
"verified_result": null
76207612
}

crates/a2a_runtime/src/auth.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,13 @@ pub fn apply_invoke_auth(
131131
}
132132

133133
// Merge auth query params (override existing keys).
134+
//
135+
// Use BTreeMap so the resulting query string is lexicographically
136+
// ordered. HashMap iteration order is randomized per-process, which
137+
// would (a) break HMAC/signed-URL flows that require a canonical
138+
// serialization and (b) make log/diff comparison non-deterministic.
134139
if !query_params.is_empty() {
135-
// Collect existing pairs, then layer auth on top.
136-
let mut merged: HashMap<String, String> = url
140+
let mut merged: std::collections::BTreeMap<String, String> = url
137141
.query_pairs()
138142
.map(|(k, v)| (k.into_owned(), v.into_owned()))
139143
.collect();

crates/a2a_runtime/src/config.rs

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,61 @@ impl RuntimeConfig {
202202
)
203203
})
204204
}
205+
206+
/// Cross-field validation invoked by [`crate::run`] at startup.
207+
///
208+
/// Clap handles per-field parsing, but several invariants cross multiple
209+
/// fields. Catching these at startup beats surfacing them as strange
210+
/// runtime behaviour (timeouts that can never trigger, retry budgets
211+
/// that exceed the client timeout, etc.).
212+
pub fn validate_cross_field(&self) -> Result<(), String> {
213+
if self.max_concurrent == 0 {
214+
return Err("A2A_RUST_MAX_CONCURRENT must be >= 1".to_string());
215+
}
216+
if self.agent_cache_max_entries == 0 {
217+
return Err("A2A_RUST_AGENT_CACHE_MAX_ENTRIES must be >= 1".to_string());
218+
}
219+
if self.circuit_failure_threshold == 0 {
220+
return Err("A2A_RUST_CIRCUIT_FAILURE_THRESHOLD must be >= 1".to_string());
221+
}
222+
223+
// Retry budget: total backoff (sum of geometric series up to
224+
// max_retries * retry_backoff_ms) should fit within the per-request
225+
// timeout, or the circuit breaker will trip before retries even run.
226+
// Use a conservative upper bound: max_retries * retry_backoff_ms.
227+
let retry_budget_ms = self
228+
.retry_backoff_ms
229+
.saturating_mul(u64::from(self.max_retries));
230+
if retry_budget_ms > self.request_timeout_ms {
231+
return Err(format!(
232+
"retry budget ({}ms = {} retries × {}ms backoff) exceeds request timeout ({}ms); \
233+
retries can never complete",
234+
retry_budget_ms, self.max_retries, self.retry_backoff_ms, self.request_timeout_ms
235+
));
236+
}
237+
238+
// Cache TTL sanity: L2 (Redis, shared) should outlive L1 (in-process)
239+
// so a node restart does not reload a stale-by-comparison entry from
240+
// L2. Allow equality (both set to the same value) for simple
241+
// deployments.
242+
if self.l2_cache_ttl_secs > 0 && self.l2_cache_ttl_secs < self.agent_cache_ttl_secs {
243+
return Err(format!(
244+
"A2A_RUST_L2_CACHE_TTL_SECS ({}) must be >= A2A_RUST_AGENT_CACHE_TTL_SECS ({})",
245+
self.l2_cache_ttl_secs, self.agent_cache_ttl_secs
246+
));
247+
}
248+
249+
// Session TTL: when sessions are enabled, the session must outlive
250+
// a typical request round-trip; otherwise fast-path cache hits will
251+
// never trigger.
252+
if self.session_enabled && self.session_ttl_secs == 0 {
253+
return Err(
254+
"A2A_RUST_SESSION_TTL_SECS must be > 0 when sessions are enabled".to_string(),
255+
);
256+
}
257+
258+
Ok(())
259+
}
205260
}
206261

207262
#[cfg(test)]
@@ -287,4 +342,58 @@ mod tests {
287342
assert!(config.session_enabled);
288343
assert_eq!(config.event_store_max_events, 1000);
289344
}
345+
346+
#[test]
347+
fn validate_cross_field_accepts_defaults() {
348+
let config = test_config();
349+
assert!(config.validate_cross_field().is_ok());
350+
}
351+
352+
#[test]
353+
fn validate_cross_field_rejects_zero_max_concurrent() {
354+
let mut config = test_config();
355+
config.max_concurrent = 0;
356+
let err = config.validate_cross_field().expect_err("should reject");
357+
assert!(err.contains("MAX_CONCURRENT"));
358+
}
359+
360+
#[test]
361+
fn validate_cross_field_rejects_retry_budget_exceeding_timeout() {
362+
// 5 retries × 1000ms backoff = 5000ms retry budget,
363+
// but request timeout is only 2000ms — retries cannot complete.
364+
let mut config = test_config();
365+
config.request_timeout_ms = 2_000;
366+
config.retry_backoff_ms = 1_000;
367+
config.max_retries = 5;
368+
let err = config.validate_cross_field().expect_err("should reject");
369+
assert!(err.contains("retry budget"));
370+
assert!(err.contains("exceeds request timeout"));
371+
}
372+
373+
#[test]
374+
fn validate_cross_field_rejects_l2_ttl_shorter_than_l1() {
375+
// L2 (Redis, shared) should outlive L1 or node restarts reload stale data.
376+
let mut config = test_config();
377+
config.agent_cache_ttl_secs = 600;
378+
config.l2_cache_ttl_secs = 60;
379+
let err = config.validate_cross_field().expect_err("should reject");
380+
assert!(err.contains("L2_CACHE_TTL_SECS"));
381+
}
382+
383+
#[test]
384+
fn validate_cross_field_rejects_zero_session_ttl_when_sessions_enabled() {
385+
let mut config = test_config();
386+
config.session_enabled = true;
387+
config.session_ttl_secs = 0;
388+
let err = config.validate_cross_field().expect_err("should reject");
389+
assert!(err.contains("SESSION_TTL_SECS"));
390+
}
391+
392+
#[test]
393+
fn validate_cross_field_allows_zero_session_ttl_when_sessions_disabled() {
394+
let mut config = test_config();
395+
config.session_enabled = false;
396+
config.session_ttl_secs = 0;
397+
assert!(config.validate_cross_field().is_ok());
398+
}
290399
}

crates/a2a_runtime/src/lib.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,26 @@ fn build_http_client(config: &RuntimeConfig) -> Result<Client, reqwest::Error> {
5353
}
5454

5555
pub async fn run(config: RuntimeConfig) -> Result<(), RuntimeError> {
56+
if config
57+
.auth_secret
58+
.as_deref()
59+
.map(str::is_empty)
60+
.unwrap_or(true)
61+
{
62+
return Err(RuntimeError::Config(
63+
"A2A_RUST_AUTH_SECRET is required: the runtime cannot build trust headers or decrypt \
64+
encrypted auth blobs without a shared secret. Refusing to start."
65+
.to_string(),
66+
));
67+
}
68+
69+
// Surface cross-field config errors at startup rather than as silent
70+
// runtime misbehaviour (timeouts that never trigger, retries that
71+
// exceed their deadline, cache tiers that churn each other, etc.).
72+
config
73+
.validate_cross_field()
74+
.map_err(RuntimeError::Config)?;
75+
5676
let client = build_http_client(&config)?;
5777
let config_arc = Arc::new(config.clone());
5878

@@ -131,7 +151,10 @@ pub async fn run(config: RuntimeConfig) -> Result<(), RuntimeError> {
131151
rx,
132152
client.clone(),
133153
config.backend_base_url.clone(),
134-
config.auth_secret.clone().unwrap_or_default(),
154+
config
155+
.auth_secret
156+
.clone()
157+
.expect("auth_secret validated non-empty in run()"),
135158
Duration::from_millis(config.event_flush_interval_ms),
136159
config.event_flush_batch_size,
137160
);
@@ -288,7 +311,7 @@ mod tests {
288311
max_response_body_bytes: 1024,
289312
max_retries: 0,
290313
retry_backoff_ms: 1,
291-
auth_secret: None,
314+
auth_secret: Some("test-shared-secret".to_string()),
292315
backend_base_url: "http://127.0.0.1:4444".to_string(),
293316
max_concurrent: 1,
294317
max_queued: Some(4),
@@ -353,6 +376,30 @@ mod tests {
353376
.expect("runtime should start and shut down cleanly");
354377
}
355378

379+
#[tokio::test]
380+
async fn run_rejects_missing_auth_secret() {
381+
let mut config = test_config();
382+
config.auth_secret = None; // pragma: allowlist secret
383+
let err = run(config)
384+
.await
385+
.expect_err("missing auth_secret must fail");
386+
assert!(
387+
matches!(err, RuntimeError::Config(_)),
388+
"expected Config error, got {err:?}"
389+
);
390+
}
391+
392+
#[tokio::test]
393+
async fn run_rejects_empty_auth_secret() {
394+
let mut config = test_config();
395+
config.auth_secret = Some(String::new());
396+
let err = run(config).await.expect_err("empty auth_secret must fail");
397+
assert!(
398+
matches!(err, RuntimeError::Config(_)),
399+
"expected Config error, got {err:?}"
400+
);
401+
}
402+
356403
#[tokio::test]
357404
async fn run_serves_uds_until_exit_after_startup() {
358405
let path = temp_socket_path("a2a-run");

0 commit comments

Comments
 (0)