@@ -433,6 +433,15 @@ pub struct ComposeManagerConfig {
433433 pub url : String ,
434434}
435435
436+ /// Owned-lifetime version of `DelegateContext` used by the background
437+ /// cache refresh task (which doesn't have access to the request-scoped
438+ /// `&Config`/`&Client`). Holds a clone of the `reqwest::Client` and an
439+ /// `Arc<Config>` so the spawned task is `'static`.
440+ pub struct DelegateRefreshConfig {
441+ pub config : Arc < crate :: config:: Config > ,
442+ pub http_client : reqwest:: Client ,
443+ }
444+
436445/// Build OHTTP attestation payload for the process-wide OHTTP gateway config.
437446pub fn build_ohttp_attestation (
438447 signing : & crate :: signing:: SigningPair ,
@@ -462,6 +471,7 @@ pub fn spawn_cache_refresh_task(
462471 refresh_interval_secs : u64 ,
463472 compose_manager : Option < ComposeManagerConfig > ,
464473 ohttp_attestation_ed25519 : Option < crate :: types:: OhttpAttestation > ,
474+ delegate_refresh : Option < DelegateRefreshConfig > ,
465475) {
466476 tokio:: spawn ( async move {
467477 // Initial delay to let the server start up.
@@ -492,6 +502,10 @@ pub fn spawn_cache_refresh_task(
492502
493503 // Refresh without TLS fingerprint (most common).
494504 // GPU evidence serialization is handled by the worker Mutex.
505+ let delegate_ctx = delegate_refresh. as_ref ( ) . map ( |d| DelegateContext {
506+ config : & d. config ,
507+ http_client : & d. http_client ,
508+ } ) ;
495509 match generate_attestation_inner (
496510 AttestationParams {
497511 model_name : & model_name,
@@ -504,6 +518,7 @@ pub fn spawn_cache_refresh_task(
504518 tls_cert_fingerprint : None ,
505519 } ,
506520 Some ( & cache) ,
521+ delegate_ctx. as_ref ( ) ,
507522 )
508523 . await
509524 {
@@ -526,6 +541,10 @@ pub fn spawn_cache_refresh_task(
526541
527542 // Also refresh with TLS fingerprint if configured.
528543 if let Some ( ref fp) = tls_cert_fingerprint {
544+ let delegate_ctx = delegate_refresh. as_ref ( ) . map ( |d| DelegateContext {
545+ config : & d. config ,
546+ http_client : & d. http_client ,
547+ } ) ;
529548 match generate_attestation_inner (
530549 AttestationParams {
531550 model_name : & model_name,
@@ -538,6 +557,7 @@ pub fn spawn_cache_refresh_task(
538557 tls_cert_fingerprint : Some ( fp. as_str ( ) ) ,
539558 } ,
540559 Some ( & cache) ,
560+ delegate_ctx. as_ref ( ) ,
541561 )
542562 . await
543563 {
@@ -828,6 +848,17 @@ pub struct AttestationParams<'a> {
828848 pub tls_cert_fingerprint : Option < & ' a str > ,
829849}
830850
851+ /// Context the delegate-dispatch path needs at the call site.
852+ ///
853+ /// Carries the resolved `Config` (for the delegate URL/timeout/auth
854+ /// token) and the shared `reqwest::Client` we use across the proxy.
855+ /// Lifetime-bound to the caller's `AppState` so we don't clone the
856+ /// client per request.
857+ pub struct DelegateContext < ' a > {
858+ pub config : & ' a crate :: config:: Config ,
859+ pub http_client : & ' a reqwest:: Client ,
860+ }
861+
831862/// Maximum attempts for `collect_gpu_evidence_with_nonce_check`.
832863///
833864/// 4 attempts (1 initial + 3 retries) with exponential backoff between
@@ -954,11 +985,12 @@ fn check_evidence_nonce_binding(
954985/// Failures (transport errors, repeated nonce mismatches) bubble up so
955986/// cloud-api can rotate to a different backend instead of submitting
956987/// known-bad evidence to NRAS.
957- async fn collect_gpu_evidence_with_nonce_check (
988+ pub ( crate ) async fn collect_gpu_evidence_with_nonce_check (
958989 nonce_hex : & str ,
959990 nonce_bytes : & [ u8 ; 32 ] ,
960991 gpu_no_hw_mode : bool ,
961992 cache : Option < & AttestationCache > ,
993+ delegate_ctx : Option < & DelegateContext < ' _ > > ,
962994) -> anyhow:: Result < serde_json:: Value > {
963995 let mut last_failure: Option < NonceMismatch > = None ;
964996
@@ -969,13 +1001,28 @@ async fn collect_gpu_evidence_with_nonce_check(
9691001 tokio:: time:: sleep ( std:: time:: Duration :: from_millis ( delay_ms) ) . await ;
9701002 }
9711003
972- // Three backends, in priority order:
973- // 1. nv-attestation-sdk (Rust → C FFI, opt-in via env var)
974- // 2. cache's persistent Python worker (existing default)
975- // 3. one-shot Python subprocess (fallback when no cache)
1004+ // Four backends, in priority order:
1005+ // 1. delegate proxy (HTTP, opt-in via GPU_EVIDENCE_DELEGATE_URL)
1006+ // — used to serialize NVML across multiple proxies sharing a
1007+ // host. Only the delegate touches local NVML.
1008+ // 2. nv-attestation-sdk (Rust → C FFI, opt-in via env var)
1009+ // 3. cache's persistent Python worker (existing default)
1010+ // 4. one-shot Python subprocess (fallback when no cache)
9761011 // The self-check + retry below applies regardless of which one
977- // produced the evidence.
978- let evidence = if crate :: attestation_sdk:: is_active ( ) && !gpu_no_hw_mode {
1012+ // produced the evidence — including evidence returned by the
1013+ // delegate (defense in depth, plus catches the rare "delegate
1014+ // returned 200 but with bytes from a different request").
1015+ let evidence = if let Some ( dctx) = delegate_ctx. filter ( |_| !gpu_no_hw_mode) {
1016+ // Delegate path. `gpu_no_hw_mode` doesn't make sense across
1017+ // an HTTP hop; fall through to local paths if it's set.
1018+ crate :: gpu_evidence_delegate:: collect_via_delegate (
1019+ dctx. config ,
1020+ dctx. http_client ,
1021+ nonce_hex,
1022+ gpu_no_hw_mode,
1023+ )
1024+ . await ?
1025+ } else if crate :: attestation_sdk:: is_active ( ) && !gpu_no_hw_mode {
9791026 // SDK path doesn't support no_gpu_mode (it requires real
9801027 // hardware via NVML); fall back to the Python paths for
9811028 // dev/test environments without GPUs.
@@ -1030,6 +1077,7 @@ async fn collect_gpu_evidence_with_nonce_check(
10301077async fn generate_attestation_inner (
10311078 params : AttestationParams < ' _ > ,
10321079 cache : Option < & AttestationCache > ,
1080+ delegate_ctx : Option < & DelegateContext < ' _ > > ,
10331081) -> Result < AttestationReport , AttestationError > {
10341082 let nonce_bytes = parse_nonce ( params. nonce ) ?;
10351083 let nonce_hex = hex:: encode ( nonce_bytes) ;
@@ -1068,6 +1116,7 @@ async fn generate_attestation_inner(
10681116 & nonce_bytes_for_verify,
10691117 gpu_no_hw_mode,
10701118 cache,
1119+ delegate_ctx,
10711120 )
10721121 . await
10731122 . map_err( AttestationError :: Internal )
@@ -1125,6 +1174,7 @@ pub enum AttestationResult {
11251174pub async fn generate_attestation (
11261175 params : AttestationParams < ' _ > ,
11271176 cache : Option < & AttestationCache > ,
1177+ delegate_ctx : Option < & DelegateContext < ' _ > > ,
11281178) -> Result < AttestationResult , AttestationError > {
11291179 let is_nonceless = params. nonce . is_none ( ) ;
11301180 let include_tls = params. tls_cert_fingerprint . is_some ( ) ;
@@ -1141,7 +1191,7 @@ pub async fn generate_attestation(
11411191
11421192 // Generate fresh report. GPU evidence is serialized by the worker Mutex,
11431193 // but TDX quote runs concurrently.
1144- let report = generate_attestation_inner ( params, cache) . await ?;
1194+ let report = generate_attestation_inner ( params, cache, delegate_ctx ) . await ?;
11451195 // Don't cache here — the caller (route handler) caches after fetching
11461196 // compose-manager attestation so cached responses include the full chain.
11471197
0 commit comments