nearai
diff --git a/‎CLAUDE.md‎
Lines changed: 7 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benches/e2e.rs‎
Lines changed: 2 additions & 0 deletions b/‎benches/e2e.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/testing-on-cvm.md‎
Lines changed: 175 additions & 0 deletions b/‎docs/testing-on-cvm.md‎
Lines changed: 175 additions & 0 deletions
diff --git a/‎src/attestation.rs‎
Lines changed: 58 additions & 8 deletions b/‎src/attestation.rs‎
Lines changed: 58 additions & 8 deletions
diff --git a/‎src/config.rs‎
Lines changed: 23 additions & 0 deletions b/‎src/config.rs‎
Lines changed: 23 additions & 0 deletions
@@ -12,6 +12,13 @@ cargo fmt                # Format code
 
 No special env vars needed for tests — integration tests use wiremock and fixed signing keys.
 
+For changes that touch NVML, the libnvat SDK FFI, dstack TDX, or
+proxy-to-proxy contracts (e.g. `/internal/gpu_evidence`), `cargo test`
+isn't enough — see [docs/testing-on-cvm.md](docs/testing-on-cvm.md)
+for the real-CVM smoke-test recipe (build a branch image with
+`gh workflow run build.yml --ref <branch>`, deploy a 2-proxy stack
+inside a gpu0X CVM, probe both happy and leader-down paths).
+
 ## Architecture
 
 This is a Rust rewrite of [nearai/vllm-proxy](https://github.com/nearai/vllm-proxy). It proxies OpenAI-compatible API requests to a vLLM/sglang backend, adding cryptographic signing and TEE attestation.
 
@@ -84,6 +84,8 @@ fn build_test_app(mock_url: &str) -> axum::Router {
         ohttp_enabled: false,
         listen_port: 8000,
         dstack_socket_path: "/var/run/dstack.sock".to_string(),
+        gpu_evidence_delegate_url: None,
+        gpu_evidence_delegate_timeout_secs: 30,
     };
 
     let ecdsa = signing::EcdsaContext::from_key_bytes(&ECDSA_KEY).unwrap();
 
@@ -0,0 +1,175 @@
+# Testing inference-proxy on a real CVM
+
+`cargo test` covers the unit + integration suites with wiremock-mocked
+upstreams. Some changes — anything that touches NVML, dstack TDX, or
+the SDK FFI — need a real CVM to validate. This doc is the recipe.
+
+## When to do this
+
+- Changes to `attestation.rs` GPU evidence dispatch
+- Changes to the libnvat SDK call (`attestation_sdk.rs`) or its mutex
+- New env vars that gate which evidence path is taken
+- Anything that adds a new HTTP endpoint to a proxy-to-proxy contract
+  (e.g. `/internal/gpu_evidence`)
+
+If you're only changing pure-Rust logic with no FFI / dstack / NVIDIA
+surface, `cargo test` is enough.
+
+## Where to run
+
+A spare GPU CVM with `USE_NV_ATTESTATION_SDK=true` and a working
+`/var/run/dstack.sock`. As of 2026-05-08 that's any `gpu0X` host. Pick
+one that isn't load-bearing — gpu07 is the usual canary. Tester is
+responsible for not disturbing whatever production model is already
+running on the host.
+
+## Build a branch image
+
+The `Build & Deploy` workflow only auto-fires on `main` and tags. For
+branch testing, dispatch it manually:
+
+```bash
+gh workflow run build.yml --ref <branch> --repo nearai/inference-proxy
+gh run list --workflow=build.yml --branch <branch> --repo nearai/inference-proxy --limit 1
+```
+
+It tags `:dev` (shared with all non-main branches — pin by digest in
+your test compose, not by tag) and prints the digest in the run log
+(`IMAGE_DIGEST: sha256:...`).
+
+## 2-proxy delegate smoke test
+
+Validates `GPU_EVIDENCE_DELEGATE_URL` end-to-end: one leader proxy
+owns NVML, the other delegates. Created for [PR #122][pr122].
+
+[pr122]: https://github.com/nearai/inference-proxy/pull/122
+
+### Compose file
+
+```yaml
+# test-delegate.yaml
+x-nvidia: &nvidia
+  runtime: nvidia
+  ipc: host
+  deploy:
+    resources:
+      reservations:
+        devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+
+services:
+  delegate-leader:
+    <<: *nvidia
+    image: ${PROXY_IMAGE}
+    container_name: delegate-leader
+    user: root
+    privileged: true
+    ports:
+      - "127.0.0.1:18001:8000"   # CVM-loopback only, no host exposure
+    volumes:
+      - /var/run/dstack.sock:/var/run/dstack.sock
+    environment:
+      - MODEL_NAME=zai-org/GLM-5-FP8
+      - TOKEN=${PROXY_TOKEN}
+      - VLLM_BASE_URL=http://glm:8000
+      - USE_NV_ATTESTATION_SDK=true
+      - LOG_FORMAT=json
+      - OPENAI_CHAT_COMPATIBILITY_CHECK=false   # don't gate on upstream
+    restart: "no"
+
+  delegate-follower:
+    <<: *nvidia
+    image: ${PROXY_IMAGE}
+    container_name: delegate-follower
+    user: root
+    privileged: true
+    ports:
+      - "127.0.0.1:18002:8000"
+    volumes:
+      - /var/run/dstack.sock:/var/run/dstack.sock
+    environment:
+      - MODEL_NAME=zai-org/GLM-5-FP8
+      - TOKEN=${PROXY_TOKEN}
+      - VLLM_BASE_URL=http://glm:8000
+      - USE_NV_ATTESTATION_SDK=true
+      - GPU_EVIDENCE_DELEGATE_URL=http://delegate-leader:8000
+      - LOG_FORMAT=json
+      - OPENAI_CHAT_COMPATIBILITY_CHECK=false
+    depends_on:
+      - delegate-leader
+    restart: "no"
+```
+
+`MODEL_NAME` is just a label here — neither proxy serves real
+inference in this test, so set it to whatever the running model on
+the host is so logs aren't confusing. `VLLM_BASE_URL` only matters if
+you flip `OPENAI_CHAT_COMPATIBILITY_CHECK=true`.
+
+### Deploy and probe
+
+CVM access on gpu0X is via host jump: `ssh gpuNN` then
+`ssh -p 10022 root@localhost`. The CVM's `/tmp` is writable; `/root`
+is not.
+
+```bash
+# scp the file in (two-hop)
+scp test-delegate.yaml gpu07:/tmp/
+ssh gpu07 'scp -P 10022 /tmp/test-delegate.yaml root@localhost:/tmp/'
+
+# run inside the CVM
+ssh gpu07 'ssh -p 10022 root@localhost' <<'CVM'
+mkdir -p /tmp/deltest && cd /tmp/deltest && mv /tmp/test-delegate.yaml .
+PROXY_IMAGE='nearaidev/vllm-proxy-rs@sha256:<digest from build run>' \
+PROXY_TOKEN=delegate-test-token-1234 \
+docker compose -f test-delegate.yaml -p deltest up -d
+CVM
+```
+
+### What to verify
+
+```bash
+# happy path — fresh nonce, leader up
+NONCE=$(openssl rand -hex 32)
+curl -w "code=%{http_code} t=%{time_total}\n" -o /tmp/r.json \
+  "http://127.0.0.1:18002/v1/attestation/report?signing_algo=ed25519&nonce=$NONCE"
+# expect: 200, ~290 KB body, request_nonce matches
+
+# loop-guard / dependency proof — fresh nonce, leader DOWN
+docker stop delegate-leader
+NONCE=$(openssl rand -hex 32)
+curl -w "code=%{http_code} t=%{time_total}\n" \
+  "http://127.0.0.1:18002/v1/attestation/report?signing_algo=ed25519&nonce=$NONCE"
+# expect: 500
+# follower logs: "delegate request to http://delegate-leader:8000/internal/gpu_evidence failed"
+
+# isolation check — leader's logs should have all libnvat output
+docker logs delegate-leader 2>&1 | grep '\[nvat\]' | head    # many lines
+docker logs delegate-follower 2>&1 | grep '\[nvat\]' | head  # zero lines
+```
+
+### Tear down
+
+```bash
+docker compose -f test-delegate.yaml -p deltest down -v
+rm -rf /tmp/deltest
+```
+
+Then on the host: confirm `docker ps --filter name=delegate` is empty
+and the production model (e.g. `glm51`, `qwen3-vl`) shows
+`RestartCount=0` in `docker inspect`.
+
+## CVM gotchas
+
+- The dstack OS is busybox, not Ubuntu. `head -c`, `head -3` etc.
+  don't work — use `dd if=… bs=N count=1` for byte-cap reads.
+- `/root` is read-only at SSH level; use `/tmp/<subdir>` for any test
+  artifacts.
+- `python3` is absent — use `jq` for JSON inspection.
+- `--gpus all` is fine for read-only NVML access; you don't need to
+  unplug the running model. The whole point of PR #122 is that
+  multiple proxies CAN share GPUs as long as only one talks to NVML.
+- The `:dev` image tag is shared across all non-main branches. **Pin
+  by digest** in test compose files so a parallel branch build can't
+  swap the image under you.
@@ -433,6 +433,15 @@ pub struct ComposeManagerConfig {
     pub url: String,
 }
 
+/// Owned-lifetime version of `DelegateContext` used by the background
+/// cache refresh task (which doesn't have access to the request-scoped
+/// `&Config`/`&Client`). Holds a clone of the `reqwest::Client` and an
+/// `Arc<Config>` so the spawned task is `'static`.
+pub struct DelegateRefreshConfig {
+    pub config: Arc<crate::config::Config>,
+    pub http_client: reqwest::Client,
+}
+
 /// Build OHTTP attestation payload for the process-wide OHTTP gateway config.
 pub fn build_ohttp_attestation(
     signing: &crate::signing::SigningPair,
@@ -462,6 +471,7 @@ pub fn spawn_cache_refresh_task(
     refresh_interval_secs: u64,
     compose_manager: Option<ComposeManagerConfig>,
     ohttp_attestation_ed25519: Option<crate::types::OhttpAttestation>,
+    delegate_refresh: Option<DelegateRefreshConfig>,
 ) {
     tokio::spawn(async move {
         // Initial delay to let the server start up.
@@ -492,6 +502,10 @@ pub fn spawn_cache_refresh_task(
 
                 // Refresh without TLS fingerprint (most common).
                 // GPU evidence serialization is handled by the worker Mutex.
+                let delegate_ctx = delegate_refresh.as_ref().map(|d| DelegateContext {
+                    config: &d.config,
+                    http_client: &d.http_client,
+                });
                 match generate_attestation_inner(
                     AttestationParams {
                         model_name: &model_name,
@@ -504,6 +518,7 @@ pub fn spawn_cache_refresh_task(
                         tls_cert_fingerprint: None,
                     },
                     Some(&cache),
+                    delegate_ctx.as_ref(),
                 )
                 .await
                 {
@@ -526,6 +541,10 @@ pub fn spawn_cache_refresh_task(
 
                 // Also refresh with TLS fingerprint if configured.
                 if let Some(ref fp) = tls_cert_fingerprint {
+                    let delegate_ctx = delegate_refresh.as_ref().map(|d| DelegateContext {
+                        config: &d.config,
+                        http_client: &d.http_client,
+                    });
                     match generate_attestation_inner(
                         AttestationParams {
                             model_name: &model_name,
@@ -538,6 +557,7 @@ pub fn spawn_cache_refresh_task(
                             tls_cert_fingerprint: Some(fp.as_str()),
                         },
                         Some(&cache),
+                        delegate_ctx.as_ref(),
                     )
                     .await
                     {
@@ -828,6 +848,17 @@ pub struct AttestationParams<'a> {
     pub tls_cert_fingerprint: Option<&'a str>,
 }
 
+/// Context the delegate-dispatch path needs at the call site.
+///
+/// Carries the resolved `Config` (for the delegate URL/timeout/auth
+/// token) and the shared `reqwest::Client` we use across the proxy.
+/// Lifetime-bound to the caller's `AppState` so we don't clone the
+/// client per request.
+pub struct DelegateContext<'a> {
+    pub config: &'a crate::config::Config,
+    pub http_client: &'a reqwest::Client,
+}
+
 /// Maximum attempts for `collect_gpu_evidence_with_nonce_check`.
 ///
 /// 4 attempts (1 initial + 3 retries) with exponential backoff between
@@ -954,11 +985,12 @@ fn check_evidence_nonce_binding(
 /// Failures (transport errors, repeated nonce mismatches) bubble up so
 /// cloud-api can rotate to a different backend instead of submitting
 /// known-bad evidence to NRAS.
-async fn collect_gpu_evidence_with_nonce_check(
+pub(crate) async fn collect_gpu_evidence_with_nonce_check(
     nonce_hex: &str,
     nonce_bytes: &[u8; 32],
     gpu_no_hw_mode: bool,
     cache: Option<&AttestationCache>,
+    delegate_ctx: Option<&DelegateContext<'_>>,
 ) -> anyhow::Result<serde_json::Value> {
     let mut last_failure: Option<NonceMismatch> = None;
 
@@ -969,13 +1001,28 @@ async fn collect_gpu_evidence_with_nonce_check(
             tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
         }
 
-        // Three backends, in priority order:
-        //   1. nv-attestation-sdk (Rust → C FFI, opt-in via env var)
-        //   2. cache's persistent Python worker (existing default)
-        //   3. one-shot Python subprocess (fallback when no cache)
+        // Four backends, in priority order:
+        //   1. delegate proxy (HTTP, opt-in via GPU_EVIDENCE_DELEGATE_URL)
+        //      — used to serialize NVML across multiple proxies sharing a
+        //      host. Only the delegate touches local NVML.
+        //   2. nv-attestation-sdk (Rust → C FFI, opt-in via env var)
+        //   3. cache's persistent Python worker (existing default)
+        //   4. one-shot Python subprocess (fallback when no cache)
         // The self-check + retry below applies regardless of which one
-        // produced the evidence.
-        let evidence = if crate::attestation_sdk::is_active() && !gpu_no_hw_mode {
+        // produced the evidence — including evidence returned by the
+        // delegate (defense in depth, plus catches the rare "delegate
+        // returned 200 but with bytes from a different request").
+        let evidence = if let Some(dctx) = delegate_ctx.filter(|_| !gpu_no_hw_mode) {
+            // Delegate path. `gpu_no_hw_mode` doesn't make sense across
+            // an HTTP hop; fall through to local paths if it's set.
+            crate::gpu_evidence_delegate::collect_via_delegate(
+                dctx.config,
+                dctx.http_client,
+                nonce_hex,
+                gpu_no_hw_mode,
+            )
+            .await?
+        } else if crate::attestation_sdk::is_active() && !gpu_no_hw_mode {
             // SDK path doesn't support no_gpu_mode (it requires real
             // hardware via NVML); fall back to the Python paths for
             // dev/test environments without GPUs.
@@ -1030,6 +1077,7 @@ async fn collect_gpu_evidence_with_nonce_check(
 async fn generate_attestation_inner(
     params: AttestationParams<'_>,
     cache: Option<&AttestationCache>,
+    delegate_ctx: Option<&DelegateContext<'_>>,
 ) -> Result<AttestationReport, AttestationError> {
     let nonce_bytes = parse_nonce(params.nonce)?;
     let nonce_hex = hex::encode(nonce_bytes);
@@ -1068,6 +1116,7 @@ async fn generate_attestation_inner(
                 &nonce_bytes_for_verify,
                 gpu_no_hw_mode,
                 cache,
+                delegate_ctx,
             )
             .await
             .map_err(AttestationError::Internal)
@@ -1125,6 +1174,7 @@ pub enum AttestationResult {
 pub async fn generate_attestation(
     params: AttestationParams<'_>,
     cache: Option<&AttestationCache>,
+    delegate_ctx: Option<&DelegateContext<'_>>,
 ) -> Result<AttestationResult, AttestationError> {
     let is_nonceless = params.nonce.is_none();
     let include_tls = params.tls_cert_fingerprint.is_some();
@@ -1141,7 +1191,7 @@ pub async fn generate_attestation(
 
     // Generate fresh report. GPU evidence is serialized by the worker Mutex,
     // but TDX quote runs concurrently.
-    let report = generate_attestation_inner(params, cache).await?;
+    let report = generate_attestation_inner(params, cache, delegate_ctx).await?;
     // Don't cache here — the caller (route handler) caches after fetching
     // compose-manager attestation so cached responses include the full chain.
 
 
@@ -93,6 +93,23 @@ pub struct Config {
     // Compose-manager attestation (deployment actions attestation)
     pub compose_manager_url: Option<String>,
 
+    // GPU evidence delegation (host-level NVML serialization)
+    /// HTTP base URL of another inference-proxy on the same host that
+    /// owns NVML evidence collection (e.g. `http://vllm-proxy-leader:8000`).
+    /// When set, this proxy forwards GPU evidence requests to the
+    /// delegate's `POST /internal/gpu_evidence` endpoint instead of
+    /// calling NVML locally. The intent is to serialize NVML access
+    /// across the *host*, not just within one process — multiple
+    /// inference-proxy instances sharing the same physical GPUs were
+    /// observed to race at the firmware level (see #107). When unset,
+    /// the proxy collects evidence locally via the SDK or Python path.
+    pub gpu_evidence_delegate_url: Option<String>,
+    /// Per-attempt timeout for the delegate HTTP call. Default 30s —
+    /// the delegate's own evidence collection plus its NVML wait
+    /// dominates this; we want enough headroom to not surface as
+    /// timeouts under contended load.
+    pub gpu_evidence_delegate_timeout_secs: u64,
+
     // OpenAI Chat Compatibility Checks
     // Validates that hosted models (qwen, glm, etc.) send OpenAI-compliant responses:
     // - /v1/models API format
@@ -249,6 +266,12 @@ impl Config {
                 as u64,
             cloud_api_auth_timeout_secs: env_int("CLOUD_API_AUTH_TIMEOUT_SECS", 5) as u64,
             compose_manager_url,
+            gpu_evidence_delegate_url: env::var("GPU_EVIDENCE_DELEGATE_URL")
+                .ok()
+                .filter(|s| !s.is_empty())
+                .map(|s| s.trim_end_matches('/').to_string()),
+            gpu_evidence_delegate_timeout_secs: env_int("GPU_EVIDENCE_DELEGATE_TIMEOUT_SECS", 30)
+                as u64,
             tls_cert_path,
             max_keepalive: env_int("VLLM_PROXY_MAX_KEEPALIVE", 100),
             pool_idle_timeout_secs: env_int("VLLM_PROXY_POOL_IDLE_TIMEOUT_SECS", 60) as u64,