Add startup health checks to validate inference backend before serving traffic (#24)

henrypark133 · claude · web-flow · commit 20d25007e1e3 · 2026-02-12T20:53:55.000-08:00
* Add startup health checks to validate inference backend before serving traffic

Gated behind start up check for open ai chat completion compatibility, runs three sequential checks against the
backend before the proxy binds its TCP listener: model existence via /v1/models,
non-streaming chat completions with tools, and streaming chat completions with
tools. Validates tool call argument JSON to catch inference engine bugs (e.g.
vLLM producing malformed arguments in streaming finish chunks). Only retries
on transient errors (connection refused, 503); fails fast on validation errors.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/config.rs b/src/config.rs
@@ -65,6 +65,20 @@ pub struct Config {
     // Timeouts
     pub timeout_secs: u64,
     pub timeout_tokenize_secs: u64,
+
+    // OpenAI Chat Compatibility Checks
+    // Validates that hosted models (qwen, glm, etc.) send OpenAI-compliant responses:
+    // - /v1/models API format
+    // - /v1/chat/completions with tool_calls (streaming & non-streaming)
+    // Only enable for models serving OpenAI-compatible chat API. Disable for:
+    // - Image generation models (FLUX, etc.)
+    // - Embedding models
+    // - Reranker models
+    // - Cohere or other non-OpenAI-compliant APIs
+    pub openai_chat_compatibility_check_enabled: bool,
+    pub startup_check_retries: usize,
+    pub startup_check_retry_delay_secs: u64,
+    pub startup_check_timeout_secs: u64,
 }
 
 impl Config {
@@ -93,7 +107,7 @@ impl Config {
             .map(|s| s.trim().to_string())
             .unwrap_or_else(|_| "unknown".to_string());
 
-        Ok(Config {
+        let config = Config {
             model_name,
             token,
             vllm_base_url: vllm_base_url.clone(),
@@ -121,7 +135,21 @@ impl Config {
             rate_limit_trust_proxy_headers: !env_bool("RATE_LIMIT_NO_TRUST_PROXY"),
             timeout_secs: 600,
             timeout_tokenize_secs: 10,
-        })
+            openai_chat_compatibility_check_enabled: env_bool("OPENAI_CHAT_COMPATIBILITY_CHECK"),
+            startup_check_retries: env_int("STARTUP_CHECK_RETRIES", 3),
+            startup_check_retry_delay_secs: env_int("STARTUP_CHECK_RETRY_DELAY_SECS", 5) as u64,
+            startup_check_timeout_secs: env_int("STARTUP_CHECK_TIMEOUT_SECS", 30) as u64,
+        };
+
+        // Validate startup check configuration
+        if config.startup_check_retries == 0 {
+            anyhow::bail!("STARTUP_CHECK_RETRIES must be at least 1");
+        }
+        if config.startup_check_timeout_secs == 0 {
+            anyhow::bail!("STARTUP_CHECK_TIMEOUT_SECS must be greater than 0");
+        }
+
+        Ok(config)
     }
 }
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -15,6 +15,7 @@ pub mod proxy;
 pub mod rate_limit;
 pub mod routes;
 pub mod signing;
+pub mod startup_checks;
 pub mod types;
 
 /// Shared application state available to all handlers.
diff --git a/src/main.rs b/src/main.rs
@@ -5,7 +5,8 @@ use tokio::net::TcpListener;
 use tracing::info;
 
 use vllm_proxy_rs::{
-    cache, config, metrics_middleware, rate_limit, request_id_middleware, routes, signing, AppState,
+    cache, config, metrics_middleware, rate_limit, request_id_middleware, routes, signing,
+    startup_checks, AppState,
 };
 
 #[tokio::main]
@@ -75,6 +76,18 @@ async fn main() -> anyhow::Result<()> {
         metrics_handle,
     };
 
+    // Run OpenAI chat compatibility checks if enabled
+    if state.config.openai_chat_compatibility_check_enabled {
+        info!("OpenAI chat compatibility check enabled, verifying backend...");
+        if let Err(e) = startup_checks::run_startup_checks(&state.http_client, &state.config).await
+        {
+            tracing::error!(error = %e, "OpenAI chat compatibility check failed — exiting");
+            return Err(e.into());
+        }
+    } else {
+        info!("OpenAI chat compatibility check disabled (set OPENAI_CHAT_COMPATIBILITY_CHECK=true to enable)");
+    }
+
     // Build rate limiter
     let rate_limiter = rate_limit::build_rate_limiter(
         state.config.rate_limit_per_second,
diff --git a/src/signing.rs b/src/signing.rs
@@ -465,10 +465,8 @@ mod tests {
         // or may panic/fail entirely — either outcome means rejection
         let result = std::panic::catch_unwind(|| recover_ecdsa_address(message, &corrupted_sig));
         if let Ok(recovered) = result {
-            // Recovery may succeed but should yield a different address
             assert_ne!(recovered, ctx.signing_address);
-        }
-        // Err(_) = panicked during recovery = also a rejection
+        } // Panicked during recovery (Err) = also a rejection
     }
 
     #[test]
diff --git a/src/startup_checks.rs b/src/startup_checks.rs
diff --git a/tests/integration.rs b/tests/integration.rs