Skip to content

Commit 20d2500

Browse files
henrypark133claude
andauthored
Add startup health checks to validate inference backend before serving traffic (#24)
* Add startup health checks to validate inference backend before serving traffic Gated behind start up check for open ai chat completion compatibility, runs three sequential checks against the backend before the proxy binds its TCP listener: model existence via /v1/models, non-streaming chat completions with tools, and streaming chat completions with tools. Validates tool call argument JSON to catch inference engine bugs (e.g. vLLM producing malformed arguments in streaming finish chunks). Only retries on transient errors (connection refused, 503); fails fast on validation errors. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0f09833 commit 20d2500

6 files changed

Lines changed: 1210 additions & 6 deletions

File tree

src/config.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,20 @@ pub struct Config {
6565
// Timeouts
6666
pub timeout_secs: u64,
6767
pub timeout_tokenize_secs: u64,
68+
69+
// OpenAI Chat Compatibility Checks
70+
// Validates that hosted models (qwen, glm, etc.) send OpenAI-compliant responses:
71+
// - /v1/models API format
72+
// - /v1/chat/completions with tool_calls (streaming & non-streaming)
73+
// Only enable for models serving OpenAI-compatible chat API. Disable for:
74+
// - Image generation models (FLUX, etc.)
75+
// - Embedding models
76+
// - Reranker models
77+
// - Cohere or other non-OpenAI-compliant APIs
78+
pub openai_chat_compatibility_check_enabled: bool,
79+
pub startup_check_retries: usize,
80+
pub startup_check_retry_delay_secs: u64,
81+
pub startup_check_timeout_secs: u64,
6882
}
6983

7084
impl Config {
@@ -93,7 +107,7 @@ impl Config {
93107
.map(|s| s.trim().to_string())
94108
.unwrap_or_else(|_| "unknown".to_string());
95109

96-
Ok(Config {
110+
let config = Config {
97111
model_name,
98112
token,
99113
vllm_base_url: vllm_base_url.clone(),
@@ -121,7 +135,21 @@ impl Config {
121135
rate_limit_trust_proxy_headers: !env_bool("RATE_LIMIT_NO_TRUST_PROXY"),
122136
timeout_secs: 600,
123137
timeout_tokenize_secs: 10,
124-
})
138+
openai_chat_compatibility_check_enabled: env_bool("OPENAI_CHAT_COMPATIBILITY_CHECK"),
139+
startup_check_retries: env_int("STARTUP_CHECK_RETRIES", 3),
140+
startup_check_retry_delay_secs: env_int("STARTUP_CHECK_RETRY_DELAY_SECS", 5) as u64,
141+
startup_check_timeout_secs: env_int("STARTUP_CHECK_TIMEOUT_SECS", 30) as u64,
142+
};
143+
144+
// Validate startup check configuration
145+
if config.startup_check_retries == 0 {
146+
anyhow::bail!("STARTUP_CHECK_RETRIES must be at least 1");
147+
}
148+
if config.startup_check_timeout_secs == 0 {
149+
anyhow::bail!("STARTUP_CHECK_TIMEOUT_SECS must be greater than 0");
150+
}
151+
152+
Ok(config)
125153
}
126154
}
127155

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pub mod proxy;
1515
pub mod rate_limit;
1616
pub mod routes;
1717
pub mod signing;
18+
pub mod startup_checks;
1819
pub mod types;
1920

2021
/// Shared application state available to all handlers.

src/main.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ use tokio::net::TcpListener;
55
use tracing::info;
66

77
use vllm_proxy_rs::{
8-
cache, config, metrics_middleware, rate_limit, request_id_middleware, routes, signing, AppState,
8+
cache, config, metrics_middleware, rate_limit, request_id_middleware, routes, signing,
9+
startup_checks, AppState,
910
};
1011

1112
#[tokio::main]
@@ -75,6 +76,18 @@ async fn main() -> anyhow::Result<()> {
7576
metrics_handle,
7677
};
7778

79+
// Run OpenAI chat compatibility checks if enabled
80+
if state.config.openai_chat_compatibility_check_enabled {
81+
info!("OpenAI chat compatibility check enabled, verifying backend...");
82+
if let Err(e) = startup_checks::run_startup_checks(&state.http_client, &state.config).await
83+
{
84+
tracing::error!(error = %e, "OpenAI chat compatibility check failed — exiting");
85+
return Err(e.into());
86+
}
87+
} else {
88+
info!("OpenAI chat compatibility check disabled (set OPENAI_CHAT_COMPATIBILITY_CHECK=true to enable)");
89+
}
90+
7891
// Build rate limiter
7992
let rate_limiter = rate_limit::build_rate_limiter(
8093
state.config.rate_limit_per_second,

src/signing.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -465,10 +465,8 @@ mod tests {
465465
// or may panic/fail entirely — either outcome means rejection
466466
let result = std::panic::catch_unwind(|| recover_ecdsa_address(message, &corrupted_sig));
467467
if let Ok(recovered) = result {
468-
// Recovery may succeed but should yield a different address
469468
assert_ne!(recovered, ctx.signing_address);
470-
}
471-
// Err(_) = panicked during recovery = also a rejection
469+
} // Panicked during recovery (Err) = also a rejection
472470
}
473471

474472
#[test]

0 commit comments

Comments
 (0)