techlab-innov
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/llmtrace-proxy/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎crates/llmtrace-proxy/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/llmtrace-proxy/src/config.rs‎
Lines changed: 60 additions & 0 deletions b/‎crates/llmtrace-proxy/src/config.rs‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎crates/llmtrace-proxy/src/main.rs‎
Lines changed: 145 additions & 0 deletions b/‎crates/llmtrace-proxy/src/main.rs‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎deployments/basilica/README.md‎
Lines changed: 51 additions & 0 deletions b/‎deployments/basilica/README.md‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎deployments/basilica/cli.py‎
Lines changed: 20 additions & 0 deletions b/‎deployments/basilica/cli.py‎
Lines changed: 20 additions & 0 deletions
@@ -42,7 +42,7 @@ hex = "0.4"
 sha2 = "0.10"
 rand = "0.8"
 axum = "0.7"
-tower-http = { version = "0.6", features = ["cors"] }
+tower-http = { version = "0.6", features = ["cors", "limit"] }
 hyper = { version = "1", features = ["full"] }
 reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
 serde_yaml = "0.9"
 
@@ -30,7 +30,15 @@ pub fn load_config(path: &Path) -> anyhow::Result<ProxyConfig> {
 /// - `LLMTRACE_CLICKHOUSE_DATABASE` → `storage.clickhouse_database`
 /// - `LLMTRACE_POSTGRES_URL` → `storage.postgres_url`
 /// - `LLMTRACE_REDIS_URL` → `storage.redis_url`
+/// - `LLMTRACE_AUTH_ENABLED` → `auth.enabled`
+/// - `LLMTRACE_AUTH_ADMIN_KEY` → `auth.admin_key`
+/// - `LLMTRACE_RATE_LIMIT_RPS` → `rate_limiting.requests_per_second`
+/// - `LLMTRACE_RATE_LIMIT_BURST` → `rate_limiting.burst_size`
 /// - `LLMTRACE_ML_MAX_CONCURRENT` → `ml_pipeline.max_concurrent_requests`
+///
+/// Rate-limit overrides only take effect when the parsed value is `> 0`
+/// (a `u32`). Invalid or non-positive values are ignored, leaving the
+/// loaded YAML value (or [`RateLimitConfig::default`]) in place.
 pub fn apply_env_overrides(config: &mut ProxyConfig) {
     if let Ok(val) = std::env::var("LLMTRACE_LISTEN_ADDR") {
         config.listen_addr = val;
@@ -63,6 +71,12 @@ pub fn apply_env_overrides(config: &mut ProxyConfig) {
     if let Ok(val) = std::env::var("LLMTRACE_AUTH_ADMIN_KEY") {
         config.auth.admin_key = Some(val);
     }
+    if let Some(rps) = parse_positive_u32("LLMTRACE_RATE_LIMIT_RPS") {
+        config.rate_limiting.requests_per_second = rps;
+    }
+    if let Some(burst) = parse_positive_u32("LLMTRACE_RATE_LIMIT_BURST") {
+        config.rate_limiting.burst_size = burst;
+    }
     if let Ok(val) = std::env::var("LLMTRACE_ML_MAX_CONCURRENT") {
         match val.parse::<usize>() {
             Ok(n) if n > 0 => config.ml_pipeline.max_concurrent_requests = n,
@@ -74,6 +88,14 @@ pub fn apply_env_overrides(config: &mut ProxyConfig) {
     }
 }
 
+/// Parse a strictly positive `u32` from an env var. Returns `None` for
+/// missing, empty, unparseable, or zero values so the caller leaves the
+/// loaded configuration untouched.
+fn parse_positive_u32(name: &str) -> Option<u32> {
+    let raw = std::env::var(name).ok()?;
+    raw.trim().parse::<u32>().ok().filter(|n| *n > 0)
+}
+
 /// Validate a [`ProxyConfig`] for common configuration errors.
 ///
 /// Returns `Ok(())` when valid, or an error listing all detected issues.
@@ -345,6 +367,44 @@ health_check:
         std::env::remove_var("LLMTRACE_STORAGE_DATABASE_PATH");
     }
 
+    /// Serialises tests that mutate the same rate-limit env vars so they
+    /// do not race when cargo runs them in parallel.
+    static RATE_LIMIT_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+    #[test]
+    fn test_apply_env_overrides_rate_limit_rps_and_burst() {
+        let _guard = RATE_LIMIT_ENV_LOCK.lock().unwrap();
+        let mut config = ProxyConfig::default();
+        std::env::set_var("LLMTRACE_RATE_LIMIT_RPS", "250");
+        std::env::set_var("LLMTRACE_RATE_LIMIT_BURST", "500");
+        apply_env_overrides(&mut config);
+        std::env::remove_var("LLMTRACE_RATE_LIMIT_RPS");
+        std::env::remove_var("LLMTRACE_RATE_LIMIT_BURST");
+        assert_eq!(config.rate_limiting.requests_per_second, 250);
+        assert_eq!(config.rate_limiting.burst_size, 500);
+    }
+
+    #[test]
+    fn test_apply_env_overrides_rate_limit_ignores_invalid() {
+        let _guard = RATE_LIMIT_ENV_LOCK.lock().unwrap();
+        let baseline = ProxyConfig::default();
+        let mut config = ProxyConfig::default();
+        std::env::set_var("LLMTRACE_RATE_LIMIT_RPS", "not-a-number");
+        std::env::set_var("LLMTRACE_RATE_LIMIT_BURST", "0");
+        apply_env_overrides(&mut config);
+        std::env::remove_var("LLMTRACE_RATE_LIMIT_RPS");
+        std::env::remove_var("LLMTRACE_RATE_LIMIT_BURST");
+        // Invalid / zero values must leave the loaded config untouched.
+        assert_eq!(
+            config.rate_limiting.requests_per_second,
+            baseline.rate_limiting.requests_per_second
+        );
+        assert_eq!(
+            config.rate_limiting.burst_size,
+            baseline.rate_limiting.burst_size
+        );
+    }
+
     #[test]
     fn test_validate_config_valid() {
         let config = ProxyConfig::default();
 
@@ -23,6 +23,7 @@ use llmtrace_storage::StorageProfile;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tower_http::cors::{Any, CorsLayer};
+use tower_http::limit::RequestBodyLimitLayer;
 use tracing::info;
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;
@@ -246,6 +247,9 @@ async fn run_proxy(
         storage_profile = %config.storage.profile,
         shutdown_timeout_seconds = config.shutdown.timeout_seconds,
         runtime_overlay_path = ?runtime_overlay_path,
+        max_request_bytes = resolve_max_request_bytes(),
+        rate_limit_rps = config.rate_limiting.requests_per_second,
+        rate_limit_burst = config.rate_limiting.burst_size,
         "Starting LLMTrace proxy server"
     );
 
@@ -952,12 +956,36 @@ async fn build_security_analyzer(
     }
 }
 
+/// Default request body cap (1 MiB).
+///
+/// The proxy enforces this hard cap before any handler is invoked so a
+/// single oversized payload cannot drive downstream ML detectors or the
+/// trace pipeline arbitrarily hard. Configurable per-deployment via
+/// the `LLMTRACE_MAX_REQUEST_BYTES` env var.
+const DEFAULT_MAX_REQUEST_BYTES: usize = 1024 * 1024;
+
+/// Resolve the per-request body cap from the environment, falling back
+/// to [`DEFAULT_MAX_REQUEST_BYTES`] on missing, empty, or unparseable
+/// values.
+fn resolve_max_request_bytes() -> usize {
+    match std::env::var("LLMTRACE_MAX_REQUEST_BYTES") {
+        Ok(raw) => raw
+            .trim()
+            .parse::<usize>()
+            .ok()
+            .filter(|n| *n > 0)
+            .unwrap_or(DEFAULT_MAX_REQUEST_BYTES),
+        Err(_) => DEFAULT_MAX_REQUEST_BYTES,
+    }
+}
+
 /// Build the axum [`Router`] with all routes.
 fn build_router(state: Arc<AppState>) -> Router {
     let cors = CorsLayer::new()
         .allow_origin(Any)
         .allow_methods(Any)
         .allow_headers(Any);
+    let body_cap = resolve_max_request_bytes();
 
     let router = Router::new()
         // OpenAPI / Swagger UI (served by the proxy itself, not forwarded upstream).
@@ -1096,6 +1124,10 @@ fn build_router(state: Arc<AppState>) -> Router {
             llmtrace_proxy::auth::auth_middleware,
         ))
         .layer(cors)
+        // Hard request body cap. Rejects oversized requests with HTTP 413
+        // before any handler executes, protecting downstream detectors and
+        // the trace pipeline from arbitrarily large payloads.
+        .layer(RequestBodyLimitLayer::new(body_cap))
         .with_state(state)
 }
 
@@ -1331,6 +1363,119 @@ mod tests {
         assert_eq!(config.logging.format, "json");
     }
 
+    /// Mutex serialising tests that mutate process env (env vars are
+    /// per-process, not per-test, so unguarded parallel mutation races).
+    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+    #[test]
+    fn test_resolve_max_request_bytes_defaults_when_unset() {
+        let _guard = ENV_LOCK.lock().unwrap();
+        std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
+        assert_eq!(resolve_max_request_bytes(), DEFAULT_MAX_REQUEST_BYTES);
+    }
+
+    #[test]
+    fn test_resolve_max_request_bytes_parses_valid_value() {
+        let _guard = ENV_LOCK.lock().unwrap();
+        std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "2048");
+        let cap = resolve_max_request_bytes();
+        std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
+        assert_eq!(cap, 2048);
+    }
+
+    #[test]
+    fn test_resolve_max_request_bytes_falls_back_on_garbage() {
+        let _guard = ENV_LOCK.lock().unwrap();
+        std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "not-a-number");
+        let cap = resolve_max_request_bytes();
+        std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
+        assert_eq!(cap, DEFAULT_MAX_REQUEST_BYTES);
+    }
+
+    #[test]
+    fn test_resolve_max_request_bytes_falls_back_on_zero() {
+        let _guard = ENV_LOCK.lock().unwrap();
+        std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "0");
+        let cap = resolve_max_request_bytes();
+        std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
+        assert_eq!(cap, DEFAULT_MAX_REQUEST_BYTES);
+    }
+
+    #[tokio::test]
+    async fn test_request_body_cap_rejects_oversized_payload() {
+        // Lock the env mutex so we can override the cap to a small value
+        // without racing other env-touching tests. The router captures the
+        // value at build time, so we can drop the guard after build_router.
+        let cap_bytes = 1024usize;
+        let app = {
+            let _guard = ENV_LOCK.lock().unwrap();
+            std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", cap_bytes.to_string());
+            let state = build_app_state(memory_config(), None).await.unwrap();
+            let app = build_router(state);
+            std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
+            app
+        };
+
+        // Oversized body must be rejected with 413 Payload Too Large
+        // before reaching the upstream proxy handler. We set
+        // Content-Length explicitly so the body-limit layer can refuse
+        // the request without buffering — matching how real clients
+        // present oversized payloads.
+        let oversized = vec![b'x'; cap_bytes + 1];
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header("content-type", "application/octet-stream")
+            .header("content-length", oversized.len().to_string())
+            .header("authorization", "Bearer sk-test")
+            .body(Body::from(oversized))
+            .unwrap();
+        let response = app.oneshot(req).await.unwrap();
+        assert_eq!(
+            response.status(),
+            StatusCode::PAYLOAD_TOO_LARGE,
+            "request body over the configured cap must be rejected with 413"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_request_body_cap_allows_payload_under_default_limit() {
+        // Default cap is 1 MiB; a small body must route through without
+        // 413. The upstream is intentionally unreachable, so a 502 Bad
+        // Gateway here confirms the request reached the proxy_handler.
+        let config = ProxyConfig {
+            upstream_url: "http://127.0.0.1:1".to_string(),
+            connection_timeout_ms: 100,
+            timeout_ms: 500,
+            storage: llmtrace_core::StorageConfig {
+                profile: "memory".to_string(),
+                database_path: String::new(),
+                ..llmtrace_core::StorageConfig::default()
+            },
+            ..ProxyConfig::default()
+        };
+        let state = build_app_state(config, None).await.unwrap();
+        let app = build_router(state);
+
+        let body = serde_json::json!({
+            "model": "gpt-4",
+            "messages": [{"role": "user", "content": "Hello"}]
+        });
+        let req = Request::builder()
+            .method("POST")
+            .uri("/v1/chat/completions")
+            .header("content-type", "application/json")
+            .header("authorization", "Bearer sk-test")
+            .body(Body::from(serde_json::to_vec(&body).unwrap()))
+            .unwrap();
+        let response = app.oneshot(req).await.unwrap();
+        assert_ne!(
+            response.status(),
+            StatusCode::PAYLOAD_TOO_LARGE,
+            "small body must not be rejected by the body cap layer"
+        );
+    }
+
     #[tokio::test]
     async fn test_metrics_endpoint_returns_prometheus_format() {
         let app = test_app().await;
 
@@ -198,6 +198,7 @@ Optional top-level fields:
 | `dashboard_name_template` | `"llmtrace-dashboard-{tenant_id}"` | Same for dashboard |
 | `inject_proxy_url_into_dashboard` | `true` | If true, the resolved proxy URL is auto-injected into the dashboard's env after the proxy is up |
 | `proxy_url_env_var` | `"LLMTRACE_PROXY_URL"` | Which env var to inject the proxy URL under |
+| `rate_limit` | unset | Optional per-tenant rate-limit override surfaced to the proxy as env vars (see below) |
 
 Optional per-component fields (override the defaults shown):
 
@@ -213,6 +214,56 @@ Optional per-component fields (override the defaults shown):
 | `readiness_period_seconds` | `10` | Readiness probe interval |
 | `readiness_failure_threshold` | `3` | Readiness probe failures before removing from service |
 
+### Per-tenant `rate_limit`
+
+Optional top-level block. When present, the lifecycle library injects two
+env vars into the proxy's `ComponentSpec` at `provision()` and
+`update(..., strategy="recreate")` time:
+
+| Env var | Source | Effect on the proxy |
+|---|---|---|
+| `LLMTRACE_RATE_LIMIT_RPS` | `rate_limit.requests_per_second` | Overrides `rate_limiting.requests_per_second` (default 100) |
+| `LLMTRACE_RATE_LIMIT_BURST` | `rate_limit.burst_size` | Overrides `rate_limiting.burst_size` (default 200) |
+
+Shape:
+
+```yaml
+rate_limit:
+  requests_per_second: 50   # strictly > 0
+  burst_size: 100           # strictly > 0
+```
+
+Behaviour:
+
+- Both fields are **required when the block is present** and must be
+  strictly positive — `RateLimitSpec.__post_init__` raises `ValueError`
+  otherwise.
+- The injected env vars **override** any same-named values the caller
+  put in `proxy.env` (same precedence as the auth-key injection).
+- On the proxy side, `LLMTRACE_RATE_LIMIT_RPS`/`_BURST` are parsed in
+  `crates/llmtrace-proxy/src/config.rs::apply_env_overrides`. Non-positive
+  or unparseable values are silently ignored so a typo cannot disable
+  rate limiting wholesale.
+- Omit the block to keep the proxy's built-in `RateLimitConfig::default()`
+  (100 rps / 200 burst). Per-tenant `tenant_overrides` set via
+  `RateLimitConfig::tenant_overrides` in the proxy's YAML are unaffected.
+
+### Request body cap
+
+The proxy enforces a hard request body cap before any handler executes
+(see `crates/llmtrace-proxy/src/main.rs::resolve_max_request_bytes`).
+The cap protects downstream ML detectors and the trace pipeline from
+a single oversized payload.
+
+- Default: **1 MiB** (1,048,576 bytes).
+- Configurable per-deployment via the `LLMTRACE_MAX_REQUEST_BYTES` env
+  var. Invalid / non-positive values fall back to the default — set this
+  in your tenant's `proxy.env` if you need a different cap.
+- Requests over the cap are rejected with HTTP `413 Payload Too Large`
+  when the `Content-Length` header is honest. Chunked / streamed bodies
+  that exceed the cap are aborted mid-stream and surface as `400 Bad
+  Request` from the proxy handler.
+
 ### `${VAR}` substitution
 
 In any string value (env values, image tags, URLs):
 
@@ -110,6 +110,19 @@ def _component_from_dict(data: dict[str, Any], label: str) -> lifecycle.Componen
     return lifecycle.ComponentSpec(**kwargs)
 
 
+def _rate_limit_from_dict(data: dict[str, Any]) -> lifecycle.RateLimitSpec:
+    required = {"requests_per_second", "burst_size"}
+    missing = required - data.keys()
+    if missing:
+        raise SystemExit(
+            f"rate_limit is missing required keys: {sorted(missing)}"
+        )
+    return lifecycle.RateLimitSpec(
+        requests_per_second=int(data["requests_per_second"]),
+        burst_size=int(data["burst_size"]),
+    )
+
+
 def _tenant_spec_from_config(tenant_id: str, cfg: dict[str, Any]) -> lifecycle.TenantSpec:
     if "proxy" not in cfg or "dashboard" not in cfg:
         raise SystemExit(
@@ -132,6 +145,13 @@ def _tenant_spec_from_config(tenant_id: str, cfg: dict[str, Any]) -> lifecycle.T
     ):
         if key in cfg:
             kwargs[key] = cfg[key]
+    if "rate_limit" in cfg and cfg["rate_limit"] is not None:
+        rate_limit_block = cfg["rate_limit"]
+        if not isinstance(rate_limit_block, dict):
+            raise SystemExit(
+                f"rate_limit must be a mapping, got {type(rate_limit_block).__name__}"
+            )
+        kwargs["rate_limit"] = _rate_limit_from_dict(rate_limit_block)
     return lifecycle.TenantSpec(**kwargs)