Skip to content

Commit 252650f

Browse files
authored
feat(proxy): request body cap + per-tenant rate limit plumbing (#240)
Adds a 1 MiB request body cap on the Axum router (configurable via LLMTRACE_MAX_REQUEST_BYTES) so a single oversized payload cannot drive ML detectors or the trace pipeline arbitrarily hard, and surfaces per-tenant rate-limit knobs through the Basilica tenant config so SaaS tenants can be shaped without rebuilding the proxy YAML. Rust side - crates/llmtrace-proxy/Cargo.toml: enable tower-http "limit" feature. - crates/llmtrace-proxy/src/main.rs: resolve_max_request_bytes() reads LLMTRACE_MAX_REQUEST_BYTES with a 1 MiB fallback on missing / invalid / non-positive values; build_router applies RequestBodyLimitLayer so oversized requests (with honest Content-Length) are rejected with HTTP 413 before any handler executes. New unit + router tests cover defaults, parse fallbacks, the 413 rejection path, and that small bodies still reach the proxy handler. - crates/llmtrace-proxy/src/config.rs: apply_env_overrides now honours LLMTRACE_RATE_LIMIT_RPS and LLMTRACE_RATE_LIMIT_BURST. A parse_positive_u32 helper silently ignores zero / unparseable values so a typo cannot disable rate limiting wholesale. Python / Basilica side - deployments/basilica/lifecycle.py: new frozen RateLimitSpec dataclass with __post_init__ validation; optional TenantSpec.rate_limit field; _apply_rate_limit injects LLMTRACE_RATE_LIMIT_RPS and LLMTRACE_RATE_LIMIT_BURST into the proxy ComponentSpec env at provision time, mirroring _apply_proxy_auth's precedence (spec wins over caller env). - deployments/basilica/cli.py: parse the optional top-level rate_limit block; fail fast on missing required keys or non-mapping shapes. - configs/examples/{starter,pro}.yaml: commented-out rate_limit block so the field is discoverable but optional. - deployments/basilica/README.md: new "Per-tenant rate_limit" and "Request body cap" subsections under "Tenant config format"; table row added for the new top-level field.
1 parent 458fc36 commit 252650f

9 files changed

Lines changed: 350 additions & 1 deletion

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/llmtrace-proxy/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ hex = "0.4"
4242
sha2 = "0.10"
4343
rand = "0.8"
4444
axum = "0.7"
45-
tower-http = { version = "0.6", features = ["cors"] }
45+
tower-http = { version = "0.6", features = ["cors", "limit"] }
4646
hyper = { version = "1", features = ["full"] }
4747
reqwest = { version = "0.12", default-features = false, features = ["json", "stream", "rustls-tls"] }
4848
serde_yaml = "0.9"

crates/llmtrace-proxy/src/config.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@ pub fn load_config(path: &Path) -> anyhow::Result<ProxyConfig> {
3030
/// - `LLMTRACE_CLICKHOUSE_DATABASE` → `storage.clickhouse_database`
3131
/// - `LLMTRACE_POSTGRES_URL` → `storage.postgres_url`
3232
/// - `LLMTRACE_REDIS_URL` → `storage.redis_url`
33+
/// - `LLMTRACE_AUTH_ENABLED` → `auth.enabled`
34+
/// - `LLMTRACE_AUTH_ADMIN_KEY` → `auth.admin_key`
35+
/// - `LLMTRACE_RATE_LIMIT_RPS` → `rate_limiting.requests_per_second`
36+
/// - `LLMTRACE_RATE_LIMIT_BURST` → `rate_limiting.burst_size`
3337
/// - `LLMTRACE_ML_MAX_CONCURRENT` → `ml_pipeline.max_concurrent_requests`
38+
///
39+
/// Rate-limit overrides only take effect when the parsed value is `> 0`
40+
/// (a `u32`). Invalid or non-positive values are ignored, leaving the
41+
/// loaded YAML value (or [`RateLimitConfig::default`]) in place.
3442
pub fn apply_env_overrides(config: &mut ProxyConfig) {
3543
if let Ok(val) = std::env::var("LLMTRACE_LISTEN_ADDR") {
3644
config.listen_addr = val;
@@ -63,6 +71,12 @@ pub fn apply_env_overrides(config: &mut ProxyConfig) {
6371
if let Ok(val) = std::env::var("LLMTRACE_AUTH_ADMIN_KEY") {
6472
config.auth.admin_key = Some(val);
6573
}
74+
if let Some(rps) = parse_positive_u32("LLMTRACE_RATE_LIMIT_RPS") {
75+
config.rate_limiting.requests_per_second = rps;
76+
}
77+
if let Some(burst) = parse_positive_u32("LLMTRACE_RATE_LIMIT_BURST") {
78+
config.rate_limiting.burst_size = burst;
79+
}
6680
if let Ok(val) = std::env::var("LLMTRACE_ML_MAX_CONCURRENT") {
6781
match val.parse::<usize>() {
6882
Ok(n) if n > 0 => config.ml_pipeline.max_concurrent_requests = n,
@@ -74,6 +88,14 @@ pub fn apply_env_overrides(config: &mut ProxyConfig) {
7488
}
7589
}
7690

91+
/// Parse a strictly positive `u32` from an env var. Returns `None` for
92+
/// missing, empty, unparseable, or zero values so the caller leaves the
93+
/// loaded configuration untouched.
94+
fn parse_positive_u32(name: &str) -> Option<u32> {
95+
let raw = std::env::var(name).ok()?;
96+
raw.trim().parse::<u32>().ok().filter(|n| *n > 0)
97+
}
98+
7799
/// Validate a [`ProxyConfig`] for common configuration errors.
78100
///
79101
/// Returns `Ok(())` when valid, or an error listing all detected issues.
@@ -345,6 +367,44 @@ health_check:
345367
std::env::remove_var("LLMTRACE_STORAGE_DATABASE_PATH");
346368
}
347369

370+
/// Serialises tests that mutate the same rate-limit env vars so they
371+
/// do not race when cargo runs them in parallel.
372+
static RATE_LIMIT_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
373+
374+
#[test]
375+
fn test_apply_env_overrides_rate_limit_rps_and_burst() {
376+
let _guard = RATE_LIMIT_ENV_LOCK.lock().unwrap();
377+
let mut config = ProxyConfig::default();
378+
std::env::set_var("LLMTRACE_RATE_LIMIT_RPS", "250");
379+
std::env::set_var("LLMTRACE_RATE_LIMIT_BURST", "500");
380+
apply_env_overrides(&mut config);
381+
std::env::remove_var("LLMTRACE_RATE_LIMIT_RPS");
382+
std::env::remove_var("LLMTRACE_RATE_LIMIT_BURST");
383+
assert_eq!(config.rate_limiting.requests_per_second, 250);
384+
assert_eq!(config.rate_limiting.burst_size, 500);
385+
}
386+
387+
#[test]
388+
fn test_apply_env_overrides_rate_limit_ignores_invalid() {
389+
let _guard = RATE_LIMIT_ENV_LOCK.lock().unwrap();
390+
let baseline = ProxyConfig::default();
391+
let mut config = ProxyConfig::default();
392+
std::env::set_var("LLMTRACE_RATE_LIMIT_RPS", "not-a-number");
393+
std::env::set_var("LLMTRACE_RATE_LIMIT_BURST", "0");
394+
apply_env_overrides(&mut config);
395+
std::env::remove_var("LLMTRACE_RATE_LIMIT_RPS");
396+
std::env::remove_var("LLMTRACE_RATE_LIMIT_BURST");
397+
// Invalid / zero values must leave the loaded config untouched.
398+
assert_eq!(
399+
config.rate_limiting.requests_per_second,
400+
baseline.rate_limiting.requests_per_second
401+
);
402+
assert_eq!(
403+
config.rate_limiting.burst_size,
404+
baseline.rate_limiting.burst_size
405+
);
406+
}
407+
348408
#[test]
349409
fn test_validate_config_valid() {
350410
let config = ProxyConfig::default();

crates/llmtrace-proxy/src/main.rs

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use llmtrace_storage::StorageProfile;
2323
use std::path::PathBuf;
2424
use std::sync::Arc;
2525
use tower_http::cors::{Any, CorsLayer};
26+
use tower_http::limit::RequestBodyLimitLayer;
2627
use tracing::info;
2728
use utoipa::OpenApi;
2829
use utoipa_swagger_ui::SwaggerUi;
@@ -246,6 +247,9 @@ async fn run_proxy(
246247
storage_profile = %config.storage.profile,
247248
shutdown_timeout_seconds = config.shutdown.timeout_seconds,
248249
runtime_overlay_path = ?runtime_overlay_path,
250+
max_request_bytes = resolve_max_request_bytes(),
251+
rate_limit_rps = config.rate_limiting.requests_per_second,
252+
rate_limit_burst = config.rate_limiting.burst_size,
249253
"Starting LLMTrace proxy server"
250254
);
251255

@@ -952,12 +956,36 @@ async fn build_security_analyzer(
952956
}
953957
}
954958

959+
/// Default request body cap (1 MiB).
960+
///
961+
/// The proxy enforces this hard cap before any handler is invoked so a
962+
/// single oversized payload cannot drive downstream ML detectors or the
963+
/// trace pipeline arbitrarily hard. Configurable per-deployment via
964+
/// the `LLMTRACE_MAX_REQUEST_BYTES` env var.
965+
const DEFAULT_MAX_REQUEST_BYTES: usize = 1024 * 1024;
966+
967+
/// Resolve the per-request body cap from the environment, falling back
968+
/// to [`DEFAULT_MAX_REQUEST_BYTES`] on missing, empty, or unparseable
969+
/// values.
970+
fn resolve_max_request_bytes() -> usize {
971+
match std::env::var("LLMTRACE_MAX_REQUEST_BYTES") {
972+
Ok(raw) => raw
973+
.trim()
974+
.parse::<usize>()
975+
.ok()
976+
.filter(|n| *n > 0)
977+
.unwrap_or(DEFAULT_MAX_REQUEST_BYTES),
978+
Err(_) => DEFAULT_MAX_REQUEST_BYTES,
979+
}
980+
}
981+
955982
/// Build the axum [`Router`] with all routes.
956983
fn build_router(state: Arc<AppState>) -> Router {
957984
let cors = CorsLayer::new()
958985
.allow_origin(Any)
959986
.allow_methods(Any)
960987
.allow_headers(Any);
988+
let body_cap = resolve_max_request_bytes();
961989

962990
let router = Router::new()
963991
// OpenAPI / Swagger UI (served by the proxy itself, not forwarded upstream).
@@ -1096,6 +1124,10 @@ fn build_router(state: Arc<AppState>) -> Router {
10961124
llmtrace_proxy::auth::auth_middleware,
10971125
))
10981126
.layer(cors)
1127+
// Hard request body cap. Rejects oversized requests with HTTP 413
1128+
// before any handler executes, protecting downstream detectors and
1129+
// the trace pipeline from arbitrarily large payloads.
1130+
.layer(RequestBodyLimitLayer::new(body_cap))
10991131
.with_state(state)
11001132
}
11011133

@@ -1331,6 +1363,119 @@ mod tests {
13311363
assert_eq!(config.logging.format, "json");
13321364
}
13331365

1366+
/// Mutex serialising tests that mutate process env (env vars are
1367+
/// per-process, not per-test, so unguarded parallel mutation races).
1368+
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1369+
1370+
#[test]
1371+
fn test_resolve_max_request_bytes_defaults_when_unset() {
1372+
let _guard = ENV_LOCK.lock().unwrap();
1373+
std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
1374+
assert_eq!(resolve_max_request_bytes(), DEFAULT_MAX_REQUEST_BYTES);
1375+
}
1376+
1377+
#[test]
1378+
fn test_resolve_max_request_bytes_parses_valid_value() {
1379+
let _guard = ENV_LOCK.lock().unwrap();
1380+
std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "2048");
1381+
let cap = resolve_max_request_bytes();
1382+
std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
1383+
assert_eq!(cap, 2048);
1384+
}
1385+
1386+
#[test]
1387+
fn test_resolve_max_request_bytes_falls_back_on_garbage() {
1388+
let _guard = ENV_LOCK.lock().unwrap();
1389+
std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "not-a-number");
1390+
let cap = resolve_max_request_bytes();
1391+
std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
1392+
assert_eq!(cap, DEFAULT_MAX_REQUEST_BYTES);
1393+
}
1394+
1395+
#[test]
1396+
fn test_resolve_max_request_bytes_falls_back_on_zero() {
1397+
let _guard = ENV_LOCK.lock().unwrap();
1398+
std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", "0");
1399+
let cap = resolve_max_request_bytes();
1400+
std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
1401+
assert_eq!(cap, DEFAULT_MAX_REQUEST_BYTES);
1402+
}
1403+
1404+
#[tokio::test]
1405+
async fn test_request_body_cap_rejects_oversized_payload() {
1406+
// Lock the env mutex so we can override the cap to a small value
1407+
// without racing other env-touching tests. The router captures the
1408+
// value at build time, so we can drop the guard after build_router.
1409+
let cap_bytes = 1024usize;
1410+
let app = {
1411+
let _guard = ENV_LOCK.lock().unwrap();
1412+
std::env::set_var("LLMTRACE_MAX_REQUEST_BYTES", cap_bytes.to_string());
1413+
let state = build_app_state(memory_config(), None).await.unwrap();
1414+
let app = build_router(state);
1415+
std::env::remove_var("LLMTRACE_MAX_REQUEST_BYTES");
1416+
app
1417+
};
1418+
1419+
// Oversized body must be rejected with 413 Payload Too Large
1420+
// before reaching the upstream proxy handler. We set
1421+
// Content-Length explicitly so the body-limit layer can refuse
1422+
// the request without buffering — matching how real clients
1423+
// present oversized payloads.
1424+
let oversized = vec![b'x'; cap_bytes + 1];
1425+
let req = Request::builder()
1426+
.method("POST")
1427+
.uri("/v1/chat/completions")
1428+
.header("content-type", "application/octet-stream")
1429+
.header("content-length", oversized.len().to_string())
1430+
.header("authorization", "Bearer sk-test")
1431+
.body(Body::from(oversized))
1432+
.unwrap();
1433+
let response = app.oneshot(req).await.unwrap();
1434+
assert_eq!(
1435+
response.status(),
1436+
StatusCode::PAYLOAD_TOO_LARGE,
1437+
"request body over the configured cap must be rejected with 413"
1438+
);
1439+
}
1440+
1441+
#[tokio::test]
1442+
async fn test_request_body_cap_allows_payload_under_default_limit() {
1443+
// Default cap is 1 MiB; a small body must route through without
1444+
// 413. The upstream is intentionally unreachable, so a 502 Bad
1445+
// Gateway here confirms the request reached the proxy_handler.
1446+
let config = ProxyConfig {
1447+
upstream_url: "http://127.0.0.1:1".to_string(),
1448+
connection_timeout_ms: 100,
1449+
timeout_ms: 500,
1450+
storage: llmtrace_core::StorageConfig {
1451+
profile: "memory".to_string(),
1452+
database_path: String::new(),
1453+
..llmtrace_core::StorageConfig::default()
1454+
},
1455+
..ProxyConfig::default()
1456+
};
1457+
let state = build_app_state(config, None).await.unwrap();
1458+
let app = build_router(state);
1459+
1460+
let body = serde_json::json!({
1461+
"model": "gpt-4",
1462+
"messages": [{"role": "user", "content": "Hello"}]
1463+
});
1464+
let req = Request::builder()
1465+
.method("POST")
1466+
.uri("/v1/chat/completions")
1467+
.header("content-type", "application/json")
1468+
.header("authorization", "Bearer sk-test")
1469+
.body(Body::from(serde_json::to_vec(&body).unwrap()))
1470+
.unwrap();
1471+
let response = app.oneshot(req).await.unwrap();
1472+
assert_ne!(
1473+
response.status(),
1474+
StatusCode::PAYLOAD_TOO_LARGE,
1475+
"small body must not be rejected by the body cap layer"
1476+
);
1477+
}
1478+
13341479
#[tokio::test]
13351480
async fn test_metrics_endpoint_returns_prometheus_format() {
13361481
let app = test_app().await;

deployments/basilica/README.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ Optional top-level fields:
198198
| `dashboard_name_template` | `"llmtrace-dashboard-{tenant_id}"` | Same for dashboard |
199199
| `inject_proxy_url_into_dashboard` | `true` | If true, the resolved proxy URL is auto-injected into the dashboard's env after the proxy is up |
200200
| `proxy_url_env_var` | `"LLMTRACE_PROXY_URL"` | Which env var to inject the proxy URL under |
201+
| `rate_limit` | unset | Optional per-tenant rate-limit override surfaced to the proxy as env vars (see below) |
201202

202203
Optional per-component fields (override the defaults shown):
203204

@@ -213,6 +214,56 @@ Optional per-component fields (override the defaults shown):
213214
| `readiness_period_seconds` | `10` | Readiness probe interval |
214215
| `readiness_failure_threshold` | `3` | Readiness probe failures before removing from service |
215216

217+
### Per-tenant `rate_limit`
218+
219+
Optional top-level block. When present, the lifecycle library injects two
220+
env vars into the proxy's `ComponentSpec` at `provision()` and
221+
`update(..., strategy="recreate")` time:
222+
223+
| Env var | Source | Effect on the proxy |
224+
|---|---|---|
225+
| `LLMTRACE_RATE_LIMIT_RPS` | `rate_limit.requests_per_second` | Overrides `rate_limiting.requests_per_second` (default 100) |
226+
| `LLMTRACE_RATE_LIMIT_BURST` | `rate_limit.burst_size` | Overrides `rate_limiting.burst_size` (default 200) |
227+
228+
Shape:
229+
230+
```yaml
231+
rate_limit:
232+
requests_per_second: 50 # strictly > 0
233+
burst_size: 100 # strictly > 0
234+
```
235+
236+
Behaviour:
237+
238+
- Both fields are **required when the block is present** and must be
239+
strictly positive — `RateLimitSpec.__post_init__` raises `ValueError`
240+
otherwise.
241+
- The injected env vars **override** any same-named values the caller
242+
put in `proxy.env` (same precedence as the auth-key injection).
243+
- On the proxy side, `LLMTRACE_RATE_LIMIT_RPS`/`_BURST` are parsed in
244+
`crates/llmtrace-proxy/src/config.rs::apply_env_overrides`. Non-positive
245+
or unparseable values are silently ignored so a typo cannot disable
246+
rate limiting wholesale.
247+
- Omit the block to keep the proxy's built-in `RateLimitConfig::default()`
248+
(100 rps / 200 burst). Per-tenant `tenant_overrides` set via
249+
`RateLimitConfig::tenant_overrides` in the proxy's YAML are unaffected.
250+
251+
### Request body cap
252+
253+
The proxy enforces a hard request body cap before any handler executes
254+
(see `crates/llmtrace-proxy/src/main.rs::resolve_max_request_bytes`).
255+
The cap protects downstream ML detectors and the trace pipeline from
256+
a single oversized payload.
257+
258+
- Default: **1 MiB** (1,048,576 bytes).
259+
- Configurable per-deployment via the `LLMTRACE_MAX_REQUEST_BYTES` env
260+
var. Invalid / non-positive values fall back to the default — set this
261+
in your tenant's `proxy.env` if you need a different cap.
262+
- Requests over the cap are rejected with HTTP `413 Payload Too Large`
263+
when the `Content-Length` header is honest. Chunked / streamed bodies
264+
that exceed the cap are aborted mid-stream and surface as `400 Bad
265+
Request` from the proxy handler.
266+
216267
### `${VAR}` substitution
217268

218269
In any string value (env values, image tags, URLs):

deployments/basilica/cli.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,19 @@ def _component_from_dict(data: dict[str, Any], label: str) -> lifecycle.Componen
110110
return lifecycle.ComponentSpec(**kwargs)
111111

112112

113+
def _rate_limit_from_dict(data: dict[str, Any]) -> lifecycle.RateLimitSpec:
114+
required = {"requests_per_second", "burst_size"}
115+
missing = required - data.keys()
116+
if missing:
117+
raise SystemExit(
118+
f"rate_limit is missing required keys: {sorted(missing)}"
119+
)
120+
return lifecycle.RateLimitSpec(
121+
requests_per_second=int(data["requests_per_second"]),
122+
burst_size=int(data["burst_size"]),
123+
)
124+
125+
113126
def _tenant_spec_from_config(tenant_id: str, cfg: dict[str, Any]) -> lifecycle.TenantSpec:
114127
if "proxy" not in cfg or "dashboard" not in cfg:
115128
raise SystemExit(
@@ -132,6 +145,13 @@ def _tenant_spec_from_config(tenant_id: str, cfg: dict[str, Any]) -> lifecycle.T
132145
):
133146
if key in cfg:
134147
kwargs[key] = cfg[key]
148+
if "rate_limit" in cfg and cfg["rate_limit"] is not None:
149+
rate_limit_block = cfg["rate_limit"]
150+
if not isinstance(rate_limit_block, dict):
151+
raise SystemExit(
152+
f"rate_limit must be a mapping, got {type(rate_limit_block).__name__}"
153+
)
154+
kwargs["rate_limit"] = _rate_limit_from_dict(rate_limit_block)
135155
return lifecycle.TenantSpec(**kwargs)
136156

137157

0 commit comments

Comments
 (0)