fix(basilica): raise startup_timeout floor when ML preload is on (closes #243)

epappas · epappas · commit 29130070800b · 2026-05-20T11:24:26.000+02:00
When the resolved proxy env carries both LLMTRACE_ML_ENABLED and
LLMTRACE_ML_PRELOAD truthy, the lifecycle library now floors
ComponentSpec.startup_timeout_seconds at ML_PRELOAD_STARTUP_FLOOR_SECONDS
(1500) before creating the deployment, emitting one WARN line per bump.
Callers who set the field &gt;= floor keep their value silently. Applied in
provision, update(strategy="recreate"), and rotate_admin_key — every
proxy-creating code path.

Live e2e on cpu=2 confirmed the original 600s default could not fit
preload (proxy never reached ready in 600s with ML on; reached ready in
41s with ML off). 1500s covers the observed band with margin. Both
example configs now set the field explicitly to 1500 to silence the
WARN and document the cost. README operational notes spell out the
auto-bump, how to opt out via LLMTRACE_ML_PRELOAD=0 (lazy load), and how
to silence the warning.
diff --git a/deployments/basilica/README.md b/deployments/basilica/README.md
@@ -973,11 +973,32 @@ client doesn't accept that media type.
 ### ML preload behaviour
 
 The proxy image has `LLMTRACE_ML_ENABLED=1` and `LLMTRACE_ML_PRELOAD=1` by
-default in `starter.yaml`. Cold-boot includes model warm-up; in live tests
-the proxy hit `phase=ready` in ~60s. If your tenant config sets a smaller
-`startup_timeout_seconds`, the deployment may legitimately time out — bump
-the timeout or set `LLMTRACE_ML_PRELOAD=0` to defer model load to
-first-request time.
+default in `starter.yaml` and `pro.yaml`. Preload loads the
+`prompt_injection`, `ner`, `injecguard`, and `piguard` weights before
+`/health` flips to ready.
+
+Cost on small CPU shapes: the cold-boot wall clock is dominated by model
+load. On `cpu=2 / memory=4Gi` (starter) this consistently lands in the
+600–1500s range; the same image without preload reaches ready in ~40s.
+Bigger shapes are proportionally faster but the same band of preload cost
+applies on first boot.
+
+**Auto-bump** (`deployments/basilica/lifecycle.py::_apply_ml_preload_startup_floor`):
+when the resolved proxy env carries both `LLMTRACE_ML_ENABLED ∈ {1,true,yes}`
+and `LLMTRACE_ML_PRELOAD ∈ {1,true,yes}` AND the caller's
+`startup_timeout_seconds` is below `ML_PRELOAD_STARTUP_FLOOR_SECONDS`
+(currently 1500), the lifecycle library raises the timeout to the floor
+and emits a single WARN. This applies to `provision`,
+`update(strategy="recreate")`, and `rotate_admin_key`. Callers who set
+`startup_timeout_seconds >= 1500` keep their value and see no warning.
+
+**Opt out of preload entirely**: set `LLMTRACE_ML_PRELOAD: "0"` in
+`proxy.env` for lazy load. The first request pays the model-load
+latency; every subsequent request is fast. Useful when readiness latency
+matters more than first-request latency.
+
+**Silence the WARN**: set `startup_timeout_seconds: 1500` (or higher)
+explicitly in your config. Both example configs already do this.
 
 ### Cleanup of orphans
 
diff --git a/deployments/basilica/configs/examples/pro.yaml b/deployments/basilica/configs/examples/pro.yaml
@@ -16,7 +16,13 @@ proxy:
   memory: 8Gi
   replicas: 2
   health_check_path: /health
-  startup_timeout_seconds: 600
+  # ML preload (LLMTRACE_ML_ENABLED=1 + LLMTRACE_ML_PRELOAD=1) loads the
+  # prompt_injection / ner / injecguard / piguard weights before /health flips
+  # to ready. On a cpu=2 shape this regularly exceeds 600s; cpu=4 is faster
+  # but still well over the library's old 600s default. The lifecycle library
+  # auto-bumps to 1500s when both flags are on and this value is lower; we set
+  # 1500 explicitly to silence the WARN and document the cost.
+  startup_timeout_seconds: 1500
   env:
     LLMTRACE_UPSTREAM_URL: "${LLMTRACE_UPSTREAM_URL}"
     LLMTRACE_STORAGE_PROFILE: "${LLMTRACE_STORAGE_PROFILE:-sqlite}"
diff --git a/deployments/basilica/configs/examples/starter.yaml b/deployments/basilica/configs/examples/starter.yaml
@@ -32,7 +32,14 @@ proxy:
   memory: 4Gi
   replicas: 1
   health_check_path: /health
-  startup_timeout_seconds: 600
+  # ML preload (LLMTRACE_ML_ENABLED=1 + LLMTRACE_ML_PRELOAD=1) loads the
+  # prompt_injection / ner / injecguard / piguard weights before /health flips
+  # to ready. On this cpu=2 shape that takes 600-1500s in practice. The
+  # lifecycle library auto-bumps to 1500s when both flags are on and this
+  # value is lower; we set 1500 explicitly to silence the WARN and document
+  # the cost. Set LLMTRACE_ML_PRELOAD=0 below for lazy load (first request
+  # pays the model-load latency, subsequent requests are fast).
+  startup_timeout_seconds: 1500
   env:
     LLMTRACE_UPSTREAM_URL: "${LLMTRACE_UPSTREAM_URL}"
     LLMTRACE_STORAGE_PROFILE: memory
diff --git a/deployments/basilica/lifecycle.py b/deployments/basilica/lifecycle.py
@@ -54,6 +54,19 @@
 API_KEY_PREFIX = "llmt_"
 API_KEY_RANDOM_BYTES = 32
 
+# Minimum startup window the lifecycle library will apply when the resolved
+# proxy env requests ML preload (`LLMTRACE_ML_ENABLED=1`,
+# `LLMTRACE_ML_PRELOAD=1`). On a cpu=2 shape, loading the prompt_injection +
+# ner + injecguard + piguard weights regularly exceeds the
+# `ComponentSpec.startup_timeout_seconds` default of 600s, so callers who
+# leave it at the default would silently time out. 1500s covers the observed
+# worst case with margin. Callers who explicitly set a higher value win.
+ML_PRELOAD_STARTUP_FLOOR_SECONDS = 1500
+
+# Env values that count as "enabled" for the ML preload flags. Matches the
+# truthiness rules in `crates/llmtrace-proxy/src/config.rs::env_flag`.
+_ML_FLAG_TRUTHY: frozenset[str] = frozenset({"1", "true", "yes"})
+
 # Proxy admin API endpoints (see `crates/llmtrace-proxy/src/main.rs` routing).
 TENANTS_PATH = "/api/v1/tenants"
 AUTH_KEYS_PATH = "/api/v1/auth/keys"
@@ -426,6 +439,42 @@ def _apply_proxy_auth(
     )
 
 
+def _apply_ml_preload_startup_floor(
+    spec: ComponentSpec,
+    *,
+    tenant_id: Optional[str] = None,
+    floor: int = ML_PRELOAD_STARTUP_FLOOR_SECONDS,
+) -> ComponentSpec:
+    """Raise `startup_timeout_seconds` when the proxy env requests ML preload.
+
+    When both `LLMTRACE_ML_ENABLED` and `LLMTRACE_ML_PRELOAD` resolve to a
+    truthy value in `spec.env`, the proxy will block readiness on model
+    weight load before flipping `/health` to ready. On small CPU shapes that
+    can take well over the library's 600s default, so we floor the timeout
+    at `floor` (default `ML_PRELOAD_STARTUP_FLOOR_SECONDS`).
+
+    Callers who explicitly set `startup_timeout_seconds >= floor` keep their
+    value and no warning is emitted. The bump is a pure function on the spec
+    — no SDK calls — so it's safe to apply in any code path that creates the
+    proxy deployment.
+    """
+    ml_enabled = spec.env.get("LLMTRACE_ML_ENABLED", "").strip().lower() in _ML_FLAG_TRUTHY
+    ml_preload = spec.env.get("LLMTRACE_ML_PRELOAD", "").strip().lower() in _ML_FLAG_TRUTHY
+    if not (ml_enabled and ml_preload):
+        return spec
+    if spec.startup_timeout_seconds >= floor:
+        return spec
+    LOGGER.warning(
+        "ML preload detected (LLMTRACE_ML_ENABLED=1, LLMTRACE_ML_PRELOAD=1) "
+        "on tenant=%s; raising startup_timeout_seconds from %d to %d to "
+        "accommodate model load. Set explicitly to silence.",
+        tenant_id or "<unknown>",
+        spec.startup_timeout_seconds,
+        floor,
+    )
+    return dataclasses.replace(spec, startup_timeout_seconds=floor)
+
+
 def _apply_rate_limit(
     proxy_spec: ComponentSpec, rate_limit: RateLimitSpec
 ) -> ComponentSpec:
@@ -667,6 +716,7 @@ def rotate_admin_key(
     new_env = {**proxy_spec.env, "LLMTRACE_AUTH_ADMIN_KEY": rotated_key}
     new_env.setdefault("LLMTRACE_AUTH_ENABLED", "true")
     rotated_spec = dataclasses.replace(proxy_spec, env=new_env)
+    rotated_spec = _apply_ml_preload_startup_floor(rotated_spec, tenant_id=tenant_id)
 
     proxy_name = proxy_name_template.format(tenant_id=tenant_id)
     LOGGER.info(
@@ -721,6 +771,7 @@ def provision(
         )
     if spec.rate_limit is not None:
         proxy_spec = _apply_rate_limit(proxy_spec, spec.rate_limit)
+    proxy_spec = _apply_ml_preload_startup_floor(proxy_spec, tenant_id=tenant_id)
 
     proxy = _create_component(client, proxy_name, proxy_spec)
 
diff --git a/deployments/basilica/tests/test_ml_preload_startup_floor.py b/deployments/basilica/tests/test_ml_preload_startup_floor.py
@@ -0,0 +1,188 @@
+"""Unit tests for the ML-preload startup-timeout floor.
+
+The bump is implemented as a pure function (`_apply_ml_preload_startup_floor`)
+on `ComponentSpec`, so the tests exercise it directly — no SDK mocking, no
+provision flow stubbing. This matches the production code path: every
+`_create_component` call site for the proxy runs the spec through the same
+helper before handing it to the Basilica SDK.
+"""
+
+from __future__ import annotations
+
+import logging
+import unittest
+from typing import Mapping
+
+from deployments.basilica import lifecycle
+
+
+def _proxy_spec(
+    *, env: Mapping[str, str], startup_timeout_seconds: int
+) -> lifecycle.ComponentSpec:
+    return lifecycle.ComponentSpec(
+        image="ghcr.io/techlab-innov/llmtrace-proxy:latest",
+        port=8080,
+        cpu="2",
+        memory="4Gi",
+        replicas=1,
+        env=env,
+        startup_timeout_seconds=startup_timeout_seconds,
+    )
+
+
+class MLPreloadStartupFloorTests(unittest.TestCase):
+    def test_bumps_when_below_floor_and_ml_preload_on(self) -> None:
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "1", "LLMTRACE_ML_PRELOAD": "1"},
+            startup_timeout_seconds=600,
+        )
+        with self.assertLogs(lifecycle.LOGGER, level=logging.WARNING) as captured:
+            bumped = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        self.assertEqual(
+            bumped.startup_timeout_seconds,
+            lifecycle.ML_PRELOAD_STARTUP_FLOOR_SECONDS,
+        )
+        # All other fields preserved.
+        self.assertEqual(bumped.image, spec.image)
+        self.assertEqual(bumped.env, spec.env)
+        self.assertEqual(bumped.port, spec.port)
+        # Exactly one warning, mentioning the tenant + the floor.
+        self.assertEqual(len(captured.records), 1)
+        message = captured.records[0].getMessage()
+        self.assertIn("acme", message)
+        self.assertIn("600", message)
+        self.assertIn(str(lifecycle.ML_PRELOAD_STARTUP_FLOOR_SECONDS), message)
+
+    def test_no_bump_when_caller_value_meets_floor(self) -> None:
+        explicit = lifecycle.ML_PRELOAD_STARTUP_FLOOR_SECONDS + 300
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "true", "LLMTRACE_ML_PRELOAD": "yes"},
+            startup_timeout_seconds=explicit,
+        )
+        # `assertNoLogs` is Python 3.10+. Capture and assert empty as a portable
+        # alternative that still fails loudly if a warning leaks.
+        with self.assertLogs(lifecycle.LOGGER, level=logging.DEBUG) as captured:
+            lifecycle.LOGGER.debug("anchor")  # ensure the context has >=1 record
+            result = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        self.assertIs(result, spec)
+        self.assertEqual(result.startup_timeout_seconds, explicit)
+        warnings = [r for r in captured.records if r.levelno >= logging.WARNING]
+        self.assertEqual(warnings, [])
+
+    def test_no_bump_when_caller_value_equals_floor(self) -> None:
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "1", "LLMTRACE_ML_PRELOAD": "1"},
+            startup_timeout_seconds=lifecycle.ML_PRELOAD_STARTUP_FLOOR_SECONDS,
+        )
+        with self.assertLogs(lifecycle.LOGGER, level=logging.DEBUG) as captured:
+            lifecycle.LOGGER.debug("anchor")
+            result = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        self.assertIs(result, spec)
+        warnings = [r for r in captured.records if r.levelno >= logging.WARNING]
+        self.assertEqual(warnings, [])
+
+    def test_no_bump_when_ml_preload_off(self) -> None:
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "1", "LLMTRACE_ML_PRELOAD": "0"},
+            startup_timeout_seconds=300,
+        )
+        with self.assertLogs(lifecycle.LOGGER, level=logging.DEBUG) as captured:
+            lifecycle.LOGGER.debug("anchor")
+            result = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        # Spec untouched, no warning — even though startup_timeout is well
+        # below the floor (caller's choice when preload is off).
+        self.assertIs(result, spec)
+        self.assertEqual(result.startup_timeout_seconds, 300)
+        warnings = [r for r in captured.records if r.levelno >= logging.WARNING]
+        self.assertEqual(warnings, [])
+
+    def test_no_bump_when_ml_disabled(self) -> None:
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "0", "LLMTRACE_ML_PRELOAD": "1"},
+            startup_timeout_seconds=300,
+        )
+        with self.assertLogs(lifecycle.LOGGER, level=logging.DEBUG) as captured:
+            lifecycle.LOGGER.debug("anchor")
+            result = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        self.assertIs(result, spec)
+        self.assertEqual(result.startup_timeout_seconds, 300)
+        warnings = [r for r in captured.records if r.levelno >= logging.WARNING]
+        self.assertEqual(warnings, [])
+
+    def test_no_bump_when_both_flags_absent(self) -> None:
+        spec = _proxy_spec(env={}, startup_timeout_seconds=120)
+        with self.assertLogs(lifecycle.LOGGER, level=logging.DEBUG) as captured:
+            lifecycle.LOGGER.debug("anchor")
+            result = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme"
+            )
+
+        self.assertIs(result, spec)
+        self.assertEqual(result.startup_timeout_seconds, 120)
+        warnings = [r for r in captured.records if r.levelno >= logging.WARNING]
+        self.assertEqual(warnings, [])
+
+    def test_custom_floor_override(self) -> None:
+        spec = _proxy_spec(
+            env={"LLMTRACE_ML_ENABLED": "1", "LLMTRACE_ML_PRELOAD": "1"},
+            startup_timeout_seconds=100,
+        )
+        with self.assertLogs(lifecycle.LOGGER, level=logging.WARNING):
+            bumped = lifecycle._apply_ml_preload_startup_floor(
+                spec, tenant_id="acme", floor=2400
+            )
+
+        self.assertEqual(bumped.startup_timeout_seconds, 2400)
+
+    def test_truthy_variants_recognised(self) -> None:
+        # Any value in the truthy set on BOTH flags triggers the bump.
+        for value in ("1", "true", "TRUE", "True", "yes", "YES"):
+            with self.subTest(value=value):
+                spec = _proxy_spec(
+                    env={
+                        "LLMTRACE_ML_ENABLED": value,
+                        "LLMTRACE_ML_PRELOAD": value,
+                    },
+                    startup_timeout_seconds=600,
+                )
+                bumped = lifecycle._apply_ml_preload_startup_floor(
+                    spec, tenant_id="acme"
+                )
+                self.assertEqual(
+                    bumped.startup_timeout_seconds,
+                    lifecycle.ML_PRELOAD_STARTUP_FLOOR_SECONDS,
+                )
+
+    def test_falsy_variants_do_not_trigger(self) -> None:
+        for value in ("0", "false", "no", "", "off", "FALSE"):
+            with self.subTest(value=value):
+                spec = _proxy_spec(
+                    env={
+                        "LLMTRACE_ML_ENABLED": "1",
+                        "LLMTRACE_ML_PRELOAD": value,
+                    },
+                    startup_timeout_seconds=300,
+                )
+                result = lifecycle._apply_ml_preload_startup_floor(
+                    spec, tenant_id="acme"
+                )
+                self.assertIs(result, spec)
+
+
+if __name__ == "__main__":
+    unittest.main()