fix(setup): start genie-ai-runtime before memory-heavy services (#76)

carlos4s · web-flow · commit 9ab2974e6b0a · 2026-05-18T08:20:23.000+02:00
Fixes #75. Issue evidence on Jetson Orin Nano 8 GB shows the runtime's auto-clamp behavior is order-sensitive: same `-c 4096` request fits only ~1.7k context when the full stack is already resident, but fits 4k / 6k / 8k cleanly when `genie-ai-runtime` loads first. This PR pins the right startup order so the runtime claims its KV cache before memory-heavy services occupy DRAM. Three coordinated changes: 1. `Before=genie-whisper.service genie-whisper-warmup.service homeassistant.service genie-core.service` on `genie-ai-runtime.service`. systemd ordering directive that makes the LLM unit's load complete before the other memory-heavy services start. Combined with PR #72's existing `After=genie-ai-runtime.service genie-llm.service` on `genie-core`, the dependency is now bidirectional. `Before=` is a no-op for units that aren't installed on this host, so this is safe for installs that don't ship homeassistant or whisper. 2. `GENIEPOD_AI_RUNTIME_CONTEXT` default bumped from `2048` to `8192` — the largest context the issue verified loads cleanly with `--int8-kv` on Orin Nano 8 GB. The env knob stays settable via systemd drop-in for smaller Jetsons. 3. `deploy/scripts/start_all.sh` reorders the `UNITS=(...)` array so the configured LLM unit + warmup run before `homeassistant`, `genie-whisper`, `genie-whisper-warmup` in the manual lifecycle path too, mirroring the systemd `Before=`. Tests added to `tool_dispatch_test.rs` lock both invariants: - `start_all_uses_configured_llm_backend` asserts `$configured_llm_unit` appears before `homeassistant.service` and `genie-whisper.service` in the `UNITS=` array. - `genie_ai_runtime_service_preserves_model_page_cache` asserts `GENIEPOD_AI_RUNTIME_CONTEXT=8192` and that the new `Before=` clause is present. Compatibility with PR #70 (warm page cache across restart) is preserved — `Before=` only affects boot-time ordering, not `systemctl restart genie-ai-runtime` alone. End-user verified on the same Jetson the issue was filed against. Worth a follow-up: PR #74's `GENIE_RUNTIME_MAX_BODY_BYTES = 4 KB` body-compaction threshold is now leaving performance on the table at the new 8192-token runtime context (the client compacts prompts the runtime could now handle). Right path is to make the threshold a function of `GENIEPOD_AI_RUNTIME_CONTEXT` or probe runtime capacity at connection time. Not blocking this PR. All 7 CI checks green on `c1cae29` (fmt, clippy, test, aarch64 cross-compile, shellcheck, ruff, `--no-default-features`).
diff --git a/crates/genie-core/tests/tool_dispatch_test.rs b/crates/genie-core/tests/tool_dispatch_test.rs
@@ -264,6 +264,24 @@ fn start_all_uses_configured_llm_backend() {
         contents.contains("is_warmup_unit") && contents.contains("start --no-block"),
         "start_all should queue warmup units without blocking the lifecycle script"
     );
+    let units = contents
+        .split("UNITS=(")
+        .nth(1)
+        .and_then(|s| s.split(")").next())
+        .expect("start_all should declare ordered units");
+    let llm_pos = units
+        .find("\"$configured_llm_unit\"")
+        .expect("start_all should include the configured LLM unit");
+    let homeassistant_pos = units
+        .find("homeassistant.service")
+        .expect("start_all should include Home Assistant");
+    let whisper_pos = units
+        .find("genie-whisper.service")
+        .expect("start_all should include Whisper");
+    assert!(
+        llm_pos < homeassistant_pos && llm_pos < whisper_pos,
+        "start_all should start the configured LLM before memory-heavy services"
+    );
 }
 
 /// Verify genie-ai-runtime service preserves warm GGUF pages across restarts.
@@ -285,8 +303,14 @@ fn genie_ai_runtime_service_preserves_model_page_cache() {
         "genie-ai-runtime.service should use INT8 KV to fit enough context under memory pressure"
     );
     assert!(
-        contents.contains("GENIEPOD_AI_RUNTIME_CONTEXT=2048"),
-        "genie-ai-runtime.service should request the GenieClaw web-chat context size"
+        contents.contains("GENIEPOD_AI_RUNTIME_CONTEXT=8192"),
+        "genie-ai-runtime.service should request the Jetson-tested 8k context size"
+    );
+    assert!(
+        contents.contains(
+            "Before=genie-whisper.service genie-whisper-warmup.service homeassistant.service genie-core.service"
+        ),
+        "genie-ai-runtime.service should reserve KV cache before memory-heavy services"
     );
 }
 
diff --git a/deploy/scripts/start_all.sh b/deploy/scripts/start_all.sh
@@ -118,12 +118,12 @@ configured_llm_unit="$(normalize_unit "$raw_llm_unit")"
 configured_warmup_unit="$(warmup_unit_for "$configured_llm_unit")"
 
 UNITS=(
-    homeassistant.service
     genie-audio.service
-    genie-whisper.service
-    genie-whisper-warmup.service
     "$configured_llm_unit"
     "$configured_warmup_unit"
+    homeassistant.service
+    genie-whisper.service
+    genie-whisper-warmup.service
     genie-core.service
     genie-governor.service
     genie-health.service
diff --git a/deploy/systemd/genie-ai-runtime.service b/deploy/systemd/genie-ai-runtime.service
@@ -2,6 +2,11 @@
 Description=GeniePod AI Runtime (Jetson-tuned LLM, OpenAI-compatible)
 Documentation=https://github.com/GeniePod/genie-ai-runtime
 After=network.target
+# Claim the LLM KV cache before memory-heavy voice/container services start.
+# Jetson testing for issue #75 showed the same `-c 4096` request fitting only
+# ~1.7k ctx after the full stack was resident, but fitting 4k/6k/8k ctx when
+# genie-ai-runtime loaded first.
+Before=genie-whisper.service genie-whisper-warmup.service homeassistant.service genie-core.service
 ConditionPathExists=/opt/geniepod/bin/jetson-llm-server
 # Conflicts with genie-llm.service: both bind :8080. systemd will refuse
 # to start the second one while the first is running, so a misconfigured
@@ -12,17 +17,16 @@ Conflicts=genie-llm.service
 Type=simple
 # Keep the GGUF in page cache across restarts when the kernel can. Clearing
 # VM caches here made every runtime restart cold-load Qwen3 again (issue #69).
-# Use INT8 KV so the Jetson service reliably gets enough context for
-# GenieClaw's web prompt even under memory pressure. `-c` is still clamped
-# by runtime memory budget, but INT8 KV roughly doubles the fitted context
-# versus the server's FP16 default.
+# Use INT8 KV so the Jetson service can reserve an 8k context on Orin Nano
+# when systemd starts it before memory-heavy services. `-c` is still clamped
+# by runtime memory budget, so boot/start ordering matters.
 ExecStart=/opt/geniepod/bin/jetson-llm-server \
     -m ${GENIEPOD_LLM_MODEL} \
     -p 8080 \
     -c ${GENIEPOD_AI_RUNTIME_CONTEXT} \
     --int8-kv
 Environment=GENIEPOD_LLM_MODEL=/opt/geniepod/models/Qwen3-4B-Q4_K_M.gguf
-Environment=GENIEPOD_AI_RUNTIME_CONTEXT=2048
+Environment=GENIEPOD_AI_RUNTIME_CONTEXT=8192
 Restart=on-failure
 RestartSec=5
 TimeoutStartSec=120