fix llama-cpp: add --fit on arg, fix GGML comment, update qwen3.5 preset with override-tensor

Copilot · mitchross · Copilot · commit 9ddc6b97d954 · 2026-02-23T16:47:44.000Z
Co-authored-by: mitchross &lt;6330506+mitchross@users.noreply.github.com&gt;
diff --git a/my-apps/ai/llama-cpp/configmap.yaml b/my-apps/ai/llama-cpp/configmap.yaml
@@ -54,18 +54,18 @@ data:
     # 397B total / 17B active (MoE) - Unsloth Dynamic Q4_K_XL
     # WARNING: ~5-15 tok/s due to cpu-moe offloading. Quality over speed.
     # Natively multimodal (vision + language), 256K context native
-    # cpu-moe keeps attention on GPU, experts on CPU - MUCH faster than
+    # override-tensor keeps attention on GPU, experts on CPU - MUCH faster than
     # unified memory swapping (targeted offload vs indiscriminate CUDA paging)
     model = /models/UD-Q4_K_XL/Qwen3.5-397B-A17B-UD-Q4_K_XL-00001-of-00006.gguf
     alias = qwen3.5, qwen 3.5, general, experimental slow
     ctx-size = 32768
     n-gpu-layers = 99
     tensor-split = 1,1
+    override-tensor = exps=CPU
     cache-type-k = q8_0
     cache-type-v = q4_0
-    cpu-moe = 1
-    temp = 0.6
+    temp = 0.7
     top-p = 0.95
-    top-k = 20
+    top-k = 40
     min-p = 0.0
     jinja = 1
diff --git a/my-apps/ai/llama-cpp/deployment.yaml b/my-apps/ai/llama-cpp/deployment.yaml
@@ -48,6 +48,8 @@ spec:
             - "-fa"
             - "on"        # Explicitly set to 'on' so --jinja is read correctly
             - "--jinja"
+            - "--fit"           # Auto-fit dense layers to available VRAM
+            - "on"
             - "--no-mmap"       # Prevent page fault stalls - we have 400GB RAM to spare
             - "-b"
             - "4096"            # Larger logical batch for faster prompt processing
@@ -67,7 +69,7 @@ spec:
             - name: NVIDIA_DRIVER_CAPABILITIES
               value: "compute,utility"
             - name: GGML_CUDA_ENABLE_UNIFIED_MEMORY
-              value: "1" # Vital for Kimi-K2 1T model to bridge VRAM and 400GB RAM
+              value: "1" # Bridges VRAM and 400GB RAM for Qwen3.5-397B MoE expert offloading
             - name: GGML_CUDA_PEER_MAX_BATCH_SIZE
               value: "128"
             - name: CUDA_SCALE_LAUNCH_QUEUES