perf: add DeepGEMM cache, multithreaded loading, and context length limit

Evrard-Nil · claude · Evrard-Nil · commit 4d945fd5fc2c · 2026-03-01T10:43:24.000+01:00
- Mount deepgemm_cache volume to persist JIT-compiled kernels across restarts
- Add --model-loader-extra-config for multithreaded model loading (64 threads)
- Set --context-length 202000 to avoid EAGLE off-by-two crash near max pos embeddings

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/GLM-5.yaml b/GLM-5.yaml
@@ -83,11 +83,14 @@ services:
       --speculative-num-draft-tokens 4
       --mem-fraction-static 0.90
       --max-running-requests 16
+      --context-length 202000
+      --model-loader-extra-config '{"enable_multithread_load": "true", "num_threads": 64}'
       --port 8000
       --host 0.0.0.0
       --enable-cache-report
     volumes:
       - hugginface_cache:/root/.cache/huggingface
+      - deepgemm_cache:/root/.deep_gemm
     environment:
       - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
       - NVIDIA_DRIVER_CAPABILITIES=compute,utility
@@ -111,6 +114,7 @@ networks:
 
 volumes:
   hugginface_cache:
+  deepgemm_cache:
   certs:
     external: true
     name: certs