fix: limit GLM-5 max running requests and update sglang image

Evrard-Nil · claude · Evrard-Nil · commit 6f44b7e2ee55 · 2026-02-28T10:44:36.000+01:00
Add --max-running-requests 16 to prevent server from hanging under load
(EAGLE speculative decoding default of 48 is too aggressive at 90% memory).
Update sglang image from glm5-hopper to glm5-hopper-patched (Feb 25).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/GLM-5.yaml b/GLM-5.yaml
@@ -65,7 +65,7 @@ services:
 
   glm:
     <<: *vllm-common
-    image: lmsysorg/sglang:glm5-hopper@sha256:e1876a9b43494fa8e0205f420db71e0e263081ed6da7173b30647d238a429bac
+    image: lmsysorg/sglang:glm5-hopper-patched@sha256:abf8deb5e81cd7f942be8be10b1a92d4360d2f0a245b50ca8d9e27e9c05a98d6
     container_name: glm
     command: >
       sglang serve
@@ -79,6 +79,7 @@ services:
       --speculative-eagle-topk 1
       --speculative-num-draft-tokens 4
       --mem-fraction-static 0.90
+      --max-running-requests 16
       --port 8000
       --host 0.0.0.0
       --enable-cache-report