fix: disable EAGLE3 speculative decoding for gpt-oss-120b

Evrard-Nil · Evrard-Nil · commit dfcbb3279f10 · 2026-03-10T12:51:06.000+01:00
Streaming responses were consistently dropping the last 1-2 tokens
due to a vLLM v0.12.0 EAGLE3 bug. Non-streaming was unaffected.
diff --git a/small-models.yaml b/small-models.yaml
@@ -74,7 +74,6 @@ x-gpt-oss-common: &gpt-oss-common
       --enable-auto-tool-choice
       --max-model-len 128K
       --max-num-batched-tokens 8192
-      --speculative-config '{"model":"nvidia/gpt-oss-120b-Eagle3-v2","num_speculative_tokens":3,"method":"eagle3","draft_tensor_parallel_size":1}'
       --load-format runai_streamer
       --model-loader-extra-config '{"distributed":true, "concurrency":48}'
   volumes: