[docker] fix sglang upgrade bug (#1639)

zhuzilin · web-flow · commit bd70add6ab27 · 2026-02-27T14:16:13.000+08:00
diff --git a/docker/patch/latest/sglang.patch b/docker/patch/latest/sglang.patch
@@ -589,7 +589,7 @@ index 1cdf65b91..4783cd18f 100644
      buf_numel_per_page: tl.constexpr,
      index_head_dim: tl.constexpr,
 diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
-index ca54a931b..258407c71 100644
+index ca54a931b..961d5f62a 100644
 --- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
 +++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
 @@ -4,6 +4,7 @@ import contextlib
@@ -612,7 +612,17 @@ index ca54a931b..258407c71 100644
              device=get_global_server_args().device,
          )
          self.block_size = block_size
-@@ -982,6 +986,9 @@ class Indexer(MultiPlatformOp):
+@@ -244,6 +248,9 @@ class Indexer(MultiPlatformOp):
+             x = x.to(self.weights_proj.weight.dtype)
+         weights, _ = self.weights_proj(x)
+         weights = weights.float()
++        if weights.shape[1] < q_scale.shape[1]:
++            assert q_scale.shape[1] % weights.shape[1] == 0
++            weights = weights.repeat_interleave(q_scale.shape[1] // weights.shape[1], dim=1)
+         weights = weights * self.n_heads**-0.5
+         weights = weights.unsqueeze(-1) * q_scale * self.softmax_scale
+         return weights
+@@ -982,15 +989,24 @@ class Indexer(MultiPlatformOp):
              query, key = self._get_q_k_bf16(
                  q_lora, x, positions, enable_dual_stream, forward_batch=forward_batch
              )
@@ -622,7 +632,12 @@ index ca54a931b..258407c71 100644
              q_fp8, q_scale = act_quant(query, self.block_size, self.scale_fmt)
              with torch.cuda.stream(self.alt_stream):
                  k_fp8, k_scale = act_quant(key, self.block_size, self.scale_fmt)
-@@ -991,6 +998,9 @@ class Indexer(MultiPlatformOp):
+             current_stream.wait_stream(self.alt_stream)
++            if weights.shape[1] < q_scale.shape[1]:
++                assert q_scale.shape[1] % weights.shape[1] == 0
++                weights = weights.repeat_interleave(q_scale.shape[1] // weights.shape[1], dim=1)
+             weights = weights.unsqueeze(-1) * q_scale * self.softmax_scale
+         else:
              query, key = self._get_q_k_bf16(
                  q_lora, x, positions, enable_dual_stream, forward_batch=forward_batch
              )
@@ -1593,7 +1608,7 @@ index f2ffa9909..6e4d1d460 100644
          self,
          obj: InitWeightsSendGroupForRemoteInstanceReqInput,
 diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
-index 0914a5230..d637041b0 100644
+index 0914a5230..2a5819856 100644
 --- a/python/sglang/srt/managers/tokenizer_manager.py
 +++ b/python/sglang/srt/managers/tokenizer_manager.py
 @@ -324,8 +324,12 @@ class TokenizerManager(TokenizerCommunicatorMixin, TokenizerManagerMultiItemMixi
@@ -1641,6 +1656,24 @@ index 0914a5230..d637041b0 100644
          trace_slice_start(RequestStage.TOKENIZER_DISPATCH, obj.rid)
          tokenized_obj.trace_context = trace_get_proc_propagate_context(obj.rid)
          tokenized_obj = wrap_shm_features(tokenized_obj)
+@@ -1327,7 +1348,7 @@ class TokenizerManager(TokenizerCommunicatorMixin, TokenizerManagerMultiItemMixi
+         async with self.is_pause_cond:
+             self.is_pause = True
+             if obj.mode != "abort":
+-                await self.send_to_scheduler.send_pyobj(obj)
++                self.send_to_scheduler.send_pyobj(obj)
+             else:
+                 # we are using the model_update_lock to check if there is still on-going requests.
+                 while True:
+@@ -1341,7 +1362,7 @@ class TokenizerManager(TokenizerCommunicatorMixin, TokenizerManagerMultiItemMixi
+     async def continue_generation(self, obj: ContinueGenerationReqInput):
+         async with self.is_pause_cond:
+             self.is_pause = False
+-            await self.send_to_scheduler.send_pyobj(obj)
++            self.send_to_scheduler.send_pyobj(obj)
+             self.is_pause_cond.notify_all()
+ 
+     async def update_weights_from_disk(
 diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
 index 86b009df4..16ebd52ae 100644
 --- a/python/sglang/srt/managers/tp_worker.py
diff --git a/docker/version.txt b/docker/version.txt
@@ -1 +1 @@
-nightly-dev-20260226a
+nightly-dev-20260227a
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ datasets
 httpx[http2]
 mcp[cli]
 memray  # needed for debugging (but is lightweight), we can put it to dev mode when using pyproject.toml
+numba
 omegaconf
 pillow
 pylatexenc

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-nightly-dev-20260226a`
	`1`	`+nightly-dev-20260227a`