[BugFix] fix reduce_sampling (vllm-project#9545)

hzx55906 · zzzzzmeng · commit 4a99c90d7cfd · 2026-05-28T15:59:07.000+08:00
### What this PR does / why we need it? Fix the issue in reduce_sampling where enabling speculative sampling causes an error with a single curl request. vllm-project#8308 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.20.2 - vLLM main: vllm-project/vllm@1ac10f1 --------- Signed-off-by: hzx55906 <513464215@qq.com>
diff --git a/vllm_ascend/ops/triton/reject_sample.py b/vllm_ascend/ops/triton/reject_sample.py
@@ -329,7 +329,7 @@ def sample_recovered_tokens_kernel(
 
             qv = tl.load(q_ptr + req_idx * C + offs, mask=mask, other=1.0).to(tl.float32)
 
-            bad_q = (qv <= 0) | tl.math.isinf(qv)
+            bad_q = (qv <= 0) | (qv != qv) | (qv == float("inf")) | (qv == -float("inf"))
             score = tl.where(bad_q, float("-inf"), prob / qv)
             score = tl.where(mask, score, float("-inf"))
 
diff --git a/vllm_ascend/patch/worker/patch_llama_eagle3.py b/vllm_ascend/patch/worker/patch_llama_eagle3.py
@@ -6,7 +6,7 @@
 def compute_logits(
     self,
     hidden_states: torch.Tensor,
-    enable_reduce_sample: bool = True,
+    enable_reduce_sample: bool = False,
 ) -> torch.Tensor | None:
     if enable_reduce_sample:
         logits = self.logits_processor(self.lm_head, hidden_states)
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -444,7 +444,7 @@ def rejection_sample(
                     global_vocab_size,
                     batch_size,
                     NO_DRAFT_PROBS=draft_probs is None,
-                    enable_reduce_sampling=True,
+                    ENABLE_REDUCE_SAMPLING=True,
                     BLOCK_SIZE=block_size,
                 )
             else:
@@ -482,7 +482,7 @@ def rejection_sample(
                     global_vocab_size,
                     batch_size,
                     NO_DRAFT_PROBS=draft_probs is None,
-                    enable_reduce_sampling=True,
+                    ENABLE_REDUCE_SAMPLING=True,
                     BLOCK_SIZE=block_size,
                 )
             else:
@@ -553,7 +553,7 @@ def rejection_sample(
                     vocab_size,  # global_vocab_size
                     batch_size,
                     NO_DRAFT_PROBS=draft_probs is None,
-                    enable_reduce_sampling=False,
+                    ENABLE_REDUCE_SAMPLING=False,
                     BLOCK_SIZE=block_size,
                 )
             else:
@@ -591,7 +591,7 @@ def rejection_sample(
                     vocab_size,  # global_vocab_size
                     batch_size,
                     NO_DRAFT_PROBS=draft_probs is None,
-                    enable_reduce_sampling=False,
+                    ENABLE_REDUCE_SAMPLING=False,
                     BLOCK_SIZE=block_size,
                 )
             else:
@@ -704,7 +704,7 @@ def sample_recovered_tokens(
             global_vocab_size if global_vocab_size is not None else vocab_size,
             NO_DRAFT_PROBS=draft_probs is None,
             BLOCK_VERIFY=use_block_verify,
-            enable_reduce_sampling=enable_reduce_sampling,
+            ENABLE_REDUCE_SAMPLING=enable_reduce_sampling,
             SUB_BLOCK=512,
             # TODO: enable multibuffer when accuracy problem is solved.
             multibuffer=False,
diff --git a/vllm_ascend/spec_decode/llm_base_proposer.py b/vllm_ascend/spec_decode/llm_base_proposer.py
@@ -958,7 +958,7 @@ def _run_merged_draft(
                 draft_token_ids = draft_token_ids[:num_indices]
                 token_indices_to_sample = token_indices_to_sample[:num_indices]
         else:
-            logits = self.model.compute_logits(sample_hidden_states, get_ascend_config().enable_reduce_sample)
+            logits = self.model.compute_logits(sample_hidden_states)
             if lmhead_tp_enable() and num_indices < logits.shape[0]:
                 logits = logits[:num_indices]
                 token_indices_to_sample = token_indices_to_sample[:num_indices]
@@ -1089,7 +1089,9 @@ def _run_merged_draft(
 
             sample_hidden_states = last_hidden_states[token_indices_to_sample]
             if get_ascend_config().enable_reduce_sample:
-                draft_token_ids = self.model.compute_logits(sample_hidden_states)
+                draft_token_ids = self.model.compute_logits(
+                    sample_hidden_states, get_ascend_config().enable_reduce_sample
+                )
                 if lmhead_tp_enable() and num_indices < draft_token_ids.shape[0]:
                     draft_token_ids = draft_token_ids[:num_indices]
                     token_indices_to_sample = token_indices_to_sample[:num_indices]
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2287,7 +2287,7 @@ def _sample(self, logits, spec_decode_metadata):
         if spec_decode_metadata is None:
             if lmhead_tp_enable() and logits is not None:
                 logits = logits[: self.input_batch.num_reqs]
-            if self.input_batch.top_k_cpu is not None and get_ascend_config().enable_reduce_sample:
+            if self.input_batch.sampling_metadata.top_k is not None and get_ascend_config().enable_reduce_sample:
                 max_topk = self.input_batch.top_k_cpu[self.input_batch.top_k_cpu < logits.shape[1]].max()
                 self.sampler.prepare_sampling(max_topk)
             return self.sampler(
@@ -2297,7 +2297,7 @@ def _sample(self, logits, spec_decode_metadata):
 
         if lmhead_tp_enable() and logits is not None:
             logits = logits[: len(spec_decode_metadata.logits_indices)]
-        if self.input_batch.top_k_cpu is not None and get_ascend_config().enable_reduce_sample:
+        if self.input_batch.sampling_metadata.top_k is not None and get_ascend_config().enable_reduce_sample:
             max_topk = self.input_batch.top_k_cpu[self.input_batch.top_k_cpu < logits.shape[1]].max()
             self.rejection_sampler.prepare_sampling(max_topk)
         sampler_output = self.rejection_sampler(