Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm_ascend/spec_decode/eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def load_model(self, model: nn.Module) -> None:
def dummy_run(self,
num_tokens: int,
with_prefill: bool = False,
skip_attn: bool = False,
in_graph_capturing: bool = False,
num_reqs: int = 0,
num_tokens_across_dp: Optional[torch.Tensor] = None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/spec_decode/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def load_model(self, model):
def dummy_run(self,
num_tokens: int,
with_prefill: bool = False,
skip_attn: bool = False,
in_graph_capturing: bool = False,
num_reqs: int = 0,
num_tokens_across_dp: Optional[torch.Tensor] = None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
Expand Down
8 changes: 3 additions & 5 deletions vllm_ascend/spec_decode/mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def load_model(self, model) -> None:
def dummy_run(self,
num_tokens: int,
with_prefill: bool = False,
skip_attn: bool = False,
in_graph_capturing: bool = False,
num_reqs: int = 0,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
Expand All @@ -251,9 +251,7 @@ def dummy_run(self,
moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type
== MoECommType.FUSED_ALLTOALL else moe_comm_type)

if skip_attn:
attn_metadata = None
elif aclgraph_runtime_mode == CUDAGraphMode.FULL:
if aclgraph_runtime_mode == CUDAGraphMode.FULL:
if len(self.runner.attn_groups) > 0:
num_computed_tokens_cpu = (
self.runner.input_batch.
Expand Down Expand Up @@ -298,7 +296,7 @@ def dummy_run(self,
positions = self.positions[:num_tokens]
previous_hidden_states = self.hidden_states[:num_tokens]
for i in range(self.num_speculative_tokens):
if i > 0 and not skip_attn and aclgraph_runtime_mode == CUDAGraphMode.FULL:
if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL:
aclgraph_runtime_mode = CUDAGraphMode.NONE
with set_ascend_forward_context(
attn_metadata,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/spec_decode/ngram_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def load_model(self, *args, **kwargs):
def dummy_run(self,
num_tokens,
with_prefill=None,
skip_attn=None,
in_graph_capturing=None,
num_reqs=None,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/spec_decode/suffix_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def load_model(self, *args, **kwargs):
def dummy_run(self,
num_tokens,
with_prefill=None,
skip_attn=None,
in_graph_capturing=None,
num_reqs=None,
num_tokens_across_dp=None,
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
Expand Down
2 changes: 1 addition & 1 deletion vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2296,7 +2296,7 @@ def dummy_drafter_compute_logits(hidden_states):
aclgraph_runtime_mode=aclgraph_runtime_mode,
batch_descriptor=batch_descriptor,
dummy_compute_logits=dummy_drafter_compute_logits,
skip_attn=not force_attention)
in_graph_capturing=not force_attention)
if self.in_profile_run and self.dynamic_eplb:
self.model.clear_all_moe_loads()
if not self.in_profile_run and self.dynamic_eplb:
Expand Down
Loading