Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions vllm/v1/worker/gpu/cudagraph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

cg_mode: CUDAGraphMode
num_tokens: int
num_tokens_for_attn: int | None
num_reqs: int | None # None means no request padding is needed (PIECEWISE graphs)
uniform_token_count: int | None = None
Comment on lines 36 to 39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add more doc to these fields

including meaning of None for the other ones too


Expand Down Expand Up @@ -120,27 +121,31 @@
and decode_mode
and self.decode_query_len <= num_tokens <= max_decode_tokens
):
num_reqs = num_tokens // self.decode_query_len
desc = BatchExecutionDescriptor(
cg_mode=decode_mode,
num_tokens=num_tokens,
num_reqs=num_tokens // self.decode_query_len,
num_tokens_for_attn=num_reqs * self.decode_query_len,
num_reqs=num_reqs,
uniform_token_count=self.decode_query_len,
)
descs_by_mode[decode_mode].append(desc)
descs_by_token_count[num_tokens].append(desc)

if mixed_mode:
# for PIECEWISE graphs there is no limit on requests when replaying
# i.e. no request padding is needed
# so we leave it as None
num_reqs = (
min(num_tokens, self.max_num_reqs)
if mixed_mode == CUDAGraphMode.FULL
else None
)
if mixed_mode == CUDAGraphMode.FULL:
num_reqs = min(num_tokens, self.max_num_reqs)
num_tokens_for_attn = num_tokens
else:
# for PIECEWISE graphs there is no limit on requests when replaying
# i.e. no request padding is needed
# so we leave it as None
Comment on lines +141 to +142
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

save a line?

Suggested change
# i.e. no request padding is needed
# so we leave it as None
# i.e. no request padding is needed, so we leave it as None

num_reqs = None

Check failure on line 143 in vllm/v1/worker/gpu/cudagraph_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types in assignment (expression has type "None", variable has type "int") [assignment]

Check failure on line 143 in vllm/v1/worker/gpu/cudagraph_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types in assignment (expression has type "None", variable has type "int") [assignment]

Check failure on line 143 in vllm/v1/worker/gpu/cudagraph_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types in assignment (expression has type "None", variable has type "int") [assignment]

Check failure on line 143 in vllm/v1/worker/gpu/cudagraph_utils.py

View workflow job for this annotation

GitHub Actions / pre-commit

Incompatible types in assignment (expression has type "None", variable has type "int") [assignment]
num_tokens_for_attn = None
desc = BatchExecutionDescriptor(
cg_mode=mixed_mode,
num_tokens=num_tokens,
num_tokens_for_attn=num_tokens_for_attn,
num_reqs=num_reqs,
)
descs_by_mode[mixed_mode].append(desc)
Expand Down Expand Up @@ -233,7 +238,10 @@
if _is_compatible(desc, num_reqs, num_tokens, uniform_token_count):
return desc
return BatchExecutionDescriptor(
cg_mode=CUDAGraphMode.NONE, num_tokens=num_tokens, num_reqs=num_reqs
cg_mode=CUDAGraphMode.NONE,
num_tokens=num_tokens,
num_tokens_for_attn=None,
num_reqs=num_reqs,
)

def run_fullgraph(self, desc: BatchExecutionDescriptor):
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/gpu/dp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def sync_cudagraph_and_dp_padding(

if torch.all(num_tokens_across_dp == 0).item():
synced_desc = BatchExecutionDescriptor(
cg_mode=CUDAGraphMode.NONE, num_tokens=0, num_reqs=0
cg_mode=CUDAGraphMode.NONE, num_tokens=0, num_tokens_for_attn=0, num_reqs=0
)
return synced_desc, None

Expand All @@ -58,6 +58,7 @@ def sync_cudagraph_and_dp_padding(
return BatchExecutionDescriptor(
cg_mode=CUDAGraphMode.NONE,
num_tokens=num_tokens,
num_tokens_for_attn=None,
num_reqs=num_reqs,
), num_tokens_across_dp

Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/worker/gpu/input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class InputBatch:
# sum(num_scheduled_tokens)
num_tokens: int
num_tokens_after_padding: int
num_tokens_for_attn: int
num_draft_tokens: int

# [num_reqs + 1]
Expand Down Expand Up @@ -132,6 +133,7 @@ def make_dummy(
num_scheduled_tokens=num_scheduled_tokens,
num_tokens=num_tokens,
num_tokens_after_padding=num_tokens,
num_tokens_for_attn=num_tokens,
num_draft_tokens=0,
query_start_loc=query_start_loc,
query_start_loc_np=query_start_loc_np,
Expand Down
7 changes: 6 additions & 1 deletion vllm/v1/worker/gpu/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,8 +599,12 @@ def prepare_inputs(
self, scheduler_output: SchedulerOutput, batch_desc: BatchExecutionDescriptor
) -> InputBatch:
num_tokens = scheduler_output.total_num_scheduled_tokens
num_tokens_after_padding = batch_desc.num_tokens
assert num_tokens > 0
num_tokens_after_padding = batch_desc.num_tokens
if batch_desc.num_tokens_for_attn is not None:
num_tokens_for_attn = batch_desc.num_tokens_for_attn
else:
num_tokens_for_attn = num_tokens
Comment on lines +604 to +607
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could simplify

Suggested change
if batch_desc.num_tokens_for_attn is not None:
num_tokens_for_attn = batch_desc.num_tokens_for_attn
else:
num_tokens_for_attn = num_tokens
num_tokens_for_attn = batch_desc.num_tokens_for_attn or num_tokens

num_tokens_per_req = scheduler_output.num_scheduled_tokens
num_reqs = len(num_tokens_per_req)

Expand Down Expand Up @@ -721,6 +725,7 @@ def prepare_inputs(
num_scheduled_tokens=num_scheduled_tokens,
num_tokens=num_tokens,
num_tokens_after_padding=num_tokens_after_padding,
num_tokens_for_attn=num_tokens_for_attn,
num_draft_tokens=total_num_draft_tokens,
query_start_loc=query_start_loc,
query_start_loc_np=query_start_loc_np,
Expand Down
3 changes: 1 addition & 2 deletions vllm/v1/worker/gpu/model_states/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,10 @@ def prepare_attn(
if cudagraph_mode == CUDAGraphMode.FULL:
# Use padded sizes - padding is handled by model_runner.prepare_attn.
num_reqs = input_batch.num_reqs_after_padding
num_tokens = input_batch.num_tokens_after_padding
else:
# For piecewise cudagraphs and eager, use unpadded sizes.
num_reqs = input_batch.num_reqs
num_tokens = input_batch.num_tokens
num_tokens = input_batch.num_tokens_for_attn
query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
max_query_len = input_batch.num_scheduled_tokens.max().item()
attn_metadata = build_attn_metadata(
Expand Down
Loading