-
-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[Model Runner V2] Introduce num_tokens_for_attn #36815
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -34,6 +34,7 @@ | |||||||
|
|
||||||||
| cg_mode: CUDAGraphMode | ||||||||
| num_tokens: int | ||||||||
| num_tokens_for_attn: int | None | ||||||||
| num_reqs: int | None # None means no request padding is needed (PIECEWISE graphs) | ||||||||
| uniform_token_count: int | None = None | ||||||||
|
|
||||||||
|
|
@@ -120,27 +121,31 @@ | |||||||
| and decode_mode | ||||||||
| and self.decode_query_len <= num_tokens <= max_decode_tokens | ||||||||
| ): | ||||||||
| num_reqs = num_tokens // self.decode_query_len | ||||||||
| desc = BatchExecutionDescriptor( | ||||||||
| cg_mode=decode_mode, | ||||||||
| num_tokens=num_tokens, | ||||||||
| num_reqs=num_tokens // self.decode_query_len, | ||||||||
| num_tokens_for_attn=num_reqs * self.decode_query_len, | ||||||||
| num_reqs=num_reqs, | ||||||||
| uniform_token_count=self.decode_query_len, | ||||||||
| ) | ||||||||
| descs_by_mode[decode_mode].append(desc) | ||||||||
| descs_by_token_count[num_tokens].append(desc) | ||||||||
|
|
||||||||
| if mixed_mode: | ||||||||
| # for PIECEWISE graphs there is no limit on requests when replaying | ||||||||
| # i.e. no request padding is needed | ||||||||
| # so we leave it as None | ||||||||
| num_reqs = ( | ||||||||
| min(num_tokens, self.max_num_reqs) | ||||||||
| if mixed_mode == CUDAGraphMode.FULL | ||||||||
| else None | ||||||||
| ) | ||||||||
| if mixed_mode == CUDAGraphMode.FULL: | ||||||||
| num_reqs = min(num_tokens, self.max_num_reqs) | ||||||||
| num_tokens_for_attn = num_tokens | ||||||||
| else: | ||||||||
| # for PIECEWISE graphs there is no limit on requests when replaying | ||||||||
| # i.e. no request padding is needed | ||||||||
| # so we leave it as None | ||||||||
|
Comment on lines
+141
to
+142
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. save a line?
Suggested change
|
||||||||
| num_reqs = None | ||||||||
|
Check failure on line 143 in vllm/v1/worker/gpu/cudagraph_utils.py
|
||||||||
| num_tokens_for_attn = None | ||||||||
| desc = BatchExecutionDescriptor( | ||||||||
| cg_mode=mixed_mode, | ||||||||
| num_tokens=num_tokens, | ||||||||
| num_tokens_for_attn=num_tokens_for_attn, | ||||||||
| num_reqs=num_reqs, | ||||||||
| ) | ||||||||
| descs_by_mode[mixed_mode].append(desc) | ||||||||
|
|
@@ -233,7 +238,10 @@ | |||||||
| if _is_compatible(desc, num_reqs, num_tokens, uniform_token_count): | ||||||||
| return desc | ||||||||
| return BatchExecutionDescriptor( | ||||||||
| cg_mode=CUDAGraphMode.NONE, num_tokens=num_tokens, num_reqs=num_reqs | ||||||||
| cg_mode=CUDAGraphMode.NONE, | ||||||||
| num_tokens=num_tokens, | ||||||||
| num_tokens_for_attn=None, | ||||||||
| num_reqs=num_reqs, | ||||||||
| ) | ||||||||
|
|
||||||||
| def run_fullgraph(self, desc: BatchExecutionDescriptor): | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -599,8 +599,12 @@ def prepare_inputs( | |||||||||||
| self, scheduler_output: SchedulerOutput, batch_desc: BatchExecutionDescriptor | ||||||||||||
| ) -> InputBatch: | ||||||||||||
| num_tokens = scheduler_output.total_num_scheduled_tokens | ||||||||||||
| num_tokens_after_padding = batch_desc.num_tokens | ||||||||||||
| assert num_tokens > 0 | ||||||||||||
| num_tokens_after_padding = batch_desc.num_tokens | ||||||||||||
| if batch_desc.num_tokens_for_attn is not None: | ||||||||||||
| num_tokens_for_attn = batch_desc.num_tokens_for_attn | ||||||||||||
| else: | ||||||||||||
| num_tokens_for_attn = num_tokens | ||||||||||||
|
Comment on lines
+604
to
+607
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could simplify
Suggested change
|
||||||||||||
| num_tokens_per_req = scheduler_output.num_scheduled_tokens | ||||||||||||
| num_reqs = len(num_tokens_per_req) | ||||||||||||
|
|
||||||||||||
|
|
@@ -721,6 +725,7 @@ def prepare_inputs( | |||||||||||
| num_scheduled_tokens=num_scheduled_tokens, | ||||||||||||
| num_tokens=num_tokens, | ||||||||||||
| num_tokens_after_padding=num_tokens_after_padding, | ||||||||||||
| num_tokens_for_attn=num_tokens_for_attn, | ||||||||||||
| num_draft_tokens=total_num_draft_tokens, | ||||||||||||
| query_start_loc=query_start_loc, | ||||||||||||
| query_start_loc_np=query_start_loc_np, | ||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should add more doc to these fields
including meaning of
Nonefor the other ones too