-
Notifications
You must be signed in to change notification settings - Fork 2.2k
[TRTLLM-11210][feat] Allow different TP config for draft/target models #11838
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -438,16 +438,19 @@ def __init__( | |
| assert self.mapping.has_cp_helix( | ||
| ), f"CP type must be HELIX for Attention, but got {self.mapping.cp_config['cp_type']}." | ||
|
|
||
| mapping = Mapping( | ||
| world_size=dp_size * tp_size * pp_size * cp_size, | ||
| tp_size=tp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=cp_size, | ||
| cp_config=self.mapping.cp_config, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| if dp_size == 1 and cp_size == 1: | ||
| mapping = self.mapping | ||
| else: | ||
| mapping = Mapping( | ||
| world_size=dp_size * tp_size * pp_size * cp_size, | ||
| tp_size=tp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=cp_size, | ||
| cp_config=self.mapping.cp_config, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
|
Comment on lines
+441
to
+453
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Preserve the draft mapping in the CP/attention-DP branches. The fast path now keeps Also applies to: 501-512, 1108-1120, 1216-1227 🤖 Prompt for AI Agents |
||
| self.tp_size = tp_size | ||
| self.cp_size = cp_size | ||
| self.tp_rank = mapping.tp_rank | ||
|
|
@@ -495,15 +498,18 @@ def __init__( | |
|
|
||
| # For Helix CP, combine TP and CP for the output projection so each | ||
| # rank's o_proj input is num_heads_tp_cp * head_dim. | ||
| mapping_o = Mapping( | ||
| world_size=dp_size * tp_size * pp_size * cp_size, | ||
| tp_size=tp_size * cp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=1, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| if dp_size == 1 and cp_size == 1: | ||
| mapping_o = self.mapping | ||
| else: | ||
| mapping_o = Mapping( | ||
| world_size=dp_size * tp_size * pp_size * cp_size, | ||
| tp_size=tp_size * cp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=1, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| self.mapping_o = mapping_o | ||
|
|
||
| self.o_proj = Linear( | ||
|
|
@@ -1099,16 +1105,19 @@ def __init__( | |
| assert self.mapping.has_cp_helix( | ||
| ), f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}." | ||
|
|
||
| mapping = Mapping( | ||
| world_size=pp_size * dp_size * tp_size * cp_size, | ||
| tp_size=tp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=cp_size, | ||
| cp_config=self.mapping.cp_config, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| if dp_size == 1 and cp_size == 1: | ||
| mapping = self.mapping | ||
| else: | ||
| mapping = Mapping( | ||
| world_size=pp_size * dp_size * tp_size * cp_size, | ||
| tp_size=tp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=cp_size, | ||
| cp_config=self.mapping.cp_config, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
|
|
||
| assert self.num_heads % (tp_size * cp_size) == 0 | ||
| self.num_heads_tp = self.num_heads // tp_size | ||
|
|
@@ -1204,15 +1213,18 @@ def __init__( | |
| requires_grad=False, | ||
| ) | ||
|
|
||
| mapping_o = Mapping( | ||
| world_size=pp_size * dp_size * tp_size * cp_size, | ||
| tp_size=tp_size * cp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=1, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| if dp_size == 1 and cp_size == 1: | ||
| mapping_o = self.mapping | ||
| else: | ||
| mapping_o = Mapping( | ||
| world_size=pp_size * dp_size * tp_size * cp_size, | ||
| tp_size=tp_size * cp_size, | ||
| pp_size=pp_size * dp_size, | ||
| cp_size=1, | ||
| rank=self.mapping.rank, | ||
| gpus_per_node=self.mapping.gpus_per_node, | ||
| enable_attention_dp=self.mapping.enable_attention_dp, | ||
| ) | ||
| self.mapping_o = mapping_o | ||
| self.o_proj = Linear( | ||
| self.num_key_value_heads * self.v_head_dim, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The synthetic MTP draft config still carries the target layer count.
replace(model_config, mapping=draft_mapping)only swaps the mapping. The cloned config still haspretrained_config.num_hidden_layers == target_num_layers, buttensorrt_llm/_torch/pyexecutor/_util.pynow sizes the separate draft KV cache fromeffective_draft_config. In MTP one-model mode that makes KV estimation count all target layers instead of just the draft layers, which can drastically shrinkmax_tokensor raise false OOMs for the newdraft_tp_sizepath. Please either synthesize a draft config with the draft layer count, or keep the layer count explicit in the KV-size path.🤖 Prompt for AI Agents