-
Notifications
You must be signed in to change notification settings - Fork 3.7k
QMoE CPU Performance Update (Up to 4x on 4-bit) #27364
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ed0478a
2c83829
2b25601
e9dcd41
dec8867
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -56,37 +56,63 @@ Status CheckInputs(MoEParameters& parameters, | |||||||||
| const int64_t block_size = 0) { // block size for block-wise quantization | ||||||||||
| // Check dimensions of input to avoid input_dims index out of range. CHECK_TENSOR_SHAPE will verify each tensor later. | ||||||||||
| ASSERT_TENSOR_2D_OR_3D(input); | ||||||||||
| ASSERT_TENSOR_3D(fc1_experts_weights); | ||||||||||
| ASSERT_TENSOR_3D(fc2_experts_weights); | ||||||||||
| if (fc1_experts_weights) ASSERT_TENSOR_3D(fc1_experts_weights); | ||||||||||
| if (fc2_experts_weights) ASSERT_TENSOR_3D(fc2_experts_weights); | ||||||||||
| ASSERT_TENSOR_2D(router_probs); | ||||||||||
|
|
||||||||||
| const auto& input_dims = input->Shape().GetDims(); | ||||||||||
| const auto& router_probs_dims = router_probs->Shape().GetDims(); | ||||||||||
| const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims(); | ||||||||||
| const auto& fc2_experts_weights_dims = fc2_experts_weights->Shape().GetDims(); | ||||||||||
|
|
||||||||||
| int64_t num_rows = input_dims.size() == 2 ? input_dims[0] : input_dims[0] * input_dims[1]; | ||||||||||
| int64_t hidden_size = input_dims[input_dims.size() - 1]; | ||||||||||
| int64_t local_num_experts = fc1_experts_weights_dims[0]; | ||||||||||
| int64_t num_experts = router_probs_dims[1]; | ||||||||||
| int64_t inter_size = (fc2_experts_weights_dims[1] * fc2_experts_weights_dims[2] * pack_size) / hidden_size; | ||||||||||
|
|
||||||||||
| const bool legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) || | ||||||||||
| (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size); | ||||||||||
| int64_t local_num_experts; | ||||||||||
| if (fc1_experts_weights != nullptr) { | ||||||||||
| local_num_experts = fc1_experts_weights->Shape().GetDims()[0]; | ||||||||||
| } else if (fc1_experts_scales != nullptr) { | ||||||||||
| local_num_experts = fc1_experts_scales->Shape().GetDims()[0]; | ||||||||||
| } else { | ||||||||||
| // Fallback for non-quantized MoE without weights (should not happen in current code paths) | ||||||||||
| // or if only bias is provided? | ||||||||||
| local_num_experts = num_experts; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| int64_t inter_size; | ||||||||||
| if (fc2_experts_weights != nullptr) { | ||||||||||
| const auto& dims = fc2_experts_weights->Shape().GetDims(); | ||||||||||
| inter_size = (dims[1] * dims[2] * pack_size) / hidden_size; | ||||||||||
| } else if (fc3_experts_scales != nullptr) { | ||||||||||
| inter_size = fc3_experts_scales->Shape().GetDims()[1]; | ||||||||||
| } else if (fc1_experts_scales != nullptr) { | ||||||||||
| int64_t fc1_inter_size = fc1_experts_scales->Shape().GetDims()[1]; | ||||||||||
| inter_size = is_fused_swiglu ? fc1_inter_size / 2 : fc1_inter_size; | ||||||||||
| } else { | ||||||||||
| // Should not happen for valid QMoE calls | ||||||||||
| inter_size = 0; | ||||||||||
|
Comment on lines
+91
to
+92
|
||||||||||
| // Should not happen for valid QMoE calls | |
| inter_size = 0; | |
| ORT_THROW("Invalid MoE configuration: unable to infer inter_size because " | |
| "fc2_experts_weights, fc3_experts_scales, and fc1_experts_scales are all null."); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The fallback at lines 76-78 sets local_num_experts = num_experts when both fc1_experts_weights and fc1_experts_scales are null. This represents an invalid configuration (no weights or scales provided), and should return an error status rather than silently falling back to a potentially incorrect value. Consider adding a validation check that returns an error if both are null.