refactor CheckInputs

tianleiwu · tianleiwu · commit 4af13153f3cc · 2026-02-17T07:45:46.000Z
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
@@ -35,31 +35,68 @@ struct MoEParameters {
 };
 namespace moe_helper {
 
+// Helper to check shape dimensions
+#define ASSERT_SHAPE_DIMENSION(shape_ptr, dim, name)                             \
+  if (shape_ptr != nullptr) {                                                    \
+    if (shape_ptr->NumDimensions() != dim) {                                     \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name,     \
+                             "' is expected to have ", dim, " dimensions, got ", \
+                             shape_ptr->NumDimensions());                        \
+    }                                                                            \
+  }
+
+#define ASSERT_SHAPE_3D(shape_ptr, name) ASSERT_SHAPE_DIMENSION(shape_ptr, 3, name)
+
+#define CHECK_SHAPE(shape_ptr, name, ...)                                    \
+  if (shape_ptr != nullptr) {                                                \
+    const TensorShape& expected_shape = make_shape(__VA_ARGS__);             \
+    if (*shape_ptr != expected_shape) {                                      \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name, \
+                             "' is expected to have shape ", expected_shape, \
+                             ", got ", *shape_ptr);                          \
+    }                                                                        \
+  }
+
 template <typename Tensor>
 Status CheckInputs(MoEParameters& parameters,
-                   const Tensor* input,                // required
-                   const Tensor* router_probs,         // required
-                   const Tensor* fc1_experts_weights,  // required
-                   const Tensor* fc1_experts_bias,     // optional
-                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc1_zero_points,      // optional, for qMoE
-                   const Tensor* fc2_experts_weights,  // required
-                   const Tensor* fc2_experts_bias,     // optional
-                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc2_zero_points,      // optional, for qMoE
-                   const Tensor* fc3_experts_weights,  // optional
-                   const Tensor* fc3_experts_bias,     // optional
-                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc3_zero_points,      // optional, for qMoE
-                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const Tensor* input,         // required
+                   const Tensor* router_probs,  // required
+                   const TensorShape* fc1_experts_weights_shape,
+                   const Tensor* fc1_experts_bias,    // optional
+                   const Tensor* fc1_experts_scales,  // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,     // optional, for qMoE
+                   const TensorShape* fc2_experts_weights_shape,
+                   const Tensor* fc2_experts_bias,    // optional
+                   const Tensor* fc2_experts_scales,  // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,     // optional, for qMoE
+                   const TensorShape* fc3_experts_weights_shape,
+                   const Tensor* fc3_experts_bias,    // optional
+                   const Tensor* fc3_experts_scales,  // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,     // optional, for qMoE
+                   const int64_t pack_size,           // number of weights packed together (like 2 for uint4 packed to uint8)
                    const bool is_fused_swiglu,
                    const int64_t block_size = 0) {  // block size for block-wise quantization
   // Check dimensions of input to avoid input_dims index out of range. CHECK_TENSOR_SHAPE will verify each tensor later.
+  if (input == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is required.");
+  }
   ASSERT_TENSOR_2D_OR_3D(input);
-  if (fc1_experts_weights) ASSERT_TENSOR_3D(fc1_experts_weights);
-  if (fc2_experts_weights) ASSERT_TENSOR_3D(fc2_experts_weights);
+
+  if (router_probs == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'router_probs' is required.");
+  }
   ASSERT_TENSOR_2D(router_probs);
 
+  if (fc1_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc1_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc1_experts_weights_shape, "fc1_experts_weights");
+
+  if (fc2_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc2_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc2_experts_weights_shape, "fc2_experts_weights");
+
   const auto& input_dims = input->Shape().GetDims();
   const auto& router_probs_dims = router_probs->Shape().GetDims();
 
@@ -68,34 +105,35 @@ Status CheckInputs(MoEParameters& parameters,
   int64_t num_experts = router_probs_dims[1];
 
   int64_t local_num_experts;
-  if (fc1_experts_weights != nullptr) {
-    local_num_experts = fc1_experts_weights->Shape().GetDims()[0];
+  if (fc1_experts_weights_shape != nullptr) {
+    local_num_experts = fc1_experts_weights_shape->GetDims()[0];
   } else if (fc1_experts_scales != nullptr) {
     local_num_experts = fc1_experts_scales->Shape().GetDims()[0];
   } else {
-    // Fallback for non-quantized MoE without weights (should not happen in current code paths)
-    // or if only bias is provided?
-    local_num_experts = num_experts;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid MoE configuration: both fc1_experts_weights and fc1_experts_scales are null. "
+                           "At least one must be provided.");
   }
 
   int64_t inter_size;
-  if (fc2_experts_weights != nullptr) {
-    const auto& dims = fc2_experts_weights->Shape().GetDims();
+  if (fc2_experts_weights_shape != nullptr) {
+    const auto& dims = fc2_experts_weights_shape->GetDims();
     inter_size = (dims[1] * dims[2] * pack_size) / hidden_size;
   } else if (fc3_experts_scales != nullptr) {
     inter_size = fc3_experts_scales->Shape().GetDims()[1];
   } else if (fc1_experts_scales != nullptr) {
     int64_t fc1_inter_size = fc1_experts_scales->Shape().GetDims()[1];
     inter_size = is_fused_swiglu ? fc1_inter_size / 2 : fc1_inter_size;
   } else {
-    // Should not happen for valid QMoE calls
-    inter_size = 0;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Invalid MoE configuration: unable to infer inter_size because "
+                           "fc2_experts_weights, fc3_experts_scales, and fc1_experts_scales are all null.");
   }
 
   bool legacy_shape = false;
-  if (fc2_experts_weights != nullptr && fc1_experts_weights != nullptr) {
-    const auto& fc2_experts_weights_dims = fc2_experts_weights->Shape().GetDims();
-    const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
+  if (fc2_experts_weights_shape != nullptr && fc1_experts_weights_shape != nullptr) {
+    const auto& fc2_experts_weights_dims = fc2_experts_weights_shape->GetDims();
+    const auto& fc1_experts_weights_dims = fc1_experts_weights_shape->GetDims();
     legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) ||
                    (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size);
   }
@@ -106,13 +144,13 @@ Status CheckInputs(MoEParameters& parameters,
 
   if (legacy_shape) {
     // legacy shape does not match column major memory layout. This is for backward compatibility.
-    if (fc1_experts_weights) CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, hidden_size, fc1_inter_size / pack_size);
-    if (fc2_experts_weights) CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, inter_size, hidden_size / pack_size);
-    if (fc3_experts_weights) CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, hidden_size, fc1_inter_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, hidden_size, inter_size / pack_size);
   } else {
-    if (fc1_experts_weights) CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, fc1_inter_size, hidden_size / pack_size);
-    if (fc2_experts_weights) CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, hidden_size, inter_size / pack_size);
-    if (fc3_experts_weights) CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, fc1_inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, inter_size, hidden_size / pack_size);
   }
 
   CHECK_TENSOR_SHAPE(router_probs, num_rows, num_experts);
@@ -194,9 +232,11 @@ Status CheckInputs(MoEParameters& parameters,
     }
   }
 
-  if (fc3_experts_weights == nullptr) {
+  if (fc3_experts_weights_shape == nullptr) {
+    // If fc3 weights are not provided, ensure no other fc3 parameters are provided
     ORT_ENFORCE(fc3_experts_bias == nullptr && fc3_experts_scales == nullptr && fc3_zero_points == nullptr);
   } else {
+    // If fc3 weights are provided, ensure scales logic is consistent
     ORT_ENFORCE(fc1_experts_scales == nullptr || fc3_experts_scales != nullptr);  // MOE no scale, or qMOE need scales
   }
 
@@ -226,6 +266,36 @@ Status CheckInputs(MoEParameters& parameters,
   return Status::OK();
 }
 
+template <typename Tensor>
+Status CheckInputs(MoEParameters& parameters,
+                   const Tensor* input,                // required
+                   const Tensor* router_probs,         // required
+                   const Tensor* fc1_experts_weights,  // required
+                   const Tensor* fc1_experts_bias,     // optional
+                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,      // optional, for qMoE
+                   const Tensor* fc2_experts_weights,  // required
+                   const Tensor* fc2_experts_bias,     // optional
+                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,      // optional, for qMoE
+                   const Tensor* fc3_experts_weights,  // optional
+                   const Tensor* fc3_experts_bias,     // optional
+                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,      // optional, for qMoE
+                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const bool is_fused_swiglu,
+                   const int64_t block_size = 0) {  // block size for block-wise quantization
+
+  const TensorShape* fc1_shape = (fc1_experts_weights != nullptr) ? &fc1_experts_weights->Shape() : nullptr;
+  const TensorShape* fc2_shape = (fc2_experts_weights != nullptr) ? &fc2_experts_weights->Shape() : nullptr;
+  const TensorShape* fc3_shape = (fc3_experts_weights != nullptr) ? &fc3_experts_weights->Shape() : nullptr;
+
+  return CheckInputs(parameters, input, router_probs, fc1_shape, fc1_experts_bias, fc1_experts_scales, fc1_zero_points,
+                     fc2_shape, fc2_experts_bias, fc2_experts_scales, fc2_zero_points,
+                     fc3_shape, fc3_experts_bias, fc3_experts_scales, fc3_zero_points,
+                     pack_size, is_fused_swiglu, block_size);
+}
+
 }  // namespace moe_helper
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -471,11 +471,9 @@ Status QMoECPU<T>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr all
 
   // If scales are prepacked, they are constant initializers.
   if (input_idx == 3) {
-    has_prepacked_fc1_scales_ = true;
     return Status::OK();
   }
   if (input_idx == 6) {
-    has_prepacked_fc2_scales_ = true;
     return Status::OK();
   }
 
@@ -511,11 +509,33 @@ Status QMoECPU<T>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr all
       }
     }
 
+    if (input_idx == 2) {
+      fc1_shape_ = shape;
+    } else if (input_idx == 5) {
+      fc2_shape_ = shape;
+    } else if (input_idx == 8) {
+      fc3_shape_ = shape;
+    }
+
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_buffer));
       prepacked_weights->buffer_sizes_.push_back(packed_size);
       is_packed = true;
 
+      // Pack Shape (Buffer 1)
+      auto dims = shape.GetDims();
+      size_t rank_bytes = sizeof(int64_t);
+      size_t dims_bytes = dims.size() * sizeof(int64_t);
+      size_t shape_size = rank_bytes + dims_bytes;
+
+      auto shape_buffer = IAllocator::MakeUniquePtr<void>(alloc, shape_size);
+      int64_t* buffer_data = static_cast<int64_t*>(shape_buffer.get());
+      *buffer_data = static_cast<int64_t>(dims.size());
+      memcpy(buffer_data + 1, dims.data(), dims_bytes);
+
+      prepacked_weights->buffers_.push_back(std::move(shape_buffer));
+      prepacked_weights->buffer_sizes_.push_back(shape_size);
+
       // Try build MLAS Q4 cache if scales are available
       if (use_mlas_q4_gemm_) {
         const Tensor* scales_tensor = nullptr;
@@ -550,7 +570,7 @@ Status QMoECPU<T>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr all
                                         alloc, cache_buffer)
                   .IsOK()) {
             // Store the size so we can verify later? Container holds size.
-            // We push it as a SECOND buffer.
+            // We push it as a THIRD buffer (Buffer 2) now.
             size_t cache_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols)) * static_cast<size_t>(num_experts);
             prepacked_weights->buffers_.push_back(std::move(cache_buffer));
             prepacked_weights->buffer_sizes_.push_back(cache_size);
@@ -576,17 +596,38 @@ Status QMoECPU<T>::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepa
   if (input_idx == 2 && !prepacked_buffers.empty()) {
     packed_fc1_ = std::move(prepacked_buffers[0]);
     if (prepacked_buffers.size() > 1) {
-      packed_fc1_mlas_cache_ = std::move(prepacked_buffers[1]);
+      int64_t* buffer_data = static_cast<int64_t*>(prepacked_buffers[1].get());
+      int64_t rank = buffer_data[0];
+      std::vector<int64_t> dims(rank);
+      memcpy(dims.data(), buffer_data + 1, rank * sizeof(int64_t));
+      fc1_shape_ = TensorShape(dims);
+    }
+    if (prepacked_buffers.size() > 2) {
+      packed_fc1_mlas_cache_ = std::move(prepacked_buffers[2]);
     }
     used_shared_buffers = true;
   } else if (input_idx == 5 && !prepacked_buffers.empty()) {
     packed_fc2_ = std::move(prepacked_buffers[0]);
     if (prepacked_buffers.size() > 1) {
-      packed_fc2_mlas_cache_ = std::move(prepacked_buffers[1]);
+      int64_t* buffer_data = static_cast<int64_t*>(prepacked_buffers[1].get());
+      int64_t rank = buffer_data[0];
+      std::vector<int64_t> dims(rank);
+      memcpy(dims.data(), buffer_data + 1, rank * sizeof(int64_t));
+      fc2_shape_ = TensorShape(dims);
+    }
+    if (prepacked_buffers.size() > 2) {
+      packed_fc2_mlas_cache_ = std::move(prepacked_buffers[2]);
     }
     used_shared_buffers = true;
   } else if (input_idx == 8 && !prepacked_buffers.empty()) {
     packed_fc3_ = std::move(prepacked_buffers[0]);
+    if (prepacked_buffers.size() > 1) {
+      int64_t* buffer_data = static_cast<int64_t*>(prepacked_buffers[1].get());
+      int64_t rank = buffer_data[0];
+      std::vector<int64_t> dims(rank);
+      memcpy(dims.data(), buffer_data + 1, rank * sizeof(int64_t));
+      fc3_shape_ = TensorShape(dims);
+    }
     used_shared_buffers = true;
   }
 
@@ -635,17 +676,21 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const auto* fc2_zero_points = context->Input<Tensor>(12);
   const auto* fc3_zero_points = context->Input<Tensor>(13);
 
+  const TensorShape* fc1_shape_ptr = packed_fc1_ ? &fc1_shape_ : (fc1_experts_weights ? &fc1_experts_weights->Shape() : nullptr);
+  const TensorShape* fc2_shape_ptr = packed_fc2_ ? &fc2_shape_ : (fc2_experts_weights ? &fc2_experts_weights->Shape() : nullptr);
+  const TensorShape* fc3_shape_ptr = packed_fc3_ ? &fc3_shape_ : (fc3_experts_weights ? &fc3_experts_weights->Shape() : nullptr);
+
   MoEParameters moe_params;
   ORT_RETURN_IF_ERROR(moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias, fc1_scales, fc1_zero_points,
-      fc2_experts_weights, fc2_experts_bias, fc2_scales, fc2_zero_points,
-      fc3_experts_weights, fc3_experts_bias, fc3_scales, fc3_zero_points,
+      fc1_shape_ptr, fc1_experts_bias, fc1_scales, fc1_zero_points,
+      fc2_shape_ptr, fc2_experts_bias, fc2_scales, fc2_zero_points,
+      fc3_shape_ptr, fc3_experts_bias, fc3_scales, fc3_zero_points,
       expert_weight_bits_ == 4 ? 2 : 1,
       true,
       block_size_));
 
-  if (fc3_experts_weights || fc3_experts_bias || fc3_scales || fc3_zero_points) {
+  if (fc3_shape_ptr || fc3_experts_bias || fc3_scales || fc3_zero_points) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "FC3 gating is not yet implemented on CPU for QMoE");
   }
 
@@ -808,8 +853,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const bool is_fc1_block_wise = (fc1_scales_dims.size() == 3 && fc1_scales_dims[2] > 1);
   const bool is_fc2_block_wise = (fc2_scales_dims.size() == 3 && fc2_scales_dims[2] > 1);
 
-  const uint8_t* fc1_weights_data = (packed_fc1_ != nullptr) ? nullptr : fc1_experts_weights->Data<uint8_t>();
-  const uint8_t* fc2_weights_data = (packed_fc2_ != nullptr) ? nullptr : fc2_experts_weights->Data<uint8_t>();
+  const uint8_t* fc1_weights_data = (packed_fc1_ != nullptr) ? nullptr : fc1_experts_weights->template Data<uint8_t>();
+  const uint8_t* fc2_weights_data = (packed_fc2_ != nullptr) ? nullptr : fc2_experts_weights->template Data<uint8_t>();
   const T* fc1_scales_data = fc1_scales->Data<T>();
   const T* fc2_scales_data = fc2_scales->Data<T>();
   const T* fc1_bias_data = fc1_experts_bias ? fc1_experts_bias->Data<T>() : nullptr;
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
@@ -7,7 +7,6 @@
 #include "core/framework/op_kernel.h"
 #include "core/mlas/inc/mlas_q4.h"
 #include "contrib_ops/cpu/moe/moe_base_cpu.h"
-#include <mutex>
 #include <vector>
 
 namespace onnxruntime {
@@ -42,13 +41,15 @@ class QMoECPU final : public OpKernel, public MoEBaseCPU {
   int64_t block_size_;
   bool use_mlas_q4_gemm_{false};
   bool use_mlas_q4_gemm_overridden_{false};
-  bool has_prepacked_fc1_scales_{false};
-  bool has_prepacked_fc2_scales_{false};
 
   IAllocatorUniquePtr<void> packed_fc1_;
   IAllocatorUniquePtr<void> packed_fc2_;
   IAllocatorUniquePtr<void> packed_fc3_;
 
+  TensorShape fc1_shape_;
+  TensorShape fc2_shape_;
+  TensorShape fc3_shape_;
+
   IAllocatorUniquePtr<void> packed_fc1_mlas_cache_;
   IAllocatorUniquePtr<void> packed_fc2_mlas_cache_;
 };