[QNN-EP] Add MatMulNBits translation for GPU (#26340)

quic-tirupath · tirupath-qti · web-flow · commit ba11af41a3b7 · 2026-01-16T08:01:25.000Z
### Description  
Add support for translation of MatMulNBits contrib op to
  QNN with FullyConnected operation with INT4 BlockQuantized weights

Implementation details:
 - Translate MatMulNBits to FullyConnected in OpBuilder
 - Support QNN_QUANTIZATION_ENCODING_BLOCK for INT4 weights
- Pass INT4 weights and quant params as BlockQuantization encoding
params in QNN

Testing:
 - Added new unit tests for MNB -&gt; QNN-GPU
 - Validated all OnnxRuntime tests
- Validated the following LLMs through Olive and ORT-GenAI execution
flow
   - LlaMA3.2 1B
   - Qwen2.5
   - DeepSeek-R1-Qwen 1.5b
   - Phi3.5-mini-instruct

### Motivation and Context
LLMs with INT4 quantization pass in Olive will generate a model with
MatMulMBits contrib ops.
To run these ops via QNN-EP, MatMulNBits is translated to QNN
FullyConnected op with INT4 weights.

---------

Co-authored-by: tirupath-qti &lt;tirupath@qti.qualcomm.com&gt;
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -227,6 +227,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateInverseOpBuilder("Inverse", *this);
   }
+
+  {
+    CreateMatMulNBitsOpBuilder("MatMulNBits", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -127,6 +127,7 @@ void CreateSTFTOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 
 void CreateInverseOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreateMatMulNBitsOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmulnbits_op_builder.cc
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.cc b/onnxruntime/core/providers/qnn/builder/qnn_def.cc
@@ -456,7 +456,7 @@ bool CreateTensorInQnnGraph(const QNN_INTERFACE_VER_TYPE& qnn_interface,
       return false;
     }
     // verify size expressed by the dims matches the raw tensor size
-    uint32_t qnn_tensor_size = CalcQnnTensorNumElems(qnn_tensor) * gsl::narrow_cast<uint32_t>(data_size);
+    const auto qnn_tensor_size = utils::GetQnnTensorDataSizeInBytes(qnn_tensor);
     auto qnn_tensor_buf_size = GetQnnTensorClientBuf(qnn_tensor).dataSize;
     if (qnn_tensor_size != qnn_tensor_buf_size) {
       ss << "Data length mismatch for static tensor. node_name: " << node_name
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -881,7 +881,8 @@ void QnnModelWrapper::GetGraphInputOutputTensorWrapper(const std::vector<std::st
 }
 
 Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initializer,
-                                              std::vector<uint8_t>& unpacked_tensor) const {
+                                              std::vector<uint8_t>& unpacked_tensor,
+                                              const bool unpack_4_bit_to_8_bit) const {
   if (initializer.data_location() == onnx::TensorProto_DataLocation_EXTERNAL) {
     ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(initializer, graph_viewer_.ModelPath(),
                                                                   unpacked_tensor));
@@ -891,12 +892,13 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
 
   int32_t onnx_data_type = initializer.data_type();
 
-  // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
-  if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
+  // If this is an int4,
+  // If unpack_4_bit_to_8_bit is true, we need to unpack it because QNN HTP treats int4 as a full int8.
+  if (unpack_4_bit_to_8_bit && onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
     TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
     const size_t num_int4_elems = shape.Size();
     ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
-  } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
+  } else if (unpack_4_bit_to_8_bit && onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
     TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
     const size_t num_uint4_elems = shape.Size();
     ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -245,7 +245,8 @@ class QnnModelWrapper {
   }
 
   Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initializer,
-                               std::vector<uint8_t>& unpacked_tensor) const;
+                               std::vector<uint8_t>& unpacked_tensor,
+                               const bool unpack_4_bit_to_8_bit = true) const;
 
   QnnBackendType GetQnnBackendType() const { return qnn_backend_type_; }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
@@ -20,8 +20,11 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(const QnnQuantParamsWrapper& other)
   size_t num_scaleoffsets = 0;
   if (other.IsLPBQ()) {
     num_scaleoffsets = other.per_channel_scales_size_;
+  } else if (other.IsBlockQuantized()) {
+    block_encoding_tensor_rank_ = other.block_encoding_tensor_rank_;
+    num_scaleoffsets = other.num_blocks_;
   }
-  Status status = Init(other.params_, num_scaleoffsets);
+  Status status = Init(other.params_, num_scaleoffsets, block_encoding_tensor_rank_);
   assert(status.IsOK());  // Expect other QnnQuantParamsWrapper to always have a supported quantization encoding.
 }
 
@@ -30,8 +33,11 @@ QnnQuantParamsWrapper& QnnQuantParamsWrapper::operator=(const QnnQuantParamsWrap
     size_t num_scaleoffsets = 0;
     if (other.IsLPBQ()) {
       num_scaleoffsets = other.per_channel_scales_size_;
+    } else if (other.IsBlockQuantized()) {
+      block_encoding_tensor_rank_ = other.block_encoding_tensor_rank_;
+      num_scaleoffsets = other.num_blocks_;
     }
-    Status status = Init(other.params_, num_scaleoffsets);
+    Status status = Init(other.params_, num_scaleoffsets, block_encoding_tensor_rank_);
     assert(status.IsOK());  // Expect other QnnQuantParamsWrapper to always have a supported quantization encoding.
   }
 
@@ -156,6 +162,39 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> per_channel_
   params_.blockwiseExpansion = lpbqPtr;
 }
 
+// Construct a BlockEncoding BQ quantization param.
+QnnQuantParamsWrapper::QnnQuantParamsWrapper(
+    gsl::span<const float> scales,
+    gsl::span<const int32_t> offsets,
+    gsl::span<const uint32_t> block_sizes,
+    Qnn_DataType_t tensor_data_type) {
+  ORT_UNUSED_PARAMETER(tensor_data_type);
+  assert(block_sizes.size() > 0);
+  assert(scales.size() > 0);
+  assert(scales.size() == offsets.size());  // Logic error if sizes don't match.
+
+  num_blocks_ = static_cast<uint32_t>(scales.size());
+  params_.encodingDefinition = QNN_DEFINITION_DEFINED;
+  params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BLOCK;
+
+  block_encoding_tensor_rank_ = static_cast<uint32_t>(block_sizes.size());
+  block_encoding_axis_data_ = std::make_unique<uint32_t[]>(block_encoding_tensor_rank_);
+  std::memcpy(block_encoding_axis_data_.get(),
+              block_sizes.data(),
+              static_cast<size_t>(block_encoding_tensor_rank_) * sizeof(uint32_t));
+  params_.blockEncoding.blockSize = block_encoding_axis_data_.get();
+
+  // Deep copy the scale offsets
+  if (num_blocks_ > 0) {
+    block_encoding_scale_offsets_data_ = std::make_unique<Qnn_ScaleOffset_t[]>(num_blocks_);
+    for (size_t i = 0; i < num_blocks_; ++i) {
+      block_encoding_scale_offsets_data_[i].offset = offsets[i];
+      block_encoding_scale_offsets_data_[i].scale = scales[i];
+    }
+    params_.blockEncoding.scaleOffset = block_encoding_scale_offsets_data_.get();
+  }
+}
+
 // Get a copy of scales. Works for both per-tensor and per-channel.
 Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector<float>& scales) const {
   ORT_RETURN_IF_NOT(params_.encodingDefinition == QNN_DEFINITION_DEFINED, "Unquantized qparams does not have scales");
@@ -195,6 +234,18 @@ Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector<float>& scales) cons
       }
       break;
     }
+    case QNN_QUANTIZATION_ENCODING_BLOCK: {
+      scales.resize(num_blocks_);
+
+      if (num_blocks_ > 0) {
+        gsl::span<const Qnn_ScaleOffset_t> scale_offsets(params_.blockEncoding.scaleOffset, num_blocks_);
+
+        for (size_t i = 0; i < num_blocks_; i++) {
+          scales[i] = scale_offsets[i].scale;
+        }
+      }
+      break;
+    }
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ",
                              params_.quantizationEncoding);
@@ -208,7 +259,7 @@ QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const {
 }
 
 // Initializes by copying from a Qnn_QuantizeParams_t.
-Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const size_t lpbq_num_scaleoffsets) {
+Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const size_t num_scaleoffsets, const size_t tensor_rank) {
   if (per_channel_data_) {
     per_channel_data_.reset(nullptr);
     params_ = QNN_QUANTIZE_PARAMS_INIT;
@@ -278,7 +329,7 @@ Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const siz
       break;
     }
     case QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION: {
-      assert(lpbq_num_scaleoffsets && "Can't create BlockwiseExpansion encoding object with zero ScaleOffsets");
+      assert(num_scaleoffsets && "Can't create BlockwiseExpansion encoding object with zero ScaleOffsets");
       params_.encodingDefinition = params.encodingDefinition;
       params_.quantizationEncoding = params.quantizationEncoding;
 
@@ -291,7 +342,7 @@ Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const siz
       params_.blockwiseExpansion = bwe_aligned_dst;
 
       // Deep copy the scaleoffsets
-      const size_t so_num_elems = lpbq_num_scaleoffsets;
+      const size_t so_num_elems = num_scaleoffsets;
       const size_t so_num_bytes = so_num_elems * sizeof(Qnn_ScaleOffset_t);
       constexpr std::uintptr_t so_align = alignof(Qnn_ScaleOffset_t);
       per_channel_data_ = std::make_unique<char[]>(so_num_bytes + so_align);
@@ -301,7 +352,7 @@ Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const siz
       params_.blockwiseExpansion->scaleOffsets = so_aligned_dst;
 
       // Deep copy blockscales
-      const size_t bs_num_elems = lpbq_num_scaleoffsets * params.blockwiseExpansion->numBlocksPerAxis;
+      const size_t bs_num_elems = num_scaleoffsets * params.blockwiseExpansion->numBlocksPerAxis;
       const size_t bs_num_bytes = bs_num_elems * sizeof(uint8_t);
       constexpr std::uintptr_t bs_align = alignof(uint8_t);
       block_scales_data_ = std::make_unique<uint8_t[]>(bs_num_bytes + bs_align);
@@ -310,6 +361,28 @@ Status QnnQuantParamsWrapper::Init(const Qnn_QuantizeParams_t& params, const siz
       params_.blockwiseExpansion->blocksScale8 = bs_aligned_dst;
       break;
     }
+    case QNN_QUANTIZATION_ENCODING_BLOCK: {
+      assert(num_scaleoffsets && "Can't create Block encoding object with zero ScaleOffsets");
+      params_.encodingDefinition = params.encodingDefinition;
+      params_.quantizationEncoding = params.quantizationEncoding;
+
+      block_encoding_tensor_rank_ = static_cast<uint32_t>(tensor_rank);
+      block_encoding_axis_data_ = std::make_unique<uint32_t[]>(block_encoding_tensor_rank_);
+      std::memcpy(block_encoding_axis_data_.get(),
+                  params.blockEncoding.blockSize,
+                  static_cast<size_t>(block_encoding_tensor_rank_) * sizeof(uint32_t));
+      params_.blockEncoding.blockSize = block_encoding_axis_data_.get();
+
+      // Deep copy the scale offsets
+      block_encoding_scale_offsets_data_ = std::make_unique<Qnn_ScaleOffset_t[]>(num_scaleoffsets);
+      for (size_t i = 0; i < num_scaleoffsets; ++i) {
+        block_encoding_scale_offsets_data_[i].scale = params.blockEncoding.scaleOffset[i].scale;
+        block_encoding_scale_offsets_data_[i].offset = params.blockEncoding.scaleOffset[i].offset;
+      }
+      params_.blockEncoding.scaleOffset = block_encoding_scale_offsets_data_.get();
+
+      break;
+    }
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ", params.quantizationEncoding);
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -34,11 +34,16 @@ class QnnQuantParamsWrapper {
   QnnQuantParamsWrapper(gsl::span<const float> per_channel_float_scales, gsl::span<const uint8_t> per_block_int_scales,
                         gsl::span<const int32_t> offsets, int64_t axis, int64_t block_size, bool is_int4);
 
+  // Construct a BQ quantization param.
+  QnnQuantParamsWrapper(
+      gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+      gsl::span<const uint32_t> block_size, Qnn_DataType_t tensor_data_type);
+
   Qnn_QuantizeParams_t& Get() { return params_; }
   const Qnn_QuantizeParams_t& Get() const { return params_; }
 
   // Initialize this object from a raw Qnn_QuantizeParam_t object.
-  Status Init(const Qnn_QuantizeParams_t& params, const size_t lpbq_num_scaleoffsets = 0);
+  Status Init(const Qnn_QuantizeParams_t& params, const size_t num_scaleoffsets = 0, const size_t tensor_rank = 0);
 
   // Initialize this object from a (potentially) quantized ONNX tensor.
   // QnnModelWrapper provides utilities for unpacking scale and zero-point ONNX initializers.
@@ -67,6 +72,11 @@ class QnnQuantParamsWrapper {
            (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION);
   }
 
+  bool IsBlockQuantized() const {
+    return params_.encodingDefinition == QNN_DEFINITION_DEFINED &&
+           (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BLOCK);
+  }
+
   // Get a copy of scales. Works for both per-tensor and per-channel.
   Status GetScales(/*out*/ std::vector<float>& scales) const;
 
@@ -163,6 +173,12 @@ class QnnQuantParamsWrapper {
   uint32_t per_channel_scales_size_;
   std::unique_ptr<uint8_t[]> block_scales_data_;
   std::unique_ptr<char[]> blockwise_expansion_data_;
+
+  // Stores BlockEncoding axis and scale offset data
+  uint32_t block_encoding_tensor_rank_ = 0;
+  uint32_t num_blocks_ = 0;
+  std::unique_ptr<uint32_t[]> block_encoding_axis_data_;
+  std::unique_ptr<Qnn_ScaleOffset_t[]> block_encoding_scale_offsets_data_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -36,9 +36,11 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type) {
       {QNN_DATATYPE_FLOAT_32, 4},
       {QNN_DATATYPE_BFLOAT_16, 2},
       {QNN_DATATYPE_BOOL_8, 1},
+      {QNN_DATATYPE_SFIXED_POINT_4, sizeof(Int4x2)},
       {QNN_DATATYPE_SFIXED_POINT_8, 1},
       {QNN_DATATYPE_SFIXED_POINT_16, 2},
       {QNN_DATATYPE_SFIXED_POINT_32, 4},
+      {QNN_DATATYPE_UFIXED_POINT_4, sizeof(Int4x2)},
       {QNN_DATATYPE_UFIXED_POINT_8, 1},
       {QNN_DATATYPE_UFIXED_POINT_16, 2},
       {QNN_DATATYPE_UFIXED_POINT_32, 4},
@@ -105,11 +107,25 @@ size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) {
   }
   // Unreachable
 }
+size_t GetQnnTensorDataSizeInBytes(size_t num_elements, Qnn_DataType_t element_type) {
+  SafeInt<size_t> safe_num_elements = num_elements;
+  if (element_type == QNN_DATATYPE_SFIXED_POINT_4 || element_type == QNN_DATATYPE_UFIXED_POINT_4) {
+    return (safe_num_elements + 1) / 2;
+  }
+  return (safe_num_elements * GetElementSizeByType(element_type));
+}
 
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_type) {
   ORT_ENFORCE(!shape.empty(), "Empty shape not allowed.");  // TODO can we just treat empty shape as a scalar?
-  SafeInt<size_t> data_length = GetElementSizeByType(element_type);
-  return std::accumulate(shape.begin(), shape.end(), data_length, std::multiplies<>{});
+  SafeInt<size_t> num_elements = std::accumulate(shape.begin(), shape.end(), SafeInt<size_t>{1}, std::multiplies<>{});
+  return GetQnnTensorDataSizeInBytes(num_elements, element_type);
+}
+
+size_t GetQnnTensorDataSizeInBytes(const Qnn_Tensor_t& tensor) {
+  uint32_t rank = GetQnnTensorRank(tensor);
+  uint32_t* dims = GetQnnTensorDims(tensor);
+  gsl::span<const uint32_t> shape{dims, static_cast<size_t>(rank)};
+  return GetQnnTensorDataSizeInBytes(shape, GetQnnTensorDataType(tensor));
 }
 
 bool QnnTensorHasDynamicShape(const Qnn_Tensor_t& tensor) {
@@ -999,7 +1015,7 @@ Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape
   const size_t num_dims = shape.size();
   const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
   ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
-  size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size();
+  size_t expected_num_quant_bytes = GetQnnTensorDataSizeInBytes(data.size(), data_type);
   ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes,
                     "Cannot quantize data because output buffer is not the correct size");
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -78,7 +78,9 @@ class QnnJSONGraph {
 
 size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type);
 
+size_t GetQnnTensorDataSizeInBytes(size_t num_elements, Qnn_DataType_t element_data_type);
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_data_type);
+size_t GetQnnTensorDataSizeInBytes(const Qnn_Tensor_t& tensor);
 
 bool QnnTensorHasDynamicShape(const Qnn_Tensor_t& tensor);
 
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -814,6 +814,62 @@ TEST(MatMulNBits, BFloat16_Int4_NoZeroPoint) {
 #endif
 
 #endif  // defined(USE_CUDA) || defined(USE_DML)
+
+#if defined(USE_QNN) && defined(_M_ARM64)
+
+namespace {
+// QNN-EP Test Function
+// This has too many parameters of the same type that must be specified in the correct order.
+// Consider using the overload with a TestOptions parameter.
+void RunQnnEpTest(int64_t M, int64_t N, int64_t K, bool has_zeropoint = true, float abs_error = 0.05f) {
+  TestOptions opts{};
+  opts.M = M;
+  opts.N = N;
+  opts.K = K;
+  opts.block_size = 32;
+  opts.accuracy_level = 4;
+  opts.has_zero_point = has_zeropoint;
+  opts.zp_is_4bit = true;
+  opts.has_g_idx = false;
+  opts.has_bias = false;
+
+  if (abs_error > 0.f) {
+    opts.output_abs_error = abs_error;
+  }
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "gpu";
+  provider_options["offload_graph_io_quantization"] = "0";
+  execution_providers.push_back(QnnExecutionProviderWithOptions(provider_options));
+
+  RunTest<float>(opts, std::move(execution_providers));
+}
+}  // namespace
+
+// QNN GPU Only support FP16 activations and Q4_0 weights, with zero_points = 8
+// Accumulation with larger channel accumulates more error. Set higher abs_error with respect to K.
+TEST(MatMulNBits, Basic_M1_N128_K512_withZp) {
+  constexpr float abs_error = 0.05f;
+  RunQnnEpTest(1, 128, 512, true, abs_error);
+}
+
+TEST(MatMulNBits, Basic_M1_N128_K512) {
+  constexpr float abs_error = 0.05f;
+  RunQnnEpTest(1, 128, 512, false, abs_error);
+}
+
+TEST(MatMulNBits, Basic_M10_N128_K512_withZp) {
+  constexpr float abs_error = 0.05f;
+  RunQnnEpTest(10, 128, 512, true, abs_error);
+}
+
+TEST(MatMulNBits, Basic_M10_N128_K512) {
+  constexpr float abs_error = 0.05f;
+  RunQnnEpTest(10, 128, 512, false, abs_error);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
 

Original file line number	Diff line number	Diff line change
`@@ -227,6 +227,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {`
`227`	`227`	`{`
`228`	`228`	`CreateInverseOpBuilder("Inverse", *this);`
`229`	`229`	`}`
	`230`	`+`
	`231`	`+ {`
	`232`	`+ CreateMatMulNBitsOpBuilder("MatMulNBits", *this);`
	`233`	`+ }`
`230`	`234`	`}`
`231`	`235`
`232`	`236`	`const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {`
Original file line number	Diff line number	Diff line change
`@@ -456,7 +456,7 @@ bool CreateTensorInQnnGraph(const QNN_INTERFACE_VER_TYPE& qnn_interface,`
`456`	`456`	`return false;`
`457`	`457`	`}`
`458`	`458`	`// verify size expressed by the dims matches the raw tensor size`
`459`		`- uint32_t qnn_tensor_size = CalcQnnTensorNumElems(qnn_tensor) * gsl::narrow_cast<uint32_t>(data_size);`
	`459`	`+ const auto qnn_tensor_size = utils::GetQnnTensorDataSizeInBytes(qnn_tensor);`
`460`	`460`	`auto qnn_tensor_buf_size = GetQnnTensorClientBuf(qnn_tensor).dataSize;`
`461`	`461`	`if (qnn_tensor_size != qnn_tensor_buf_size) {`
`462`	`462`	`ss << "Data length mismatch for static tensor. node_name: " << node_name`
Original file line number	Diff line number	Diff line change
`@@ -245,7 +245,8 @@ class QnnModelWrapper {`
`245`	`245`	`}`
`246`	`246`
`247`	`247`	`Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& initializer,`
`248`		`- std::vector<uint8_t>& unpacked_tensor) const;`
	`248`	`+ std::vector<uint8_t>& unpacked_tensor,`
	`249`	`+ const bool unpack_4_bit_to_8_bit = true) const;`
`249`	`250`
`250`	`251`	`QnnBackendType GetQnnBackendType() const { return qnn_backend_type_; }`
`251`	`252`