updated to support int8 asymmetric weight quantization

e-ddykim · e-ddykim · commit f3e95d0d2e28 · 2026-03-04T03:02:13.000+09:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/moe_gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/moe_gemm_onednn.cpp
@@ -65,16 +65,18 @@ struct moe_gemm_onednn : typed_primitive_onednn_impl<moe_gemm> {
             dnnl::memory::dim d0 = wei_scales_shape[0];
             dnnl::memory::dim d1 = wei_scales_shape[1];
             dnnl::memory::dim d2 = wei_scales_shape[2];
+            dnnl::memory::dims wei_scales_dims = (moe_cfg.weight_group_size == -1) ? dnnl::memory::dims{d0, d2} : dnnl::memory::dims{d0, d1, d2};
+            dnnl::memory::format_tag wei_scales_fmt = (moe_cfg.weight_group_size == -1) ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::abc;
             dnnl::memory::desc wei_scales_md(
-                    {d0, d1, d2}, convert_data_type(wei_scales.get_layout().data_type), dnnl::memory::format_tag::abc);
+                    wei_scales_dims, convert_data_type(wei_scales.get_layout().data_type), wei_scales_fmt);
             dnnl::memory wei_scales_mem = dnnl::ocl_interop::make_memory(wei_scales_md, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
                 reinterpret_cast<uint8_t*>(wei_scales.buffer_ptr()));
             args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_mem});
 
             if (!moe_cfg.is_weight_symmetric_quantized) {
                 auto& wei_zp = instance.input_memory(moe_cfg.weight_zp_idx);
                 dnnl::memory::desc wei_zp_md(
-                        {d0, d1, d2}, convert_data_type(wei_zp.get_layout().data_type), dnnl::memory::format_tag::abc);
+                        wei_scales_dims, convert_data_type(wei_zp.get_layout().data_type), wei_scales_fmt);
                 dnnl::memory wei_zp_mem = dnnl::ocl_interop::make_memory(wei_zp_md, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
                     reinterpret_cast<uint8_t*>(wei_zp.buffer_ptr()));
                 args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, wei_zp_mem});
@@ -157,16 +159,30 @@ struct moe_gemm_onednn : typed_primitive_onednn_impl<moe_gemm> {
         auto moe_cfg = MoEGemmImplementationManager::get_moe_cfg(impl_params);
 
         if (moe_cfg.is_weight_quantized) {
-            attr->set_scales(DNNL_ARG_WEIGHTS,
-                             (1 << 0) | (1 << 1) | (1 << 2),
-                             {moe_cfg.weight_group_size, 1},
-                             convert_data_type(impl_params.get_input_layout(moe_cfg.weight_scale_idx).data_type));
+            if (moe_cfg.weight_group_size == -1) {
+                attr->set_scales(DNNL_ARG_WEIGHTS,
+                                 (1 << 0) | (1 << 2),
+                                 {},
+                                 convert_data_type(impl_params.get_input_layout(moe_cfg.weight_scale_idx).data_type));
+            } else {
+                attr->set_scales(DNNL_ARG_WEIGHTS,
+                                 (1 << 0) | (1 << 1) | (1 << 2),
+                                 {moe_cfg.weight_group_size, 1},
+                                 convert_data_type(impl_params.get_input_layout(moe_cfg.weight_scale_idx).data_type));
+            }
 
             if (!moe_cfg.is_weight_symmetric_quantized) {
-                attr->set_zero_points(DNNL_ARG_WEIGHTS,
-                                      (1 << 0) | (1 << 1) | (1 << 2),
-                                      {moe_cfg.weight_group_size, 1},
-                                      convert_data_type(impl_params.get_input_layout(moe_cfg.weight_zp_idx).data_type));
+                if (moe_cfg.weight_group_size == -1) {
+                    attr->set_zero_points(DNNL_ARG_WEIGHTS,
+                                         (1 << 0) | (1 << 2),
+                                         {},
+                                         convert_data_type(impl_params.get_input_layout(moe_cfg.weight_zp_idx).data_type));
+                } else {
+                    attr->set_zero_points(DNNL_ARG_WEIGHTS,
+                                          (1 << 0) | (1 << 1) | (1 << 2),
+                                          {moe_cfg.weight_group_size, 1},
+                                          convert_data_type(impl_params.get_input_layout(moe_cfg.weight_zp_idx).data_type));
+                }
             }
         }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/moe_gemm_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/moe_gemm_onednn.hpp
@@ -160,7 +160,7 @@ struct MoEGemmImplementationManager : public ImplementationManager {
             // weight scales : [#experts, num_groups, ofm, 1]
             auto scale_group_dim = 1;
             auto num_scale_groups = (weight_shape.size() == 4) ? params.input_layouts[moe_cfg.weight_scale_idx].get_shape()[scale_group_dim] : 1;
-            moe_cfg.weight_group_size = k / num_scale_groups;
+            moe_cfg.weight_group_size = (num_scale_groups == 1) ? -1 : (k / num_scale_groups);
             if (static_cast<int32_t>(params.input_layouts.size()) > moe_cfg.weight_zp_idx) {
                 moe_cfg.is_weight_symmetric_quantized = false;
             } else {
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_moe_to_compressed.cpp
@@ -262,7 +262,7 @@ ConvertMOEToMOECompressed::ConvertMOEToMOECompressed(bool is_pa) {
             config.hidden_size = weight_shape[2];
             if (weight_shape.size() == 4) config.hidden_size *= weight_shape[3];
             config.inter_size = weight_shape[1];
-            config.group_size = (weight_shape.size() == 3) ? config.hidden_size : scale_shape[3];
+            config.group_size = (scale_shape.size() == 3) ? std::numeric_limits<size_t>::max() : scale_shape[3];
             config.top_k = topk_shape.rbegin()->get_length();
             config.out_type = ov::element::dynamic;
             config.has_batch_dim = is_pa ? 0 : 1;
@@ -272,7 +272,9 @@ ConvertMOEToMOECompressed::ConvertMOEToMOECompressed(bool is_pa) {
             args.push_back(pattern_map.at(topk_indices_gemm2_m));
             // params for up
             args.push_back(pattern_map.at(compressed_weights_m_up));
-            auto transposed_index = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 2, 1, 3});
+            auto transposed_index = (config.group_size == std::numeric_limits<size_t>::max()) ?
+                                    std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{0, 2, 1}) :
+                                    std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 2, 1, 3});
             {
                 auto scale = std::make_shared<ov::op::v1::Transpose>(pattern_map.at(mul_const_m_up), transposed_index);
                 args.push_back(scale);
diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu
@@ -1 +1 @@
-Subproject commit 98bc83d7b5cf9592ad7719767ad5ddab3694d679
+Subproject commit 3e48e1f324199c61d935478e0787ebbc7e7616b6