@@ -540,6 +540,49 @@ TEST(MatMul2BitsWebGpu, Float32_ZeroPoint_LargerK) {
540540 RunWebGpu2BitsTest<float >(1 , 32 , 256 , 32 , true , 0 .3f , 0 .05f );
541541}
542542
543+ // DP4A path tests (accuracy_level=4) — exercises the 1024-entry LUT / dequantization
544+ // path for 2-bit weights with zero_points.
545+ // DP4A constraints: accuracy_level==4, block_size%32==0, K%128==0, N%16==0.
546+ TEST (MatMul2BitsWebGpu, Float32_ZeroPoint_DP4A) {
547+ TestOptions2Bits opts{};
548+ opts.accuracy_level = 4 ;
549+ opts.has_zero_point = true ;
550+ opts.output_abs_error = 0 .1f ;
551+ opts.output_rel_error = 0 .02f ;
552+
553+ // M=1, N=16, K=128, block_size=32 — minimal DP4A-eligible shape
554+ opts.M = 1 ;
555+ opts.N = 16 ;
556+ opts.K = 128 ;
557+ opts.block_size = 32 ;
558+ RunTest2Bits<float >(opts);
559+
560+ // M=1, N=32, K=256, block_size=32 — larger K
561+ opts.M = 1 ;
562+ opts.N = 32 ;
563+ opts.K = 256 ;
564+ opts.block_size = 32 ;
565+ opts.output_abs_error = 0 .3f ;
566+ opts.output_rel_error = 0 .05f ;
567+ RunTest2Bits<float >(opts);
568+
569+ // M=4 (batch), N=32, K=128, block_size=32
570+ opts.M = 4 ;
571+ opts.N = 32 ;
572+ opts.K = 128 ;
573+ opts.block_size = 32 ;
574+ opts.output_abs_error = 0 .1f ;
575+ opts.output_rel_error = 0 .02f ;
576+ RunTest2Bits<float >(opts);
577+
578+ // M=1, N=16, K=128, block_size=128 — full-block
579+ opts.M = 1 ;
580+ opts.N = 16 ;
581+ opts.K = 128 ;
582+ opts.block_size = 128 ;
583+ RunTest2Bits<float >(opts);
584+ }
585+
543586#endif // USE_WEBGPU
544587
545588} // namespace test
0 commit comments