*improve AMX-BF16 optimizations of class SynetConvolution16bNhwcGemm (part 1: Convolution16bNhwcGemm_32x32).

ermig1979 · ermig1979 · commit 639c1aae6222 · 2025-02-03T18:21:14.000+03:00
diff --git a/docs/2025.html b/docs/2025.html
@@ -48,6 +48,7 @@ <h5>Improving</h5>
  <li>SSE4.1 optimizations of class ResizerBf16Bilinear.</li>
  <li>SSE4.1, AVX2, AVX-512BW optimizations of class ResizerFloatBilinear.</li>
  <li>AMX-BF16 optimizations of class SynetConvolution16bNchwGemm.</li>
+ <li>AMX-BF16 optimizations of class SynetConvolution16bNhwcGemm.</li>
 <</ul>
 
 <h4>Test framework</h4>
diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp
@@ -228,33 +228,16 @@ namespace Simd
 
         //-----------------------------------------------------------------------------------------
 
-        template<Term16bType term, SimdConvolutionActivationType type> void Convolution16bNhwcGemm_32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a,
+        template<Term16bType term, SimdConvolutionActivationType type, int cfg> void Convolution16bNhwcGemm_32x32(const uint16_t* src0, const ConvParam& p, const AlgParam& a,
             size_t srcC, size_t dstS, size_t dstC, int zero, const uint16_t* weight0, const __m512* bias, const __m512* params, float* buf, uint8_t* dst)
         {
             int dB = (int)a.dB, dD = int(p.dstC * a.elem), dS = (int)a.bufK, strideB = dB * 4, strideW = 64;
             int stepS = a.reorderType ? 512 : 32, strideS = a.reorderType ? 64 : dS * 2;
             const uint16_t* src1 = src0 + 16 * dS;
             const uint16_t* weight1 = weight0 + a.bufK * F;
 
-            TileConf conf;
-            conf.rows[0] = 16;
-            conf.rows[1] = 16;
-            conf.rows[2] = uint8_t(dstS - 16);
-            conf.rows[3] = uint8_t(dstS - 16);
-            conf.rows[4] = 16;
-            conf.rows[5] = uint8_t(dstS - 16);
-            conf.rows[6] = 16;
-            conf.rows[7] = 16;
-            conf.colsb[0] = 64;
-            conf.colsb[1] = uint16_t((dstC - 16) * 4);
-            conf.colsb[2] = 64;
-            conf.colsb[3] = uint16_t((dstC - 16) * 4);
-            conf.colsb[4] = 64;
-            conf.colsb[5] = 64;
-            conf.colsb[6] = 64;
-            conf.colsb[7] = uint16_t((dstC - 16) * 4);
-            _tile_loadconfig(&conf);
-
+            if (cfg)
+                SetTileConf2x2(dstS, dstC);
             if (zero)
             {
                 _tile_zero(0);
@@ -269,17 +252,30 @@ namespace Simd
                 _tile_stream_loadd(2, buf + 16 * dB + 0, strideB);
                 _tile_stream_loadd(3, buf + 16 * dB + F, strideB);
             }
-            for (size_t sc = 0; sc < srcC; sc += 32, src0 += stepS, src1 += stepS)
+
+            size_t srcC32 = srcC - 32, sc = 0;
+            _tile_stream_loadd(4, src0, strideS);
+            _tile_loadd(6, weight0 + sc * 16, strideW);
+            for (; sc < srcC32; src1 += stepS)
             {
-                _tile_stream_loadd(4, src0, strideS);
-                _tile_loadd(6, weight0 + sc * 16, strideW);
-                _tile_dpbf16ps(0, 4, 6);
                 _tile_loadd(7, weight1 + sc * 16, strideW);
-                _tile_dpbf16ps(1, 4, 7);
                 _tile_stream_loadd(5, src1, strideS);
+                _tile_dpbf16ps(0, 4, 6);
+                _tile_dpbf16ps(1, 4, 7);
+                src0 += stepS;
+                _tile_stream_loadd(4, src0, strideS);
                 _tile_dpbf16ps(2, 5, 6);
+                sc += 32;
+                _tile_loadd(6, weight0 + sc * 16, strideW);
                 _tile_dpbf16ps(3, 5, 7);
             }
+            _tile_loadd(7, weight1 + sc * 16, strideW);
+            _tile_stream_loadd(5, src1, strideS);
+            _tile_dpbf16ps(0, 4, 6);
+            _tile_dpbf16ps(1, 4, 7);
+            _tile_dpbf16ps(2, 5, 6);
+            _tile_dpbf16ps(3, 5, 7);
+
             _tile_stored(0, buf + 0, strideB);
             _tile_stored(1, buf + F, strideB);
             _tile_stored(2, buf + 16 * dB + 0, strideB);
@@ -470,8 +466,8 @@ namespace Simd
         {
             size_t n = 32, n1 = dstH * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn, dW = a.bufK * DF;
             size_t dB = a.macroK < a.bufK ? a.dB : 0, dD = p.dstC * a.elem, dS = a.bufK;
-            Convolution16bNhwcGemmPtr body_2 = Convolution16bNhwcGemm_32x32<term, type>;
-            Convolution16bNhwcGemmPtr tail_2 = m > 16 ? Convolution16bNhwcGemm_32x32<term, type> : Convolution16bNhwcGemm_16x32<term, type>;
+            Convolution16bNhwcGemmPtr body_2 = Convolution16bNhwcGemm_32x32<term, type, 0>;
+            Convolution16bNhwcGemmPtr tail_2 = m > 16 ? Convolution16bNhwcGemm_32x32<term, type, 1> : Convolution16bNhwcGemm_16x32<term, type>;
             Convolution16bNhwcGemmPtr body_1 = Convolution16bNhwcGemm_32x16<term, type>;
             Convolution16bNhwcGemmPtr tail_1 = m > 16 ? Convolution16bNhwcGemm_32x16<term, type> : Convolution16bNhwcGemm_16x16<term, type>;
 
@@ -482,6 +478,7 @@ namespace Simd
                 type == SimdConvolutionActivationHardSigmoid)
                 _params[1] = _mm512_set1_ps(params[1]);
 
+            SetTileConfFull();
             for (size_t dc = 0; dc < dstC; dc += DF)
             {
                 size_t dC = Simd::Min(DF, dstC - dc);
@@ -498,6 +495,8 @@ namespace Simd
                 size_t i = 0;
                 if (dC > F)
                 {
+                    if(m)
+                        SetTileConfFull();
                     for (; i < nn; i += n, s += n * dS, b += n * dB, d += n * dD)
                         body_2(s, p, a, srcC, n, dC, zero, weight, _bias, _params, b, d);
                     if (m)
diff --git a/src/Test/TestSynetConvolution16b.cpp b/src/Test/TestSynetConvolution16b.cpp
@@ -316,16 +316,21 @@ namespace Test
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 32, 192, 256, 256, _1, _1, _1, _0, _0, 1, aRe, tT, f32, f32), c, f1, f2);
 #endif
 #if 1
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 3, 3, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 5, 5, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 16, 16, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 383, 13, 13, 1150, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 14, 1155, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, f32, f32), c, f1, f2);
-
-#if 1
+        result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 1024, 13, 13, 1152, _1, _1, _1, _0, _0, 1, aRe, tT, b16, b16), c, f1, f2);
+#endif
+#if 0
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 16, 16, 1152, _1, _1, _1, _0, _0, 1, aRe, tF, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 14, 1150, _1, _1, _1, _0, _0, 1, aRe, tF, b16, b16), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, aRe, tF, f32, f32), c, f1, f2);
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 383, 13, 13, 1155, _1, _1, _1, _0, _0, 1, aRe, tF, b16, b16), c, f1, f2);
 #endif
-#endif
 #else
         result = result && SynetConvolution16bForwardAutoTest(eps, Param(1, 384, 13, 13, 1152, _1, _1, _1, _0, _0, 1, aRe, tF, b16, b16), c, f1, f2);
 #endif