Add basic HVX reduce kernels.

dsharletg · xnnpack-bot · commit c28ce786d717 · 2026-02-23T22:27:40.000-08:00
Kernels that should be reasonably good: - min, max, min_max for all types - sum, sum_squared for int8 and uint8 for k1 > 1 Kernels that are not good and need work: - sum, sum_squared for int8 and uint8 for k1 = 1. These are currently naively implemented with conversions, and wide arithmetic (instead of widening arithmetic). - In general k1 = 1 is not good because we unroll the accumulator by 2x/4x, so we can load whole vectors, which makes the accumulators really large (e.g. 128). This means that we're very likely to hit tail case code. Example inner loop (k1 > 1 uint8 sum, sum_squared is almost identical): ``` .LBB30_116: // %while.body14.i // Parent Loop BB30_110 Depth=1 // Parent Loop BB30_112 Depth=2 // Parent Loop BB30_114 Depth=3 // => This Inner Loop Header: Depth=4 { v27 = vmemu(r5++#1) } { v28 = vmemu(r6++#1) } { v10.w += vrmpy(v27.ub,r9.b) v23 = vmemu(r0++#1) } { v9.w += vrmpy(v28.ub,r9.b) v15 = vmemu(r7++#1) } { v5.w += vrmpy(v23.ub,r9.b) } { v6.w += vrmpy(v15.ub,r9.b) r3 = add(r3,#-128) } { p3 = cmp.gtu(r3,#127) if (p3.new) jump:t .LBB30_116 } ``` PiperOrigin-RevId: 874380564
diff --git a/ynnpack/kernels/reduce/BUILD b/ynnpack/kernels/reduce/BUILD
@@ -42,6 +42,7 @@ ynn_cc_library(
         "arm_neonfma": ["arm_neonfma.cc"],
         "arm_neondot": ["arm_neondot.cc"],
         "arm_neon": ["arm_neon.cc"],
+        "hexagon_hvx": ["hexagon_hvx.cc"],
         "x86_ssse3": ["x86_ssse3.cc"],
         "x86_sse2": ["x86_sse2.cc"],
         "x86_sse41": ["x86_sse41.cc"],
diff --git a/ynnpack/kernels/reduce/hexagon_hvx.cc b/ynnpack/kernels/reduce/hexagon_hvx.cc
@@ -0,0 +1,165 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ynnpack/base/simd/hexagon_hvx.h"
+
+#include <hexagon_protos.h>
+#include <hexagon_types.h>
+#include <hvx_hexagon_protos.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "ynnpack/base/base.h"
+#include "ynnpack/base/bfloat16.h"
+#include "ynnpack/base/half.h"
+#include "ynnpack/base/simd/vec.h"
+#include "ynnpack/kernels/reduce/generic.h"
+#include "ynnpack/kernels/reduce/min_max_accumulator.h"
+#include "ynnpack/kernels/reduce/sum_accumulator.h"
+
+namespace ynn {
+
+namespace simd {
+
+static s32x32 reduce_add(
+    s32x32 a, u8x128 b, Identity /*map_fn*/,
+    std::integral_constant<size_t, 4> /*horizontal_factor*/) {
+  a.v = Q6_Vw_vrmpyacc_VwVubRb(a.v, b.v, 0x01010101);
+  return a;
+}
+
+static s32x32 reduce_add(
+    s32x32 a, u8x128 b, Square /*map_fn*/,
+    std::integral_constant<size_t, 4> /*horizontal_factor*/) {
+  a.v = Q6_Vuw_vrmpyacc_VuwVubVub(a.v, b.v, b.v);
+  return a;
+}
+
+static s32x32 reduce_add(
+    s32x32 a, s8x128 b, Identity /*map_fn*/,
+    std::integral_constant<size_t, 4> /*horizontal_factor*/) {
+  const auto ones = Q6_V_vsplat_R(0x01010101);
+  a.v = Q6_Vw_vrmpyacc_VwVbVb(a.v, b.v, ones);
+  return a;
+}
+
+static s32x32 reduce_add(
+    s32x32 a, s8x128 b, Square /*map_fn*/,
+    std::integral_constant<size_t, 4> /*horizontal_factor*/) {
+  a.v = Q6_Vw_vrmpyacc_VwVbVb(a.v, b.v, b.v);
+  return a;
+}
+
+}  // namespace simd
+
+using simd::bf16x64;
+using simd::f16x64;
+using simd::f32x32;
+using simd::s16x64;
+using simd::s32x32;
+using simd::s8x128;
+using simd::u8x128;
+using s32x128 = simd::vec<int32_t, 128>;
+using f32x128 = simd::vec<float, 128>;
+
+using bf16x64_rvar = float16_wrapper<bf16x64, s16x64>;
+
+MIN_MAX_KERNEL(min_max_fp32_4x32_hvx, f32x32, f32x32, float, 32);
+MIN_MAX_KERNEL(min_max_fp16_4x64_hvx, f16x64, f16x64, half, 64);
+MIN_MAX_KERNEL(min_max_bf16_4x64_hvx, bf16x64_rvar, bf16x64_rvar, bfloat16, 64);
+MIN_MAX_KERNEL(min_max_uint8_4x128_hvx, u8x128, u8x128, uint8_t, 128);
+MIN_MAX_KERNEL(min_max_int8_4x128_hvx, s8x128, s8x128, int8_t, 128);
+
+MIN_MAX_KERNEL(min_fp32_4x32_hvx, f32x32, dummy_t, float, 32);
+MIN_MAX_KERNEL(min_fp16_4x64_hvx, f16x64, dummy_t, half, 64);
+MIN_MAX_KERNEL(min_bf16_4x64_hvx, bf16x64_rvar, dummy_t, bfloat16, 64);
+MIN_MAX_KERNEL(min_uint8_4x128_hvx, u8x128, dummy_t, uint8_t, 128);
+MIN_MAX_KERNEL(min_int8_4x128_hvx, s8x128, dummy_t, int8_t, 128);
+
+MIN_MAX_KERNEL(max_fp32_4x32_hvx, dummy_t, f32x32, float, 32);
+MIN_MAX_KERNEL(max_fp16_4x64_hvx, dummy_t, f16x64, half, 64);
+MIN_MAX_KERNEL(max_bf16_4x64_hvx, dummy_t, bf16x64_rvar, bfloat16, 64);
+MIN_MAX_KERNEL(max_uint8_4x128_hvx, dummy_t, u8x128, uint8_t, 128);
+MIN_MAX_KERNEL(max_int8_4x128_hvx, dummy_t, s8x128, int8_t, 128);
+
+void sum_uint8_int32_hvx(size_t n, size_t k3, size_t k2, size_t k1,
+                         size_t a_stride_n, size_t a_stride_k3,
+                         size_t a_stride_k2, const void* a, size_t, void* c) {
+  if (k1 == 1 && a_stride_n == sizeof(uint8_t)) {
+    // TODO(b/482435301): This case is poorly optimized. It naively converts to
+    // int32 and does a 32-bit add. We should be using a widening op, and
+    // storing the accumulators interleaved until `sum_rows`.
+    stream_reduce<sum_accumulator_k1_1<s32x128>, uint8_t, int32_t>(
+        n, k3, k2, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const uint8_t*>(a),
+        /*C_stride_m=*/0, reinterpret_cast<int32_t*>(c));
+  } else {
+    tiled_reduce<sum_accumulator_x32<s32x32, 128, Identity>, uint8_t, int32_t>(
+        n, k3, k2, k1, a_stride_n, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const uint8_t*>(a), /*C_stride_m=*/0,
+        reinterpret_cast<int32_t*>(c));
+  }
+}
+
+void sum_squared_uint8_int32_hvx(size_t n, size_t k3, size_t k2, size_t k1,
+                                 size_t a_stride_n, size_t a_stride_k3,
+                                 size_t a_stride_k2, const void* a, size_t,
+                                 void* c) {
+  if (k1 == 1 && a_stride_n == sizeof(uint8_t)) {
+    // TODO(b/482435301): This case is poorly optimized. It naively converts to
+    // int32 and does a 32-bit add. We should be using a widening op, and
+    // storing the accumulators interleaved until `sum_rows`.
+    stream_reduce<sum_accumulator_k1_1<s32x128, Square>, uint8_t, int32_t>(
+        n, k3, k2, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const uint8_t*>(a),
+        /*C_stride_m=*/0, reinterpret_cast<int32_t*>(c));
+  } else {
+    tiled_reduce<sum_accumulator_x32<s32x32, 128, Square>, uint8_t, int32_t>(
+        n, k3, k2, k1, a_stride_n, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const uint8_t*>(a), /*C_stride_m=*/0,
+        reinterpret_cast<int32_t*>(c));
+  }
+}
+
+void sum_int8_int32_hvx(size_t n, size_t k3, size_t k2, size_t k1,
+                        size_t a_stride_n, size_t a_stride_k3,
+                        size_t a_stride_k2, const void* a, size_t, void* c) {
+  if (k1 == 1 && a_stride_n == sizeof(int8_t)) {
+    // TODO(b/482435301): This case is poorly optimized. It naively converts to
+    // int32 and does a 32-bit add. We should be using a widening op, and
+    // storing the accumulators interleaved until `sum_rows`.
+    stream_reduce<sum_accumulator_k1_1<s32x128>, int8_t, int32_t>(
+        n, k3, k2, a_stride_k3, a_stride_k2, reinterpret_cast<const int8_t*>(a),
+        /*C_stride_m=*/0, reinterpret_cast<int32_t*>(c));
+  } else {
+    tiled_reduce<sum_accumulator_x32<s32x32, 128, Identity>, int8_t, int32_t>(
+        n, k3, k2, k1, a_stride_n, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const int8_t*>(a), /*C_stride_m=*/0,
+        reinterpret_cast<int32_t*>(c));
+  }
+}
+
+void sum_squared_int8_int32_hvx(size_t n, size_t k3, size_t k2, size_t k1,
+                                size_t a_stride_n, size_t a_stride_k3,
+                                size_t a_stride_k2, const void* a, size_t,
+                                void* c) {
+  if (k1 == 1 && a_stride_n == sizeof(int8_t)) {
+    // TODO(b/482435301): This case is poorly optimized. It naively converts to
+    // int32 and does a 32-bit add. We should be using a widening op, and
+    // storing the accumulators interleaved until `sum_rows`.
+    stream_reduce<sum_accumulator_k1_1<s32x128, Square>, int8_t, int32_t>(
+        n, k3, k2, a_stride_k3, a_stride_k2, reinterpret_cast<const int8_t*>(a),
+        /*C_stride_m=*/0, reinterpret_cast<int32_t*>(c));
+  } else {
+    tiled_reduce<sum_accumulator_x32<s32x32, 128, Square>, int8_t, int32_t>(
+        n, k3, k2, k1, a_stride_n, a_stride_k3, a_stride_k2,
+        reinterpret_cast<const int8_t*>(a), /*C_stride_m=*/0,
+        reinterpret_cast<int32_t*>(c));
+  }
+}
+
+}  // namespace ynn
diff --git a/ynnpack/kernels/reduce/max.inc b/ynnpack/kernels/reduce/max.inc
@@ -8,6 +8,14 @@ YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, max_int8_4x16_neon, int8_t, int8_t)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, max_uint8_4x16_neon, uint8_t, uint8_t)
 #endif
 
+#ifdef YNN_ARCH_HEXAGON_HVX
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, max_fp32_4x32_hvx, float, float)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, max_bf16_4x64_hvx, bfloat16, bfloat16)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, max_fp16_4x64_hvx, half, half)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, max_int8_4x128_hvx, int8_t, int8_t)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, max_uint8_4x128_hvx, uint8_t, uint8_t)
+#endif  // YNN_ARCH_HEXAGON_HVX
+
 #ifdef YNN_ARCH_X86_AVX512
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, max_bf16_4x32_avx512bw, bfloat16, bfloat16)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, max_fp16_4x32_avx512bw, half, half)
diff --git a/ynnpack/kernels/reduce/min.inc b/ynnpack/kernels/reduce/min.inc
@@ -8,6 +8,14 @@ YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, min_int8_4x16_neon, int8_t, int8_t)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, min_uint8_4x16_neon, uint8_t, uint8_t)
 #endif
 
+#ifdef YNN_ARCH_HEXAGON_HVX
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_fp32_4x32_hvx, float, float)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_bf16_4x64_hvx, bfloat16, bfloat16)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_fp16_4x64_hvx, half, half)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_int8_4x128_hvx, int8_t, int8_t)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_uint8_4x128_hvx, uint8_t, uint8_t)
+#endif  // YNN_ARCH_HEXAGON_HVX
+
 #ifdef YNN_ARCH_X86_AVX512
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, min_bf16_4x32_avx512bw, bfloat16, bfloat16)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, min_fp16_4x32_avx512bw, bfloat16, bfloat16)
diff --git a/ynnpack/kernels/reduce/min_max.inc b/ynnpack/kernels/reduce/min_max.inc
@@ -8,6 +8,14 @@ YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, min_max_int8_4x16_neon, int8_t, int8_t)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, min_max_uint8_4x16_neon, uint8_t, uint8_t)
 #endif
 
+#ifdef YNN_ARCH_HEXAGON_HVX
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_max_fp32_4x32_hvx, float, float)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_max_bf16_4x64_hvx, bfloat16, bfloat16)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_max_fp16_4x64_hvx, half, half)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_max_int8_4x128_hvx, int8_t, int8_t)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, min_max_uint8_4x128_hvx, uint8_t, uint8_t)
+#endif  // YNN_ARCH_HEXAGON_HVX
+
 #ifdef YNN_ARCH_X86_AVX512
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, min_max_bf16_4x32_avx512bw, bfloat16, bfloat16)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bw, min_max_fp16_4x32_avx512bw, half, half)
diff --git a/ynnpack/kernels/reduce/min_max_accumulator.h b/ynnpack/kernels/reduce/min_max_accumulator.h
@@ -136,7 +136,7 @@ struct min_max_accumulator {
   }
 
   template <typename AccT>
-  void accumulate_min(T* __restrict C, size_t n, const AccT* acc) {
+  void accumulate_min(T* __restrict C, size_t n, const AccT* __restrict acc) {
     switch (n) {
       case 4:
         C[3] = min(C[3], horizontal_min(acc[3]));
@@ -153,7 +153,7 @@ struct min_max_accumulator {
   }
 
   template <typename AccT>
-  void accumulate_max(T* __restrict C, size_t n, const AccT* acc) {
+  void accumulate_max(T* __restrict C, size_t n, const AccT* __restrict acc) {
     switch (n) {
       case 4:
         C[3] = max(C[3], horizontal_max(acc[3]));
diff --git a/ynnpack/kernels/reduce/sum.inc b/ynnpack/kernels/reduce/sum.inc
@@ -15,6 +15,11 @@ YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, sum_fp32_neon, float, float)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, sum_bf16_fp32_neon, bfloat16, float)
 #endif  // YNN_ARCH_ARM_NEON
 
+#ifdef YNN_ARCH_HEXAGON_HVX
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, sum_int8_int32_hvx, int8_t, int32_t)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, sum_uint8_int32_hvx, uint8_t, int32_t)
+#endif  // YNN_ARCH_HEXAGON_HVX
+
 #ifdef YNN_ARCH_X86_AVX512BF16
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bf16, sum_bf16_fp32_avx512bf16, bfloat16, float)
 #endif  // YNN_ARCH_X86_AVX512BF16
diff --git a/ynnpack/kernels/reduce/sum_accumulator.h b/ynnpack/kernels/reduce/sum_accumulator.h
@@ -69,6 +69,8 @@ YNN_ALWAYS_INLINE auto sum_rows(const AccT* acc,
   auto v_1 = (extract<0>(acc[1], cols) + extract<1>(acc[1], cols)) +
              (extract<2>(acc[1], cols) + extract<3>(acc[1], cols));
 
+  // TODO(dsharlet): This returns a vector of 4 values, when it should return
+  // a vector of 2 values.
   auto zero = decltype(v_0)(0);
   auto t = transpose<typename AccT::value_type>({{v_0, v_1, zero, zero}});
   return (t[0] + t[1]) + (t[2] + t[3]);
@@ -98,6 +100,25 @@ YNN_ALWAYS_INLINE auto sum_rows(const AccT* acc,
   return (t[0] + t[1]) + (t[2] + t[3]);
 }
 
+#ifndef YNN_ARCH_X86
+// This is not numerically consistent, don't let it be used on x86.
+template <typename AccT, size_t K, size_t N>
+YNN_ALWAYS_INLINE auto sum_rows(const AccT* __restrict acc,
+                                std::integral_constant<size_t, K> /*K*/,
+                                std::integral_constant<size_t, N> /*N*/) {
+  using scalar = typename AccT::value_type;
+  scalar result[N];
+  YNN_UNROLL
+  for (size_t i = 0; i < N; ++i) {
+    result[i] = simd::horizontal_sum(acc[i]);
+  }
+  // TODO(dsharlet): This returns a vector of 4 values to meet the assumptions
+  // of the callers below. It should return a vector of N values.
+  static_assert(N <= 4);
+  return simd::load(result, N, simd::vec<scalar, 4>{});
+}
+#endif  // YNN_ARCH_X86
+
 template <typename AccT, size_t K_, typename MapFn = Identity, size_t N_ = 4>
 struct sum_accumulator_x32 {
   static constexpr std::integral_constant<size_t, N_> N = {};
@@ -123,16 +144,18 @@ struct sum_accumulator_x32 {
                                 NT n, KT k) {
     const simd::vec<AT, K> zero(0);
     auto a_0 = load(offset_bytes(A, 0 * A_stride_n), k, zero);
-    auto a_1 = 1 < n ? load(offset_bytes(A, 1 * A_stride_n), k, zero) : zero;
     acc[0] = reduce_add(acc[0], a_0, map_fn, horizontal_factor);
-    acc[1] = reduce_add(acc[1], a_1, map_fn, horizontal_factor);
-
-    if constexpr (N == 4) {
+    if constexpr (N >= 2) {
+      auto a_1 = 1 < n ? load(offset_bytes(A, 1 * A_stride_n), k, zero) : zero;
+      acc[1] = reduce_add(acc[1], a_1, map_fn, horizontal_factor);
+    }
+    if constexpr (N >= 4) {
       auto a_2 = 2 < n ? load(offset_bytes(A, 2 * A_stride_n), k, zero) : zero;
       auto a_3 = 3 < n ? load(offset_bytes(A, 3 * A_stride_n), k, zero) : zero;
       acc[2] = reduce_add(acc[2], a_2, map_fn, horizontal_factor);
       acc[3] = reduce_add(acc[3], a_3, map_fn, horizontal_factor);
     }
+    static_assert(N <= 4, "");
   }
 
   template <typename T, typename NT>
diff --git a/ynnpack/kernels/reduce/sum_squared.inc b/ynnpack/kernels/reduce/sum_squared.inc
@@ -19,6 +19,11 @@ YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, sum_squared_fp32_neon, float, float)
 YNN_UNARY_REDUCE_KERNEL(arch_flag::neon, sum_squared_bf16_fp32_neon, bfloat16, float)
 #endif  // YNN_ARCH_ARM_NEON
 
+#ifdef YNN_ARCH_HEXAGON_HVX
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, sum_squared_int8_int32_hvx, int8_t, int32_t)
+YNN_UNARY_REDUCE_KERNEL(arch_flag::hvx, sum_squared_uint8_int32_hvx, uint8_t, int32_t)
+#endif  // YNN_ARCH_HEXAGON_HVX
+
 #ifdef YNN_ARCH_X86_AVX512BF16
 YNN_UNARY_REDUCE_KERNEL(arch_flag::avx512bf16, sum_squared_bf16_fp32_avx512bf16, bfloat16, float)
 #endif  // YNN_ARCH_X86_AVX512BF16