[SW] Add SpMV kernel.

Aquaticfuller · Aquaticfuller · commit eb3c5a8c322d · 2026-03-03T15:46:15.000+01:00
diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt
@@ -111,6 +111,7 @@ add_library(hp-fmatmul hp-fmatmul/kernel/hp-fmatmul.c)
 
 add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64  128 64)
 add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32)
+add_spatz_test_threeParam_type(dp-spmv spmv/main.c 64 64 512 64)
 
 # add_library(widening-hp-fmatmul widening-hp-fmatmul/kernel/widening-fmatmul.c)
 # add_library(widening-bp-fmatmul widening-bp-fmatmul/kernel/widening-fmatmul.c)
diff --git a/sw/spatzBenchmarks/spmv/data/layer.h b/sw/spatzBenchmarks/spmv/data/layer.h
@@ -0,0 +1,16 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+
+typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
+
+typedef struct spmv_layer_struct {
+  uint32_t M;
+  uint32_t N;
+  uint32_t K;
+  precision_t dtype;
+} spmv_layer;
diff --git a/sw/spatzBenchmarks/spmv/kernel/spmv.c b/sw/spatzBenchmarks/spmv/kernel/spmv.c
@@ -0,0 +1,79 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "spmv.h"
+
+#define SPMV_SMALL_ROW_THRESHOLD 4
+
+// Reduce one CSR row worth of products for fp64.
+// x_off contains 32-bit byte offsets into x. On this rv32 target, vluxei64 is
+// not available, so the gather indices must remain 32-bit even though all data
+// arrays are 8-byte aligned.
+static inline double spmv_row_v64b(const double *val, const uint32_t *x_off,
+                                   const double *x, uint32_t avl) {
+  if (avl == 0) return 0.0;
+
+  const uint32_t orig_avl = avl;
+  uint32_t vl;
+  double red = 0.0;
+  const uint32_t cid = snrt_cluster_core_idx();
+
+  asm volatile("vsetvli %0, %1, e64, m1, ta, ma" : "=r"(vl) : "r"(avl));
+  asm volatile("vmv.s.x v0, zero");
+
+  do {
+    // Stripmine the remaining non-zeros in this row.
+    asm volatile("vsetvli %0, %1, e64, m1, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // v8  <- val[k : k+vl]
+    // v16 <- 32-bit byte offsets for x[col_idx[k : k+vl]]
+    asm volatile("vle64.v v8, (%0)" ::"r"(val));
+    asm volatile("vle32.v v16, (%0)" ::"r"(x_off));
+
+    // v24 <- gathered x values using the per-entry byte offsets.
+    asm volatile("vluxei32.v v24, (%0), v16" ::"r"(x));
+
+    if (avl == orig_avl) {
+      // First chunk initializes the accumulation vector.
+      asm volatile("vfmul.vv v28, v8, v24");
+    } else {
+      // Later chunks accumulate into the same vector accumulator.
+      asm volatile("vfmacc.vv v28, v8, v24");
+    }
+
+    // Advance the stripmined row window.
+    val += vl;
+    x_off += vl;
+    avl -= vl;
+  } while (avl > 0);
+
+  // Reduce the accumulated products in v28 to one scalar sum in v0[0].
+  asm volatile("vsetvli zero, %0, e64, m1, ta, ma" ::"r"(orig_avl));
+  asm volatile("vfredusum.vs v0, v28, v0");
+  asm volatile("vfmv.f.s %0, v0" : "=f"(red));
+
+  return red;
+}
+
+// Top-level fp64 SpMV: scalar fallback for very short rows, vector path
+// otherwise.
+void spmv_v64b(const uint32_t *row_ptr, const uint32_t *x_off, const double *val,
+               const double *x, double *y, uint32_t row_start,
+               uint32_t row_end) {
+  for (uint32_t row = row_start; row < row_end; ++row) {
+    const uint32_t start = row_ptr[row];
+    const uint32_t end = row_ptr[row + 1];
+    const uint32_t nnz = end - start;
+
+    if (nnz < SPMV_SMALL_ROW_THRESHOLD) {
+      double sum = 0.0;
+      for (uint32_t k = start; k < end; ++k) {
+        sum += val[k] * x[x_off[k] / sizeof(double)];
+      }
+      y[row] = sum;
+    } else {
+      y[row] = spmv_row_v64b(val + start, x_off + start, x, nnz);
+    }
+  }
+}
diff --git a/sw/spatzBenchmarks/spmv/kernel/spmv.h b/sw/spatzBenchmarks/spmv/kernel/spmv.h
@@ -0,0 +1,14 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef _SPMV_H
+#define _SPMV_H
+
+#include <stdint.h>
+
+void spmv_v64b(const uint32_t *row_ptr, const uint32_t *x_off, const double *val,
+               const double *x, double *y, uint32_t row_start,
+               uint32_t row_end);
+
+#endif
diff --git a/sw/spatzBenchmarks/spmv/main.c b/sw/spatzBenchmarks/spmv/main.c
@@ -0,0 +1,184 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <benchmark.h>
+#include <snrt.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include DATAHEADER
+#include "kernel/spmv.c"
+
+#if (PREC != 64)
+#error "spmv currently supports double precision only"
+#endif
+
+#define T double
+
+// Use all cluster cores by default. Set SPMV_NUM_CORES to a positive value to
+// limit the kernel to a smaller number of worker cores.
+#ifndef SPMV_NUM_CORES
+#define SPMV_NUM_CORES 0
+#endif
+
+static T *row_val;
+static T *x_vec;
+static T *result;
+static uint32_t *row_ptr;
+static uint32_t *col_idx;
+static uint32_t *x_off;
+
+static inline void *l1alloc_aligned(size_t size, size_t alignment) {
+  uintptr_t raw = (uintptr_t)snrt_l1alloc(size + alignment - 1);
+  uintptr_t aligned = (raw + alignment - 1) & ~(uintptr_t)(alignment - 1);
+  return (void *)aligned;
+}
+
+static inline double abs_diff(double a, double b) {
+  double d = a - b;
+  return d < 0.0 ? -d : d;
+}
+
+static inline int fp_check(const T *a, const T *b) {
+  return abs_diff(*a, *b) > 0.001;
+}
+
+static inline void build_offsets(uint32_t *dst, const uint32_t *src,
+                                 uint32_t nnz) {
+  for (uint32_t i = 0; i < nnz; ++i) dst[i] = src[i] * sizeof(T);
+}
+
+int main() {
+  const uint32_t num_cores_hw = snrt_cluster_core_num();
+  const uint32_t cid = snrt_cluster_core_idx();
+  const uint32_t num_cores =
+      (SPMV_NUM_CORES > 0 && SPMV_NUM_CORES < num_cores_hw) ? SPMV_NUM_CORES
+                                                            : num_cores_hw;
+
+#if USE_CACHE == 1
+  uint32_t spm_size = 16;
+#else
+  uint32_t spm_size = 120;
+#endif
+
+  const uint32_t num_fpu = sizeof(T) / 2;
+
+  if (cid == 0) {
+    l1d_init(spm_size);
+  }
+
+#if MEAS_1ITER == 1
+  const int measure_iter = 1;
+#else
+  const int measure_iter = 2;
+#endif
+
+  unsigned int timer = (unsigned int)-1;
+  unsigned int timer_best = (unsigned int)-1;
+  unsigned int timer_1iter = (unsigned int)-1;
+  int ret = 0;
+
+  const uint32_t row_start = (cid < num_cores) ? (spmv_l.M * cid) / num_cores : 0;
+  const uint32_t row_end =
+      (cid < num_cores) ? (spmv_l.M * (cid + 1)) / num_cores : 0;
+
+#if USE_CACHE == 1
+  if (cid == 0) {
+    x_off = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
+    result = (T *)l1alloc_aligned(spmv_l.M * sizeof(T), 8);
+    build_offsets(x_off, spmv_col_idx_dram, spmv_l.K);
+  }
+
+  row_ptr = spmv_row_ptr_dram;
+  col_idx = spmv_col_idx_dram;
+  row_val = spmv_val_dram;
+  x_vec = spmv_x_dram;
+#else
+  if (cid == 0) {
+    row_ptr = (uint32_t *)l1alloc_aligned((spmv_l.M + 1) * sizeof(uint32_t), 8);
+    col_idx = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
+    x_off = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
+    row_val = (T *)l1alloc_aligned(spmv_l.K * sizeof(T), 8);
+    x_vec = (T *)l1alloc_aligned(spmv_l.N * sizeof(T), 8);
+    result = (T *)l1alloc_aligned(spmv_l.M * sizeof(T), 8);
+
+    snrt_dma_start_1d(row_ptr, spmv_row_ptr_dram,
+                      (spmv_l.M + 1) * sizeof(uint32_t));
+    snrt_dma_start_1d(col_idx, spmv_col_idx_dram, spmv_l.K * sizeof(uint32_t));
+    snrt_dma_start_1d(row_val, spmv_val_dram, spmv_l.K * sizeof(T));
+    snrt_dma_start_1d(x_vec, spmv_x_dram, spmv_l.N * sizeof(T));
+    snrt_dma_wait_all();
+    build_offsets(x_off, col_idx, spmv_l.K);
+  }
+#endif
+
+  snrt_cluster_hw_barrier();
+
+  for (int iter = 0; iter < measure_iter; ++iter) {
+    if (cid == 0) {
+      start_kernel();
+      timer = benchmark_get_cycle();
+    }
+
+    spmv_v64b(row_ptr, x_off, row_val, x_vec, result, row_start, row_end);
+
+    snrt_cluster_hw_barrier();
+
+    if (cid == 0) {
+      stop_kernel();
+      timer = benchmark_get_cycle() - timer;
+      if (iter == 0) {
+        timer_1iter = timer;
+      } else {
+        timer_best = (timer_best > timer) ? timer : timer_best;
+      }
+    }
+
+    snrt_cluster_hw_barrier();
+  }
+
+  if (measure_iter == 1) timer_best = timer_1iter;
+
+  if (cid == 0) {
+    double checksum = 0.0;
+    int errors = 0;
+
+    for (uint32_t i = 0; i < spmv_l.M; ++i) {
+      checksum += result[i];
+      if (fp_check(&result[i], &spmv_result[i])) {
+        ++errors;
+        printf("Error: row %u result=%f golden=%f\n", i, result[i],
+               spmv_result[i]);
+      }
+    }
+
+    if (abs_diff(checksum, spmv_checksum) > 0.001) {
+      ++errors;
+      printf("Error: checksum=%f golden=%f\n", checksum, spmv_checksum);
+    }
+
+    write_cyc(timer_best);
+
+    {
+      const unsigned long performance = 1000UL * 2UL * spmv_l.K / timer_best;
+      const unsigned long utilization =
+          performance / (2 * num_cores * num_fpu * 8 / sizeof(T));
+
+      printf("\n----- (%u x %u, nnz=%u) spmv -----\n", spmv_l.M, spmv_l.N,
+             spmv_l.K);
+      printf("Active cores: %u / %u\n", num_cores, num_cores_hw);
+      printf("The first iter takes %u cycles.\n", timer_1iter);
+      printf("The best execution took %u cycles.\n", timer_best);
+      printf("Checksum: %f\n", checksum);
+      printf("The performance is %lu OP/1000cycle (%lu%%o utilization).\n",
+             performance, utilization);
+    }
+
+    if (errors) ret = -1;
+  }
+
+  snrt_cluster_hw_barrier();
+  set_eoc();
+  return ret;
+}
diff --git a/sw/spatzBenchmarks/spmv/prompt.md b/sw/spatzBenchmarks/spmv/prompt.md
@@ -0,0 +1,26 @@
+Implement CSR SpMV (y=A*x) with fp16/fp32/fp64 kernels similar to the existing GEMV style.
+
+CSR:
+  row_ptr[nrows+1] u32, col_idx[nnz] u32, val[nnz] fp16/32/64, x[ncols], y[nrows].
+
+Scalar reference:
+  for i: sum=0; for k=row_ptr[i]..row_ptr[i+1)-1: sum += val[k]*x[col_idx[k]]; y[i]=sum.
+
+Vector kernel (within each row, strip-mine nnz):
+  for i:
+    sum_scalar = 0
+    k = row_ptr[i]
+    while k < row_ptr[i+1]:
+      vl = vsetvli(min(VLMAX, row_ptr[i+1]-k), e16/e32/e64, m1, ta, ma)
+      v_val = vleXX.v(&val[k])
+      v_idx = vle32.v(&col_idx[k])
+      v_off = v_idx << shift   (shift=1 for fp16, 2 for fp32, 3 for fp64)
+      v_x   = vluxei32.v(x_base, v_off)
+      v_p   = v_val * v_x
+      reduce v_p to scalar with vfredsum; add into sum_scalar
+      k += vl
+    y[i] = sum_scalar
+
+Add small-row fallback: if nnz < N (e.g., 4 or 8) do scalar loop.
+
+Keep same multi-core row partitioning scheme as GEMV main.c and add timing + checksum.
diff --git a/sw/spatzBenchmarks/spmv/script/README.md b/sw/spatzBenchmarks/spmv/script/README.md
diff --git a/sw/spatzBenchmarks/spmv/script/gen_data.py b/sw/spatzBenchmarks/spmv/script/gen_data.py
diff --git a/sw/spatzBenchmarks/spmv/script/spmv.json b/sw/spatzBenchmarks/spmv/script/spmv.json