Skip to content

Commit eb3c5a8

Browse files
committed
[SW] Add SpMV kernel.
1 parent 898d5af commit eb3c5a8

File tree

9 files changed

+924
-0
lines changed

9 files changed

+924
-0
lines changed

sw/spatzBenchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ add_library(hp-fmatmul hp-fmatmul/kernel/hp-fmatmul.c)
111111

112112
add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64 128 64)
113113
add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32)
114+
add_spatz_test_threeParam_type(dp-spmv spmv/main.c 64 64 512 64)
114115

115116
# add_library(widening-hp-fmatmul widening-hp-fmatmul/kernel/widening-fmatmul.c)
116117
# add_library(widening-bp-fmatmul widening-bp-fmatmul/kernel/widening-fmatmul.c)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// Copyright 2025 ETH Zurich and University of Bologna.
2+
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include <stdint.h>
8+
9+
typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
10+
11+
typedef struct spmv_layer_struct {
12+
uint32_t M;
13+
uint32_t N;
14+
uint32_t K;
15+
precision_t dtype;
16+
} spmv_layer;
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// Copyright 2025 ETH Zurich and University of Bologna.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include "spmv.h"
6+
7+
#define SPMV_SMALL_ROW_THRESHOLD 4
8+
9+
// Reduce one CSR row worth of products for fp64.
10+
// x_off contains 32-bit byte offsets into x. On this rv32 target, vluxei64 is
11+
// not available, so the gather indices must remain 32-bit even though all data
12+
// arrays are 8-byte aligned.
13+
static inline double spmv_row_v64b(const double *val, const uint32_t *x_off,
14+
const double *x, uint32_t avl) {
15+
if (avl == 0) return 0.0;
16+
17+
const uint32_t orig_avl = avl;
18+
uint32_t vl;
19+
double red = 0.0;
20+
const uint32_t cid = snrt_cluster_core_idx();
21+
22+
asm volatile("vsetvli %0, %1, e64, m1, ta, ma" : "=r"(vl) : "r"(avl));
23+
asm volatile("vmv.s.x v0, zero");
24+
25+
do {
26+
// Stripmine the remaining non-zeros in this row.
27+
asm volatile("vsetvli %0, %1, e64, m1, ta, ma" : "=r"(vl) : "r"(avl));
28+
29+
// v8 <- val[k : k+vl]
30+
// v16 <- 32-bit byte offsets for x[col_idx[k : k+vl]]
31+
asm volatile("vle64.v v8, (%0)" ::"r"(val));
32+
asm volatile("vle32.v v16, (%0)" ::"r"(x_off));
33+
34+
// v24 <- gathered x values using the per-entry byte offsets.
35+
asm volatile("vluxei32.v v24, (%0), v16" ::"r"(x));
36+
37+
if (avl == orig_avl) {
38+
// First chunk initializes the accumulation vector.
39+
asm volatile("vfmul.vv v28, v8, v24");
40+
} else {
41+
// Later chunks accumulate into the same vector accumulator.
42+
asm volatile("vfmacc.vv v28, v8, v24");
43+
}
44+
45+
// Advance the stripmined row window.
46+
val += vl;
47+
x_off += vl;
48+
avl -= vl;
49+
} while (avl > 0);
50+
51+
// Reduce the accumulated products in v28 to one scalar sum in v0[0].
52+
asm volatile("vsetvli zero, %0, e64, m1, ta, ma" ::"r"(orig_avl));
53+
asm volatile("vfredusum.vs v0, v28, v0");
54+
asm volatile("vfmv.f.s %0, v0" : "=f"(red));
55+
56+
return red;
57+
}
58+
59+
// Top-level fp64 SpMV: scalar fallback for very short rows, vector path
60+
// otherwise.
61+
void spmv_v64b(const uint32_t *row_ptr, const uint32_t *x_off, const double *val,
62+
const double *x, double *y, uint32_t row_start,
63+
uint32_t row_end) {
64+
for (uint32_t row = row_start; row < row_end; ++row) {
65+
const uint32_t start = row_ptr[row];
66+
const uint32_t end = row_ptr[row + 1];
67+
const uint32_t nnz = end - start;
68+
69+
if (nnz < SPMV_SMALL_ROW_THRESHOLD) {
70+
double sum = 0.0;
71+
for (uint32_t k = start; k < end; ++k) {
72+
sum += val[k] * x[x_off[k] / sizeof(double)];
73+
}
74+
y[row] = sum;
75+
} else {
76+
y[row] = spmv_row_v64b(val + start, x_off + start, x, nnz);
77+
}
78+
}
79+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Copyright 2025 ETH Zurich and University of Bologna.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#ifndef _SPMV_H
6+
#define _SPMV_H
7+
8+
#include <stdint.h>
9+
10+
void spmv_v64b(const uint32_t *row_ptr, const uint32_t *x_off, const double *val,
11+
const double *x, double *y, uint32_t row_start,
12+
uint32_t row_end);
13+
14+
#endif

sw/spatzBenchmarks/spmv/main.c

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
// Copyright 2025 ETH Zurich and University of Bologna.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include <benchmark.h>
6+
#include <snrt.h>
7+
#include <stdint.h>
8+
#include <stdio.h>
9+
10+
#include DATAHEADER
11+
#include "kernel/spmv.c"
12+
13+
#if (PREC != 64)
14+
#error "spmv currently supports double precision only"
15+
#endif
16+
17+
#define T double
18+
19+
// Use all cluster cores by default. Set SPMV_NUM_CORES to a positive value to
20+
// limit the kernel to a smaller number of worker cores.
21+
#ifndef SPMV_NUM_CORES
22+
#define SPMV_NUM_CORES 0
23+
#endif
24+
25+
static T *row_val;
26+
static T *x_vec;
27+
static T *result;
28+
static uint32_t *row_ptr;
29+
static uint32_t *col_idx;
30+
static uint32_t *x_off;
31+
32+
static inline void *l1alloc_aligned(size_t size, size_t alignment) {
33+
uintptr_t raw = (uintptr_t)snrt_l1alloc(size + alignment - 1);
34+
uintptr_t aligned = (raw + alignment - 1) & ~(uintptr_t)(alignment - 1);
35+
return (void *)aligned;
36+
}
37+
38+
static inline double abs_diff(double a, double b) {
39+
double d = a - b;
40+
return d < 0.0 ? -d : d;
41+
}
42+
43+
static inline int fp_check(const T *a, const T *b) {
44+
return abs_diff(*a, *b) > 0.001;
45+
}
46+
47+
static inline void build_offsets(uint32_t *dst, const uint32_t *src,
48+
uint32_t nnz) {
49+
for (uint32_t i = 0; i < nnz; ++i) dst[i] = src[i] * sizeof(T);
50+
}
51+
52+
int main() {
53+
const uint32_t num_cores_hw = snrt_cluster_core_num();
54+
const uint32_t cid = snrt_cluster_core_idx();
55+
const uint32_t num_cores =
56+
(SPMV_NUM_CORES > 0 && SPMV_NUM_CORES < num_cores_hw) ? SPMV_NUM_CORES
57+
: num_cores_hw;
58+
59+
#if USE_CACHE == 1
60+
uint32_t spm_size = 16;
61+
#else
62+
uint32_t spm_size = 120;
63+
#endif
64+
65+
const uint32_t num_fpu = sizeof(T) / 2;
66+
67+
if (cid == 0) {
68+
l1d_init(spm_size);
69+
}
70+
71+
#if MEAS_1ITER == 1
72+
const int measure_iter = 1;
73+
#else
74+
const int measure_iter = 2;
75+
#endif
76+
77+
unsigned int timer = (unsigned int)-1;
78+
unsigned int timer_best = (unsigned int)-1;
79+
unsigned int timer_1iter = (unsigned int)-1;
80+
int ret = 0;
81+
82+
const uint32_t row_start = (cid < num_cores) ? (spmv_l.M * cid) / num_cores : 0;
83+
const uint32_t row_end =
84+
(cid < num_cores) ? (spmv_l.M * (cid + 1)) / num_cores : 0;
85+
86+
#if USE_CACHE == 1
87+
if (cid == 0) {
88+
x_off = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
89+
result = (T *)l1alloc_aligned(spmv_l.M * sizeof(T), 8);
90+
build_offsets(x_off, spmv_col_idx_dram, spmv_l.K);
91+
}
92+
93+
row_ptr = spmv_row_ptr_dram;
94+
col_idx = spmv_col_idx_dram;
95+
row_val = spmv_val_dram;
96+
x_vec = spmv_x_dram;
97+
#else
98+
if (cid == 0) {
99+
row_ptr = (uint32_t *)l1alloc_aligned((spmv_l.M + 1) * sizeof(uint32_t), 8);
100+
col_idx = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
101+
x_off = (uint32_t *)l1alloc_aligned(spmv_l.K * sizeof(uint32_t), 8);
102+
row_val = (T *)l1alloc_aligned(spmv_l.K * sizeof(T), 8);
103+
x_vec = (T *)l1alloc_aligned(spmv_l.N * sizeof(T), 8);
104+
result = (T *)l1alloc_aligned(spmv_l.M * sizeof(T), 8);
105+
106+
snrt_dma_start_1d(row_ptr, spmv_row_ptr_dram,
107+
(spmv_l.M + 1) * sizeof(uint32_t));
108+
snrt_dma_start_1d(col_idx, spmv_col_idx_dram, spmv_l.K * sizeof(uint32_t));
109+
snrt_dma_start_1d(row_val, spmv_val_dram, spmv_l.K * sizeof(T));
110+
snrt_dma_start_1d(x_vec, spmv_x_dram, spmv_l.N * sizeof(T));
111+
snrt_dma_wait_all();
112+
build_offsets(x_off, col_idx, spmv_l.K);
113+
}
114+
#endif
115+
116+
snrt_cluster_hw_barrier();
117+
118+
for (int iter = 0; iter < measure_iter; ++iter) {
119+
if (cid == 0) {
120+
start_kernel();
121+
timer = benchmark_get_cycle();
122+
}
123+
124+
spmv_v64b(row_ptr, x_off, row_val, x_vec, result, row_start, row_end);
125+
126+
snrt_cluster_hw_barrier();
127+
128+
if (cid == 0) {
129+
stop_kernel();
130+
timer = benchmark_get_cycle() - timer;
131+
if (iter == 0) {
132+
timer_1iter = timer;
133+
} else {
134+
timer_best = (timer_best > timer) ? timer : timer_best;
135+
}
136+
}
137+
138+
snrt_cluster_hw_barrier();
139+
}
140+
141+
if (measure_iter == 1) timer_best = timer_1iter;
142+
143+
if (cid == 0) {
144+
double checksum = 0.0;
145+
int errors = 0;
146+
147+
for (uint32_t i = 0; i < spmv_l.M; ++i) {
148+
checksum += result[i];
149+
if (fp_check(&result[i], &spmv_result[i])) {
150+
++errors;
151+
printf("Error: row %u result=%f golden=%f\n", i, result[i],
152+
spmv_result[i]);
153+
}
154+
}
155+
156+
if (abs_diff(checksum, spmv_checksum) > 0.001) {
157+
++errors;
158+
printf("Error: checksum=%f golden=%f\n", checksum, spmv_checksum);
159+
}
160+
161+
write_cyc(timer_best);
162+
163+
{
164+
const unsigned long performance = 1000UL * 2UL * spmv_l.K / timer_best;
165+
const unsigned long utilization =
166+
performance / (2 * num_cores * num_fpu * 8 / sizeof(T));
167+
168+
printf("\n----- (%u x %u, nnz=%u) spmv -----\n", spmv_l.M, spmv_l.N,
169+
spmv_l.K);
170+
printf("Active cores: %u / %u\n", num_cores, num_cores_hw);
171+
printf("The first iter takes %u cycles.\n", timer_1iter);
172+
printf("The best execution took %u cycles.\n", timer_best);
173+
printf("Checksum: %f\n", checksum);
174+
printf("The performance is %lu OP/1000cycle (%lu%%o utilization).\n",
175+
performance, utilization);
176+
}
177+
178+
if (errors) ret = -1;
179+
}
180+
181+
snrt_cluster_hw_barrier();
182+
set_eoc();
183+
return ret;
184+
}

sw/spatzBenchmarks/spmv/prompt.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
Implement CSR SpMV (y=A*x) with fp16/fp32/fp64 kernels similar to the existing GEMV style.
2+
3+
CSR:
4+
row_ptr[nrows+1] u32, col_idx[nnz] u32, val[nnz] fp16/32/64, x[ncols], y[nrows].
5+
6+
Scalar reference:
7+
for i: sum=0; for k=row_ptr[i]..row_ptr[i+1)-1: sum += val[k]*x[col_idx[k]]; y[i]=sum.
8+
9+
Vector kernel (within each row, strip-mine nnz):
10+
for i:
11+
sum_scalar = 0
12+
k = row_ptr[i]
13+
while k < row_ptr[i+1]:
14+
vl = vsetvli(min(VLMAX, row_ptr[i+1]-k), e16/e32/e64, m1, ta, ma)
15+
v_val = vleXX.v(&val[k])
16+
v_idx = vle32.v(&col_idx[k])
17+
v_off = v_idx << shift (shift=1 for fp16, 2 for fp32, 3 for fp64)
18+
v_x = vluxei32.v(x_base, v_off)
19+
v_p = v_val * v_x
20+
reduce v_p to scalar with vfredsum; add into sum_scalar
21+
k += vl
22+
y[i] = sum_scalar
23+
24+
Add small-row fallback: if nnz < N (e.g., 4 or 8) do scalar loop.
25+
26+
Keep same multi-core row partitioning scheme as GEMV main.c and add timing + checksum.

0 commit comments

Comments
 (0)