Skip to content

Commit bb697ec

Browse files
Wondertanclaude
andauthored
perf(rsema1d): vectorize batched RLC for Coder.Encode (#7143)
Adds `computeRLCVectorized`, RLC compute using hardware-accelerated matrix-multiply built on the fused `GF16MulSliceXor8` kernel from klauspost/reedsolomon#334. `Coder.Encode` switches to this path. The RLC compute was the main CPU user in the profiles, accounting for ~80% of the total CPU profile. This change makes RLC effectively invisible in profiles, leaving hashing as the number CPU consumer. The benchmarks below are getting bottlenecked by the memory throughput of my laptop. On the server's hardware, the increase should be more substantial. benchstat BenchmarkCoderEncode/1024x1024x131072 on Ryzen 9 7940HS: ``` scalar vectorized vs base workers=1 49.75 MiB/s 363.07 MiB/s +629.82% (~7.3x) workers=default 236.20 MiB/s 627.00 MiB/s +165.48% (~2.7x) ``` The subsequent PR will also migrate the verification to the hardware-optimized version. Closes the original celestiaorg/rsema1d#15 (Closes #6738) <!-- devin-review-badge-begin --> --- <a href="https://app.devin.ai/review/celestiaorg/celestia-app/pull/7143" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open in Devin Review"> </picture> </a> <!-- devin-review-badge-end --> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent acdbd27 commit bb697ec

10 files changed

Lines changed: 308 additions & 12 deletions

File tree

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ require (
5555
github.com/grpc-ecosystem/grpc-gateway v1.16.0
5656
github.com/hashicorp/go-metrics v0.5.4
5757
github.com/joho/godotenv v1.5.1
58-
github.com/klauspost/reedsolomon v1.13.3
58+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a
5959
github.com/pelletier/go-toml/v2 v2.3.0
6060
github.com/prometheus/client_golang v1.23.2
6161
github.com/rs/zerolog v1.35.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -851,8 +851,8 @@ github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBF
851851
github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
852852
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
853853
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
854-
github.com/klauspost/reedsolomon v1.13.3 h1:01GwnO2xoCSaM0ShP4qwl+FsHg3csFShC6Tu/RS1ji0=
855-
github.com/klauspost/reedsolomon v1.13.3/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
854+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a h1:aP94idRf0yhG07gBSIyW3sy/cd+XNLWnghSp11y0oIc=
855+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
856856
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
857857
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
858858
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=

pkg/rsema1d/coder.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func (c *Coder) commit(extendedRows [][]byte) *ExtendedData {
6767

6868
// derive RLC coefficients and compute RLC results for original rows
6969
coeffs := deriveCoefficients(rowRoot, len(extendedRows[0]))
70-
rlcOrig := computeRLCOrig(extendedRows[:c.config.K], coeffs, c.config)
70+
rlcOrig := computeRLCVectorized(extendedRows[:c.config.K], coeffs, c.config)
7171

7272
// build padded RLC Merkle tree
7373
rlcOrigTree := BuildPaddedRLCTree(rlcOrig, c.config)

pkg/rsema1d/coder_bench_test.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package rsema1d
22

33
import (
44
"math/rand/v2"
5+
"runtime"
56
"testing"
67
)
78

@@ -10,16 +11,27 @@ func BenchmarkCoderEncode(b *testing.B) {
1011
name string
1112
k, n int
1213
rowSize int
14+
workers int // 0 means runtime.NumCPU()
1315
}{
14-
{"4x4x64", 4, 4, 64},
15-
{"64x64x512", 64, 64, 512},
16-
{"1024x1024x1024", 1024, 1024, 1024},
17-
{"4096x12288x8192", 4096, 12288, 8192},
16+
{"4x4x64", 4, 4, 64, 1},
17+
{"64x64x512", 64, 64, 512, 1},
18+
{"1024x1024x1024", 1024, 1024, 1024, 1},
19+
{"4096x12288x8192", 4096, 12288, 8192, 1},
20+
// 128 MB original / K=1024 / 128 KB rows — the largest single-row size
21+
// in the wider bench matrix. Covers both workers=1 and the default
22+
// multi-worker path so the RLC SIMD win vs. the rest of the pipeline
23+
// (Leopard extend + row-Merkle) is visible.
24+
{"1024x1024x131072", 1024, 1024, 131072, 1},
25+
{"1024x1024x131072/workers=default", 1024, 1024, 131072, 0},
1826
}
1927

2028
for _, sz := range sizes {
2129
b.Run(sz.name, func(b *testing.B) {
22-
coder, err := NewCoder(&Config{K: sz.k, N: sz.n, WorkerCount: 1})
30+
workers := sz.workers
31+
if workers == 0 {
32+
workers = runtime.NumCPU()
33+
}
34+
coder, err := NewCoder(&Config{K: sz.k, N: sz.n, WorkerCount: workers})
2335
if err != nil {
2436
b.Fatal(err)
2537
}

pkg/rsema1d/field/gf128.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,41 @@ func HashToGF128(data []byte) GF128 {
7878
// XOR the two halves for final result
7979
return Add128(firstHalf, secondHalf)
8080
}
81+
82+
// GF128Width is the number of GF16 components that make up one GF128 element.
83+
const GF128Width = 8
84+
85+
// LeopardGF128BufSize returns the byte length of a buffer that holds k GF128
86+
// values laid out as GF128Width concatenated Leopard-formatted regions
87+
// (one per GF128 component).
88+
func LeopardGF128BufSize(k int) int { return GF128Width * 2 * k }
89+
90+
// LeopardGF128Views partitions a GF128Width-by-k Leopard-formatted byte
91+
// buffer into GF128Width per-component slice headers. len(buf) must equal
92+
// LeopardGF128BufSize(k). The views share the underlying buffer and are
93+
// suitable as MulSliceXor8 destinations; recover the GF128s with
94+
// GF128sFromLeopard(buf, k). The returned array stays on the caller's
95+
// stack — no heap allocation.
96+
func LeopardGF128Views(buf []byte, k int) [GF128Width][]byte {
97+
stride := 2 * k
98+
var views [GF128Width][]byte
99+
for i := range views {
100+
views[i] = buf[i*stride : (i+1)*stride]
101+
}
102+
return views
103+
}
104+
105+
// GF128sFromLeopard reads a GF128Width-by-k Leopard-formatted byte buffer
106+
// into a freshly allocated []GF128 of length k, one GF128 value per row.
107+
// The only heap allocation is the returned slice itself.
108+
func GF128sFromLeopard(buf []byte, k int) []GF128 {
109+
out := make([]GF128, k)
110+
stride := 2 * k
111+
for comp := range GF128Width {
112+
view := buf[comp*stride : (comp+1)*stride]
113+
for r := range k {
114+
out[r][comp] = GF16FromLeopard(view, r)
115+
}
116+
}
117+
return out
118+
}

pkg/rsema1d/field/gf16.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ import (
77
// GF16 represents a GF(2^16) field element
88
type GF16 uint16
99

10+
// GF16FromLeopard extracts the r-th GF16 element from a byte slice laid out
11+
// in Leopard format, i.e. one or more 64-byte chunks each containing 32 low
12+
// bytes followed by 32 high bytes. Caller must ensure r*2 < len(slab).
13+
func GF16FromLeopard(slab []byte, r int) GF16 {
14+
rb, rr := r/32, r%32
15+
return GF16(uint16(slab[rb*64+32+rr])<<8 | uint16(slab[rb*64+rr]))
16+
}
17+
1018
var ll reedsolomon.LowLevel
1119

1220
// Mul16 multiplies two GF(2^16) elements
@@ -18,3 +26,18 @@ func Mul16(a, b GF16) GF16 {
1826
func Add16(a, b GF16) GF16 {
1927
return a ^ b
2028
}
29+
30+
// MulSliceXor8 applies the 8 GF16 components of `coeff` as scalars to one
31+
// shared input slice, XOR-accumulating each into a distinct output slice:
32+
//
33+
// out[k][i] ^= coeff[k] * in[i] for k in [0, 8)
34+
//
35+
// All slices must have equal length, a multiple of 64 bytes, in Leopard
36+
// format (32 low bytes + 32 high bytes per 64-byte chunk).
37+
func MulSliceXor8(coeff *GF128, in []byte, outs *[8][]byte) {
38+
var s [8]uint16
39+
for k, v := range coeff {
40+
s[k] = uint16(v)
41+
}
42+
ll.GF16MulSliceXor8(&s, in, outs)
43+
}

pkg/rsema1d/rlc_vectorized.go

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
package rsema1d
2+
3+
import (
4+
"crypto/subtle"
5+
"sync"
6+
7+
"github.com/celestiaorg/celestia-app/v9/pkg/rsema1d/field"
8+
)
9+
10+
// The batched RLC over K rows is a matrix multiply over GF(2^16):
11+
// Result[r][k] = Σ_i Rows[r][i] * Coeffs[i][k]. We run it as an outer-product
12+
// accumulate so every inner step is a scalar-broadcast multiply — the shape
13+
// reedsolomon's SIMD kernel handles natively — and fuse the 8 GF128 components
14+
// into one kernel call per transposed column.
15+
const symbolsPerChunk = 32 // GF(2^16) symbols stored per 64-byte Leopard chunk
16+
17+
// computeRLCVectorized computes the RLC of len(rows) rows against `coeffs`
18+
// using the vectorized GF(2^16) SIMD kernel. Rows must be Leopard-sized
19+
// (a positive multiple of chunkSize bytes, equal across rows); K is padded
20+
// up to a multiple of symbolsPerChunk internally when needed.
21+
func computeRLCVectorized(rows [][]byte, coeffs []field.GF128, config *Config) []field.GF128 {
22+
origK := len(rows)
23+
if origK == 0 {
24+
return nil
25+
}
26+
27+
rows, K := padToSymbolsPerChunk(rows)
28+
numChunks := len(rows[0]) / chunkSize
29+
workers := min(max(config.WorkerCount, 1), numChunks)
30+
if workers == 1 {
31+
return field.GF128sFromLeopard(accumulateRLC(rows, coeffs, K, 0, numChunks), K)[:origK]
32+
}
33+
34+
partials := make([][]byte, workers)
35+
step, rem := numChunks/workers, numChunks%workers
36+
var wg sync.WaitGroup
37+
wg.Add(workers)
38+
for w := range workers {
39+
cStart := w*step + min(w, rem)
40+
cEnd := cStart + step
41+
if w < rem {
42+
cEnd++
43+
}
44+
go func() {
45+
defer wg.Done()
46+
partials[w] = accumulateRLC(rows, coeffs, K, cStart, cEnd)
47+
}()
48+
}
49+
wg.Wait()
50+
51+
for _, p := range partials[1:] {
52+
subtle.XORBytes(partials[0], partials[0], p)
53+
}
54+
return field.GF128sFromLeopard(partials[0], K)[:origK]
55+
}
56+
57+
// accumulateRLC processes chunk indices [cStart, cEnd) across every row and
58+
// returns a Leopard GF128-buffer (see field.LeopardGF128BufSize) holding the
59+
// partial RLC sums.
60+
func accumulateRLC(rows [][]byte, coeffs []field.GF128, k, cStart, cEnd int) []byte {
61+
buf := make([]byte, field.LeopardGF128BufSize(k))
62+
outs := field.LeopardGF128Views(buf, k)
63+
stride := 2 * k
64+
cols := make([]byte, symbolsPerChunk*stride)
65+
66+
rowBlocks := k / symbolsPerChunk
67+
for c := cStart; c < cEnd; c++ {
68+
transposeChunk(cols, k, rows, c, rowBlocks)
69+
for j := range symbolsPerChunk {
70+
col := cols[j*stride : (j+1)*stride]
71+
field.MulSliceXor8(&coeffs[c*symbolsPerChunk+j], col, &outs)
72+
}
73+
}
74+
return buf
75+
}
76+
77+
// transposeChunk gathers the 64-byte Leopard chunk at row offset c from each
78+
// of k rows and redistributes it into symbolsPerChunk column buffers, each of
79+
// stride 2k bytes and itself in Leopard format. Reading a block of 32 rows
80+
// at a time keeps each row's cache line hot while we scatter into the column
81+
// buffers.
82+
func transposeChunk(cols []byte, k int, rows [][]byte, c, rowBlocks int) {
83+
stride := 2 * k
84+
rowOff := c * chunkSize
85+
var block [symbolsPerChunk * chunkSize]byte
86+
for rb := range rowBlocks {
87+
rowBase := rb * symbolsPerChunk
88+
for rr := range symbolsPerChunk {
89+
copy(block[rr*chunkSize:(rr+1)*chunkSize], rows[rowBase+rr][rowOff:rowOff+chunkSize])
90+
}
91+
colOff := rb * chunkSize
92+
for j := range symbolsPerChunk {
93+
dst := cols[j*stride+colOff : j*stride+colOff+chunkSize]
94+
for rr := range symbolsPerChunk {
95+
src := block[rr*chunkSize:]
96+
dst[rr] = src[j] // low byte
97+
dst[symbolsPerChunk+rr] = src[symbolsPerChunk+j] // high byte
98+
}
99+
}
100+
}
101+
}
102+
103+
// padToSymbolsPerChunk returns rows with its length rounded up to a multiple
104+
// of symbolsPerChunk by appending a single shared zero row, along with the
105+
// (possibly padded) length. rows is not mutated; a new slice header is
106+
// returned only when padding is needed.
107+
func padToSymbolsPerChunk(rows [][]byte) ([][]byte, int) {
108+
K := len(rows)
109+
rem := K % symbolsPerChunk
110+
if rem == 0 {
111+
return rows, K
112+
}
113+
paddedK := K + symbolsPerChunk - rem
114+
padded := make([][]byte, paddedK)
115+
copy(padded, rows)
116+
zero := make([]byte, len(rows[0]))
117+
for i := K; i < paddedK; i++ {
118+
padded[i] = zero
119+
}
120+
return padded, paddedK
121+
}

pkg/rsema1d/rlc_vectorized_test.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package rsema1d
2+
3+
import (
4+
"math/rand/v2"
5+
"testing"
6+
7+
"github.com/celestiaorg/celestia-app/v9/pkg/rsema1d/field"
8+
)
9+
10+
// TestComputeRLCVectorizedMatchesScalar verifies the vectorized SIMD kernel
11+
// produces the same []GF128 as the per-row scalar loop across a range of
12+
// eligible K/rowSize combinations and both worker counts.
13+
func TestComputeRLCVectorizedMatchesScalar(t *testing.T) {
14+
cases := []struct{ k, rowSize int }{
15+
{32, 64},
16+
{32, 128},
17+
{64, 64},
18+
{256, 256},
19+
{1024, 1024},
20+
{1024, 8192},
21+
{4096, 4096},
22+
// K values that are not a multiple of symbolsPerChunk to exercise
23+
// the internal zero-row padding path.
24+
{1, 64},
25+
{17, 128},
26+
{33, 64},
27+
{100, 256},
28+
{1023, 1024},
29+
}
30+
for _, tc := range cases {
31+
rows := make([][]byte, tc.k)
32+
r := rand.New(rand.NewPCG(uint64(tc.k), uint64(tc.rowSize)))
33+
for i := range rows {
34+
rows[i] = make([]byte, tc.rowSize)
35+
for j := range rows[i] {
36+
rows[i][j] = byte(r.IntN(256))
37+
}
38+
}
39+
var rowRoot [32]byte
40+
for i := range rowRoot {
41+
rowRoot[i] = byte(r.IntN(256))
42+
}
43+
coeffs := deriveCoefficients(rowRoot, tc.rowSize)
44+
cfg := &Config{K: tc.k, N: tc.k, RowSize: tc.rowSize, WorkerCount: 1}
45+
46+
want := computeRLCOrig(rows, coeffs, cfg)
47+
for _, workers := range []int{1, 4} {
48+
cfg.WorkerCount = workers
49+
got := computeRLCVectorized(rows, coeffs, cfg)
50+
if len(want) != len(got) {
51+
t.Fatalf("k=%d rs=%d workers=%d length mismatch", tc.k, tc.rowSize, workers)
52+
}
53+
for i := range want {
54+
if !field.Equal128(want[i], got[i]) {
55+
t.Fatalf("k=%d rs=%d workers=%d row %d mismatch: want %v got %v",
56+
tc.k, tc.rowSize, workers, i, want[i], got[i])
57+
}
58+
}
59+
}
60+
}
61+
}
62+
63+
// BenchmarkComputeRLCVectorized measures the vectorized SIMD RLC kernel at
64+
// the largest single-row size in the matrix — 128MB total, K=1024 → 128KB
65+
// per row. Both single-worker and default-worker variants are covered.
66+
func BenchmarkComputeRLCVectorized(b *testing.B) {
67+
configs := []struct {
68+
name string
69+
bytes, k, n int
70+
workers int
71+
}{
72+
{"size=128MB/k=1024/n=1024", 128 << 20, 1024, 1024, 1},
73+
{"size=128MB/k=1024/n=1024/workers=16", 128 << 20, 1024, 1024, 16},
74+
{"size=128MB/k=1024/n=3072", 128 << 20, 1024, 3072, 1},
75+
{"size=128MB/k=1024/n=3072/workers=16", 128 << 20, 1024, 3072, 16},
76+
}
77+
for _, cfg := range configs {
78+
b.Run(cfg.name, func(b *testing.B) {
79+
rowSize := cfg.bytes / cfg.k
80+
codecConfig := &Config{
81+
K: cfg.k, N: cfg.n, RowSize: rowSize, WorkerCount: cfg.workers,
82+
}
83+
rowRoot := [32]byte{1, 2, 3, 4}
84+
coeffs := deriveCoefficients(rowRoot, rowSize)
85+
86+
data := make([][]byte, cfg.k)
87+
r := rand.New(rand.NewPCG(uint64(cfg.k), uint64(rowSize)))
88+
for i := range data {
89+
data[i] = make([]byte, rowSize)
90+
for j := range data[i] {
91+
data[i][j] = byte(r.IntN(256))
92+
}
93+
}
94+
95+
b.SetBytes(int64(cfg.bytes))
96+
b.ResetTimer()
97+
for range b.N {
98+
_ = computeRLCVectorized(data, coeffs, codecConfig)
99+
}
100+
})
101+
}
102+
}

test/docker-e2e/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ require (
193193
github.com/jmhodges/levigo v1.0.0 // indirect
194194
github.com/klauspost/compress v1.18.5 // indirect
195195
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
196-
github.com/klauspost/reedsolomon v1.13.3 // indirect
196+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a // indirect
197197
github.com/kr/pretty v0.3.1 // indirect
198198
github.com/kr/text v0.2.0 // indirect
199199
github.com/lib/pq v1.12.3 // indirect

test/docker-e2e/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -643,8 +643,8 @@ github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBF
643643
github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
644644
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
645645
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
646-
github.com/klauspost/reedsolomon v1.13.3 h1:01GwnO2xoCSaM0ShP4qwl+FsHg3csFShC6Tu/RS1ji0=
647-
github.com/klauspost/reedsolomon v1.13.3/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
646+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a h1:aP94idRf0yhG07gBSIyW3sy/cd+XNLWnghSp11y0oIc=
647+
github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
648648
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
649649
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
650650
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=

0 commit comments

Comments
 (0)