perf(rsema1d): vectorize batched RLC for Coder.Encode (#7143)

Wondertan · claude · web-flow · commit bb697ecaadbe · 2026-04-21T18:34:03.000Z
Adds `computeRLCVectorized`, RLC compute using hardware-accelerated matrix-multiply built on the fused `GF16MulSliceXor8` kernel from klauspost/reedsolomon#334. `Coder.Encode` switches to this path. The RLC compute was the main CPU user in the profiles, accounting for ~80% of the total CPU profile. This change makes RLC effectively invisible in profiles, leaving hashing as the number CPU consumer. The benchmarks below are getting bottlenecked by the memory throughput of my laptop. On the server's hardware, the increase should be more substantial. benchstat BenchmarkCoderEncode/1024x1024x131072 on Ryzen 9 7940HS: ``` scalar vectorized vs base workers=1 49.75 MiB/s 363.07 MiB/s +629.82% (~7.3x) workers=default 236.20 MiB/s 627.00 MiB/s +165.48% (~2.7x) ``` The subsequent PR will also migrate the verification to the hardware-optimized version. Closes the original celestiaorg/rsema1d#15 (Closes #6738)  --- <a href="https://app.devin.ai/review/celestiaorg/celestia-app/pull/7143" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open in Devin Review"> </picture> </a>  Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/go.mod b/go.mod
@@ -55,7 +55,7 @@ require (
 	github.com/grpc-ecosystem/grpc-gateway v1.16.0
 	github.com/hashicorp/go-metrics v0.5.4
 	github.com/joho/godotenv v1.5.1
-	github.com/klauspost/reedsolomon v1.13.3
+	github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a
 	github.com/pelletier/go-toml/v2 v2.3.0
 	github.com/prometheus/client_golang v1.23.2
 	github.com/rs/zerolog v1.35.0
diff --git a/go.sum b/go.sum
@@ -851,8 +851,8 @@ github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBF
 github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
 github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
-github.com/klauspost/reedsolomon v1.13.3 h1:01GwnO2xoCSaM0ShP4qwl+FsHg3csFShC6Tu/RS1ji0=
-github.com/klauspost/reedsolomon v1.13.3/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
+github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a h1:aP94idRf0yhG07gBSIyW3sy/cd+XNLWnghSp11y0oIc=
+github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
diff --git a/pkg/rsema1d/coder.go b/pkg/rsema1d/coder.go
@@ -67,7 +67,7 @@ func (c *Coder) commit(extendedRows [][]byte) *ExtendedData {
 
 	// derive RLC coefficients and compute RLC results for original rows
 	coeffs := deriveCoefficients(rowRoot, len(extendedRows[0]))
-	rlcOrig := computeRLCOrig(extendedRows[:c.config.K], coeffs, c.config)
+	rlcOrig := computeRLCVectorized(extendedRows[:c.config.K], coeffs, c.config)
 
 	// build padded RLC Merkle tree
 	rlcOrigTree := BuildPaddedRLCTree(rlcOrig, c.config)
diff --git a/pkg/rsema1d/coder_bench_test.go b/pkg/rsema1d/coder_bench_test.go
@@ -2,6 +2,7 @@ package rsema1d
 
 import (
 	"math/rand/v2"
+	"runtime"
 	"testing"
 )
 
@@ -10,16 +11,27 @@ func BenchmarkCoderEncode(b *testing.B) {
 		name    string
 		k, n    int
 		rowSize int
+		workers int // 0 means runtime.NumCPU()
 	}{
-		{"4x4x64", 4, 4, 64},
-		{"64x64x512", 64, 64, 512},
-		{"1024x1024x1024", 1024, 1024, 1024},
-		{"4096x12288x8192", 4096, 12288, 8192},
+		{"4x4x64", 4, 4, 64, 1},
+		{"64x64x512", 64, 64, 512, 1},
+		{"1024x1024x1024", 1024, 1024, 1024, 1},
+		{"4096x12288x8192", 4096, 12288, 8192, 1},
+		// 128 MB original / K=1024 / 128 KB rows — the largest single-row size
+		// in the wider bench matrix. Covers both workers=1 and the default
+		// multi-worker path so the RLC SIMD win vs. the rest of the pipeline
+		// (Leopard extend + row-Merkle) is visible.
+		{"1024x1024x131072", 1024, 1024, 131072, 1},
+		{"1024x1024x131072/workers=default", 1024, 1024, 131072, 0},
 	}
 
 	for _, sz := range sizes {
 		b.Run(sz.name, func(b *testing.B) {
-			coder, err := NewCoder(&Config{K: sz.k, N: sz.n, WorkerCount: 1})
+			workers := sz.workers
+			if workers == 0 {
+				workers = runtime.NumCPU()
+			}
+			coder, err := NewCoder(&Config{K: sz.k, N: sz.n, WorkerCount: workers})
 			if err != nil {
 				b.Fatal(err)
 			}
diff --git a/pkg/rsema1d/field/gf128.go b/pkg/rsema1d/field/gf128.go
@@ -78,3 +78,41 @@ func HashToGF128(data []byte) GF128 {
 	// XOR the two halves for final result
 	return Add128(firstHalf, secondHalf)
 }
+
+// GF128Width is the number of GF16 components that make up one GF128 element.
+const GF128Width = 8
+
+// LeopardGF128BufSize returns the byte length of a buffer that holds k GF128
+// values laid out as GF128Width concatenated Leopard-formatted regions
+// (one per GF128 component).
+func LeopardGF128BufSize(k int) int { return GF128Width * 2 * k }
+
+// LeopardGF128Views partitions a GF128Width-by-k Leopard-formatted byte
+// buffer into GF128Width per-component slice headers. len(buf) must equal
+// LeopardGF128BufSize(k). The views share the underlying buffer and are
+// suitable as MulSliceXor8 destinations; recover the GF128s with
+// GF128sFromLeopard(buf, k). The returned array stays on the caller's
+// stack — no heap allocation.
+func LeopardGF128Views(buf []byte, k int) [GF128Width][]byte {
+	stride := 2 * k
+	var views [GF128Width][]byte
+	for i := range views {
+		views[i] = buf[i*stride : (i+1)*stride]
+	}
+	return views
+}
+
+// GF128sFromLeopard reads a GF128Width-by-k Leopard-formatted byte buffer
+// into a freshly allocated []GF128 of length k, one GF128 value per row.
+// The only heap allocation is the returned slice itself.
+func GF128sFromLeopard(buf []byte, k int) []GF128 {
+	out := make([]GF128, k)
+	stride := 2 * k
+	for comp := range GF128Width {
+		view := buf[comp*stride : (comp+1)*stride]
+		for r := range k {
+			out[r][comp] = GF16FromLeopard(view, r)
+		}
+	}
+	return out
+}
diff --git a/pkg/rsema1d/field/gf16.go b/pkg/rsema1d/field/gf16.go
@@ -7,6 +7,14 @@ import (
 // GF16 represents a GF(2^16) field element
 type GF16 uint16
 
+// GF16FromLeopard extracts the r-th GF16 element from a byte slice laid out
+// in Leopard format, i.e. one or more 64-byte chunks each containing 32 low
+// bytes followed by 32 high bytes. Caller must ensure r*2 < len(slab).
+func GF16FromLeopard(slab []byte, r int) GF16 {
+	rb, rr := r/32, r%32
+	return GF16(uint16(slab[rb*64+32+rr])<<8 | uint16(slab[rb*64+rr]))
+}
+
 var ll reedsolomon.LowLevel
 
 // Mul16 multiplies two GF(2^16) elements
@@ -18,3 +26,18 @@ func Mul16(a, b GF16) GF16 {
 func Add16(a, b GF16) GF16 {
 	return a ^ b
 }
+
+// MulSliceXor8 applies the 8 GF16 components of `coeff` as scalars to one
+// shared input slice, XOR-accumulating each into a distinct output slice:
+//
+//	out[k][i] ^= coeff[k] * in[i]   for k in [0, 8)
+//
+// All slices must have equal length, a multiple of 64 bytes, in Leopard
+// format (32 low bytes + 32 high bytes per 64-byte chunk).
+func MulSliceXor8(coeff *GF128, in []byte, outs *[8][]byte) {
+	var s [8]uint16
+	for k, v := range coeff {
+		s[k] = uint16(v)
+	}
+	ll.GF16MulSliceXor8(&s, in, outs)
+}
diff --git a/pkg/rsema1d/rlc_vectorized.go b/pkg/rsema1d/rlc_vectorized.go
@@ -0,0 +1,121 @@
+package rsema1d
+
+import (
+	"crypto/subtle"
+	"sync"
+
+	"github.com/celestiaorg/celestia-app/v9/pkg/rsema1d/field"
+)
+
+// The batched RLC over K rows is a matrix multiply over GF(2^16):
+// Result[r][k] = Σ_i Rows[r][i] * Coeffs[i][k]. We run it as an outer-product
+// accumulate so every inner step is a scalar-broadcast multiply — the shape
+// reedsolomon's SIMD kernel handles natively — and fuse the 8 GF128 components
+// into one kernel call per transposed column.
+const symbolsPerChunk = 32 // GF(2^16) symbols stored per 64-byte Leopard chunk
+
+// computeRLCVectorized computes the RLC of len(rows) rows against `coeffs`
+// using the vectorized GF(2^16) SIMD kernel. Rows must be Leopard-sized
+// (a positive multiple of chunkSize bytes, equal across rows); K is padded
+// up to a multiple of symbolsPerChunk internally when needed.
+func computeRLCVectorized(rows [][]byte, coeffs []field.GF128, config *Config) []field.GF128 {
+	origK := len(rows)
+	if origK == 0 {
+		return nil
+	}
+
+	rows, K := padToSymbolsPerChunk(rows)
+	numChunks := len(rows[0]) / chunkSize
+	workers := min(max(config.WorkerCount, 1), numChunks)
+	if workers == 1 {
+		return field.GF128sFromLeopard(accumulateRLC(rows, coeffs, K, 0, numChunks), K)[:origK]
+	}
+
+	partials := make([][]byte, workers)
+	step, rem := numChunks/workers, numChunks%workers
+	var wg sync.WaitGroup
+	wg.Add(workers)
+	for w := range workers {
+		cStart := w*step + min(w, rem)
+		cEnd := cStart + step
+		if w < rem {
+			cEnd++
+		}
+		go func() {
+			defer wg.Done()
+			partials[w] = accumulateRLC(rows, coeffs, K, cStart, cEnd)
+		}()
+	}
+	wg.Wait()
+
+	for _, p := range partials[1:] {
+		subtle.XORBytes(partials[0], partials[0], p)
+	}
+	return field.GF128sFromLeopard(partials[0], K)[:origK]
+}
+
+// accumulateRLC processes chunk indices [cStart, cEnd) across every row and
+// returns a Leopard GF128-buffer (see field.LeopardGF128BufSize) holding the
+// partial RLC sums.
+func accumulateRLC(rows [][]byte, coeffs []field.GF128, k, cStart, cEnd int) []byte {
+	buf := make([]byte, field.LeopardGF128BufSize(k))
+	outs := field.LeopardGF128Views(buf, k)
+	stride := 2 * k
+	cols := make([]byte, symbolsPerChunk*stride)
+
+	rowBlocks := k / symbolsPerChunk
+	for c := cStart; c < cEnd; c++ {
+		transposeChunk(cols, k, rows, c, rowBlocks)
+		for j := range symbolsPerChunk {
+			col := cols[j*stride : (j+1)*stride]
+			field.MulSliceXor8(&coeffs[c*symbolsPerChunk+j], col, &outs)
+		}
+	}
+	return buf
+}
+
+// transposeChunk gathers the 64-byte Leopard chunk at row offset c from each
+// of k rows and redistributes it into symbolsPerChunk column buffers, each of
+// stride 2k bytes and itself in Leopard format. Reading a block of 32 rows
+// at a time keeps each row's cache line hot while we scatter into the column
+// buffers.
+func transposeChunk(cols []byte, k int, rows [][]byte, c, rowBlocks int) {
+	stride := 2 * k
+	rowOff := c * chunkSize
+	var block [symbolsPerChunk * chunkSize]byte
+	for rb := range rowBlocks {
+		rowBase := rb * symbolsPerChunk
+		for rr := range symbolsPerChunk {
+			copy(block[rr*chunkSize:(rr+1)*chunkSize], rows[rowBase+rr][rowOff:rowOff+chunkSize])
+		}
+		colOff := rb * chunkSize
+		for j := range symbolsPerChunk {
+			dst := cols[j*stride+colOff : j*stride+colOff+chunkSize]
+			for rr := range symbolsPerChunk {
+				src := block[rr*chunkSize:]
+				dst[rr] = src[j]                                 // low byte
+				dst[symbolsPerChunk+rr] = src[symbolsPerChunk+j] // high byte
+			}
+		}
+	}
+}
+
+// padToSymbolsPerChunk returns rows with its length rounded up to a multiple
+// of symbolsPerChunk by appending a single shared zero row, along with the
+// (possibly padded) length. rows is not mutated; a new slice header is
+// returned only when padding is needed.
+func padToSymbolsPerChunk(rows [][]byte) ([][]byte, int) {
+	K := len(rows)
+	rem := K % symbolsPerChunk
+	if rem == 0 {
+		return rows, K
+	}
+	paddedK := K + symbolsPerChunk - rem
+	padded := make([][]byte, paddedK)
+	copy(padded, rows)
+	zero := make([]byte, len(rows[0]))
+	for i := K; i < paddedK; i++ {
+		padded[i] = zero
+	}
+	return padded, paddedK
+}
diff --git a/pkg/rsema1d/rlc_vectorized_test.go b/pkg/rsema1d/rlc_vectorized_test.go
@@ -0,0 +1,102 @@
+package rsema1d
+
+import (
+	"math/rand/v2"
+	"testing"
+
+	"github.com/celestiaorg/celestia-app/v9/pkg/rsema1d/field"
+)
+
+// TestComputeRLCVectorizedMatchesScalar verifies the vectorized SIMD kernel
+// produces the same []GF128 as the per-row scalar loop across a range of
+// eligible K/rowSize combinations and both worker counts.
+func TestComputeRLCVectorizedMatchesScalar(t *testing.T) {
+	cases := []struct{ k, rowSize int }{
+		{32, 64},
+		{32, 128},
+		{64, 64},
+		{256, 256},
+		{1024, 1024},
+		{1024, 8192},
+		{4096, 4096},
+		// K values that are not a multiple of symbolsPerChunk to exercise
+		// the internal zero-row padding path.
+		{1, 64},
+		{17, 128},
+		{33, 64},
+		{100, 256},
+		{1023, 1024},
+	}
+	for _, tc := range cases {
+		rows := make([][]byte, tc.k)
+		r := rand.New(rand.NewPCG(uint64(tc.k), uint64(tc.rowSize)))
+		for i := range rows {
+			rows[i] = make([]byte, tc.rowSize)
+			for j := range rows[i] {
+				rows[i][j] = byte(r.IntN(256))
+			}
+		}
+		var rowRoot [32]byte
+		for i := range rowRoot {
+			rowRoot[i] = byte(r.IntN(256))
+		}
+		coeffs := deriveCoefficients(rowRoot, tc.rowSize)
+		cfg := &Config{K: tc.k, N: tc.k, RowSize: tc.rowSize, WorkerCount: 1}
+
+		want := computeRLCOrig(rows, coeffs, cfg)
+		for _, workers := range []int{1, 4} {
+			cfg.WorkerCount = workers
+			got := computeRLCVectorized(rows, coeffs, cfg)
+			if len(want) != len(got) {
+				t.Fatalf("k=%d rs=%d workers=%d length mismatch", tc.k, tc.rowSize, workers)
+			}
+			for i := range want {
+				if !field.Equal128(want[i], got[i]) {
+					t.Fatalf("k=%d rs=%d workers=%d row %d mismatch: want %v got %v",
+						tc.k, tc.rowSize, workers, i, want[i], got[i])
+				}
+			}
+		}
+	}
+}
+
+// BenchmarkComputeRLCVectorized measures the vectorized SIMD RLC kernel at
+// the largest single-row size in the matrix — 128MB total, K=1024 → 128KB
+// per row. Both single-worker and default-worker variants are covered.
+func BenchmarkComputeRLCVectorized(b *testing.B) {
+	configs := []struct {
+		name        string
+		bytes, k, n int
+		workers     int
+	}{
+		{"size=128MB/k=1024/n=1024", 128 << 20, 1024, 1024, 1},
+		{"size=128MB/k=1024/n=1024/workers=16", 128 << 20, 1024, 1024, 16},
+		{"size=128MB/k=1024/n=3072", 128 << 20, 1024, 3072, 1},
+		{"size=128MB/k=1024/n=3072/workers=16", 128 << 20, 1024, 3072, 16},
+	}
+	for _, cfg := range configs {
+		b.Run(cfg.name, func(b *testing.B) {
+			rowSize := cfg.bytes / cfg.k
+			codecConfig := &Config{
+				K: cfg.k, N: cfg.n, RowSize: rowSize, WorkerCount: cfg.workers,
+			}
+			rowRoot := [32]byte{1, 2, 3, 4}
+			coeffs := deriveCoefficients(rowRoot, rowSize)
+
+			data := make([][]byte, cfg.k)
+			r := rand.New(rand.NewPCG(uint64(cfg.k), uint64(rowSize)))
+			for i := range data {
+				data[i] = make([]byte, rowSize)
+				for j := range data[i] {
+					data[i][j] = byte(r.IntN(256))
+				}
+			}
+
+			b.SetBytes(int64(cfg.bytes))
+			b.ResetTimer()
+			for range b.N {
+				_ = computeRLCVectorized(data, coeffs, codecConfig)
+			}
+		})
+	}
+}
diff --git a/test/docker-e2e/go.mod b/test/docker-e2e/go.mod
@@ -193,7 +193,7 @@ require (
 	github.com/jmhodges/levigo v1.0.0 // indirect
 	github.com/klauspost/compress v1.18.5 // indirect
 	github.com/klauspost/cpuid/v2 v2.3.0 // indirect
-	github.com/klauspost/reedsolomon v1.13.3 // indirect
+	github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/lib/pq v1.12.3 // indirect
diff --git a/test/docker-e2e/go.sum b/test/docker-e2e/go.sum
@@ -643,8 +643,8 @@ github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBF
 github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ=
 github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
 github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
-github.com/klauspost/reedsolomon v1.13.3 h1:01GwnO2xoCSaM0ShP4qwl+FsHg3csFShC6Tu/RS1ji0=
-github.com/klauspost/reedsolomon v1.13.3/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
+github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a h1:aP94idRf0yhG07gBSIyW3sy/cd+XNLWnghSp11y0oIc=
+github.com/klauspost/reedsolomon v1.13.4-0.20260420101718-f7e5efe6123a/go.mod h1:yjqqjgMTQkBUHSG97/rm4zipffCNbCiZcB3kTqr++sQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=