|
| 1 | +package rsema1d |
| 2 | + |
| 3 | +import ( |
| 4 | + "crypto/subtle" |
| 5 | + "sync" |
| 6 | + |
| 7 | + "github.com/celestiaorg/celestia-app/v9/pkg/rsema1d/field" |
| 8 | +) |
| 9 | + |
| 10 | +// The batched RLC over K rows is a matrix multiply over GF(2^16): |
| 11 | +// Result[r][k] = Σ_i Rows[r][i] * Coeffs[i][k]. We run it as an outer-product |
| 12 | +// accumulate so every inner step is a scalar-broadcast multiply — the shape |
| 13 | +// reedsolomon's SIMD kernel handles natively — and fuse the 8 GF128 components |
| 14 | +// into one kernel call per transposed column. |
| 15 | +const symbolsPerChunk = 32 // GF(2^16) symbols stored per 64-byte Leopard chunk |
| 16 | + |
| 17 | +// computeRLCVectorized computes the RLC of len(rows) rows against `coeffs` |
| 18 | +// using the vectorized GF(2^16) SIMD kernel. Rows must be Leopard-sized |
| 19 | +// (a positive multiple of chunkSize bytes, equal across rows); K is padded |
| 20 | +// up to a multiple of symbolsPerChunk internally when needed. |
| 21 | +func computeRLCVectorized(rows [][]byte, coeffs []field.GF128, config *Config) []field.GF128 { |
| 22 | + origK := len(rows) |
| 23 | + if origK == 0 { |
| 24 | + return nil |
| 25 | + } |
| 26 | + |
| 27 | + rows, K := padToSymbolsPerChunk(rows) |
| 28 | + numChunks := len(rows[0]) / chunkSize |
| 29 | + workers := min(max(config.WorkerCount, 1), numChunks) |
| 30 | + if workers == 1 { |
| 31 | + return field.GF128sFromLeopard(accumulateRLC(rows, coeffs, K, 0, numChunks), K)[:origK] |
| 32 | + } |
| 33 | + |
| 34 | + partials := make([][]byte, workers) |
| 35 | + step, rem := numChunks/workers, numChunks%workers |
| 36 | + var wg sync.WaitGroup |
| 37 | + wg.Add(workers) |
| 38 | + for w := range workers { |
| 39 | + cStart := w*step + min(w, rem) |
| 40 | + cEnd := cStart + step |
| 41 | + if w < rem { |
| 42 | + cEnd++ |
| 43 | + } |
| 44 | + go func() { |
| 45 | + defer wg.Done() |
| 46 | + partials[w] = accumulateRLC(rows, coeffs, K, cStart, cEnd) |
| 47 | + }() |
| 48 | + } |
| 49 | + wg.Wait() |
| 50 | + |
| 51 | + for _, p := range partials[1:] { |
| 52 | + subtle.XORBytes(partials[0], partials[0], p) |
| 53 | + } |
| 54 | + return field.GF128sFromLeopard(partials[0], K)[:origK] |
| 55 | +} |
| 56 | + |
| 57 | +// accumulateRLC processes chunk indices [cStart, cEnd) across every row and |
| 58 | +// returns a Leopard GF128-buffer (see field.LeopardGF128BufSize) holding the |
| 59 | +// partial RLC sums. |
| 60 | +func accumulateRLC(rows [][]byte, coeffs []field.GF128, k, cStart, cEnd int) []byte { |
| 61 | + buf := make([]byte, field.LeopardGF128BufSize(k)) |
| 62 | + outs := field.LeopardGF128Views(buf, k) |
| 63 | + stride := 2 * k |
| 64 | + cols := make([]byte, symbolsPerChunk*stride) |
| 65 | + |
| 66 | + rowBlocks := k / symbolsPerChunk |
| 67 | + for c := cStart; c < cEnd; c++ { |
| 68 | + transposeChunk(cols, k, rows, c, rowBlocks) |
| 69 | + for j := range symbolsPerChunk { |
| 70 | + col := cols[j*stride : (j+1)*stride] |
| 71 | + field.MulSliceXor8(&coeffs[c*symbolsPerChunk+j], col, &outs) |
| 72 | + } |
| 73 | + } |
| 74 | + return buf |
| 75 | +} |
| 76 | + |
| 77 | +// transposeChunk gathers the 64-byte Leopard chunk at row offset c from each |
| 78 | +// of k rows and redistributes it into symbolsPerChunk column buffers, each of |
| 79 | +// stride 2k bytes and itself in Leopard format. Reading a block of 32 rows |
| 80 | +// at a time keeps each row's cache line hot while we scatter into the column |
| 81 | +// buffers. |
| 82 | +func transposeChunk(cols []byte, k int, rows [][]byte, c, rowBlocks int) { |
| 83 | + stride := 2 * k |
| 84 | + rowOff := c * chunkSize |
| 85 | + var block [symbolsPerChunk * chunkSize]byte |
| 86 | + for rb := range rowBlocks { |
| 87 | + rowBase := rb * symbolsPerChunk |
| 88 | + for rr := range symbolsPerChunk { |
| 89 | + copy(block[rr*chunkSize:(rr+1)*chunkSize], rows[rowBase+rr][rowOff:rowOff+chunkSize]) |
| 90 | + } |
| 91 | + colOff := rb * chunkSize |
| 92 | + for j := range symbolsPerChunk { |
| 93 | + dst := cols[j*stride+colOff : j*stride+colOff+chunkSize] |
| 94 | + for rr := range symbolsPerChunk { |
| 95 | + src := block[rr*chunkSize:] |
| 96 | + dst[rr] = src[j] // low byte |
| 97 | + dst[symbolsPerChunk+rr] = src[symbolsPerChunk+j] // high byte |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +// padToSymbolsPerChunk returns rows with its length rounded up to a multiple |
| 104 | +// of symbolsPerChunk by appending a single shared zero row, along with the |
| 105 | +// (possibly padded) length. rows is not mutated; a new slice header is |
| 106 | +// returned only when padding is needed. |
| 107 | +func padToSymbolsPerChunk(rows [][]byte) ([][]byte, int) { |
| 108 | + K := len(rows) |
| 109 | + rem := K % symbolsPerChunk |
| 110 | + if rem == 0 { |
| 111 | + return rows, K |
| 112 | + } |
| 113 | + paddedK := K + symbolsPerChunk - rem |
| 114 | + padded := make([][]byte, paddedK) |
| 115 | + copy(padded, rows) |
| 116 | + zero := make([]byte, len(rows[0])) |
| 117 | + for i := K; i < paddedK; i++ { |
| 118 | + padded[i] = zero |
| 119 | + } |
| 120 | + return padded, paddedK |
| 121 | +} |
0 commit comments