use tiled pippenger if pool is available

spiral-ladder · spiral-ladder · commit f744e0cda38d · 2026-04-10T09:52:17.000+01:00
diff --git a/bindings/napi/blst.zig b/bindings/napi/blst.zig
@@ -879,50 +879,56 @@ fn asyncAggregateExecute(_: napi.Env, data: *AsyncAggregateData) void {
         }
     }
 
-    // Build pointer arrays for Pippenger API
-    var pk_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p1_affine = undefined;
-    var sig_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p2_affine = undefined;
-    var sca_ptrs: [MAX_AGGREGATE_PER_JOB]*const u8 = undefined;
-    for (0..n) |i| {
-        pk_ptrs[i] = &data.pks[i].point;
-        sig_ptrs[i] = &data.sigs[i].point;
-        sca_ptrs[i] = &scalars[i * nbytes];
-    }
+    // Use thread pool for parallel tiled Pippenger when available.
+    if (thread_pool) |pool| {
+        const p1_ret = pool.multP1(@ptrCast(data.pks.ptr), n, &scalars, nbits);
+        bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
 
-    // Per-call scratch allocation
-    const scratch_size = @max(
-        bls.c.blst_p1s_mult_pippenger_scratch_sizeof(n),
-        bls.c.blst_p2s_mult_pippenger_scratch_sizeof(n),
-    );
-    const scratch = allocator.alloc(u64, scratch_size) catch {
-        data.err = true;
-        return;
-    };
-    defer allocator.free(scratch);
-
-    // Pippenger multi-scalar multiplication on G1 (pubkeys)
-    var p1_ret: bls.c.blst_p1 = std.mem.zeroes(bls.c.blst_p1);
-    bls.c.blst_p1s_mult_pippenger(
-        &p1_ret,
-        @ptrCast(&pk_ptrs),
-        n,
-        @ptrCast(&sca_ptrs),
-        nbits,
-        scratch.ptr,
-    );
-    bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
-
-    // Pippenger multi-scalar multiplication on G2 (signatures)
-    var p2_ret: bls.c.blst_p2 = std.mem.zeroes(bls.c.blst_p2);
-    bls.c.blst_p2s_mult_pippenger(
-        &p2_ret,
-        @ptrCast(&sig_ptrs),
-        n,
-        @ptrCast(&sca_ptrs),
-        nbits,
-        scratch.ptr,
-    );
-    bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
+        const p2_ret = pool.multP2(@ptrCast(data.sigs.ptr), n, &scalars, nbits);
+        bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
+    } else {
+        // Fallback: single-threaded Pippenger (no thread pool initialized)
+        var pk_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p1_affine = undefined;
+        var sig_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p2_affine = undefined;
+        var sca_ptrs: [MAX_AGGREGATE_PER_JOB]*const u8 = undefined;
+        for (0..n) |i| {
+            pk_ptrs[i] = &data.pks[i].point;
+            sig_ptrs[i] = &data.sigs[i].point;
+            sca_ptrs[i] = &scalars[i * nbytes];
+        }
+
+        const scratch_size = @max(
+            bls.c.blst_p1s_mult_pippenger_scratch_sizeof(n),
+            bls.c.blst_p2s_mult_pippenger_scratch_sizeof(n),
+        );
+        const scratch = allocator.alloc(u64, scratch_size) catch {
+            data.err = true;
+            return;
+        };
+        defer allocator.free(scratch);
+
+        var p1_ret: bls.c.blst_p1 = std.mem.zeroes(bls.c.blst_p1);
+        bls.c.blst_p1s_mult_pippenger(
+            &p1_ret,
+            @ptrCast(&pk_ptrs),
+            n,
+            @ptrCast(&sca_ptrs),
+            nbits,
+            scratch.ptr,
+        );
+        bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
+
+        var p2_ret: bls.c.blst_p2 = std.mem.zeroes(bls.c.blst_p2);
+        bls.c.blst_p2s_mult_pippenger(
+            &p2_ret,
+            @ptrCast(&sig_ptrs),
+            n,
+            @ptrCast(&sca_ptrs),
+            nbits,
+            scratch.ptr,
+        );
+        bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
+    }
 }
 
 fn asyncAggregateComplete(env: napi.Env, _: napi.status.Status, data: *AsyncAggregateData) void {
diff --git a/src/bls/ThreadPool.zig b/src/bls/ThreadPool.zig
@@ -593,6 +593,113 @@ fn buildTileGrid(npoints: usize, nbits: usize, ncpus: usize, tiles: []Tile) usiz
     return total;
 }
 
+/// Multi-scalar multiplication on G1 (pubkeys) using tiled Pippenger.
+/// Falls back to single-threaded Pippenger for small inputs or when no pool is available.
+pub fn multP1(pool: *ThreadPool, points: [*]const c.blst_p1_affine, npoints: usize, scalars: [*]const u8, nbits: usize) c.blst_p1 {
+    const nbytes = (nbits + 7) / 8;
+    const ncpus = pool.n_workers;
+
+    // Single-threaded fallback for small inputs or single worker
+    if (ncpus < 2 or npoints < 32) {
+        const scratch_size = c.blst_p1s_mult_pippenger_scratch_sizeof(npoints);
+        const scratch = pool.allocator.alloc(u64, scratch_size) catch {
+            // If allocation fails, try stack-based single-threaded
+            var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);
+            const pts: [2]?*const c.blst_p1_affine = .{ &points[0], null };
+            const sca: [2]?*const u8 = .{ &scalars[0], null };
+            c.blst_p1s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, null);
+            return ret;
+        };
+        defer pool.allocator.free(scratch);
+
+        var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);
+        const pts: [2]?*const c.blst_p1_affine = .{ &points[0], null };
+        const sca: [2]?*const u8 = .{ &scalars[0], null };
+        c.blst_p1s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, scratch.ptr);
+        return ret;
+    }
+
+    // Tiled parallel Pippenger
+    var tiles: [MAX_TILES]Tile = undefined;
+    const total = buildTileGrid(npoints, nbits, ncpus, &tiles);
+    const bd = breakdown(nbits, pippenger_window_size(npoints), ncpus);
+
+    var results: [MAX_TILES]c.blst_p1 = undefined;
+    var work_items: [MAX_WORKERS]TileP1WorkItem = undefined;
+    var work_ptrs: [MAX_WORKERS]*WorkItem = undefined;
+
+    var job = TileP1Job{
+        .points = points,
+        .scalars = scalars,
+        .nbytes = nbytes,
+        .nbits = nbits,
+        .tiles = tiles[0..total],
+        .results = results[0..total],
+        .counter = std.atomic.Value(usize).init(0),
+    };
+
+    const n_work = @min(ncpus, total);
+    for (0..n_work) |i| {
+        work_items[i] = .{ .base = .{ .exec_fn = TileP1WorkItem.exec }, .job = &job };
+        work_ptrs[i] = &work_items[i].base;
+    }
+    pool.submitAndWait(work_ptrs[0..n_work]);
+
+    return reduceTilesP1(tiles[0..total], results[0..total], bd.nx, bd.ny, bd.wnd);
+}
+
+/// Multi-scalar multiplication on G2 (signatures) using tiled Pippenger.
+/// Falls back to single-threaded Pippenger for small inputs or when no pool is available.
+pub fn multP2(pool: *ThreadPool, points: [*]const c.blst_p2_affine, npoints: usize, scalars: [*]const u8, nbits: usize) c.blst_p2 {
+    const nbytes = (nbits + 7) / 8;
+    const ncpus = pool.n_workers;
+
+    if (ncpus < 2 or npoints < 32) {
+        const scratch_size = c.blst_p2s_mult_pippenger_scratch_sizeof(npoints);
+        const scratch = pool.allocator.alloc(u64, scratch_size) catch {
+            var ret: c.blst_p2 = std.mem.zeroes(c.blst_p2);
+            const pts: [2]?*const c.blst_p2_affine = .{ &points[0], null };
+            const sca: [2]?*const u8 = .{ &scalars[0], null };
+            c.blst_p2s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, null);
+            return ret;
+        };
+        defer pool.allocator.free(scratch);
+
+        var ret: c.blst_p2 = std.mem.zeroes(c.blst_p2);
+        const pts: [2]?*const c.blst_p2_affine = .{ &points[0], null };
+        const sca: [2]?*const u8 = .{ &scalars[0], null };
+        c.blst_p2s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, scratch.ptr);
+        return ret;
+    }
+
+    var tiles: [MAX_TILES]Tile = undefined;
+    const total = buildTileGrid(npoints, nbits, ncpus, &tiles);
+    const bd = breakdown(nbits, pippenger_window_size(npoints), ncpus);
+
+    var results: [MAX_TILES]c.blst_p2 = undefined;
+    var work_items: [MAX_WORKERS]TileP2WorkItem = undefined;
+    var work_ptrs: [MAX_WORKERS]*WorkItem = undefined;
+
+    var job = TileP2Job{
+        .points = points,
+        .scalars = scalars,
+        .nbytes = nbytes,
+        .nbits = nbits,
+        .tiles = tiles[0..total],
+        .results = results[0..total],
+        .counter = std.atomic.Value(usize).init(0),
+    };
+
+    const n_work = @min(ncpus, total);
+    for (0..n_work) |i| {
+        work_items[i] = .{ .base = .{ .exec_fn = TileP2WorkItem.exec }, .job = &job };
+        work_ptrs[i] = &work_items[i].base;
+    }
+    pool.submitAndWait(work_ptrs[0..n_work]);
+
+    return reduceTilesP2(tiles[0..total], results[0..total], bd.nx, bd.ny, bd.wnd);
+}
+
 /// Reduce tile results: for each row (same y), add across x; then double-and-add across rows.
 fn reduceTilesP1(tiles: []const Tile, results: []c.blst_p1, nx: usize, ny: usize, window: usize) c.blst_p1 {
     var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);