Skip to content

Commit f744e0c

Browse files
committed
use tiled pippenger if pool is available
1 parent 56ef6b6 commit f744e0c

File tree

2 files changed

+156
-43
lines changed

2 files changed

+156
-43
lines changed

bindings/napi/blst.zig

Lines changed: 49 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -879,50 +879,56 @@ fn asyncAggregateExecute(_: napi.Env, data: *AsyncAggregateData) void {
879879
}
880880
}
881881

882-
// Build pointer arrays for Pippenger API
883-
var pk_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p1_affine = undefined;
884-
var sig_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p2_affine = undefined;
885-
var sca_ptrs: [MAX_AGGREGATE_PER_JOB]*const u8 = undefined;
886-
for (0..n) |i| {
887-
pk_ptrs[i] = &data.pks[i].point;
888-
sig_ptrs[i] = &data.sigs[i].point;
889-
sca_ptrs[i] = &scalars[i * nbytes];
890-
}
882+
// Use thread pool for parallel tiled Pippenger when available.
883+
if (thread_pool) |pool| {
884+
const p1_ret = pool.multP1(@ptrCast(data.pks.ptr), n, &scalars, nbits);
885+
bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
891886

892-
// Per-call scratch allocation
893-
const scratch_size = @max(
894-
bls.c.blst_p1s_mult_pippenger_scratch_sizeof(n),
895-
bls.c.blst_p2s_mult_pippenger_scratch_sizeof(n),
896-
);
897-
const scratch = allocator.alloc(u64, scratch_size) catch {
898-
data.err = true;
899-
return;
900-
};
901-
defer allocator.free(scratch);
902-
903-
// Pippenger multi-scalar multiplication on G1 (pubkeys)
904-
var p1_ret: bls.c.blst_p1 = std.mem.zeroes(bls.c.blst_p1);
905-
bls.c.blst_p1s_mult_pippenger(
906-
&p1_ret,
907-
@ptrCast(&pk_ptrs),
908-
n,
909-
@ptrCast(&sca_ptrs),
910-
nbits,
911-
scratch.ptr,
912-
);
913-
bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
914-
915-
// Pippenger multi-scalar multiplication on G2 (signatures)
916-
var p2_ret: bls.c.blst_p2 = std.mem.zeroes(bls.c.blst_p2);
917-
bls.c.blst_p2s_mult_pippenger(
918-
&p2_ret,
919-
@ptrCast(&sig_ptrs),
920-
n,
921-
@ptrCast(&sca_ptrs),
922-
nbits,
923-
scratch.ptr,
924-
);
925-
bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
887+
const p2_ret = pool.multP2(@ptrCast(data.sigs.ptr), n, &scalars, nbits);
888+
bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
889+
} else {
890+
// Fallback: single-threaded Pippenger (no thread pool initialized)
891+
var pk_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p1_affine = undefined;
892+
var sig_ptrs: [MAX_AGGREGATE_PER_JOB]*const bls.c.blst_p2_affine = undefined;
893+
var sca_ptrs: [MAX_AGGREGATE_PER_JOB]*const u8 = undefined;
894+
for (0..n) |i| {
895+
pk_ptrs[i] = &data.pks[i].point;
896+
sig_ptrs[i] = &data.sigs[i].point;
897+
sca_ptrs[i] = &scalars[i * nbytes];
898+
}
899+
900+
const scratch_size = @max(
901+
bls.c.blst_p1s_mult_pippenger_scratch_sizeof(n),
902+
bls.c.blst_p2s_mult_pippenger_scratch_sizeof(n),
903+
);
904+
const scratch = allocator.alloc(u64, scratch_size) catch {
905+
data.err = true;
906+
return;
907+
};
908+
defer allocator.free(scratch);
909+
910+
var p1_ret: bls.c.blst_p1 = std.mem.zeroes(bls.c.blst_p1);
911+
bls.c.blst_p1s_mult_pippenger(
912+
&p1_ret,
913+
@ptrCast(&pk_ptrs),
914+
n,
915+
@ptrCast(&sca_ptrs),
916+
nbits,
917+
scratch.ptr,
918+
);
919+
bls.c.blst_p1_to_affine(&data.result_pk.point, &p1_ret);
920+
921+
var p2_ret: bls.c.blst_p2 = std.mem.zeroes(bls.c.blst_p2);
922+
bls.c.blst_p2s_mult_pippenger(
923+
&p2_ret,
924+
@ptrCast(&sig_ptrs),
925+
n,
926+
@ptrCast(&sca_ptrs),
927+
nbits,
928+
scratch.ptr,
929+
);
930+
bls.c.blst_p2_to_affine(&data.result_sig.point, &p2_ret);
931+
}
926932
}
927933

928934
fn asyncAggregateComplete(env: napi.Env, _: napi.status.Status, data: *AsyncAggregateData) void {

src/bls/ThreadPool.zig

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,113 @@ fn buildTileGrid(npoints: usize, nbits: usize, ncpus: usize, tiles: []Tile) usiz
593593
return total;
594594
}
595595

596+
/// Multi-scalar multiplication on G1 (pubkeys) using tiled Pippenger.
597+
/// Falls back to single-threaded Pippenger for small inputs or when no pool is available.
598+
pub fn multP1(pool: *ThreadPool, points: [*]const c.blst_p1_affine, npoints: usize, scalars: [*]const u8, nbits: usize) c.blst_p1 {
599+
const nbytes = (nbits + 7) / 8;
600+
const ncpus = pool.n_workers;
601+
602+
// Single-threaded fallback for small inputs or single worker
603+
if (ncpus < 2 or npoints < 32) {
604+
const scratch_size = c.blst_p1s_mult_pippenger_scratch_sizeof(npoints);
605+
const scratch = pool.allocator.alloc(u64, scratch_size) catch {
606+
// If allocation fails, try stack-based single-threaded
607+
var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);
608+
const pts: [2]?*const c.blst_p1_affine = .{ &points[0], null };
609+
const sca: [2]?*const u8 = .{ &scalars[0], null };
610+
c.blst_p1s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, null);
611+
return ret;
612+
};
613+
defer pool.allocator.free(scratch);
614+
615+
var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);
616+
const pts: [2]?*const c.blst_p1_affine = .{ &points[0], null };
617+
const sca: [2]?*const u8 = .{ &scalars[0], null };
618+
c.blst_p1s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, scratch.ptr);
619+
return ret;
620+
}
621+
622+
// Tiled parallel Pippenger
623+
var tiles: [MAX_TILES]Tile = undefined;
624+
const total = buildTileGrid(npoints, nbits, ncpus, &tiles);
625+
const bd = breakdown(nbits, pippenger_window_size(npoints), ncpus);
626+
627+
var results: [MAX_TILES]c.blst_p1 = undefined;
628+
var work_items: [MAX_WORKERS]TileP1WorkItem = undefined;
629+
var work_ptrs: [MAX_WORKERS]*WorkItem = undefined;
630+
631+
var job = TileP1Job{
632+
.points = points,
633+
.scalars = scalars,
634+
.nbytes = nbytes,
635+
.nbits = nbits,
636+
.tiles = tiles[0..total],
637+
.results = results[0..total],
638+
.counter = std.atomic.Value(usize).init(0),
639+
};
640+
641+
const n_work = @min(ncpus, total);
642+
for (0..n_work) |i| {
643+
work_items[i] = .{ .base = .{ .exec_fn = TileP1WorkItem.exec }, .job = &job };
644+
work_ptrs[i] = &work_items[i].base;
645+
}
646+
pool.submitAndWait(work_ptrs[0..n_work]);
647+
648+
return reduceTilesP1(tiles[0..total], results[0..total], bd.nx, bd.ny, bd.wnd);
649+
}
650+
651+
/// Multi-scalar multiplication on G2 (signatures) using tiled Pippenger.
652+
/// Falls back to single-threaded Pippenger for small inputs or when no pool is available.
653+
pub fn multP2(pool: *ThreadPool, points: [*]const c.blst_p2_affine, npoints: usize, scalars: [*]const u8, nbits: usize) c.blst_p2 {
654+
const nbytes = (nbits + 7) / 8;
655+
const ncpus = pool.n_workers;
656+
657+
if (ncpus < 2 or npoints < 32) {
658+
const scratch_size = c.blst_p2s_mult_pippenger_scratch_sizeof(npoints);
659+
const scratch = pool.allocator.alloc(u64, scratch_size) catch {
660+
var ret: c.blst_p2 = std.mem.zeroes(c.blst_p2);
661+
const pts: [2]?*const c.blst_p2_affine = .{ &points[0], null };
662+
const sca: [2]?*const u8 = .{ &scalars[0], null };
663+
c.blst_p2s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, null);
664+
return ret;
665+
};
666+
defer pool.allocator.free(scratch);
667+
668+
var ret: c.blst_p2 = std.mem.zeroes(c.blst_p2);
669+
const pts: [2]?*const c.blst_p2_affine = .{ &points[0], null };
670+
const sca: [2]?*const u8 = .{ &scalars[0], null };
671+
c.blst_p2s_mult_pippenger(&ret, @ptrCast(&pts), npoints, @ptrCast(&sca), nbits, scratch.ptr);
672+
return ret;
673+
}
674+
675+
var tiles: [MAX_TILES]Tile = undefined;
676+
const total = buildTileGrid(npoints, nbits, ncpus, &tiles);
677+
const bd = breakdown(nbits, pippenger_window_size(npoints), ncpus);
678+
679+
var results: [MAX_TILES]c.blst_p2 = undefined;
680+
var work_items: [MAX_WORKERS]TileP2WorkItem = undefined;
681+
var work_ptrs: [MAX_WORKERS]*WorkItem = undefined;
682+
683+
var job = TileP2Job{
684+
.points = points,
685+
.scalars = scalars,
686+
.nbytes = nbytes,
687+
.nbits = nbits,
688+
.tiles = tiles[0..total],
689+
.results = results[0..total],
690+
.counter = std.atomic.Value(usize).init(0),
691+
};
692+
693+
const n_work = @min(ncpus, total);
694+
for (0..n_work) |i| {
695+
work_items[i] = .{ .base = .{ .exec_fn = TileP2WorkItem.exec }, .job = &job };
696+
work_ptrs[i] = &work_items[i].base;
697+
}
698+
pool.submitAndWait(work_ptrs[0..n_work]);
699+
700+
return reduceTilesP2(tiles[0..total], results[0..total], bd.nx, bd.ny, bd.wnd);
701+
}
702+
596703
/// Reduce tile results: for each row (same y), add across x; then double-and-add across rows.
597704
fn reduceTilesP1(tiles: []const Tile, results: []c.blst_p1, nx: usize, ny: usize, window: usize) c.blst_p1 {
598705
var ret: c.blst_p1 = std.mem.zeroes(c.blst_p1);

0 commit comments

Comments
 (0)