Skip to content

Commit 62d9041

Browse files
committed
Add benchmark
1 parent fee0538 commit 62d9041

File tree

6 files changed

+640
-105
lines changed

6 files changed

+640
-105
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,10 @@
99
# Doc outputs
1010
/doc/html
1111

12-
## AI agents
12+
# AI agents
1313
.claude/**/*.local.*
14+
/doc/plans
15+
16+
# Benchmarks
17+
perf.data
18+
perf.data.old

AGENTS.md

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
1-
# System Instructions
2-
3-
You are an expert CUDA developer.
4-
5-
## Coding
6-
71
- Follow Google C++ Style Guide
82
- Start error messages with a lowercase letter unless it is a proper noun
93
- Never reorder `#include`
104
- Use global `memcpy` and `memset` instead of `std::` prefixed ones
11-
12-
## Running Code
13-
145
- Build to /build
15-
- For the error message `cd: no such file or directory: ...`, check your current working directory first

CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,24 @@ if(BUILD_TESTING)
6666
gtest_discover_tests(dcf_u128_le_test)
6767
endif()
6868

69+
option(BUILD_BENCH "Build benchmarks" OFF)
70+
if(BUILD_BENCH)
71+
FetchContent_Declare(benchmark
72+
GIT_REPOSITORY https://github.com/google/benchmark.git
73+
GIT_TAG v1.9.5)
74+
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
75+
set(BENCHMARK_ENABLE_GTEST_TESTS OFF CACHE BOOL "" FORCE)
76+
FetchContent_MakeAvailable(benchmark)
77+
78+
find_package(OpenSSL REQUIRED)
79+
80+
add_executable(bench_cpu src/bench_cpu.cu)
81+
target_link_libraries(bench_cpu benchmark::benchmark_main fss OpenSSL::Crypto)
82+
83+
add_executable(bench_gpu src/bench_gpu.cu)
84+
target_link_libraries(bench_gpu benchmark::benchmark_main fss)
85+
endif()
86+
6987
install(TARGETS fss EXPORT fssTargets)
7088
install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
7189
install(EXPORT fssTargets

README.md

Lines changed: 128 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -93,70 +93,70 @@ This walks through using DPF and DCF on the CPU with AES-128 MMO PRG. This PRG r
9393

9494
1. Include the headers and set up type aliases:
9595

96-
```cpp
97-
#include <fss/dpf.cuh>
98-
#include <fss/dcf.cuh>
99-
#include <fss/group/bytes.cuh>
100-
#include <fss/prg/aes128_mmo.cuh>
101-
102-
constexpr int kInBits = 8; // Input domain: 2^8 = 256 values
103-
using In = uint8_t;
104-
using Group = fss::group::Bytes;
105-
106-
// DPF uses mul=2, DCF uses mul=4
107-
using DpfPrg = fss::prg::Aes128Mmo<2>;
108-
using DcfPrg = fss::prg::Aes128Mmo<4>;
109-
using Dpf = fss::Dpf<kInBits, Group, DpfPrg, In>;
110-
using Dcf = fss::Dcf<kInBits, Group, DcfPrg, In>;
111-
```
96+
```cpp
97+
#include <fss/dpf.cuh>
98+
#include <fss/dcf.cuh>
99+
#include <fss/group/bytes.cuh>
100+
#include <fss/prg/aes128_mmo.cuh>
101+
102+
constexpr int kInBits = 8; // Input domain: 2^8 = 256 values
103+
using In = uint8_t;
104+
using Group = fss::group::Bytes;
105+
106+
// DPF uses mul=2, DCF uses mul=4
107+
using DpfPrg = fss::prg::Aes128Mmo<2>;
108+
using DcfPrg = fss::prg::Aes128Mmo<4>;
109+
using Dpf = fss::Dpf<kInBits, Group, DpfPrg, In>;
110+
using Dcf = fss::Dcf<kInBits, Group, DcfPrg, In>;
111+
```
112112

113113
2. Create the PRG with AES keys and instantiate DPF/DCF:
114114

115-
```cpp
116-
// DPF PRG needs 2 AES keys
117-
unsigned char key0[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
118-
unsigned char key1[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
119-
const unsigned char *keys[2] = {key0, key1};
120-
auto ctxs = DpfPrg::CreateCtxs(keys);
115+
```cpp
116+
// DPF PRG needs 2 AES keys
117+
unsigned char key0[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
118+
unsigned char key1[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
119+
const unsigned char *keys[2] = {key0, key1};
120+
auto ctxs = DpfPrg::CreateCtxs(keys);
121121

122-
DpfPrg prg(ctxs);
123-
Dpf dpf{prg};
124-
```
122+
DpfPrg prg(ctxs);
123+
Dpf dpf{prg};
124+
```
125125

126126
3. Run `Gen` to generate correction words (keys) from secret inputs:
127127

128-
```cpp
129-
In alpha = 42; // Secret point / threshold
130-
int4 beta = {7, 0, 0, 0}; // Secret payload (LSB of .w must be 0)
128+
```cpp
129+
In alpha = 42; // Secret point / threshold
130+
int4 beta = {7, 0, 0, 0}; // Secret payload (LSB of .w must be 0)
131131

132-
// Random seeds for the two parties (LSB of .w must be 0)
133-
int4 seeds[2] = {
134-
{0x11111111, 0x22222222, 0x33333333, 0x44444440},
135-
{0x55555555, 0x66666666, 0x77777777, static_cast<int>(0x88888880u)},
136-
};
132+
// Random seeds for the two parties (LSB of .w must be 0)
133+
int4 seeds[2] = {
134+
{0x11111111, 0x22222222, 0x33333333, 0x44444440},
135+
{0x55555555, 0x66666666, 0x77777777, static_cast<int>(0x88888880u)},
136+
};
137137

138-
Dpf::Cw cws[kInBits + 1];
139-
dpf.Gen(cws, seeds, alpha, beta);
140-
```
138+
Dpf::Cw cws[kInBits + 1];
139+
dpf.Gen(cws, seeds, alpha, beta);
140+
```
141141

142142
4. Run `Eval` on each party and reconstruct using the group:
143143

144-
```cpp
145-
// Each party evaluates independently
146-
int4 y0 = dpf.Eval(false, seeds[0], cws, alpha);
147-
int4 y1 = dpf.Eval(true, seeds[1], cws, alpha);
144+
```cpp
145+
// Each party evaluates independently
146+
int4 y0 = dpf.Eval(false, seeds[0], cws, alpha);
147+
int4 y1 = dpf.Eval(true, seeds[1], cws, alpha);
148148

149-
// Reconstruct via the group: convert to group elements, add, convert back
150-
// For Bytes group this is XOR; for Uint group this is arithmetic addition
151-
int4 sum = (Group::From(y0) + Group::From(y1)).Into();
152-
// sum == beta at x == alpha, 0 otherwise
153-
```
149+
// Reconstruct via the group: convert to group elements, add, convert back
150+
// For Bytes group this is XOR; for Uint group this is arithmetic addition
151+
int4 sum = (Group::From(y0) + Group::From(y1)).Into();
152+
// sum == beta at x == alpha, 0 otherwise
153+
```
154154

155155
5. Free the AES contexts when done:
156156

157-
```cpp
158-
DpfPrg::FreeCtxs(ctxs);
159-
```
157+
```cpp
158+
DpfPrg::FreeCtxs(ctxs);
159+
```
160160

161161
DCF follows the same pattern — use `DcfPrg` (mul=4, needs 4 AES keys), `Dcf`, and `Dcf::Cw`. The reconstructed output equals `beta` when `x < alpha` and `0` otherwise.
162162

@@ -175,79 +175,113 @@ This walks through using DPF and DCF on the GPU with ChaCha PRG.
175175

176176
1. Include the headers and set up type aliases:
177177

178-
```cpp
179-
#include <fss/dpf.cuh>
180-
#include <fss/dcf.cuh>
181-
#include <fss/group/bytes.cuh>
182-
#include <fss/prg/chacha.cuh>
178+
```cpp
179+
#include <fss/dpf.cuh>
180+
#include <fss/dcf.cuh>
181+
#include <fss/group/bytes.cuh>
182+
#include <fss/prg/chacha.cuh>
183183

184-
constexpr int kInBits = 8;
185-
using In = uint8_t;
186-
using Group = fss::group::Bytes;
184+
constexpr int kInBits = 8;
185+
using In = uint8_t;
186+
using Group = fss::group::Bytes;
187187

188-
// DPF uses mul=2, DCF uses mul=4
189-
using DpfPrg = fss::prg::ChaCha<2>;
190-
using DcfPrg = fss::prg::ChaCha<4>;
191-
using Dpf = fss::Dpf<kInBits, Group, DpfPrg, In>;
192-
using Dcf = fss::Dcf<kInBits, Group, DcfPrg, In>;
193-
```
188+
// DPF uses mul=2, DCF uses mul=4
189+
using DpfPrg = fss::prg::ChaCha<2>;
190+
using DcfPrg = fss::prg::ChaCha<4>;
191+
using Dpf = fss::Dpf<kInBits, Group, DpfPrg, In>;
192+
using Dcf = fss::Dcf<kInBits, Group, DcfPrg, In>;
193+
```
194194

195195
2. Set up a nonce in constant memory and create the PRG in a kernel:
196196

197-
```cpp
198-
__constant__ int kNonce[2] = {0x12345678, 0x9abcdef0};
197+
```cpp
198+
__constant__ int kNonce[2] = {0x12345678, 0x9abcdef0};
199199

200-
__global__ void GenKernel(Dpf::Cw *cws, const int4 *seeds, const In *alphas, const int4 *betas) {
201-
int tid = blockIdx.x * blockDim.x + threadIdx.x;
200+
__global__ void GenKernel(Dpf::Cw *cws, const int4 *seeds, const In *alphas, const int4 *betas) {
201+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
202202

203-
DpfPrg prg(kNonce);
204-
Dpf dpf{prg};
203+
DpfPrg prg(kNonce);
204+
Dpf dpf{prg};
205205

206-
int4 s[2] = {seeds[tid * 2], seeds[tid * 2 + 1]};
207-
dpf.Gen(cws + tid * (kInBits + 1), s, alphas[tid], betas[tid]);
208-
}
209-
```
206+
int4 s[2] = {seeds[tid * 2], seeds[tid * 2 + 1]};
207+
dpf.Gen(cws + tid * (kInBits + 1), s, alphas[tid], betas[tid]);
208+
}
209+
```
210210

211211
3. Prepare host data, copy to device, and launch the `Gen` kernel:
212212

213-
```cpp
214-
int4 *d_seeds = /* cudaMalloc + cudaMemcpy seeds to device */;
215-
In *d_alphas = /* cudaMalloc + cudaMemcpy alphas to device */;
216-
int4 *d_betas = /* cudaMalloc + cudaMemcpy betas to device */;
213+
```cpp
214+
int4 *d_seeds = /* cudaMalloc + cudaMemcpy seeds to device */;
215+
In *d_alphas = /* cudaMalloc + cudaMemcpy alphas to device */;
216+
int4 *d_betas = /* cudaMalloc + cudaMemcpy betas to device */;
217217

218-
Dpf::Cw *d_cws;
219-
cudaMalloc(&d_cws, sizeof(Dpf::Cw) * (kInBits + 1) * N);
218+
Dpf::Cw *d_cws;
219+
cudaMalloc(&d_cws, sizeof(Dpf::Cw) * (kInBits + 1) * N);
220220

221-
GenKernel<<<blocks, threads>>>(d_cws, d_seeds, d_alphas, d_betas);
222-
```
221+
GenKernel<<<blocks, threads>>>(d_cws, d_seeds, d_alphas, d_betas);
222+
```
223223

224224
4. Write and launch an `Eval` kernel for each party, then copy results back:
225225

226-
```cpp
227-
__global__ void EvalKernel(int4 *ys, bool party, const int4 *seeds, const Dpf::Cw *cws, const In *xs) {
228-
int tid = blockIdx.x * blockDim.x + threadIdx.x;
226+
```cpp
227+
__global__ void EvalKernel(int4 *ys, bool party, const int4 *seeds, const Dpf::Cw *cws, const In *xs) {
228+
int tid = blockIdx.x * blockDim.x + threadIdx.x;
229229

230-
DpfPrg prg(kNonce);
231-
Dpf dpf{prg};
230+
DpfPrg prg(kNonce);
231+
Dpf dpf{prg};
232232

233-
ys[tid] = dpf.Eval(party, seeds[tid], cws + tid * (kInBits + 1), xs[tid]);
234-
}
233+
ys[tid] = dpf.Eval(party, seeds[tid], cws + tid * (kInBits + 1), xs[tid]);
234+
}
235235

236-
// Launch for party 0 and party 1, then copy d_ys back to host
237-
EvalKernel<<<blocks, threads>>>(d_ys, false, d_seeds0, d_cws, d_xs);
238-
EvalKernel<<<blocks, threads>>>(d_ys, true, d_seeds1, d_cws, d_xs);
239-
```
236+
// Launch for party 0 and party 1, then copy d_ys back to host
237+
EvalKernel<<<blocks, threads>>>(d_ys, false, d_seeds0, d_cws, d_xs);
238+
EvalKernel<<<blocks, threads>>>(d_ys, true, d_seeds1, d_cws, d_xs);
239+
```
240240

241241
5. Reconstruct on the host using the group, same as the CPU case:
242242

243-
```cpp
244-
int4 sum = (Group::From(h_y0s[i]) + Group::From(h_y1s[i])).Into();
245-
```
243+
```cpp
244+
int4 sum = (Group::From(h_y0s[i]) + Group::From(h_y1s[i])).Into();
245+
```
246246

247247
DCF follows the same pattern — use `DcfPrg` (mul=4), `Dcf`, and `Dcf::Cw`.
248248

249249
See `samples/dpf_dcf_gpu.cu` for the complete working example.
250250

251+
## Benchmarks
252+
253+
Microbenchmarks for DPF/DCF `Gen`/`Eval` using [Google Benchmark](https://github.com/google/benchmark), covering both CPU (AES-128 MMO PRG) and GPU (ChaCha PRG) paths.
254+
255+
Configure with `BUILD_BENCH=ON` and build the targets:
256+
257+
```bash
258+
cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo
259+
cmake --build build --target bench_cpu bench_gpu
260+
```
261+
262+
Run all benchmarks:
263+
264+
```bash
265+
./build/bench_cpu
266+
./build/bench_gpu
267+
```
268+
269+
Run a subset using `--benchmark_filter` (regex):
270+
271+
```bash
272+
./build/bench_cpu --benchmark_filter=BM_DcfGen
273+
./build/bench_cpu --benchmark_filter=BM_DpfEval_Uint/20
274+
```
275+
276+
Generate a CPU flamegraph with `perf` and [FlameGraph](https://github.com/brendangregg/FlameGraph):
277+
278+
```bash
279+
perf record -g ./build/bench_cpu --benchmark_filter=BM_DpfEval_Uint/20
280+
perf script | /path/to/FlameGraph/stackcollapse-perf.pl | /path/to/FlameGraph/flamegraph.pl > build/flamegraph.svg
281+
```
282+
283+
Open `build/flamegraph.svg` in a browser. The graph is interactive: click a frame to zoom in.
284+
251285
## License
252286

253287
Apache License, Version 2.0

0 commit comments

Comments
 (0)