Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit ec8a282

Browse files
committed
Add signing GPU impl
1 parent 78a5a9a commit ec8a282

File tree

15 files changed

+454
-187
lines changed

15 files changed

+454
-187
lines changed

src/Makefile

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#
2-
# Makefile to build libcuda-crypt.a
2+
# Makefile to build libcuda-crypt.so
33
# From sources in /cuda-crypt and /ed25519-verify
44
#
5-
# nvcc inserts conflicting driver symbols into a static lib (.a)
6-
# so more than one .a cannot be linked into a single program.
5+
# nvcc inserts conflicting driver symbols into a static lib (.so)
6+
# so more than one .so cannot be linked into a single program.
77
# Shared objects with device kernels also did not seem to work--
88
# it can build, but the kernel launch is not successful. (with cuda 9.2)
99
# Hence, build ed25519 ecdsa verify and chacha encryption device
@@ -24,12 +24,41 @@ CFLAGS+=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR)
2424
all: $V/$(CHACHA_TEST_BIN) $V/$(ECC_TEST_BIN) $(V)/lib$(LIB).so
2525

2626
ECC_DIR:=cuda-ecc-ed25519
27-
VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.cu sc.cu fe.cu sign.cu keypair.cu common.cu ed25519.h)
2827

28+
SC_SRCS:=$(addprefix $(ECC_DIR)/,sc.cu ed25519.h ge.h)
29+
$V/sc.o: $(SC_SRCS)
30+
@mkdir -p $(@D)
31+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
32+
33+
KEYPAIR_SRCS:=$(addprefix $(ECC_DIR)/,keypair.cu ed25519.h ge.h)
34+
$V/keypair.o: $(KEYPAIR_SRCS)
35+
@mkdir -p $(@D)
36+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
37+
38+
SEED_SRCS:=$(addprefix $(ECC_DIR)/,seed.cu ed25519.h)
39+
$V/seed.o: $(SEED_SRCS)
40+
@mkdir -p $(@D)
41+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
42+
43+
GE_SRCS:=$(addprefix $(ECC_DIR)/,ge.cu ge.h precomp_data.h)
44+
$V/ge.o: $(GE_SRCS)
45+
@mkdir -p $(@D)
46+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
47+
48+
SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../$(CUDA_HEADER_DIR)/gpu_common.h ed25519.h)
49+
$V/sign.o: $(SIGN_SRCS)
50+
@mkdir -p $(@D)
51+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
52+
53+
VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.h sc.cu fe.cu keypair.cu common.cu ed25519.h)
2954
$V/verify.o: $(VERIFY_SRCS)
3055
@mkdir -p $(@D)
3156
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
3257

58+
$V/gpu_ctx.o: $(addprefix $(ECC_DIR)/,gpu_ctx.cu gpu_ctx.h)
59+
@mkdir -p $(@D)
60+
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
61+
3362
CHACHA_DIR:=cuda-crypt
3463
CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu chacha.h common.cu)
3564

@@ -50,25 +79,30 @@ $V/poh_verify.o: $(POH_SRCS)
5079
@mkdir -p $(@D)
5180
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
5281

53-
$V/crypt-dlink.o: $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
54-
$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@
82+
CPU_GPU_OBJS=$(addprefix $V/,chacha_cbc.o aes_cbc.o verify.o poh_verify.o gpu_ctx.o sign.o ge.o seed.o keypair.o sc.o)
5583

56-
$V/lib$(LIB).a: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
57-
$(NVCC) -Xcompiler "-fPIC" --lib --output-file $@ $^
84+
$V/crypt-dlink.o: $(CPU_GPU_OBJS)
85+
$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@
5886

59-
$V/lib$(LIB).so: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
87+
$V/lib$(LIB).so: $V/crypt-dlink.o $(CPU_GPU_OBJS)
6088
$(NVCC) -Xcompiler "-fPIC" --shared --output-file $@ $^
6189

62-
$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).a
90+
$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).so
6391
$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@
6492

6593
$V/ecc_main.o: $(addprefix $(ECC_DIR)/,main.cu ed25519.h)
6694
@mkdir -p $(@D)
6795
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
6896

69-
$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).a
97+
$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).so
7098
$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@
7199

72100
.PHONY:clean
73101
clean:
74102
rm -rf $V
103+
104+
test: $V/$(ECC_TEST_BIN) $V/$(CHACHA_TEST_BIN)
105+
cd $(V) && ./$(CHACHA_TEST_BIN) 64 \
106+
cd $(V) && ./$(ECC_TEST_BIN) 1 1 1 1 1 1
107+
cd $(V) && ./$(ECC_TEST_BIN) 64 1 1 1 1 0
108+
cd $(V) && ./$(ECC_TEST_BIN) 100201 1 1 4 10 1

src/cuda-ecc-ed25519/ed25519.h

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,35 @@ typedef struct {
3838

3939
void ED25519_DECLSPEC ed25519_create_keypair(unsigned char *public_key, unsigned char *private_key, const unsigned char *seed);
4040
void ED25519_DECLSPEC ed25519_sign(unsigned char *signature, const unsigned char *message, size_t message_len, const unsigned char *public_key, const unsigned char *private_key);
41+
42+
void ED25519_DECLSPEC ed25519_sign_many(const gpu_Elems* elems,
43+
uint32_t num_elems,
44+
uint32_t message_size,
45+
uint32_t total_packets,
46+
uint32_t total_signatures,
47+
const uint32_t* message_lens,
48+
const uint32_t* public_key_offsets,
49+
const uint32_t* private_key_offsets,
50+
const uint32_t* message_start_offsets,
51+
uint8_t* signatures_out,
52+
uint8_t use_non_default_stream);
53+
4154
int ED25519_DECLSPEC ed25519_verify(const unsigned char *signature, const unsigned char *message, uint32_t message_len, const unsigned char *public_key);
42-
void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems, uint32_t num_elems, uint32_t message_size, uint32_t total_packets, uint32_t total_signatures, const uint32_t* message_lens, const uint32_t* public_key_offset, const uint32_t* signature_offset, const uint32_t* message_start_offset, uint8_t* out, uint8_t use_non_default_stream);
55+
56+
void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems,
57+
uint32_t num_elems,
58+
uint32_t message_size,
59+
uint32_t total_packets,
60+
uint32_t total_signatures,
61+
const uint32_t* message_lens,
62+
const uint32_t* public_key_offsets,
63+
const uint32_t* private_key_offsets,
64+
const uint32_t* message_start_offsets,
65+
uint8_t* out,
66+
uint8_t use_non_default_stream);
67+
4368
void ED25519_DECLSPEC ed25519_add_scalar(unsigned char *public_key, unsigned char *private_key, const unsigned char *scalar);
4469
void ED25519_DECLSPEC ed25519_key_exchange(unsigned char *shared_secret, const unsigned char *public_key, const unsigned char *private_key);
45-
void ED25519_DECLSPEC ed25519_free_gpu_mem();
4670
void ED25519_DECLSPEC ed25519_set_verbose(bool val);
4771

4872
const char* ED25519_DECLSPEC ed25519_license();

src/cuda-ecc-ed25519/fe.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ void __device__ __host__ fe_add(fe h, const fe f, const fe g) {
109109
Preconditions: b in {0,1}.
110110
*/
111111

112-
void fe_cmov(fe f, const fe g, unsigned int b) {
112+
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b) {
113113
int32_t f0 = f[0];
114114
int32_t f1 = f[1];
115115
int32_t f2 = f[2];

src/cuda-ecc-ed25519/fe.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
2525
void __host__ __device__ fe_copy(fe h, const fe f);
2626
int __host__ __device__ fe_isnegative(const fe f);
2727
int __device__ __host__ fe_isnonzero(const fe f);
28-
void fe_cmov(fe f, const fe g, unsigned int b);
28+
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b);
2929
void fe_cswap(fe f, fe g, unsigned int b);
3030

3131
void __device__ __host__ fe_neg(fe h, const fe f);

src/cuda-ecc-ed25519/ge.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
271271
}
272272

273273

274-
void ge_p3_0(ge_p3 *h) {
274+
void __host__ __device__ ge_p3_0(ge_p3 *h) {
275275
fe_0(h->X);
276276
fe_1(h->Y);
277277
fe_1(h->Z);
@@ -330,7 +330,7 @@ void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
330330
}
331331

332332

333-
static unsigned char equal(signed char b, signed char c) {
333+
static unsigned char __host__ __device__ equal(signed char b, signed char c) {
334334
unsigned char ub = b;
335335
unsigned char uc = c;
336336
unsigned char x = ub ^ uc; /* 0: yes; 1..255: no */
@@ -340,20 +340,20 @@ static unsigned char equal(signed char b, signed char c) {
340340
return (unsigned char) y;
341341
}
342342

343-
static unsigned char negative(signed char b) {
343+
static unsigned char __host__ __device__ negative(signed char b) {
344344
uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
345345
x >>= 63; /* 1: yes; 0: no */
346346
return (unsigned char) x;
347347
}
348348

349-
static void cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
349+
static void __host__ __device__ cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
350350
fe_cmov(t->yplusx, u->yplusx, b);
351351
fe_cmov(t->yminusx, u->yminusx, b);
352352
fe_cmov(t->xy2d, u->xy2d, b);
353353
}
354354

355355

356-
static void select(ge_precomp *t, int pos, signed char b) {
356+
static void __host__ __device__ select(ge_precomp *t, int pos, signed char b) {
357357
ge_precomp minust;
358358
unsigned char bnegative = negative(b);
359359
unsigned char babs = b - (((-bnegative) & b) << 1);
@@ -383,7 +383,7 @@ Preconditions:
383383
a[31] <= 127
384384
*/
385385

386-
void ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
386+
void __device__ __host__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
387387
signed char e[64];
388388
signed char carry;
389389
ge_p1p1 r;

src/cuda-ecc-ed25519/ge.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,20 +53,20 @@ typedef struct {
5353

5454
void __host__ __device__ ge_p3_tobytes(unsigned char *s, const ge_p3 *h);
5555
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h);
56-
int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
56+
int __host__ __device__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
5757

5858
void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
5959
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
6060
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b);
6161
void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
6262
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
63-
void ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
63+
void __host__ __device__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
6464

6565
void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
6666
void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
6767
void __host__ __device__ ge_p2_0(ge_p2 *h);
6868
void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p);
69-
void ge_p3_0(ge_p3 *h);
69+
void __host__ __device__ ge_p3_0(ge_p3 *h);
7070
void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p);
7171
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
7272
void __host__ __device__ ge_p3_to_p2(ge_p2 *r, const ge_p3 *p);

src/cuda-ecc-ed25519/gpu_ctx.cu

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#include "ed25519.h"
2+
#include "gpu_ctx.h"
3+
#include <pthread.h>
4+
#include "gpu_common.h"
5+
6+
static pthread_mutex_t g_ctx_mutex = PTHREAD_MUTEX_INITIALIZER;
7+
8+
#define MAX_NUM_GPUS 8
9+
#define MAX_QUEUE_SIZE 8
10+
11+
static gpu_ctx_t g_gpu_ctx[MAX_NUM_GPUS][MAX_QUEUE_SIZE] = {0};
12+
static uint32_t g_cur_gpu = 0;
13+
static uint32_t g_cur_queue[MAX_NUM_GPUS] = {0};
14+
static int32_t g_total_gpus = -1;
15+
16+
static bool cuda_crypt_init_locked() {
17+
if (g_total_gpus == -1) {
18+
cudaGetDeviceCount(&g_total_gpus);
19+
g_total_gpus = min(MAX_NUM_GPUS, g_total_gpus);
20+
LOG("total_gpus: %d\n", g_total_gpus);
21+
for (int gpu = 0; gpu < g_total_gpus; gpu++) {
22+
CUDA_CHK(cudaSetDevice(gpu));
23+
for (int queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
24+
int err = pthread_mutex_init(&g_gpu_ctx[gpu][queue].mutex, NULL);
25+
if (err != 0) {
26+
fprintf(stderr, "pthread_mutex_init error %d gpu: %d queue: %d\n",
27+
err, gpu, queue);
28+
g_total_gpus = 0;
29+
return false;
30+
}
31+
CUDA_CHK(cudaStreamCreate(&g_gpu_ctx[gpu][queue].stream));
32+
}
33+
}
34+
}
35+
return g_total_gpus > 0;
36+
}
37+
38+
bool ed25519_init() {
39+
cudaFree(0);
40+
pthread_mutex_lock(&g_ctx_mutex);
41+
bool success = cuda_crypt_init_locked();
42+
pthread_mutex_unlock(&g_ctx_mutex);
43+
return success;
44+
}
45+
46+
gpu_ctx_t* get_gpu_ctx() {
47+
int32_t cur_gpu, cur_queue;
48+
49+
LOG("locking global mutex");
50+
pthread_mutex_lock(&g_ctx_mutex);
51+
if (!cuda_crypt_init_locked()) {
52+
pthread_mutex_unlock(&g_ctx_mutex);
53+
LOG("No GPUs, exiting...\n");
54+
return NULL;
55+
}
56+
cur_gpu = g_cur_gpu;
57+
g_cur_gpu++;
58+
g_cur_gpu %= g_total_gpus;
59+
cur_queue = g_cur_queue[cur_gpu];
60+
g_cur_queue[cur_gpu]++;
61+
g_cur_queue[cur_gpu] %= MAX_QUEUE_SIZE;
62+
pthread_mutex_unlock(&g_ctx_mutex);
63+
64+
gpu_ctx_t* cur_ctx = &g_gpu_ctx[cur_gpu][cur_queue];
65+
LOG("locking contex mutex queue: %d gpu: %d", cur_queue, cur_gpu);
66+
pthread_mutex_lock(&cur_ctx->mutex);
67+
68+
CUDA_CHK(cudaSetDevice(cur_gpu));
69+
70+
LOG("selecting gpu: %d queue: %d\n", cur_gpu, cur_queue);
71+
72+
return cur_ctx;
73+
}
74+
75+
void setup_gpu_ctx(verify_ctx_t* cur_ctx,
76+
const gpu_Elems* elems,
77+
uint32_t num_elems,
78+
uint32_t message_size,
79+
uint32_t total_packets,
80+
uint32_t total_packets_size,
81+
uint32_t total_signatures,
82+
const uint32_t* message_lens,
83+
const uint32_t* public_key_offsets,
84+
const uint32_t* signature_offsets,
85+
const uint32_t* message_start_offsets,
86+
size_t out_size,
87+
cudaStream_t stream
88+
) {
89+
size_t offsets_size = total_signatures * sizeof(uint32_t);
90+
91+
LOG("device allocate. packets: %d out: %d offsets_size: %zu\n",
92+
total_packets_size, (int)out_size, offsets_size);
93+
94+
if (cur_ctx->packets == NULL ||
95+
total_packets_size > cur_ctx->packets_size_bytes) {
96+
CUDA_CHK(cudaFree(cur_ctx->packets));
97+
CUDA_CHK(cudaMalloc(&cur_ctx->packets, total_packets_size));
98+
99+
cur_ctx->packets_size_bytes = total_packets_size;
100+
}
101+
102+
if (cur_ctx->out == NULL || cur_ctx->out_size_bytes < out_size) {
103+
CUDA_CHK(cudaFree(cur_ctx->out));
104+
CUDA_CHK(cudaMalloc(&cur_ctx->out, out_size));
105+
106+
cur_ctx->out_size_bytes = total_signatures;
107+
}
108+
109+
if (cur_ctx->public_key_offsets == NULL || cur_ctx->offsets_len < total_signatures) {
110+
CUDA_CHK(cudaFree(cur_ctx->public_key_offsets));
111+
CUDA_CHK(cudaMalloc(&cur_ctx->public_key_offsets, offsets_size));
112+
113+
CUDA_CHK(cudaFree(cur_ctx->signature_offsets));
114+
CUDA_CHK(cudaMalloc(&cur_ctx->signature_offsets, offsets_size));
115+
116+
CUDA_CHK(cudaFree(cur_ctx->message_start_offsets));
117+
CUDA_CHK(cudaMalloc(&cur_ctx->message_start_offsets, offsets_size));
118+
119+
CUDA_CHK(cudaFree(cur_ctx->message_lens));
120+
CUDA_CHK(cudaMalloc(&cur_ctx->message_lens, offsets_size));
121+
122+
cur_ctx->offsets_len = total_signatures;
123+
}
124+
125+
LOG("Done alloc");
126+
127+
CUDA_CHK(cudaMemcpyAsync(cur_ctx->public_key_offsets, public_key_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
128+
CUDA_CHK(cudaMemcpyAsync(cur_ctx->signature_offsets, signature_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
129+
CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_start_offsets, message_start_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
130+
CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_lens, message_lens, offsets_size, cudaMemcpyHostToDevice, stream));
131+
132+
size_t cur = 0;
133+
for (size_t i = 0; i < num_elems; i++) {
134+
LOG("i: %zu size: %d\n", i, elems[i].num * message_size);
135+
CUDA_CHK(cudaMemcpyAsync(&cur_ctx->packets[cur * message_size], elems[i].elems, elems[i].num * message_size, cudaMemcpyHostToDevice, stream));
136+
cur += elems[i].num;
137+
}
138+
}
139+
140+
141+
void release_gpu_ctx(gpu_ctx_t* cur_ctx) {
142+
pthread_mutex_unlock(&cur_ctx->mutex);
143+
}
144+
145+
void ed25519_free_gpu_mem() {
146+
for (size_t gpu = 0; gpu < MAX_NUM_GPUS; gpu++) {
147+
for (size_t queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
148+
gpu_ctx_t* cur_ctx = &g_gpu_ctx[gpu][queue];
149+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.packets));
150+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.out));
151+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_lens));
152+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.public_key_offsets));
153+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.private_key_offsets));
154+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.signature_offsets));
155+
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_start_offsets));
156+
if (cur_ctx->stream != 0) {
157+
CUDA_CHK(cudaStreamDestroy(cur_ctx->stream));
158+
}
159+
}
160+
}
161+
}

0 commit comments

Comments
 (0)