Skip to content

Commit 42a59a6

Browse files
authored
feat: add distribute function (#14)
# What ❔ This PR adds the distribute function. ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Documentation comments have been added / updated.
1 parent b521275 commit 42a59a6

File tree

4 files changed

+32
-0
lines changed

4 files changed

+32
-0
lines changed

src/bellman-cuda.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,11 @@ bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, c
280280
static_cast<cudaStream_t>(stream.handle)));
281281
}
282282

283+
bc_error pn_distribute_values(const void *src, void *dst, const unsigned count, const unsigned stride, bc_stream stream) {
284+
return static_cast<bc_error>(pn::distribute_values(static_cast<const fd_q::storage *>(src), static_cast<fd_q::storage *>(dst), count, stride,
285+
static_cast<cudaStream_t>(stream.handle)));
286+
}
287+
283288
bc_error pn_tear_down() { return static_cast<bc_error>(pn::tear_down()); };
284289

285290
bc_error msm_set_up() { return static_cast<bc_error>(msm::set_up()); }

src/bellman-cuda.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,14 @@ bc_error pn_generate_permutation_polynomials(generate_permutation_polynomials_co
458458
// stream - Stream on which this operation will be scheduled
459459
bc_error pn_set_values_from_packed_bits(void *values, const void *packet_bits, unsigned count, bc_stream stream);
460460

461+
// Distribute field element values with a stride
462+
// src - device pointer to the vector of field elements from where the values will be read
463+
// dst - device pointer to the vector of field elements to where the results will be written
464+
// count - number of values to distribute
465+
// stride - stride with which the values will be distributed
466+
// stream - Stream on which this operation will be scheduled
467+
bc_error pn_distribute_values(const void *src, void *dst, unsigned count, unsigned stride, bc_stream stream);
468+
461469
// release all resources associated with the internal state for polynomial computations
462470
bc_error pn_tear_down();
463471

src/pn_kernels.cu

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,4 +141,21 @@ cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *p
141141
return cudaGetLastError();
142142
}
143143

144+
__global__ void distribute_values_kernel(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride) {
145+
typedef fd_q::storage storage;
146+
const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
147+
if (gid >= count)
148+
return;
149+
const auto value = memory::load<storage, memory::ld_modifier::cs>(src + gid);
150+
memory::store<storage, memory::st_modifier::cs>(dst + gid * stride, value);
151+
}
152+
153+
cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, const unsigned count, const unsigned stride, cudaStream_t stream) {
154+
const unsigned threads_per_block = 128;
155+
const dim3 block_dim = count < threads_per_block ? count : threads_per_block;
156+
const dim3 grid_dim = (count - 1) / block_dim.x + 1;
157+
distribute_values_kernel<<<grid_dim, block_dim, 0, stream>>>(src, dst, count, stride);
158+
return cudaGetLastError();
159+
}
160+
144161
} // namespace pn

src/pn_kernels.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,6 @@ cudaError_t generate_permutation_matrix(fd_q::storage *values, const fd_q::stora
1818

1919
cudaError_t set_values_from_packed_bits(fd_q::storage *values, const unsigned *packet_bits, unsigned count, cudaStream_t stream);
2020

21+
cudaError_t distribute_values(const fd_q::storage *src, fd_q::storage *dst, unsigned count, unsigned stride, cudaStream_t stream);
22+
2123
} // namespace pn

0 commit comments

Comments
 (0)