Skip to content

[WIP] Generate CUDA graphs with CUDASTF #2247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,31 @@ else()
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
endif()

# Fetch CPM.cmake directly from GitHub if not already present
include(FetchContent)
FetchContent_Declare(
CPM
GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
GIT_TAG v0.38.5 # replace with the desired version or main for latest
)
FetchContent_MakeAvailable(CPM)

include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)

# This will automatically clone CCCL from GitHub and make the exported cmake targets available
CPMAddPackage(
NAME CCCL
GITHUB_REPOSITORY "nvidia/cccl"
GIT_TAG "main"
# The following is required to make the `CCCL::cudax` target available:
OPTIONS "CCCL_ENABLE_UNSTABLE ON"
)

# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")
--use_fast_math -Xcompiler -fPIC -DCCCL_DISABLE_EXCEPTIONS -DCUDASTF_DISABLE_CODE_GENERATION")

set(INCLUDE_DIR include)

Expand All @@ -101,6 +121,8 @@ enable_testing()
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)

# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include "types/complex/operations.cuh"
#include <vector>

#include <cuda/experimental/stf.cuh>

namespace cudastf = cuda::experimental::stf;

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
Expand Down Expand Up @@ -383,26 +387,45 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");

cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}


auto lwe_chunk_size = buffer->lwe_chunk_size;

auto buffer_token = ctx.logical_token();

for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();

// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write()).set_symbol("compute_keybundle")->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
};

// Accumulate
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
ctx.task(key_token.read(), buffer_token.rw(), result_token.write()).set_symbol("accumulate")->*[&](cudaStream_t stf_stream) {
execute_cg_external_product_loop<Torus, params>(
stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
};
}

ctx.finalize();
}

// Verify if the grid size satisfies the cooperative group constraints
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include "types/complex/operations.cuh"
#include <vector>

#include <cuda/experimental/stf.cuh>

namespace cudastf = cuda::experimental::stf;

template <typename Torus, class params>
__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
uint32_t ggsw_idx,
Expand Down Expand Up @@ -682,48 +686,68 @@ __host__ void host_multi_bit_programmable_bootstrap(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
// Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
const char *use_graph_env = getenv("USE_CUDA_GRAPH");

cudastf::context ctx(stream);
if (use_graph_env && atoi(use_graph_env) != 0) {
ctx = cudastf::graph_ctx(stream);
}

auto buffer_token = ctx.logical_token();

auto lwe_chunk_size = buffer->lwe_chunk_size;

for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

auto key_token = ctx.logical_token();
auto result_token = ctx.logical_token();

// Compute a keybundle
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
ctx.task(key_token.write(), buffer_token.write()).set_symbol("compute_keybundle")->*[&](cudaStream_t stf_stream) {
execute_compute_keybundle<Torus, params>(
stf_stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
};

// Accumulate
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one<Torus, params, true>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
} else {
execute_step_one<Torus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
}

if (is_last_iter) {
execute_step_two<Torus, params, true>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
} else {
execute_step_two<Torus, params, false>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
}
ctx.task(key_token.read(), buffer_token.rw(), result_token.rw()).set_symbol("step_one_two")->*[&](cudaStream_t stf_stream) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one<Torus, params, true>(
stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
} else {
execute_step_one<Torus, params, false>(
stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
}

if (is_last_iter) {
execute_step_two<Torus, params, true>(
stf_stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
} else {
execute_step_two<Torus, params, false>(
stf_stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
}
};
}
}

ctx.finalize();
}
#endif // MULTIBIT_PBS_H