zama-ai · caugonnet · Apr 8, 2025 · Apr 8, 2025 · Apr 9, 2025 · Apr 9, 2025
@@ -88,11 +88,31 @@ else()
   set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()
 
+# Fetch CPM.cmake directly from GitHub if not already present
+include(FetchContent)
+FetchContent_Declare(
+    CPM
+    GIT_REPOSITORY https://github.com/cpm-cmake/CPM.cmake
+    GIT_TAG v0.38.5 # replace with the desired version or main for latest
+)
+FetchContent_MakeAvailable(CPM)
+
+include(${cpm_SOURCE_DIR}/cmake/CPM.cmake)
+
+# This will automatically clone CCCL from GitHub and make the exported cmake targets available
+CPMAddPackage(
+  NAME CCCL
+  GITHUB_REPOSITORY "nvidia/cccl"
+  GIT_TAG "main"
+  # The following is required to make the `CCCL::cudax` target available:
+  OPTIONS "CCCL_ENABLE_UNSTABLE ON"
+)
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
     "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
   -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
+  --use_fast_math -Xcompiler -fPIC -DCCCL_DISABLE_EXCEPTIONS -DCUDASTF_DISABLE_CODE_GENERATION")
 
 set(INCLUDE_DIR include)
 
@@ -101,6 +121,8 @@ enable_testing()
 add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
 
+target_link_libraries(tfhe_cuda_backend PRIVATE CCCL::CCCL CCCL::cudax cuda)
+
 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
 

@@ -18,6 +18,10 @@
 #include "types/complex/operations.cuh"
 #include <vector>
 
+#include <cuda/experimental/stf.cuh>
+
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_cg_accumulate(
@@ -383,26 +387,45 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride) {
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+      ctx = cudastf::graph_ctx(stream);
+  }
+
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
+  auto buffer_token = ctx.logical_token();
+
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
+
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write()).set_symbol("compute_keybundle")->*[&](cudaStream_t stf_stream) {
+        execute_compute_keybundle<Torus, params>(
+            stf_stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            grouping_factor, level_count, lwe_offset);
+    };
 
     // Accumulate
-    execute_cg_external_product_loop<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+    ctx.task(key_token.read(), buffer_token.rw(), result_token.write()).set_symbol("accumulate")->*[&](cudaStream_t stf_stream) {
+        execute_cg_external_product_loop<Torus, params>(
+            stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+            lut_stride);
+    };
   }
+
+  ctx.finalize();
 }
 
 // Verify if the grid size satisfies the cooperative group constraints

@@ -18,6 +18,10 @@
 #include "types/complex/operations.cuh"
 #include <vector>
 
+#include <cuda/experimental/stf.cuh>
+
+namespace cudastf = cuda::experimental::stf;
+
 template <typename Torus, class params>
 __device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
                                                uint32_t ggsw_idx,
@@ -682,48 +686,68 @@ __host__ void host_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride) {
+  // Generate a CUDA graph if the USE_CUDA_GRAPH is set to a non-null value
+  const char *use_graph_env = getenv("USE_CUDA_GRAPH");
+
+  cudastf::context ctx(stream);
+  if (use_graph_env && atoi(use_graph_env) != 0) {
+      ctx = cudastf::graph_ctx(stream);
+  }
+
+  auto buffer_token = ctx.logical_token();
 
   auto lwe_chunk_size = buffer->lwe_chunk_size;
 
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
 
+    auto key_token = ctx.logical_token();
+    auto result_token = ctx.logical_token();
+
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
+    ctx.task(key_token.write(), buffer_token.write()).set_symbol("compute_keybundle")->*[&](cudaStream_t stf_stream) {
+        execute_compute_keybundle<Torus, params>(
+            stf_stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            grouping_factor, level_count, lwe_offset);
+    };
+
     // Accumulate
     uint32_t chunk_size = std::min(
         lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
     for (uint32_t j = 0; j < chunk_size; j++) {
-      bool is_first_iter = (j + lwe_offset) == 0;
-      bool is_last_iter =
-          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
-      if (is_first_iter) {
-        execute_step_one<Torus, params, true>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      } else {
-        execute_step_one<Torus, params, false>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
-      }
 
-      if (is_last_iter) {
-        execute_step_two<Torus, params, true>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      } else {
-        execute_step_two<Torus, params, false>(
-            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-            num_samples, glwe_dimension, polynomial_size, level_count, j,
-            num_many_lut, lut_stride);
-      }
+      ctx.task(key_token.read(), buffer_token.rw(), result_token.rw()).set_symbol("step_one_two")->*[&](cudaStream_t stf_stream) {
+          bool is_first_iter = (j + lwe_offset) == 0;
+          bool is_last_iter =
+              (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+          if (is_first_iter) {
+            execute_step_one<Torus, params, true>(
+                stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+                lwe_input_indexes, buffer, num_samples, lwe_dimension,
+                glwe_dimension, polynomial_size, base_log, level_count);
+          } else {
+            execute_step_one<Torus, params, false>(
+                stf_stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+                lwe_input_indexes, buffer, num_samples, lwe_dimension,
+                glwe_dimension, polynomial_size, base_log, level_count);
+          }
+
+          if (is_last_iter) {
+            execute_step_two<Torus, params, true>(
+                stf_stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+                num_samples, glwe_dimension, polynomial_size, level_count, j,
+                num_many_lut, lut_stride);
+          } else {
+            execute_step_two<Torus, params, false>(
+                stf_stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+                num_samples, glwe_dimension, polynomial_size, level_count, j,
+                num_many_lut, lut_stride);
+          }
+      };
     }
   }
+
+  ctx.finalize();
 }
 #endif // MULTIBIT_PBS_H