Add kernel execution timing to the KernelLauncher class (pytorch#4201)

q10 · facebook-github-bot · commit 006443653025 · 2025-05-28T08:14:14.000-07:00
Summary: Pull Request resolved: pytorch#4201 - Add kernel execution timing to the `KernelLauncher` class Reviewed By: jiawenliu64 Differential Revision: D75382325 fbshipit-source-id: 216a5d57d8f0410e5d58ffddcf509255057a5d50
diff --git a/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu b/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu
@@ -18,6 +18,7 @@
 
 #include "fbgemm_gpu/utils/device_cache_flusher.cuh"
 #include "fbgemm_gpu/utils/host_device_buffer_pair.cuh"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/stochastic_rounding.cuh"
 
 namespace fbgemm_gpu {
@@ -165,31 +166,14 @@ void time_kernel_run(
     Args&&... args) {
   std::cout << "[" << description << "] starting kernel run ..." << std::endl;
 
-  // Create CUDA events to time the kernel
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  // Execute the kernel, while recording the start and end times
-  cudaEventRecord(start);
-  kernel<<<grid, block>>>(std::forward<Args>(args)...);
-  cudaEventRecord(stop);
-
-  // Synchronize to ensure that the kernel has completed
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  cudaEventSynchronize(stop);
-
-  // Check for kernel execution errors
-  const auto e = cudaGetLastError();
-  if (e != cudaSuccess) {
-    std::cout << "[" << description
-              << "] CUDA Failure: " << cudaGetErrorString(e) << std::endl;
-    std::exit(-1);
-  }
-
-  // Calculate the elapsed time in milliseconds
-  float milliseconds = 0;
-  cudaEventElapsedTime(&milliseconds, start, stop);
+  const auto kernel_ = kernel;
+  const auto milliseconds = FBGEMM_TIME_KERNEL_RUN(
+      kernel_,
+      grid,
+      block,
+      0,
+      at::cuda::getCurrentCUDAStream(),
+      std::forward<Args>(args)...);
 
   std::cout << "[" << description << "] " << milliseconds << " ms\n"
             << std::endl;
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_execution_timer.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_execution_timer.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <stdexcept>
+
+namespace fbgemm_gpu::utils {
+
+class KernelExecutionTimer {
+ public:
+  explicit KernelExecutionTimer(const c10::cuda::CUDAStream stream)
+      : stream_(stream) {
+    C10_CUDA_CHECK(cudaEventCreate(&start_));
+    C10_CUDA_CHECK(cudaEventCreate(&stop_));
+  }
+
+  KernelExecutionTimer(const KernelExecutionTimer&) = delete;
+  KernelExecutionTimer& operator=(const KernelExecutionTimer&) = delete;
+  KernelExecutionTimer(KernelExecutionTimer&&) = delete;
+  KernelExecutionTimer& operator=(KernelExecutionTimer&&) = delete;
+
+  ~KernelExecutionTimer() {
+    C10_CUDA_CHECK(cudaEventDestroy(start_));
+    C10_CUDA_CHECK(cudaEventDestroy(stop_));
+  }
+
+  void start() {
+    if (started_) {
+      throw std::logic_error("Cannot call start() more than once.");
+    }
+    C10_CUDA_CHECK(cudaEventRecord(start_, stream_));
+    started_ = true;
+  }
+
+  void stop() {
+    if (!started_) {
+      throw std::logic_error("Must call start() before stop().");
+    }
+    if (stopped_) {
+      throw std::logic_error("Cannot call stop() more than once.");
+    }
+    C10_CUDA_CHECK(cudaEventRecord(stop_, stream_));
+    stopped_ = true;
+  }
+
+  float elapsedMillis() const {
+    if (!stopped_) {
+      throw std::logic_error(
+          "Must call stop() before retrieving elapsed time.");
+    }
+    float milliseconds = 0;
+    C10_CUDA_CHECK(cudaEventSynchronize(stop_)); // Ensure timing is complete
+    C10_CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    return milliseconds;
+  }
+
+ private:
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+  const c10::cuda::CUDAStream stream_;
+  bool started_ = false;
+  bool stopped_ = false;
+};
+
+} // namespace fbgemm_gpu::utils
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh
@@ -13,9 +13,11 @@
 #include <c10/cuda/CUDAStream.h>
 
 #include "fbgemm_gpu/utils/device_properties.cuh"
+#include "fbgemm_gpu/utils/kernel_execution_timer.cuh"
 #include "fbgemm_gpu/utils/source_context.h"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
 
+#include <memory>
 #include <type_traits>
 
 namespace fbgemm_gpu::utils {
@@ -91,7 +93,8 @@ decltype(auto) check_kernel_arg(const SourceContext& context, T&& arg) {
 template <
     bool EnableDSA = false,
     bool EnableBarrierIsolation = false,
-    bool EnableNaNChecks = false>
+    bool EnableNaNChecks = false,
+    bool EnableExecutionTimer = false>
 struct KernelLauncher {
   const SourceContext context;
 
@@ -263,17 +266,19 @@ struct KernelLauncher {
   }
 
   template <typename KernelFunc, typename... Args>
-  inline void launch_kernel(
+  inline auto launch_kernel(
       const KernelFunc& kernel,
       const dim3 grid,
       const dim3 block,
       const size_t shared_mem_per_block,
       const c10::cuda::CUDAStream stream,
-      Args&&... args) const {
+      Args&&... args) const
+      -> std::conditional_t<EnableExecutionTimer, float, void> {
     // Fetch device properties from the stream information
     const auto device = stream.device_index();
     const auto properties = *at::cuda::getDeviceProperties(device);
     const auto streamId = stream.id();
+    [[maybe_unused]] std::unique_ptr<KernelExecutionTimer> timer = nullptr;
 
     // Check that the grid sizes are within the range per the device associated
     // with the compute stream
@@ -305,6 +310,13 @@ struct KernelLauncher {
       cudaDeviceSynchronize();
     }
 
+    // If execution timer is enabled, initialize and start the CUDAEvents-based
+    // timer prior to kernel launch
+    if constexpr (EnableExecutionTimer) {
+      timer = std::make_unique<KernelExecutionTimer>(stream);
+      timer->start();
+    }
+
     if constexpr (EnableDSA) {
       // This launch code here is essentially the same as the contents of
       // TORCH_USE_CUDA_DSA macro, but with the addition of kernel argument
@@ -332,6 +344,11 @@ struct KernelLauncher {
           transform_kernel_arg(context, std::forward<Args>(args))...);
     }
 
+    // If execution timer is enabled, stop the CUDAEvents-based timer
+    if constexpr (EnableExecutionTimer) {
+      timer->stop();
+    }
+
     // If barrier isolation is enabled, synchronize the stream again to wait for
     // kernel execution to complete
     if constexpr (EnableBarrierIsolation) {
@@ -350,6 +367,11 @@ struct KernelLauncher {
       (check_kernel_arg(context.withSummary(summary), std::forward<Args>(args)),
        ...);
     }
+
+    // If execution timer is enabled, return the elapsed time in milliseconds
+    if constexpr (EnableExecutionTimer) {
+      return timer->elapsedMillis();
+    }
   }
 };
 
@@ -420,3 +442,15 @@ struct KernelLauncher {
                location, #KERNEL, _FKL_TFILE_)                              \
             .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
   }())
+
+#define FBGEMM_TIME_KERNEL_RUN(KERNEL, GRID, BLOCK, SMEM, STREAM, ...)      \
+  ([&] {                                                                    \
+    using source_location = fbgemm_gpu::utils::source_location;             \
+    constexpr auto location = source_location::current();                   \
+    decltype(KERNEL)& kernel = KERNEL;                                      \
+                                                                            \
+    return fbgemm_gpu::utils::                                              \
+        KernelLauncher<false, _FKL_BLOCKING_, _FKL_TENSORCHECK_, true>(     \
+               location, #KERNEL, _FKL_TFILE_)                              \
+            .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
+  }())