implement optional threadpool support in CubinCustomKernelCompiler

ermilovmaxim · Google-ML-Automation · commit 60accc8fc8d9 · 2026-04-21T10:17:46.000-07:00
PiperOrigin-RevId: 903290909
diff --git a/xla/backends/gpu/codegen/BUILD b/xla/backends/gpu/codegen/BUILD
@@ -387,6 +387,7 @@ cc_library(
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/service/gpu:launch_dimensions",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:env",
         "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status:statusor",
diff --git a/xla/backends/gpu/codegen/cubin_custom_kernel_compiler.cc b/xla/backends/gpu/codegen/cubin_custom_kernel_compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "xla/tsl/platform/status_macros.h"
 #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
 #include "llvm/IR/Module.h"
@@ -40,6 +41,27 @@ xla::Future<std::unique_ptr<Thunk>> CubinCustomKernelCompiler::Compile(
     const std::string& sanitized_kernel_name,
     const emitters::KernelArguments& kernel_arguments,
     const LaunchDimensions& launch_dimensions) {
+  if (!thread_pool_) {
+    return CompileImpl(std::move(thunk_info), std::move(kernel_source),
+                       sanitized_kernel_name, kernel_arguments,
+                       launch_dimensions);
+  }
+  return tsl::MakeFutureOn(
+      *thread_pool_->AsExecutor(),
+      [this, thunk_info = std::move(thunk_info),
+       kernel_source = std::move(kernel_source), sanitized_kernel_name,
+       kernel_arguments, launch_dimensions]() mutable {
+        return CompileImpl(std::move(thunk_info), std::move(kernel_source),
+                           sanitized_kernel_name, kernel_arguments,
+                           launch_dimensions);
+      });
+}
+
+absl::StatusOr<std::unique_ptr<Thunk>> CubinCustomKernelCompiler::CompileImpl(
+    Thunk::ThunkInfo thunk_info, LlvmKernelSource kernel_source,
+    const std::string& sanitized_kernel_name,
+    const emitters::KernelArguments& kernel_arguments,
+    const LaunchDimensions& launch_dimensions) {
   llvm::orc::ThreadSafeModule thread_safe_module =
       std::move(kernel_source).thread_safe_module();
   llvm::Module* llvm_module = thread_safe_module.getModuleUnlocked();
diff --git a/xla/backends/gpu/codegen/cubin_custom_kernel_compiler.h b/xla/backends/gpu/codegen/cubin_custom_kernel_compiler.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/platform/threadpool.h"
 
 namespace xla::gpu {
 
@@ -45,17 +46,20 @@ using LlvmIrCompiler = absl::AnyInvocable<absl::StatusOr<std::vector<uint8_t>>(
 // Implementation of KernelCompiler that compiles LLVM IR to CUBIN format using
 // a provided compilation function.
 //
-// Note: This implementation is currently synchronous. The compilation happens
+// Note: CubinCustomKernelCompiler utilizes provided threadpool.
+// If threadpool is not provided, the compilation happens
 // fully within this call, and the result is returned as an immediately ready
 // Future.
 class CubinCustomKernelCompiler : public KernelCompiler {
  public:
   CubinCustomKernelCompiler(LlvmIrCompiler compiler,
                             const se::DeviceDescription& gpu_device_info,
-                            const DebugOptions& debug_options)
+                            const DebugOptions& debug_options,
+                            tsl::thread::ThreadPool* thread_pool = nullptr)
       : compiler_(std::move(compiler)),
         device_info_(gpu_device_info),
-        debug_options_(debug_options) {}
+        debug_options_(debug_options),
+        thread_pool_(thread_pool) {}
 
   xla::Future<std::unique_ptr<Thunk>> Compile(
       Thunk::ThunkInfo thunk_info, LlvmKernelSource kernel_source,
@@ -64,9 +68,16 @@ class CubinCustomKernelCompiler : public KernelCompiler {
       const LaunchDimensions& launch_dimensions) override;
 
  private:
+  absl::StatusOr<std::unique_ptr<Thunk>> CompileImpl(
+      Thunk::ThunkInfo thunk_info, LlvmKernelSource kernel_source,
+      const std::string& sanitized_kernel_name,
+      const emitters::KernelArguments& kernel_arguments,
+      const LaunchDimensions& launch_dimensions);
+
   LlvmIrCompiler compiler_;
   const se::DeviceDescription device_info_;
   const DebugOptions debug_options_;
+  tsl::thread::ThreadPool* thread_pool_;
 };
 
 }  // namespace xla::gpu
diff --git a/xla/service/gpu/gpu_compiler.cc b/xla/service/gpu/gpu_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <atomic>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -2519,6 +2520,13 @@ GpuCompiler::CompileToBackendResult(
       module, schedule_metadata.scheduler_mem_limit,
       gpu_topology.gpu_target_config().device_description, alias_info.get()));
 
+  MaybeOwningThreadPool thread_pool = CreateMaybeOwningThreadPool(
+      /*parallelism=*/module->config()
+          .debug_options()
+          .xla_gpu_force_compilation_parallelism(),
+      /*default_thread_pool=*/options.thread_pool,
+      /*default_parallelism=*/tsl::port::MaxParallelism());
+
   ASSIGN_OR_RETURN(
       bool can_use_link_modules,
       CanUseLinkModules(module->config(),
@@ -2531,6 +2539,7 @@ GpuCompiler::CompileToBackendResult(
           .xla_gpu_enable_llvm_module_compilation_parallelism();
 
   CompileModuleResults compile_module_results;
+  std::atomic<int> shard_number = 0;
 
   {
     xla::llvm_ir::LLVMCommandLineOptionsReleasableLock llvm_options_lock(
@@ -2541,15 +2550,16 @@ GpuCompiler::CompileToBackendResult(
     auto llvm_compiler =
         [&](llvm::Module& llvm_module, const se::DeviceDescription& descr,
             const DebugOptions& opts) -> absl::StatusOr<std::vector<uint8_t>> {
-      ASSIGN_OR_RETURN(BackendCompileResult result,
-                       CompileSingleModule(module->config(), descr, module,
-                                           &llvm_module, false, std::nullopt));
+      ASSIGN_OR_RETURN(
+          BackendCompileResult result,
+          CompileSingleModule(module->config(), descr, module, &llvm_module,
+                              false, shard_number.fetch_add(1)));
       return std::move(result.binary);
     };
     CubinCustomKernelCompiler kernel_compiler(
         std::move(llvm_compiler),
         gpu_topology.gpu_target_config().device_description,
-        module->config().debug_options());
+        module->config().debug_options(), thread_pool.get_mutable());
     kernel_compiler.SetPreOptimizationHook([&](const llvm::Module& module) {
       CallUserPreOptimizationHook(module);
     });
@@ -2571,7 +2581,8 @@ GpuCompiler::CompileToBackendResult(
   for (const std::unique_ptr<llvm::Module>& llvm_module :
        compile_module_results.llvm_modules) {
     llvm_ir::DumpIrIfEnabled(*module, *llvm_module,
-                             /*optimized=*/false);
+                             /*optimized=*/false,
+                             std::to_string(shard_number.fetch_add(1)));
     CallUserPreOptimizationHook(*llvm_module);
   }
   if (compile_module_results.llvm_module_constants != nullptr) {
@@ -2613,7 +2624,7 @@ GpuCompiler::CompileToBackendResult(
                             gpu_topology.gpu_target_config().device_description,
                             module, &*compile_module_results.llvm_modules[0],
                             /*relocatable=*/false,
-                            /*shard_number=*/std::nullopt));
+                            /*shard_number=*/shard_number.fetch_add(1)));
   }
 
   if (!backend_result.asm_text.empty()) {
@@ -3198,13 +3209,14 @@ GpuCompiler::LoadExecutableFromAotResult(
       BufferAssignment::FromProto(proto.buffer_assignment(), hlo_module.get(),
                                   BufferSizeBytesFunction(), alias_info.get()));
 
+  std::atomic<int> shard_number = 0;
   auto llvm_compiler =
       [&](llvm::Module& llvm_module, const se::DeviceDescription& descr,
           const DebugOptions& opts) -> absl::StatusOr<std::vector<uint8_t>> {
     ASSIGN_OR_RETURN(
         BackendCompileResult result,
         CompileSingleModule(hlo_module->config(), descr, hlo_module.get(),
-                            &llvm_module, false, std::nullopt));
+                            &llvm_module, false, shard_number.fetch_add(1)));
     return std::move(result.binary);
   };
   CubinCustomKernelCompiler kernel_compiler(