[CIR][CUDA] Register __global__ functions

AdUhTkJm · AdUhTkJm · commit 014a7185e2ce · 2025-03-05T22:47:36.000Z
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -123,13 +123,16 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// CUDA related
   /// ------------
 
-  // Maps CUDA device stub name to kernel name.
-  llvm::DenseMap<llvm::StringRef, std::string> cudaKernelMap;
+  // Maps CUDA kernel name to device stub function.
+  std::unordered_map<std::string, FuncOp> cudaKernelMap;
+  llvm::StringRef cudaPrefix;
 
   void buildCUDAModuleCtor();
   void buildCUDAModuleDtor();
   std::optional<FuncOp> buildCUDARegisterGlobals();
 
+  std::string addUnderscoredPrefix(llvm::StringRef cudaFunctionName);
+
   ///
   /// AST related
   /// -----------
@@ -184,6 +187,8 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   llvm::SmallVector<mlir::Attribute, 4> globalDtorList;
   /// List of annotations in the module
   llvm::SmallVector<mlir::Attribute, 4> globalAnnotations;
+
+  TypeSizeInfoAttr typeSizeInfo;
 };
 } // namespace
 
@@ -983,6 +988,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   if (astCtx->getLangOpts().GPURelocatableDeviceCode)
     llvm_unreachable("NYI");
 
+  // For CUDA without -fgpu-rdc, it's safe to stop generating ctor
+  // if there's nothing to register.
+  if (cudaKernelMap.empty())
+    return;
+
   // There's no device-side binary, so no need to proceed for CUDA.
   // HIP has to create an external symbol in this case, which is NYI.
   auto cudaBinaryHandleAttr =
@@ -995,18 +1005,14 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   std::string cudaGPUBinaryName =
       cast<CUDABinaryHandleAttr>(cudaBinaryHandleAttr).getName();
 
-  llvm::StringRef prefix = "cuda";
+  cudaPrefix = "cuda";
 
   constexpr unsigned cudaFatMagic = 0x466243b1;
   constexpr unsigned hipFatMagic = 0x48495046; // "HIPF"
 
   const unsigned fatMagic =
       astCtx->getLangOpts().HIP ? hipFatMagic : cudaFatMagic;
 
-  auto addUnderscoredPrefix = [&](llvm::StringRef name) -> std::string {
-    return ("__" + prefix + name).str();
-  };
-
   // MAC OS X needs special care, but we haven't supported that in CIR yet.
   assert(!cir::MissingFeatures::checkMacOSXTriple());
 
@@ -1015,15 +1021,11 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
 
   mlir::Location loc = theModule.getLoc();
 
-  // Extract types from the module.
-  auto typeSizesAttr = cast<TypeSizeInfoAttr>(
-      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
-
   auto voidTy = VoidType::get(&getContext());
   auto voidPtrTy = PointerType::get(voidTy);
   auto voidPtrPtrTy = PointerType::get(voidPtrTy);
-  auto intTy = typeSizesAttr.getIntType(&getContext());
-  auto charTy = typeSizesAttr.getCharType(&getContext());
+  auto intTy = typeSizeInfo.getIntType(&getContext());
+  auto charTy = typeSizeInfo.getCharType(&getContext());
 
   // Read the GPU binary and create a constant array for it.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> cudaGPUBinaryOrErr =
@@ -1066,22 +1068,30 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
 
   std::string fatbinWrapperName = addUnderscoredPrefix("_fatbin_wrapper");
   GlobalOp fatbinWrapper = builder.create<GlobalOp>(
-      loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/false,
+      loc, fatbinWrapperName, fatbinWrapperType, /*isConstant=*/true,
       /*linkage=*/cir::GlobalLinkageKind::InternalLinkage);
   fatbinWrapper.setPrivate();
   fatbinWrapper.setSection(fatbinSectionName);
 
   auto magicInit = IntAttr::get(intTy, fatMagic);
   auto versionInit = IntAttr::get(intTy, 1);
-  // `fatbinInit` is only a placeholder. The value will be initialized at the
-  // beginning of module ctor.
-  auto fatbinInit = builder.getConstNullPtrAttr(voidPtrTy);
+  auto fatbinStrSymbol =
+      mlir::FlatSymbolRefAttr::get(fatbinStr.getSymNameAttr());
+  auto fatbinInit = GlobalViewAttr::get(voidPtrTy, fatbinStrSymbol);
   auto unusedInit = builder.getConstNullPtrAttr(voidPtrTy);
   fatbinWrapper.setInitialValueAttr(cir::ConstStructAttr::get(
       fatbinWrapperType,
       ArrayAttr::get(&getContext(),
                      {magicInit, versionInit, fatbinInit, unusedInit})));
 
+  // GPU fat binary handle is also a global variable in OG.
+  std::string gpubinHandleName = addUnderscoredPrefix("_gpubin_handle");
+  auto gpubinHandle = builder.create<GlobalOp>(
+      loc, gpubinHandleName, voidPtrPtrTy,
+      /*isConstant=*/false, /*linkage=*/GlobalLinkageKind::InternalLinkage);
+  gpubinHandle.setInitialValueAttr(builder.getConstNullPtrAttr(voidPtrPtrTy));
+  gpubinHandle.setPrivate();
+
   // Declare this function:
   //    void **__{cuda|hip}RegisterFatBinary(void *);
 
@@ -1098,25 +1108,131 @@ void LoweringPreparePass::buildCUDAModuleCtor() {
   globalCtorList.push_back(GlobalCtorAttr::get(&getContext(), moduleCtorName));
   builder.setInsertionPointToStart(moduleCtor.addEntryBlock());
 
-  auto wrapper = builder.createGetGlobal(fatbinWrapper);
-  // Put fatbinStr inside fatbinWrapper.
-  mlir::Value fatbinStrValue = builder.createGetGlobal(fatbinStr);
-  mlir::Value fatbinField = builder.createGetMemberOp(loc, wrapper, "", 2);
-  builder.createStore(loc, fatbinStrValue, fatbinField);
-
   // Register binary with CUDA runtime. This is substantially different in
   // default mode vs. separate compilation.
   // Corresponding code:
   //     gpuBinaryHandle = __cudaRegisterFatBinary(&fatbinWrapper);
+  auto wrapper = builder.createGetGlobal(fatbinWrapper);
   auto fatbinVoidPtr = builder.createBitcast(wrapper, voidPtrTy);
-  auto gpuBinaryHandle = builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+  auto gpuBinaryHandleCall = builder.createCallOp(loc, regFunc, fatbinVoidPtr);
+  auto gpuBinaryHandle = gpuBinaryHandleCall.getResult();
+  // Store the value back to the global `__cuda_gpubin_handle`.
+  auto gpuBinaryHandleGlobal = builder.createGetGlobal(gpubinHandle);
+  builder.createStore(loc, gpuBinaryHandle, gpuBinaryHandleGlobal);
+
+  // Generate __cuda_register_globals and call it.
+  std::optional<FuncOp> regGlobal = buildCUDARegisterGlobals();
+  if (regGlobal) {
+    builder.createCallOp(loc, *regGlobal, gpuBinaryHandle);
+  }
 
-  // This is currently incomplete.
-  // TODO(cir): create __cuda_register_globals(), and call it here.
+  // From CUDA 10.1 onwards, we must call this function to end registration:
+  //      void __cudaRegisterFatBinaryEnd(void **fatbinHandle);
+  // This is CUDA-specific, so no need to use `addUnderscoredPrefix`.
+  if (clang::CudaFeatureEnabled(
+          astCtx->getTargetInfo().getSDKVersion(),
+          clang::CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
+    cir::CIRBaseBuilderTy globalBuilder(getContext());
+    globalBuilder.setInsertionPointToStart(theModule.getBody());
+    FuncOp endFunc =
+        buildRuntimeFunction(globalBuilder, "__cudaRegisterFatBinaryEnd", loc,
+                             FuncType::get({voidPtrPtrTy}, voidTy));
+    builder.createCallOp(loc, endFunc, gpuBinaryHandle);
+  }
 
   builder.create<cir::ReturnOp>(loc);
 }
 
+std::string
+LoweringPreparePass::addUnderscoredPrefix(llvm::StringRef cudaFunctionName) {
+  return ("__" + cudaPrefix + cudaFunctionName).str();
+}
+
+std::optional<FuncOp> LoweringPreparePass::buildCUDARegisterGlobals() {
+  // There is nothing to register.
+  if (cudaKernelMap.empty())
+    return {};
+
+  cir::CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToStart(theModule.getBody());
+
+  auto loc = theModule.getLoc();
+
+  auto voidTy = cir::VoidType::get(&getContext());
+  auto voidPtrTy = cir::PointerType::get(voidTy);
+  auto voidPtrPtrTy = cir::PointerType::get(voidPtrTy);
+  auto intTy = typeSizeInfo.getIntType(&getContext());
+  auto charTy = typeSizeInfo.getCharType(&getContext());
+
+  // Create the function:
+  //      void __cuda_register_globals(void **fatbinHandle)
+  std::string regGlobalFuncName = addUnderscoredPrefix("_register_globals");
+  auto regGlobalFuncTy = FuncType::get({voidPtrPtrTy}, voidTy);
+  FuncOp regGlobalFunc =
+      buildRuntimeFunction(builder, regGlobalFuncName, loc, regGlobalFuncTy,
+                           /*linkage=*/GlobalLinkageKind::InternalLinkage);
+  builder.setInsertionPointToStart(regGlobalFunc.addEntryBlock());
+
+  // Extract the GPU binary handle argument.
+  mlir::Value fatbinHandle = *regGlobalFunc.args_begin();
+
+  // Declare CUDA internal functions:
+  // int __cudaRegisterFunction(
+  //   void **fatbinHandle,
+  //   const char *hostFunc,
+  //   char *deviceFunc,
+  //   const char *deviceName,
+  //   int threadLimit,
+  //   uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim,
+  //   int *wsize
+  // )
+  // OG doesn't care about the types at all. They're treated as void*.
+  cir::CIRBaseBuilderTy globalBuilder(getContext());
+  globalBuilder.setInsertionPointToStart(theModule.getBody());
+
+  FuncOp cudaRegisterFunction = buildRuntimeFunction(
+      globalBuilder, addUnderscoredPrefix("RegisterFunction"), loc,
+      FuncType::get({voidPtrPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, intTy,
+                     voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy, voidPtrTy},
+                    intTy));
+
+  auto makeConstantString = [&](llvm::StringRef str) -> GlobalOp {
+    auto strType = ArrayType::get(&getContext(), charTy, 1 + str.size());
+
+    auto tmpString = globalBuilder.create<GlobalOp>(
+        loc, (".str" + str).str(), strType, /*isConstant=*/true,
+        /*linkage=*/cir::GlobalLinkageKind::PrivateLinkage);
+
+    // We must make the string zero-terminated.
+    tmpString.setInitialValueAttr(ConstArrayAttr::get(
+        strType, StringAttr::get(&getContext(), str + "\0")));
+    tmpString.setPrivate();
+    return tmpString;
+  };
+
+  auto cirNullPtr = builder.getNullPtr(voidPtrTy, loc);
+  for (auto [kernelName, deviceStub] : cudaKernelMap) {
+    GlobalOp deviceFuncStr = makeConstantString(kernelName);
+    mlir::Value deviceFunc = builder.createBitcast(
+        builder.createGetGlobal(deviceFuncStr), voidPtrTy);
+    mlir::Value hostFunc = builder.createBitcast(
+        builder.create<GetGlobalOp>(
+            loc, PointerType::get(deviceStub.getFunctionType()),
+            mlir::FlatSymbolRefAttr::get(deviceStub.getSymNameAttr())),
+        voidPtrTy);
+    builder.createCallOp(
+        loc, cudaRegisterFunction,
+        {fatbinHandle, hostFunc, deviceFunc, deviceFunc,
+         builder.create<ConstantOp>(loc, IntAttr::get(intTy, -1)), cirNullPtr,
+         cirNullPtr, cirNullPtr, cirNullPtr, cirNullPtr});
+  }
+
+  // TODO(cir): registration for global variables.
+
+  builder.create<ReturnOp>(loc);
+  return regGlobalFunc;
+}
+
 void LoweringPreparePass::lowerDynamicCastOp(DynamicCastOp op) {
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
@@ -1378,11 +1494,10 @@ void LoweringPreparePass::runOnOp(Operation *op) {
       globalDtorList.push_back(globalDtor);
     }
     if (auto attr = fnOp.getExtraAttrs().getElements().get(
-            CIRDialect::getCUDABinaryHandleAttrName())) {
-      auto cudaBinaryAttr = dyn_cast<CUDABinaryHandleAttr>(attr);
-      std::string kernelName = cudaBinaryAttr.getName();
-      llvm::StringRef stubName = fnOp.getSymName();
-      cudaKernelMap[stubName] = kernelName;
+            CUDAKernelNameAttr::getMnemonic())) {
+      auto cudaBinaryAttr = dyn_cast<CUDAKernelNameAttr>(attr);
+      std::string kernelName = cudaBinaryAttr.getKernelName();
+      cudaKernelMap[kernelName] = fnOp;
     }
     if (std::optional<mlir::ArrayAttr> annotations = fnOp.getAnnotations())
       addGlobalAnnotations(fnOp, annotations.value());
@@ -1399,6 +1514,13 @@ void LoweringPreparePass::runOnOperation() {
     datalayout.emplace(theModule);
   }
 
+  if (astCtx->getLangOpts().CUDA) {
+    cudaPrefix = "cuda";
+  }
+
+  typeSizeInfo = cast<TypeSizeInfoAttr>(
+      theModule->getAttr(CIRDialect::getTypeSizeInfoAttrName()));
+
   llvm::SmallVector<Operation *> opsToTransform;
 
   op->walk([&](Operation *op) {
diff --git a/clang/test/CIR/CodeGen/CUDA/registration.cu b/clang/test/CIR/CodeGen/CUDA/registration.cu
@@ -13,56 +13,82 @@
 // RUN:            %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
 
-// COM: OG doesn't emit anything if there is nothing to register.
-// COM: Here we still emit the template for test purposes,
-// COM: and the behaviour will be fixed later.
-
 // CIR-HOST: module @"{{.*}}" attributes {
 // CIR-HOST:   cir.cu.binary_handle = #cir.cu.binary_handle<{{.*}}.fatbin>,
 // CIR-HOST:   cir.global_ctors = [#cir.global_ctor<"__cuda_module_ctor", {{[0-9]+}}>]
 // CIR-HOST: }
 
+// CIR-HOST: cir.global "private" constant cir_private @".str_Z2fnv" =
+// CIR-HOST-SAME: #cir.const_array<"_Z2fnv", trailing_zeros>
+
+// COM: In OG this variable has an `unnamed_addr` attribute.
+// LLVM-HOST: @.str_Z2fnv = private constant [7 x i8] c"_Z2fnv\00"
+
+// The corresponding CIR test for these three variables are down below.
+// They are here because LLVM IR puts global variables at the front of file.
+
+// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
+// LLVM-HOST: @__cuda_fatbin_wrapper = internal constant {
+// LLVM-HOST:   i32 1180844977, i32 1, ptr @__cuda_fatbin_str, ptr null
+// LLVM-HOST: }
+// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
+
+__global__ void fn() {}
+
+// CIR-HOST: cir.func internal private @__cuda_register_globals(%[[FatbinHandle:[a-zA-Z0-9]+]]{{.*}}) {
+// CIR-HOST:   %[[#NULL:]] = cir.const #cir.ptr<null>
+// CIR-HOST:   %[[#T1:]] = cir.get_global @".str_Z2fnv"
+// CIR-HOST:   %[[#DeviceFn:]] = cir.cast(bitcast, %[[#T1]]
+// CIR-HOST:   %[[#T2:]] = cir.get_global @_Z17__device_stub__fnv
+// CIR-HOST:   %[[#HostFn:]] = cir.cast(bitcast, %[[#T2]]
+// CIR-HOST:   %[[#MinusOne:]] = cir.const #cir.int<-1>
+// CIR-HOST:   cir.call @__cudaRegisterFunction(
+// CIR-HOST-SAME: %[[FatbinHandle]],
+// CIR-HOST-SAME: %[[#HostFn]],
+// CIR-HOST-SAME: %[[#DeviceFn]],
+// CIR-HOST-SAME: %[[#DeviceFn]],
+// CIR-HOST-SAME: %[[#MinusOne]],
+// CIR-HOST-SAME: %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]], %[[#NULL]])
+// CIR-HOST: }
+
+// LLVM-HOST: define internal void @__cuda_register_globals(ptr %[[#LLVMFatbin:]]) {
+// LLVM-HOST:   call i32 @__cudaRegisterFunction(
+// LLVM-HOST-SAME: ptr %[[#LLVMFatbin]],
+// LLVM-HOST-SAME: ptr @_Z17__device_stub__fnv,
+// LLVM-HOST-SAME: ptr @.str_Z2fnv,
+// LLVM-HOST-SAME: ptr @.str_Z2fnv,
+// LLVM-HOST-SAME: i32 -1,
+// LLVM-HOST-SAME: ptr null, ptr null, ptr null, ptr null, ptr null)
+// LLVM-HOST: }
+
 // The content in const array should be the same as echoed above,
 // with a trailing line break ('\n', 0x0A).
 // CIR-HOST: cir.global "private" constant cir_private @__cuda_fatbin_str =
 // CIR-HOST-SAME: #cir.const_array<"sample fatbin\0A">
 // CIR-HOST-SAME: {{.*}}section = ".nv_fatbin"
 
-// LLVM-HOST: @__cuda_fatbin_str = private constant [14 x i8] c"sample fatbin\0A", section ".nv_fatbin"
-
 // The first value is CUDA file head magic number.
-// CIR-HOST: cir.global "private" internal @__cuda_fatbin_wrapper
+// CIR-HOST: cir.global "private" constant internal @__cuda_fatbin_wrapper
 // CIR-HOST: = #cir.const_struct<{
 // CIR-HOST:   #cir.int<1180844977> : !s32i,
 // CIR-HOST:   #cir.int<1> : !s32i,
-// CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>,
+// CIR-HOST:   #cir.global_view<@__cuda_fatbin_str> : !cir.ptr<!void>,
 // CIR-HOST:   #cir.ptr<null> : !cir.ptr<!void>
 // CIR-HOST: }>
 // CIR-HOST-SAME: {{.*}}section = ".nvFatBinSegment"
 
-// COM: @__cuda_fatbin_wrapper is constant for OG.
-// COM: However, as we don't have a way to put @__cuda_fatbin_str directly
-// COM: to its third field in Clang IR, we can't mark this variable as 
-// COM: constant: we need to initialize it later, at the beginning
-// COM: of @__cuda_module_ctor.
-
-// LLVM-HOST: @__cuda_fatbin_wrapper = internal global {
-// LLVM-HOST:   i32 1180844977, i32 1, ptr null, ptr null
-// LLVM-HOST: }
-
-// LLVM-HOST: @llvm.global_ctors = {{.*}}ptr @__cuda_module_ctor
-
 // CIR-HOST: cir.func private @__cudaRegisterFatBinary
 // CIR-HOST: cir.func {{.*}} @__cuda_module_ctor() {
-// CIR-HOST:   %[[#F0:]] = cir.get_global @__cuda_fatbin_wrapper
-// CIR-HOST:   %[[#F1:]] = cir.get_global @__cuda_fatbin_str
-// CIR-HOST:   %[[#F2:]] = cir.get_member %[[#F0]][2]
-// CIR-HOST:   %[[#F3:]] = cir.cast(bitcast, %[[#F2]]
-// CIR-HOST:   cir.store %[[#F1]], %[[#F3]]
-// CIR-HOST:   cir.call @__cudaRegisterFatBinary
+// CIR-HOST:   %[[#Fatbin:]] = cir.call @__cudaRegisterFatBinary
+// CIR-HOST:   %[[#FatbinGlobal:]] = cir.get_global @__cuda_gpubin_handle
+// CIR-HOST:   cir.store %[[#Fatbin]], %[[#FatbinGlobal]]
+// CIR-HOST:   cir.call @__cuda_register_globals
+// CIR-HOTS:   cir.call @__cudaRegisterFatBinaryEnd
 // CIR-HOST: }
 
 // LLVM-HOST: define internal void @__cuda_module_ctor() {
-// LLVM-HOST:   store ptr @__cuda_fatbin_str, ptr getelementptr {{.*}}, ptr @__cuda_fatbin_wrapper
-// LLVM-HOST:   call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST:  %[[#LLVMFatbin:]] = call ptr @__cudaRegisterFatBinary(ptr @__cuda_fatbin_wrapper)
+// LLVM-HOST:  store ptr %[[#LLVMFatbin]], ptr @__cuda_gpubin_handle
+// LLVM-HOST:  call void @__cuda_register_globals
+// LLVM-HOST:  call void @__cudaRegisterFatBinaryEnd
 // LLVM-HOST: }