[CIR][CUDA] Miscellanous bugfixes (#1462)

AdUhTkJm · web-flow · commit 182c680e9c2a · 2025-03-11T14:53:27.000-07:00
This PR deals with several issues currently present in CUDA CodeGen. Each of them requires only a few lines to fix, so they're combined in a single PR. **Bug 1.** Suppose we write ```cpp __global__ void kernel(int a, int b); ``` Then when we call this kernel with `cudaLaunchKernel`, the 4th argument to that function is something of the form `void *kernel_args[2] = {&a, &b}`. OG allocates the space of it with `alloca ptr, i32 2`, but that doesn't seem to be feasible in CIR, so we allocated `alloca [2 x ptr], i32 1`. This means there must be an extra GEP as compared to OG. In CIR, it means we must add an `array_to_ptrdecay` cast before trying to accessing the array elements. I missed that out in #1332 . **Bug 2.** We missed a load instruction for 6th argument to `cudaLaunchKernel`. It's added back in this PR. **Bug 3.** When we launch a kernel, we first retrieve the return value of `__cudaPopCallConfiguration`. If it's zero, then the call succeeds and we should proceed to call the device stub. In #1348 we did exactly the opposite, calling the device stub only if it's not zero. It's fixed here. **Issue 4.** CallConvLowering is required to make `cudaLaunchKernel` correct. The codepath is unblocked by adding a `getIndirectResult` at the same place as OG does -- the function is already implemented so we can just call it. After this (and other pending PRs), CIR is now able to compile real CUDA programs. There are still missing features, which will be followed up later.
diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -69,11 +69,16 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
       loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
       CharUnits::fromQuantity(16));
 
+  mlir::Value kernelArgsDecayed =
+      builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
+                         cir::PointerType::get(cgm.VoidPtrTy));
+
   // Store arguments into kernelArgs
   for (auto [i, arg] : llvm::enumerate(args)) {
     mlir::Value index =
         builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
-    mlir::Value storePos = builder.createPtrStride(loc, kernelArgs, index);
+    mlir::Value storePos =
+        builder.createPtrStride(loc, kernelArgsDecayed, index);
     builder.CIRBaseBuilderTy::createStore(
         loc, cgf.GetAddrOfLocalVar(arg).getPointer(), storePos);
   }
@@ -166,10 +171,6 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
   // mlir::Value func = builder.createBitcast(kernel, cgm.VoidPtrTy);
   CallArgList launchArgs;
 
-  mlir::Value kernelArgsDecayed =
-      builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
-                         cir::PointerType::get(cgm.VoidPtrTy));
-
   launchArgs.add(RValue::get(kernel), launchFD->getParamDecl(0)->getType());
   launchArgs.add(
       RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
@@ -182,7 +183,8 @@ void CIRGenCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
   launchArgs.add(
       RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
       launchFD->getParamDecl(4)->getType());
-  launchArgs.add(RValue::get(stream), launchFD->getParamDecl(5)->getType());
+  launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
+                 launchFD->getParamDecl(5)->getType());
 
   mlir::Type launchTy = cgm.getTypes().convertType(launchFD->getType());
   mlir::Operation *launchFn =
@@ -219,13 +221,16 @@ RValue CIRGenCUDARuntime::emitCUDAKernelCallExpr(CIRGenFunction &cgf,
 
   cgf.emitIfOnBoolExpr(
       expr->getConfig(),
+      [&](mlir::OpBuilder &b, mlir::Location l) {
+        b.create<cir::YieldOp>(loc);
+      },
+      loc,
       [&](mlir::OpBuilder &b, mlir::Location l) {
         CIRGenCallee callee = cgf.emitCallee(expr->getCallee());
         cgf.emitCall(expr->getCallee()->getType(), callee, expr, retValue);
         b.create<cir::YieldOp>(loc);
       },
-      loc, [](mlir::OpBuilder &b, mlir::Location l) {},
-      std::optional<mlir::Location>());
+      loc);
 
   return RValue::get(nullptr);
 }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -751,7 +751,7 @@ void X86_64ABIInfo::computeInfo(LowerFunctionInfo &FI) const {
       if (cir::MissingFeatures::vectorType())
         cir_cconv_unreachable("NYI");
     } else {
-      cir_cconv_unreachable("Indirect results are NYI");
+      it->info = getIndirectResult(it->type, FreeIntRegs);
     }
   }
 }
diff --git a/clang/test/CIR/CodeGen/CUDA/simple.cu b/clang/test/CIR/CodeGen/CUDA/simple.cu
@@ -28,12 +28,16 @@ __global__ void global_fn(int a) {}
 // Check for device stub emission.
 
 // CIR-HOST: @_Z24__device_stub__global_fni{{.*}}extra([[Kernel]])
-// CIR-HOST: cir.alloca {{.*}}"kernel_args"
+// CIR-HOST: %[[#CIRKernelArgs:]] = cir.alloca {{.*}}"kernel_args"
+// CIR-HOST: %[[#Decayed:]] = cir.cast(array_to_ptrdecay, %[[#CIRKernelArgs]]
 // CIR-HOST: cir.call @__cudaPopCallConfiguration
 // CIR-HOST: cir.get_global @_Z24__device_stub__global_fni
 // CIR-HOST: cir.call @cudaLaunchKernel
 
 // LLVM-HOST: void @_Z24__device_stub__global_fni
+// LLVM-HOST: %[[#KernelArgs:]] = alloca [1 x ptr], i64 1, align 16
+// LLVM-HOST: %[[#GEP1:]] = getelementptr ptr, ptr %[[#KernelArgs]], i32 0
+// LLVM-HOST: %[[#GEP2:]] = getelementptr ptr, ptr %[[#GEP1]], i64 0
 // LLVM-HOST: call i32 @__cudaPopCallConfiguration
 // LLVM-HOST: call i32 @cudaLaunchKernel(ptr @_Z24__device_stub__global_fni
 
@@ -48,6 +52,7 @@ int main() {
 // CIR-HOST: [[Push:%[0-9]+]] = cir.call @__cudaPushCallConfiguration
 // CIR-HOST: [[ConfigOK:%[0-9]+]] = cir.cast(int_to_bool, [[Push]]
 // CIR-HOST: cir.if [[ConfigOK]] {
+// CIR-HOST: } else {
 // CIR-HOST:   [[Arg:%[0-9]+]] = cir.const #cir.int<1>
 // CIR-HOST:   cir.call @_Z24__device_stub__global_fni([[Arg]])
 // CIR-HOST: }
@@ -58,9 +63,9 @@ int main() {
 // LLVM-HOST: call void @_ZN4dim3C1Ejjj
 // LLVM-HOST: call void @_ZN4dim3C1Ejjj
 // LLVM-HOST: [[LLVMConfigOK:%[0-9]+]] = call i32 @__cudaPushCallConfiguration
-// LLVM-HOST: br [[LLVMConfigOK]], label %[[Good:[0-9]+]], label [[Bad:[0-9]+]]
-// LLVM-HOST: [[Good]]:
+// LLVM-HOST: br [[LLVMConfigOK]], label %[[#Good:]], label [[#Bad:]]
+// LLVM-HOST: [[#Good]]:
+// LLVM-HOST:   br label [[#End:]]
+// LLVM-HOST: [[#Bad]]:
 // LLVM-HOST:   call void @_Z24__device_stub__global_fni
-// LLVM-HOST:   br label [[Bad]]
-// LLVM-HOST: [[Bad]]:
-// LLVM-HOST:   ret i32
+// LLVM-HOST:   br label [[#End]]

Original file line number	Diff line number	Diff line change
`@@ -751,7 +751,7 @@ void X86_64ABIInfo::computeInfo(LowerFunctionInfo &FI) const {`
`751`	`751`	`if (cir::MissingFeatures::vectorType())`
`752`	`752`	`cir_cconv_unreachable("NYI");`
`753`	`753`	`} else {`
`754`		`- cir_cconv_unreachable("Indirect results are NYI");`
	`754`	`+ it->info = getIndirectResult(it->type, FreeIntRegs);`
`755`	`755`	`}`
`756`	`756`	`}`
`757`	`757`	`}`