Skip to content

Commit cdef6c6

Browse files
committed
[CIR][CUDA] Lowering device and shared variables
1 parent 5a75305 commit cdef6c6

File tree

6 files changed

+84
-12
lines changed

6 files changed

+84
-12
lines changed

clang/include/clang/CIR/Dialect/IR/CIRCUDAAttrs.td

+31
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,24 @@ def CUDAKernelNameAttr : CIR_Attr<"CUDAKernelName",
3535
let assemblyFormat = "`<` $kernel_name `>`";
3636
}
3737

38+
def CUDAShadowNameAttr : CIR_Attr<"CUDAShadowName",
39+
"cu.shadow_name"> {
40+
let summary = "Device-side global variable name for this shadow.";
41+
let description =
42+
[{
43+
This attribute is attached to global variable definitions and records the
44+
mangled name of the global variable used on the device.
45+
46+
In CUDA, __device__, __constant__ and __shared__ variables, as well as
47+
surface and texture variables, will generate a shadow symbol on host.
48+
We must preserve the correspodence in order to generate registration
49+
functions.
50+
}];
51+
52+
let parameters = (ins "std::string":$device_side_name);
53+
let assemblyFormat = "`<` $device_side_name `>`";
54+
}
55+
3856
def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle",
3957
"cu.binary_handle"> {
4058
let summary = "Fat binary handle for device code.";
@@ -52,4 +70,17 @@ def CUDABinaryHandleAttr : CIR_Attr<"CUDABinaryHandle",
5270
let assemblyFormat = "`<` $name `>`";
5371
}
5472

73+
def CUDAExternallyInitializedAttr : CIR_Attr<"CUDAExternallyInitialized",
74+
"cu.externally_initialized"> {
75+
let summary = "The marked variable is externally initialized.";
76+
let description =
77+
[{
78+
CUDA __device__ and __constant__ variables, along with surface and
79+
textures, might be initialized by host, hence "externally initialized".
80+
Therefore they must be emitted even if they are not referenced.
81+
82+
The attribute corresponds to the attribute on LLVM with the same name.
83+
}];
84+
}
85+
5586
#endif // MLIR_CIR_DIALECT_CIR_CUDA_ATTRS

clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -283,3 +283,23 @@ mlir::Operation *CIRGenCUDARuntime::getKernelHandle(cir::FuncOp fn,
283283

284284
return globalOp;
285285
}
286+
287+
void CIRGenCUDARuntime::internalizeDeviceSideVar(
288+
const VarDecl *d, cir::GlobalLinkageKind &linkage) {
289+
if (cgm.getLangOpts().GPURelocatableDeviceCode)
290+
llvm_unreachable("NYI");
291+
292+
// __shared__ variables are odd. Shadows do get created, but
293+
// they are not registered with the CUDA runtime, so they
294+
// can't really be used to access their device-side
295+
// counterparts. It's not clear yet whether it's nvcc's bug or
296+
// a feature, but we've got to do the same for compatibility.
297+
if (d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
298+
d->hasAttr<CUDASharedAttr>()) {
299+
linkage = cir::GlobalLinkageKind::InternalLinkage;
300+
}
301+
302+
if (d->getType()->isCUDADeviceBuiltinSurfaceType() ||
303+
d->getType()->isCUDADeviceBuiltinTextureType())
304+
llvm_unreachable("NYI");
305+
}

clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h

+2
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class CIRGenCUDARuntime {
5858
const CUDAKernelCallExpr *expr,
5959
ReturnValueSlot retValue);
6060
virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl GD);
61+
virtual void internalizeDeviceSideVar(const VarDecl *d,
62+
cir::GlobalLinkageKind &linkage);
6163
};
6264

6365
} // namespace clang::CIRGen

clang/lib/CIR/CodeGen/CIRGenModule.cpp

+16-11
Original file line numberDiff line numberDiff line change
@@ -569,13 +569,13 @@ bool CIRGenModule::shouldEmitCUDAGlobalVar(const VarDecl *global) const {
569569
// their device-side incarnations.
570570

571571
if (global->hasAttr<CUDAConstantAttr>() ||
572-
global->hasAttr<CUDASharedAttr>() ||
573572
global->getType()->isCUDADeviceBuiltinSurfaceType() ||
574573
global->getType()->isCUDADeviceBuiltinTextureType()) {
575574
llvm_unreachable("NYI");
576575
}
577576

578-
return !langOpts.CUDAIsDevice || global->hasAttr<CUDADeviceAttr>();
577+
return !langOpts.CUDAIsDevice || global->hasAttr<CUDADeviceAttr>() ||
578+
global->hasAttr<CUDASharedAttr>();
579579
}
580580

581581
void CIRGenModule::emitGlobal(GlobalDecl gd) {
@@ -598,8 +598,10 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) {
598598
assert(!global->hasAttr<CPUDispatchAttr>() && "NYI");
599599

600600
if (langOpts.CUDA || langOpts.HIP) {
601-
// clang uses the same flag when building HIP code
602-
if (langOpts.CUDAIsDevice) {
601+
if (const auto *vd = dyn_cast<VarDecl>(global)) {
602+
if (!shouldEmitCUDAGlobalVar(vd))
603+
return;
604+
} else if (langOpts.CUDAIsDevice) {
603605
// This will implicitly mark templates and their
604606
// specializations as __host__ __device__.
605607
if (langOpts.OffloadImplicitHostDeviceTemplates)
@@ -621,11 +623,6 @@ void CIRGenModule::emitGlobal(GlobalDecl gd) {
621623
return;
622624
}
623625
}
624-
625-
if (const auto *vd = dyn_cast<VarDecl>(global)) {
626-
if (!shouldEmitCUDAGlobalVar(vd))
627-
return;
628-
}
629626
}
630627

631628
if (langOpts.OpenMP) {
@@ -1394,7 +1391,7 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d,
13941391
d->getType()->isCUDADeviceBuiltinTextureType());
13951392
if (getLangOpts().CUDA &&
13961393
(isCudaSharedVar || isCudaShadowVar || isCudaDeviceShadowVar))
1397-
assert(0 && "not implemented");
1394+
init = UndefAttr::get(&getMLIRContext(), convertType(d->getType()));
13981395
else if (d->hasAttr<LoaderUninitializedAttr>())
13991396
assert(0 && "not implemented");
14001397
else if (!initExpr) {
@@ -1490,11 +1487,19 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *d,
14901487
cir::GlobalLinkageKind linkage =
14911488
getCIRLinkageVarDefinition(d, /*IsConstant=*/false);
14921489

1493-
// TODO(cir):
14941490
// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
14951491
// the device. [...]"
14961492
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
14971493
// __device__, declares a variable that: [...]
1494+
if (langOpts.CUDA && langOpts.CUDAIsDevice) {
1495+
// __shared__ variables is not marked as externally initialized,
1496+
// because they must not be initialized.
1497+
if (linkage != cir::GlobalLinkageKind::InternalLinkage &&
1498+
(d->hasAttr<CUDADeviceAttr>())) {
1499+
gv->setAttr(CUDAExternallyInitializedAttr::getMnemonic(),
1500+
CUDAExternallyInitializedAttr::get(&getMLIRContext()));
1501+
}
1502+
}
14981503

14991504
// Set initializer and finalize emission
15001505
CIRGenModule::setInitializer(gv, init);

clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -2397,6 +2397,12 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
23972397

23982398
attributes.push_back(rewriter.getNamedAttr("visibility_", visibility));
23992399

2400+
if (auto extInit =
2401+
op->getAttr(CUDAExternallyInitializedAttr::getMnemonic())) {
2402+
attributes.push_back(rewriter.getNamedAttr("externally_initialized",
2403+
rewriter.getUnitAttr()));
2404+
}
2405+
24002406
if (init.has_value()) {
24012407
if (mlir::isa<cir::FPAttr, cir::IntAttr, cir::BoolAttr>(init.value())) {
24022408
// If a directly equivalent attribute is available, use it.

clang/test/CIR/CodeGen/CUDA/global-vars.cu

+9-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,15 @@
55
// RUN: %s -o %t.cir
66
// RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s
77

8+
// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \
9+
// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \
10+
// RUN: %s -o %t.cir
11+
// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.cir %s
812

913
__device__ int a;
14+
// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0>
15+
// LLVM-DEVICE: @a = addrspace(1) externally_initialized global i32 0, align 4
1016

11-
// CIR-DEVICE: cir.global external addrspace(offload_global) @a = #cir.int<0> : !s32i {alignment = 4 : i64} loc(#loc3)
17+
__shared__ int shared;
18+
// CIR-DEVICE: cir.global external addrspace(offload_local) @shared = #cir.undef
19+
// LLVM-DEVICE: @shared = addrspace(3) global i32 undef, align 4

0 commit comments

Comments
 (0)