Skip to content

Commit 0ae9dac

Browse files
authored
[OpenACC][CIR] Lower 'num_workers' for parallel/kernels (llvm#136578)
This patch also includes the first one to handle 'device_type' properly, which is where most of the 'challenge' here comes from. From the best I can tell: we must keep two lists of the same size, 1 of all of the 'num_workers' items, and 1 of the 'device_type' value for that 'num_workers'. Additionally, the 'device_type' list can only handle single 'device_type' values, so we have to duplicate the 'num_workers' items in cases where there are multiple applicable 'device_type' values. This patch accomplishes this by keeping the two in sync, and saving the current 'device_type' in the visitor.
1 parent e0c1e23 commit 0ae9dac

File tree

3 files changed

+168
-1
lines changed

3 files changed

+168
-1
lines changed

clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp

+64-1
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,27 @@ class OpenACCClauseCIREmitter final
4646
// diagnostics are gone.
4747
SourceLocation dirLoc;
4848

49+
const OpenACCDeviceTypeClause *lastDeviceTypeClause = nullptr;
50+
4951
void clauseNotImplemented(const OpenACCClause &c) {
5052
cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind());
5153
}
5254

55+
mlir::Value createIntExpr(const Expr *intExpr) {
56+
mlir::Value expr = cgf.emitScalarExpr(intExpr);
57+
mlir::Location exprLoc = cgf.cgm.getLoc(intExpr->getBeginLoc());
58+
59+
mlir::IntegerType targetType = mlir::IntegerType::get(
60+
&cgf.getMLIRContext(), cgf.getContext().getIntWidth(intExpr->getType()),
61+
intExpr->getType()->isSignedIntegerOrEnumerationType()
62+
? mlir::IntegerType::SignednessSemantics::Signed
63+
: mlir::IntegerType::SignednessSemantics::Unsigned);
64+
65+
auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
66+
exprLoc, targetType, expr);
67+
return conversionOp.getResult(0);
68+
}
69+
5370
// 'condition' as an OpenACC grammar production is used for 'if' and (some
5471
// variants of) 'self'. It needs to be emitted as a signless-1-bit value, so
5572
// this function emits the expression, then sets the unrealized conversion
@@ -109,14 +126,15 @@ class OpenACCClauseCIREmitter final
109126
}
110127

111128
void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
129+
lastDeviceTypeClause = &clause;
112130
if constexpr (isOneOfTypes<OpTy, InitOp, ShutdownOp>) {
113131
llvm::SmallVector<mlir::Attribute> deviceTypes;
114132
std::optional<mlir::ArrayAttr> existingDeviceTypes =
115133
operation.getDeviceTypes();
116134

117135
// Ensure we keep the existing ones, and in the correct 'new' order.
118136
if (existingDeviceTypes) {
119-
for (const mlir::Attribute &Attr : *existingDeviceTypes)
137+
for (mlir::Attribute Attr : *existingDeviceTypes)
120138
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
121139
builder.getContext(),
122140
cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
@@ -136,6 +154,51 @@ class OpenACCClauseCIREmitter final
136154
if (!clause.getArchitectures().empty())
137155
operation.setDeviceType(
138156
decodeDeviceType(clause.getArchitectures()[0].getIdentifierInfo()));
157+
} else if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
158+
// Nothing to do here, these constructs don't have any IR for these, as
159+
// they just modify the other clauses IR. So setting of `lastDeviceType`
160+
// (done above) is all we need.
161+
} else {
162+
return clauseNotImplemented(clause);
163+
}
164+
}
165+
166+
void VisitNumWorkersClause(const OpenACCNumWorkersClause &clause) {
167+
if constexpr (isOneOfTypes<OpTy, ParallelOp, KernelsOp>) {
168+
// Collect the 'existing' device-type attributes so we can re-create them
169+
// and insert them.
170+
llvm::SmallVector<mlir::Attribute> deviceTypes;
171+
mlir::ArrayAttr existingDeviceTypes =
172+
operation.getNumWorkersDeviceTypeAttr();
173+
174+
if (existingDeviceTypes) {
175+
for (mlir::Attribute Attr : existingDeviceTypes)
176+
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
177+
builder.getContext(),
178+
cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
179+
}
180+
181+
// Insert 1 version of the 'int-expr' to the NumWorkers list per-current
182+
// device type.
183+
mlir::Value intExpr = createIntExpr(clause.getIntExpr());
184+
if (lastDeviceTypeClause) {
185+
for (const DeviceTypeArgument &arg :
186+
lastDeviceTypeClause->getArchitectures()) {
187+
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
188+
builder.getContext(), decodeDeviceType(arg.getIdentifierInfo())));
189+
operation.getNumWorkersMutable().append(intExpr);
190+
}
191+
} else {
192+
// Else, we just add a single for 'none'.
193+
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
194+
builder.getContext(), mlir::acc::DeviceType::None));
195+
operation.getNumWorkersMutable().append(intExpr);
196+
}
197+
198+
operation.setNumWorkersDeviceTypeAttr(
199+
mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
200+
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
201+
llvm_unreachable("num_workers not valid on serial");
139202
} else {
140203
return clauseNotImplemented(clause);
141204
}

clang/test/CIR/CodeGenOpenACC/kernels.c

+52
Original file line numberDiff line numberDiff line change
@@ -106,5 +106,57 @@ void acc_kernels(int cond) {
106106
// CHECK-NEXT: acc.terminator
107107
// CHECK-NEXT: } loc
108108

109+
#pragma acc kernels num_workers(cond)
110+
{}
111+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
112+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
113+
// CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32) {
114+
// CHECK-NEXT: acc.terminator
115+
// CHECK-NEXT: } loc
116+
117+
#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2u)
118+
{}
119+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
120+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
121+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
122+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
123+
// CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
124+
// CHECK-NEXT: acc.terminator
125+
// CHECK-NEXT: } loc
126+
127+
#pragma acc kernels num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
128+
{}
129+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
130+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
131+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
132+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
133+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
134+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
135+
// CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
136+
// CHECK-NEXT: acc.terminator
137+
// CHECK-NEXT: } loc
138+
139+
#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(3)
140+
{}
141+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
142+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
143+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
144+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
145+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
146+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
147+
// CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>], %[[THREE_CAST]] : si32 [#acc.device_type<multicore>]) {
148+
// CHECK-NEXT: acc.terminator
149+
// CHECK-NEXT: } loc
150+
151+
#pragma acc kernels device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
152+
{}
153+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
154+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
155+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
156+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
157+
// CHECK-NEXT: acc.kernels num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
158+
// CHECK-NEXT: acc.terminator
159+
// CHECK-NEXT: } loc
160+
109161
// CHECK-NEXT: cir.return
110162
}

clang/test/CIR/CodeGenOpenACC/parallel.c

+52
Original file line numberDiff line numberDiff line change
@@ -105,5 +105,57 @@ void acc_parallel(int cond) {
105105
// CHECK-NEXT: acc.yield
106106
// CHECK-NEXT: } loc
107107

108+
#pragma acc parallel num_workers(cond)
109+
{}
110+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
111+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
112+
// CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32) {
113+
// CHECK-NEXT: acc.yield
114+
// CHECK-NEXT: } loc
115+
116+
#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2u)
117+
{}
118+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
119+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
120+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
121+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
122+
// CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
123+
// CHECK-NEXT: acc.yield
124+
// CHECK-NEXT: } loc
125+
126+
#pragma acc parallel num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
127+
{}
128+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
129+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
130+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
131+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
132+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
133+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
134+
// CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
135+
// CHECK-NEXT: acc.yield
136+
// CHECK-NEXT: } loc
137+
138+
#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4)
139+
{}
140+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
141+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
142+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
143+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
144+
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
145+
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
146+
// CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
147+
// CHECK-NEXT: acc.yield
148+
// CHECK-NEXT: } loc
149+
150+
#pragma acc parallel device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
151+
{}
152+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
153+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
154+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
155+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
156+
// CHECK-NEXT: acc.parallel num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
157+
// CHECK-NEXT: acc.yield
158+
// CHECK-NEXT: } loc
159+
108160
// CHECK-NEXT: cir.return
109161
}

0 commit comments

Comments
 (0)