Skip to content

Commit eae4d72

Browse files
[openacc] Attach Parallelism Levels to Auto Loops (llvm#200884)
Auto loops are analyzed by the compiler in later compilation stages to determine whether they can be parallelized. These loops may carry parallelism levels (this does not guarantee that they are parallelizable, compiler should still analyze them). However, if the loop is parallelized, the parallelism levels specified in the source should be respected. This change attaches the parallelism level to auto loops, which enables their propagation through next compilation steps.
1 parent 8c30cb2 commit eae4d72

2 files changed

Lines changed: 36 additions & 0 deletions

File tree

mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,13 @@ class ACCLoopConversion : public OpRewritePattern<LoopOp> {
314314
convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/true);
315315
if (!forOp)
316316
return failure();
317+
SmallVector<GPUParallelDimAttr> parDims =
318+
getParallelDimensions(loopOp, policy, deviceType);
319+
if (!parDims.empty()) {
320+
auto parDimsAttr =
321+
GPUParallelDimsAttr::get(loopOp->getContext(), parDims);
322+
setParDimsAttr(forOp, parDimsAttr);
323+
}
317324
rewriter.replaceOp(loopOp, forOp);
318325
} else if (!isOpInComputeRegion(loopOp) &&
319326
!isSpecializedAccRoutine(

mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,32 @@ func.func @device_routine_vector_with_loop(%buf: memref<8xi32>) attributes {acc.
174174
} attributes {independent = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
175175
return
176176
}
177+
178+
// -----
179+
180+
// Auto loop with gang: lowered to scf.for with predetermined par_dims.
181+
// CHECK-LABEL: func.func @parallel_loop_auto_gang
182+
func.func @parallel_loop_auto_gang(%buf: memref<1xi32>) {
183+
%c0 = arith.constant 0 : index
184+
%c1_i32 = arith.constant 1 : i32
185+
%c10_i32 = arith.constant 10 : i32
186+
%c100_i32 = arith.constant 100 : i32
187+
188+
%dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
189+
// CHECK-NOT: acc.parallel
190+
// CHECK: acc.kernel_environment
191+
// CHECK: acc.par_width {{.*}} {par_dim = #acc.par_dim<block_x>}
192+
// CHECK: acc.compute_region launch(
193+
// CHECK: scf.for
194+
// CHECK-NOT: scf.parallel
195+
// CHECK: acc.par_dims = #acc<par_dims[block_x]>
196+
acc.parallel num_gangs({%c10_i32 : i32}) dataOperands(%dev : memref<1xi32>) {
197+
acc.loop gang control(%arg0 : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
198+
memref.store %arg0, %dev[%c0] : memref<1xi32>
199+
acc.yield
200+
} attributes {auto_ = [#acc.device_type<none>]}
201+
acc.yield
202+
}
203+
acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
204+
return
205+
}

0 commit comments

Comments
 (0)