Skip to content

Commit 6f6f386

Browse files
iwwuigcbot
authored andcommitted
Enable PropagateCmpUniformity pass by default
This pass improves scalarization performance by propagating uniform values through conditional branches. The optimization is applied to non-RT shaders. Bug fixes in some special cases.
1 parent c732987 commit 6f6f386

File tree

4 files changed

+164
-31
lines changed

4 files changed

+164
-31
lines changed

IGC/Compiler/CISACodeGen/PropagateCmpUniformity.cpp

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ using namespace IGC;
2828
IGC_INITIALIZE_PASS_BEGIN(PropagateCmpUniformity, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
2929
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
3030
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
31+
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
3132
IGC_INITIALIZE_PASS_END(PropagateCmpUniformity, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
3233

3334
char PropagateCmpUniformity::ID = 0;
@@ -125,50 +126,71 @@ bool PropagateCmpUniformity::getUniformNonUniformPair(Value *op0, Value *op1, Va
125126
}
126127
}
127128

128-
bool PropagateCmpUniformity::canReplaceUse(Use &U, BasicBlock *trueBranch, BasicBlock *cmpBB, DominatorTree &DT) {
129+
bool PropagateCmpUniformity::canReplaceUse(Use &U, BasicBlock *trueBranchBB, BasicBlock *cmpBB, DominatorTree &DT) {
129130
User *user = U.getUser();
130131
BasicBlock *useBB = nullptr;
131132
BasicBlock *incomingBB = nullptr;
132133

133134
if (PHINode *phi = dyn_cast<PHINode>(user)) {
134135
useBB = phi->getParent();
135136
incomingBB = phi->getIncomingBlock(U);
136-
137-
// Special case: PHI in trueBranch with incoming edge directly from cmpBB.
138-
// Equality holds on the cmpBB trueBranch edge, so replacing the incoming
139-
// nonUniform value with uniform is valid — but only when falseBranch (the
140-
// other successor of cmpBB) is NOT a direct predecessor of trueBranch.
141-
// If falseBranch also feeds trueBranch, CFGSimplification may later merge
142-
// an empty falseBranch into cmpBB, producing two incoming edges from the
143-
// same block and collapsing the PHI incorrectly (e.g. to a zero constant).
144-
if (incomingBB == cmpBB && useBB == trueBranch) {
145-
BranchInst *cmpBr = cast<BranchInst>(cmpBB->getTerminator());
146-
BasicBlock *falseBranch =
147-
(cmpBr->getSuccessor(0) == trueBranch) ? cmpBr->getSuccessor(1) : cmpBr->getSuccessor(0);
148-
for (BasicBlock *pred : predecessors(trueBranch))
149-
if (pred == falseBranch)
150-
return false;
151-
return true;
152-
}
153-
154137
} else if (Instruction *inst = dyn_cast<Instruction>(user)) {
155138
useBB = inst->getParent();
156139
incomingBB = useBB;
157140
} else {
158141
return false;
159142
}
160143

161-
// trueBranch must have cmpBB as its only predecessor. If trueBranch is a
162-
// join point with multiple predecessors, it is reachable without going
163-
// through the equality-proven edge, so the equality may not hold on all
164-
// paths to the use even if trueBranch dominates it.
165-
if (trueBranch->getSinglePredecessor() != cmpBB)
144+
// Compute falseBranchBB once for PHI uses; nullptr for non-PHI.
145+
BasicBlock *falseBranchBB = nullptr;
146+
if (isa<PHINode>(user)) {
147+
BranchInst *cmpBr = cast<BranchInst>(cmpBB->getTerminator());
148+
falseBranchBB = (cmpBr->getSuccessor(0) == trueBranchBB) ? cmpBr->getSuccessor(1) : cmpBr->getSuccessor(0);
149+
}
150+
151+
// Special case: PHI in trueBranchBB with incoming edge directly from cmpBB.
152+
// Equality holds on cmpBB->trueBranchBB, so the replacement is valid — unless
153+
// falseBranchBB can also reach trueBranchBB, either directly or via intermediate
154+
// blocks (e.g. created by JumpThreading). In that case a later CFGSimplification
155+
// would collapse the intermediate blocks back and corrupt the PHI.
156+
if (incomingBB == cmpBB && useBB == trueBranchBB) {
157+
// Sub-case: cmpBB's two successors are the same block (both edges to trueBranchBB).
158+
if (falseBranchBB == trueBranchBB)
159+
return false;
160+
// Sub-case: falseBranchBB is a direct predecessor of trueBranchBB.
161+
for (BasicBlock *pred : predecessors(trueBranchBB))
162+
if (pred == falseBranchBB)
163+
return false;
164+
// Sub-case: falseBranchBB reaches trueBranchBB via intermediate blocks that are
165+
// dominated by cmpBB (i.e. they lie on the false-branch path). After a later
166+
// CFGSimplification those intermediates would be merged away, corrupting the PHI.
167+
for (BasicBlock *pred : predecessors(trueBranchBB))
168+
if (pred != cmpBB && DT.dominates(cmpBB, pred))
169+
return false;
170+
return true;
171+
}
172+
173+
// trueBranchBB must have cmpBB as its only predecessor; otherwise equality may
174+
// not hold on all paths to the use.
175+
if (trueBranchBB->getSinglePredecessor() != cmpBB)
166176
return false;
167177

168-
// Only replace if trueBranch dominates the use's incoming BB.
178+
// For PHI uses: reject if falseBranchBB or cmpBB is also a direct predecessor
179+
// of useBB. CFGSimplification would later collapse those edges back to cmpBB,
180+
// overwriting the non-equality incoming value with the uniform constant.
181+
// (The cmpBB check covers the case where an earlier CFGSimplification pass
182+
// already merged trueBranchBB away, leaving cmpBB with a direct false edge to
183+
// useBB where the equality guarantee does not hold.)
184+
if (falseBranchBB) {
185+
for (BasicBlock *pred : predecessors(useBB))
186+
if (pred == falseBranchBB || pred == cmpBB)
187+
return false;
188+
}
189+
190+
// Only replace if trueBranchBB dominates the use's incomingBB.
169191
// Together with the single-predecessor check above, this guarantees every
170-
// path to the use went through the equality-proven cmpBB→trueBranch edge.
171-
return DT.dominates(trueBranch, incomingBB);
192+
// path to the use went through the equality-proven cmpBB->trueBranchBB edge.
193+
return DT.dominates(trueBranchBB, incomingBB);
172194
}
173195

174196
bool PropagateCmpUniformity::replaceNonUniformWithUniform(CmpInst *cmp, Value *nonUniform, Value *uniform,

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1614,7 +1614,10 @@ void OptimizeIR(CodeGenContext *const pContext) {
16141614
mpm.add(llvm::createJumpThreadingPass(false, BBDuplicateThreshold));
16151615
#endif // LLVM_VERSION_MAJOR
16161616
}
1617-
if (IGC_IS_FLAG_ENABLED(EnablePropagateCmpUniformity)) {
1617+
// RT shaders have many short equality branches (dispatch patterns)
1618+
// where live range extension costs outweigh scalarization benefits at SIMD8,
1619+
// and FunctionMultiversioning is disabled for RT (no compounding benefit).
1620+
if (IGC_IS_FLAG_ENABLED(EnablePropagateCmpUniformity) && pContext->type != ShaderType::RAYTRACING_SHADER) {
16181621
mpm.add(createPropagateCmpUniformityPass());
16191622
}
16201623
mpm.add(llvm::createCFGSimplificationPass());

IGC/Compiler/tests/PropagateCmpUniformity/cmpUniform.ll

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -478,15 +478,113 @@ exit:
478478
ret void
479479
}
480480

481+
; ============================================================================
482+
; Test 15: falseBranch is also a direct predecessor of the PHI's parent block
483+
; (useBB) — no replacement allowed.
484+
;
485+
; CFG produced by SROA when a dead ternary alloca is promoted:
486+
;
487+
; cmpBB: fcmp oeq %val, 0.0
488+
; - ternary.true (trueBranch, empty: br endBB)
489+
; - ternary.false (falseBranch, empty: br endBB)
490+
; |
491+
; V
492+
; endBB: phi [%val, ternary.true], [%val, ternary.false]
493+
;
494+
; Must NOT replace %val in the [%val, %ternary.true] incoming because
495+
; falseBranch is also a direct predecessor of endBB (useBB). If it did,
496+
; CFGSimplification would later remove both empty blocks, collapsing the
497+
; two incoming edges back to cmpBB and discarding the ternary.false value,
498+
; turning the PHI into the constant 0.0. This caused incorrect rendering in
499+
; a GS shader where the diagonal faceNormal components were zeroed out.
500+
; ============================================================================
501+
; CHECK-LABEL: @test_falsebranch_also_pred_of_usebb(
502+
define spir_kernel void @test_falsebranch_also_pred_of_usebb(ptr addrspace(1) %out) {
503+
entry:
504+
%tid = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
505+
%val = uitofp i32 %tid to float
506+
%cmp = fcmp oeq float %val, 0.000000e+00
507+
br i1 %cmp, label %ternary.true, label %ternary.false
508+
509+
ternary.true:
510+
br label %endBB
511+
512+
ternary.false:
513+
br label %endBB
514+
515+
endBB:
516+
; falseBranch (%ternary.false) is a direct predecessor of endBB, so
517+
; the [%val, %ternary.true] arm must not be replaced with 0.0.
518+
; CHECK-LABEL: endBB:
519+
; CHECK: %phi_val = phi float [ %val, %ternary.true ], [ %val, %ternary.false ]
520+
%phi_val = phi float [ %val, %ternary.true ], [ %val, %ternary.false ]
521+
store float %phi_val, ptr addrspace(1) %out, align 4
522+
ret void
523+
}
524+
525+
; ============================================================================
526+
; Test 16: falseBranch reaches trueBranchBB (useBB) via an intermediate block
527+
; that is dominated by cmpBB — no replacement allowed.
528+
;
529+
; This models the CFG produced by JumpThreading when it threads the true path of
530+
; a ternary-style conditional directly to the downstream join block (endBB),
531+
; while the false path still goes through intermediate blocks:
532+
;
533+
; cmpBB: fcmp oeq %val, 0.0
534+
; true ------------------------------> endBB
535+
; false --> ternary.false --> ternary.end --> endBB
536+
;
537+
; PCU's Check 1 fires: incomingBB==cmpBB and useBB==trueBranchBB==endBB.
538+
; The direct-predecessor guard (sub-case A, Test 15) does not help because
539+
; ternary.false is NOT a direct predecessor of endBB. The new guard catches
540+
; it: ternary.end is a predecessor of endBB, is not cmpBB, and IS dominated
541+
; by cmpBB — indicating it lies on the false-branch path.
542+
;
543+
; Must NOT replace %val in [%val, %entry] because ternary.end reaches endBB
544+
; without the equality guarantee, so after CFGSimplification collapses
545+
; ternary.false->ternary.end the PHI would incorrectly receive the constant 0.0.
546+
; This was the root cause of a GS shader rendering bug (IGC-13513) where the
547+
; diagonal faceNormal component was zeroed out.
548+
; ============================================================================
549+
; CHECK-LABEL: @test_indirect_falsebranch(
550+
define spir_kernel void @test_indirect_falsebranch(ptr addrspace(1) %out) {
551+
entry:
552+
%tid = call i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32 17)
553+
%val = uitofp i32 %tid to float
554+
%cmp = fcmp oeq float %val, 0.000000e+00
555+
; True edge goes directly to endBB (as if JumpThreading threaded ternary.true away).
556+
; False edge goes to ternary.false, which eventually also reaches endBB.
557+
br i1 %cmp, label %endBB, label %ternary.false
558+
559+
ternary.false:
560+
; Intermediate block dominated by entry (=cmpBB); no equality guarantee here.
561+
br label %ternary.end
562+
563+
ternary.end:
564+
; Also dominated by entry (=cmpBB); direct predecessor of endBB.
565+
br label %endBB
566+
567+
endBB:
568+
; [%val, %entry] — incoming from cmpBB on the true (equality) edge.
569+
; [%val, %ternary.end] — incoming from the false-path intermediate block.
570+
; The [%val, %entry] arm must NOT be replaced with 0.0: ternary.end reaches
571+
; here without equality being guaranteed.
572+
; CHECK-LABEL: endBB:
573+
; CHECK: %phi_val = phi float [ %val, %entry ], [ %val, %ternary.end ]
574+
%phi_val = phi float [ %val, %entry ], [ %val, %ternary.end ]
575+
store float %phi_val, ptr addrspace(1) %out, align 4
576+
ret void
577+
}
578+
481579
declare i32 @llvm.genx.GenISA.DCL.SystemValue.i32(i32) #0
482580

483581
attributes #0 = { nounwind readnone }
484582

485583
!IGCMetadata = !{!2}
486-
!igc.functions = !{!13, !22, !32, !42, !52, !62, !72, !82, !92, !102, !112, !122, !132, !142}
584+
!igc.functions = !{!13, !22, !32, !42, !52, !62, !72, !82, !92, !102, !112, !122, !132, !142, !152, !162}
487585

488586
!2 = !{!"ModuleMD", !3}
489-
!3 = !{!"FuncMD", !4, !5, !20, !21, !30, !31, !40, !41, !50, !51, !60, !61, !70, !71, !80, !81, !90, !91, !100, !101, !110, !111, !120, !121, !130, !131, !140, !141}
587+
!3 = !{!"FuncMD", !4, !5, !20, !21, !30, !31, !40, !41, !50, !51, !60, !61, !70, !71, !80, !81, !90, !91, !100, !101, !110, !111, !120, !121, !130, !131, !140, !141, !150, !151, !160, !161}
490588

491589
; Shared metadata nodes reused by all kernels
492590
!6 = !{!"localOffsets"}
@@ -569,3 +667,13 @@ attributes #0 = { nounwind readnone }
569667
!141 = !{!"FuncMDValue[13]", !6, !7, !11, !12}
570668
!142 = !{ptr @test_multi_cmpbb_truebb, !14}
571669

670+
; test_falsebranch_also_pred_of_usebb
671+
!150 = !{!"FuncMDMap[14]", ptr @test_falsebranch_also_pred_of_usebb}
672+
!151 = !{!"FuncMDValue[14]", !6, !7, !11, !12}
673+
!152 = !{ptr @test_falsebranch_also_pred_of_usebb, !14}
674+
675+
; test_indirect_falsebranch
676+
!160 = !{!"FuncMDMap[15]", ptr @test_indirect_falsebranch}
677+
!161 = !{!"FuncMDValue[15]", !6, !7, !11, !12}
678+
!162 = !{ptr @test_indirect_falsebranch, !14}
679+

IGC/common/igc_flags.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ DECLARE_IGC_REGKEY(bool, DisableIPConstantPropagation, false, "Disable Inter-pro
544544
DECLARE_IGC_REGKEY(bool, EnableSplitIndirectEEtoSel, true, "Enable the split indirect extractelement to icmp+sel pass",
545545
false)
546546
DECLARE_IGC_REGKEY(DWORD, SplitIndirectEEtoSelThreshold, 8, "Split indirect extractelement cost threshold", false)
547-
DECLARE_IGC_REGKEY(DWORD, EnablePropagateCmpUniformity, 0,
547+
DECLARE_IGC_REGKEY(DWORD, EnablePropagateCmpUniformity, 1,
548548
"Enable propagation of compare-based uniformity: replace non-uniform/divergent values with uniform "
549549
"values in dominated BBs as determined by WIAnalysis (1 enable, 2 enable and print)",
550550
false)

0 commit comments

Comments
 (0)