-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[InstCombine] Combine or-disjoint (and->mul), (and->mul) to and->mul #136013
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-llvm-transforms Author: Jeffrey Byrnes (jrbyrnes) ChangesThe canonical pattern for bitmasked mul is currently %val = and %x, %bitMask // where %bitMask is some constant In certain cases, where we are combining multiple of these bitmasked muls with common factors, we are able to optimize into and->mul (see #135274 ) This optimization lends itself to further optimizations. This PR addresses one of such optimizations. In cases where we have
we can combine into
provide C1 and C2 are disjoint. Generalized proof: https://alive2.llvm.org/ce/z/MQYMui Full diff: https://github.com/llvm/llvm-project/pull/136013.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 6cc241781d112..206131ab4a6a7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3643,6 +3643,27 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
foldAddLikeCommutative(I.getOperand(1), I.getOperand(0),
/*NSW=*/true, /*NUW=*/true))
return R;
+
+ Value *LHSOp = nullptr, *RHSOp = nullptr;
+ const APInt *LHSConst = nullptr, *RHSConst = nullptr;
+
+ // ((X & C1) * D) + ((X & C2) * D) -> (X & (C1 + C2) * D)
+ if (match(I.getOperand(0), m_Mul(m_Value(LHSOp), m_APInt(LHSConst))) &&
+ match(I.getOperand(1), m_Mul(m_Value(RHSOp), m_APInt(RHSConst))) &&
+ LHSConst == RHSConst) {
+ Value *LHSBase = nullptr, *RHSBase = nullptr;
+ const APInt *LHSMask = nullptr, *RHSMask = nullptr;
+ if (match(LHSOp, m_And(m_Value(LHSBase), m_APInt(LHSMask))) &&
+ match(RHSOp, m_And(m_Value(RHSBase), m_APInt(RHSMask))) &&
+ LHSBase == RHSBase &&
+ ((*LHSMask & *RHSMask) == APInt::getZero(LHSMask->getBitWidth()))) {
+ auto NewAnd = Builder.CreateAnd(
+ LHSBase, ConstantInt::get(LHSOp->getType(), (*LHSMask + *RHSMask)));
+
+ return BinaryOperator::CreateMul(
+ NewAnd, ConstantInt::get(NewAnd->getType(), *LHSConst));
+ }
+ }
}
Value *X, *Y;
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 95f89e4ce11cd..777387cc662d6 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -1281,10 +1281,10 @@ define <16 x i1> @test51(<16 x i1> %arg, <16 x i1> %arg1) {
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i1> [[ARG:%.*]], <16 x i1> [[ARG1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: ret <16 x i1> [[TMP3]]
;
- %tmp = and <16 x i1> %arg, <i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
- %tmp2 = and <16 x i1> %arg1, <i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true>
- %tmp3 = or <16 x i1> %tmp, %tmp2
- ret <16 x i1> %tmp3
+ %out = and <16 x i1> %arg, <i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>
+ %out2 = and <16 x i1> %arg1, <i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true>
+ %out3 = or <16 x i1> %out, %out2
+ ret <16 x i1> %out3
}
; This would infinite loop because it reaches a transform
@@ -2035,3 +2035,82 @@ define i32 @or_xor_and_commuted3(i32 %x, i32 %y, i32 %z) {
%or1 = or i32 %xor, %yy
ret i32 %or1
}
+
+define i32 @or_combine_mul_and1(i32 %in) {
+; CHECK-LABEL: @or_combine_mul_and1(
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 6
+; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT: ret i32 [[OUT]]
+;
+ %bitop0 = and i32 %in, 2
+ %out0 = mul i32 %bitop0, 72
+ %bitop1 = and i32 %in, 4
+ %out1 = mul i32 %bitop1, 72
+ %out = or disjoint i32 %out0, %out1
+ ret i32 %out
+}
+
+define i32 @or_combine_mul_and2(i32 %in) {
+; CHECK-LABEL: @or_combine_mul_and2(
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 10
+; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT: ret i32 [[OUT]]
+;
+ %bitop0 = and i32 %in, 2
+ %out0 = mul i32 %bitop0, 72
+ %bitop1 = and i32 %in, 8
+ %out1 = mul i32 %bitop1, 72
+ %out = or disjoint i32 %out0, %out1
+ ret i32 %out
+}
+
+define i32 @or_combine_mul_and_diff_factor(i32 %in) {
+; CHECK-LABEL: @or_combine_mul_and_diff_factor(
+; CHECK-NEXT: [[BITOP0:%.*]] = and i32 [[IN:%.*]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = mul nuw nsw i32 [[BITOP0]], 36
+; CHECK-NEXT: [[BITOP1:%.*]] = and i32 [[IN]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[BITOP1]], 72
+; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: ret i32 [[OUT]]
+;
+ %bitop0 = and i32 %in, 2
+ %out0 = mul i32 %bitop0, 36
+ %bitop1 = and i32 %in, 4
+ %out1 = mul i32 %bitop1, 72
+ %out = or disjoint i32 %out0, %out1
+ ret i32 %out
+}
+
+define i32 @or_combine_mul_and_diff_base(i32 %in, i32 %in1) {
+; CHECK-LABEL: @or_combine_mul_and_diff_base(
+; CHECK-NEXT: [[BITOP0:%.*]] = and i32 [[IN:%.*]], 2
+; CHECK-NEXT: [[TMP0:%.*]] = mul nuw nsw i32 [[BITOP0]], 72
+; CHECK-NEXT: [[BITOP1:%.*]] = and i32 [[IN1:%.*]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[BITOP1]], 72
+; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: ret i32 [[OUT]]
+;
+ %bitop0 = and i32 %in, 2
+ %out0 = mul i32 %bitop0, 72
+ %bitop1 = and i32 %in1, 4
+ %out1 = mul i32 %bitop1, 72
+ %out = or disjoint i32 %out0, %out1
+ ret i32 %out
+}
+
+define i32 @or_combine_mul_and_decomposed(i32 %in) {
+; CHECK-LABEL: @or_combine_mul_and_decomposed(
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[IN:%.*]] to i1
+; CHECK-NEXT: [[OUT0:%.*]] = select i1 [[TMP2]], i32 72, i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN]], 4
+; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT: [[OUT1:%.*]] = or disjoint i32 [[OUT0]], [[OUT]]
+; CHECK-NEXT: ret i32 [[OUT1]]
+;
+ %bitop0 = and i32 %in, 1
+ %out0 = mul i32 %bitop0, 72
+ %bitop1 = and i32 %in, 4
+ %out1 = mul i32 %bitop1, 72
+ %out = or disjoint i32 %out0, %out1
+ ret i32 %out
+}
|
I plan to have 3 or so extensions on #135274 -- the plan is to resolve merge conflicts by extracting this code into a separate function. |
Converted to draft / planned changes: on second thought, it makes more sense to integrate this with #135274 |
Change-Id: Id45315f1e5f71077800d3a8141b85bb3b5d8f38a
Change-Id: I630d506375b0eb4b16dad1437bff2da357be2059
Change-Id: I24786ee6dc53a33fc7afbd80d226cda4e4a4df03
Change-Id: I897641e07a0e035c106f8d0d018f8a1b7a92aa6b
Change-Id: I2a995aa22358d9d532a7f839dd64d8c8633990fa
Change-Id: I769481b102dc943039b2bd07a815f2716e58fed4
Change-Id: I1cc2acd3804dde50636518f3ef2c9581848ae9f6
d040ee7
to
c74714c
Compare
The canonical pattern for bitmasked mul is currently
In certain cases, where we are combining multiple of these bitmasked muls with common factors, we are able to optimize into and->mul (see #135274 )
This optimization lends itself to further optimizations. This PR addresses one of such optimizations.
In cases where we have
or-disjoint ( mul(and (X, C1), D) , mul (and (X, C2), D))
we can combine into
mul( and (X, (C1 + C2)), D)
provided C1 and C2 are disjoint.
Generalized proof: https://alive2.llvm.org/ce/z/MQYMui