-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SimplifyCFG] Simplify switch instruction that has duplicate arms #114262
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Michael Maitland (michaelmaitland) ChangesI noticed that the two C functions emitted different IR:
For
For the equivalent
For On RISC-V, prior to this patch, we generate the following code:
After this patch, the O3 code is optimized to the icmp + select pair, which This may help with both code size and further switch simplification. I found Full diff: https://github.com/llvm/llvm-project/pull/114262.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 72228b445a8b6e..f44364ea507b7b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -276,6 +276,7 @@ class SimplifyCFGOpt {
bool simplifyCleanupReturn(CleanupReturnInst *RI);
bool simplifyUnreachable(UnreachableInst *UI);
bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+ bool simplifyDuplicateSwitchArms(SwitchInst *SI);
bool simplifyIndirectBr(IndirectBrInst *IBI);
bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
@@ -7436,6 +7437,94 @@ static bool simplifySwitchOfCmpIntrinsic(SwitchInst *SI, IRBuilderBase &Builder,
return true;
}
+bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI) {
+ // Simplify the case where multiple arms contain only a terminator, the
+ // terminators are the same, and their sucessor PHIS incoming values are the
+ // same.
+
+ // Find BBs that are candidates for simplification.
+ SmallPtrSet<BasicBlock *, 8> BBs;
+ for (auto &Case : SI->cases()) {
+ BasicBlock *BB = Case.getCaseSuccessor();
+
+ // FIXME: This case needs some extra care because the terminators other than
+ // SI need to be updated.
+ if (!BB->hasNPredecessors(1))
+ continue;
+
+ // FIXME: Relax that the terminator is a BranchInst by checking for equality
+ // on other kinds of terminators.
+ Instruction *T = BB->getTerminator();
+ if (T && BB->size() == 1 && isa<BranchInst>(T))
+ BBs.insert(BB);
+ }
+
+ auto IsBranchEq = [](BranchInst *A, BranchInst *B) {
+ if (A->isConditional() != B->isConditional())
+ return false;
+
+ if (A->isConditional() && A->getCondition() != B->getCondition())
+ return false;
+
+ if (A->getNumSuccessors() != B->getNumSuccessors())
+ return false;
+
+ for (unsigned I = 0; I < A->getNumSuccessors(); ++I)
+ if (A->getSuccessor(I) != B->getSuccessor(I))
+ return false;
+
+ // Need to check that PHIs in sucessors have matching values
+ for (auto *Succ : A->successors()) {
+ for (PHINode &Phi : Succ->phis())
+ if (Phi.getIncomingValueForBlock(A->getParent()) !=
+ Phi.getIncomingValueForBlock(B->getParent()))
+ return false;
+ }
+
+ return true;
+ };
+
+ // Construct a map from candidate basic block to an equivalent basic block
+ // to replace it with. All equivalent basic blocks should be replaced with
+ // the same basic block. To do this, if there is no equivalent BB in the map,
+ // then insert into the map BB -> BB. Otherwise, we should check only elements
+ // in the map for equivalence to ensure that all equivalent BB get replaced
+ // by the BB in the map. Replacing BB with BB has no impact, so we skip
+ // a call to setSuccessor when we do the actual replacement.
+ DenseMap<BasicBlock *, BasicBlock *> ReplaceWith;
+ for (BasicBlock *BB : BBs) {
+ bool Inserted = false;
+ for (auto KV : ReplaceWith) {
+ if (IsBranchEq(cast<BranchInst>(BB->getTerminator()),
+ cast<BranchInst>(KV.first->getTerminator()))) {
+ ReplaceWith[BB] = KV.first;
+ Inserted = true;
+ break;
+ }
+ }
+ if (!Inserted)
+ ReplaceWith[BB] = BB;
+ }
+
+ // Do the replacement in SI.
+ bool MadeChange = false;
+ // There is no fast lookup of BasicBlock -> Cases, so we iterate over cases
+ // and check that the case was a candidate. BBs is already filtered, so
+ // hopefully calling contains on it is not too expensive.
+ for (auto &Case : SI->cases()) {
+ BasicBlock *OldSucc = Case.getCaseSuccessor();
+ if (!BBs.contains(OldSucc))
+ continue;
+ BasicBlock *NewSucc = ReplaceWith[OldSucc];
+ if (OldSucc != NewSucc) {
+ Case.setSuccessor(NewSucc);
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
BasicBlock *BB = SI->getParent();
@@ -7496,6 +7585,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
hoistCommonCodeFromSuccessors(SI, !Options.HoistCommonInsts))
return requestResimplify();
+ if (simplifyDuplicateSwitchArms(SI))
+ return requestResimplify();
+
return false;
}
diff --git a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
index 8ad455eb9e7f22..4623eb2c5dd3c1 100644
--- a/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
+++ b/llvm/test/Transforms/SimplifyCFG/ForwardSwitchConditionToPHI.ll
@@ -139,16 +139,14 @@ define i32 @PR34471(i32 %x) {
; NO_FWD-NEXT: switch i32 [[X:%.*]], label [[ELSE3:%.*]] [
; NO_FWD-NEXT: i32 17, label [[RETURN:%.*]]
; NO_FWD-NEXT: i32 19, label [[IF19:%.*]]
-; NO_FWD-NEXT: i32 42, label [[IF42:%.*]]
+; NO_FWD-NEXT: i32 42, label [[IF19]]
; NO_FWD-NEXT: ]
; NO_FWD: if19:
; NO_FWD-NEXT: br label [[RETURN]]
-; NO_FWD: if42:
-; NO_FWD-NEXT: br label [[RETURN]]
; NO_FWD: else3:
; NO_FWD-NEXT: br label [[RETURN]]
; NO_FWD: return:
-; NO_FWD-NEXT: [[R:%.*]] = phi i32 [ [[X]], [[IF19]] ], [ [[X]], [[IF42]] ], [ 0, [[ELSE3]] ], [ 17, [[ENTRY:%.*]] ]
+; NO_FWD-NEXT: [[R:%.*]] = phi i32 [ [[X]], [[IF19]] ], [ 0, [[ELSE3]] ], [ 17, [[ENTRY:%.*]] ]
; NO_FWD-NEXT: ret i32 [[R]]
;
; FWD-LABEL: @PR34471(
diff --git a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
index fe0b48028a3b62..fbe41d891c1ec5 100644
--- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
@@ -65,14 +65,12 @@ define float @PR39535min_switch(i64 %i, float %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: switch i64 [[I:%.*]], label [[END:%.*]] [
; CHECK-NEXT: i64 1, label [[BB1:%.*]]
-; CHECK-NEXT: i64 2, label [[BB2:%.*]]
+; CHECK-NEXT: i64 2, label [[BB1]]
; CHECK-NEXT: ]
; CHECK: bb1:
; CHECK-NEXT: br label [[END]]
-; CHECK: bb2:
-; CHECK-NEXT: br label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[COND:%.*]] = phi fast float [ [[X:%.*]], [[BB1]] ], [ [[X]], [[BB2]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[COND:%.*]] = phi fast float [ [[X:%.*]], [[BB1]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: ret float [[COND]]
;
entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
new file mode 100644
index 00000000000000..b12db656fdf681
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s -check-prefix=SIMPLIFY-CFG
+; RUN: opt < %s -O3 -S | FileCheck %s -check-prefix=O3
+
+define i32 @switch_all_duplicate_arms(i32 %0, i32 %1, i32 %2, i32 %3) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_all_duplicate_arms(
+; SIMPLIFY-CFG-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) {
+; SIMPLIFY-CFG-NEXT: switch i32 [[TMP1]], label %[[BB6:.*]] [
+; SIMPLIFY-CFG-NEXT: i32 0, label %[[BB5:.*]]
+; SIMPLIFY-CFG-NEXT: i32 1, label %[[BB5]]
+; SIMPLIFY-CFG-NEXT: ]
+; SIMPLIFY-CFG: [[BB5]]:
+; SIMPLIFY-CFG-NEXT: br label %[[BB6]]
+; SIMPLIFY-CFG: [[BB6]]:
+; SIMPLIFY-CFG-NEXT: [[TMP8:%.*]] = phi i32 [ [[TMP3]], [[TMP4:%.*]] ], [ [[TMP2]], %[[BB5]] ]
+; SIMPLIFY-CFG-NEXT: ret i32 [[TMP8]]
+;
+; O3-LABEL: define i32 @switch_all_duplicate_arms(
+; O3-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; O3-NEXT: [[SWITCH:%.*]] = icmp ult i32 [[TMP1]], 2
+; O3-NEXT: [[TMP8:%.*]] = select i1 [[SWITCH]], i32 [[TMP2]], i32 [[TMP3]]
+; O3-NEXT: ret i32 [[TMP8]]
+;
+ switch i32 %1, label %7 [
+ i32 0, label %5
+ i32 1, label %6
+ ]
+
+5:
+ br label %7
+
+6:
+ br label %7
+
+7:
+ %8 = phi i32 [ %3, %4 ], [ %2, %6 ], [ %2, %5 ]
+ ret i32 %8
+}
+
+define i32 @switch_some_duplicate_arms(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_some_duplicate_arms(
+; SIMPLIFY-CFG-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]]) {
+; SIMPLIFY-CFG-NEXT: switch i32 [[TMP1]], label %[[BB8:.*]] [
+; SIMPLIFY-CFG-NEXT: i32 0, label %[[BB6:.*]]
+; SIMPLIFY-CFG-NEXT: i32 1, label %[[BB6]]
+; SIMPLIFY-CFG-NEXT: i32 2, label %[[BB7:.*]]
+; SIMPLIFY-CFG-NEXT: ]
+; SIMPLIFY-CFG: [[BB6]]:
+; SIMPLIFY-CFG-NEXT: br label %[[BB8]]
+; SIMPLIFY-CFG: [[BB7]]:
+; SIMPLIFY-CFG-NEXT: br label %[[BB8]]
+; SIMPLIFY-CFG: [[BB8]]:
+; SIMPLIFY-CFG-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP3]], [[TMP5:%.*]] ], [ [[TMP4]], %[[BB7]] ], [ [[TMP2]], %[[BB6]] ]
+; SIMPLIFY-CFG-NEXT: ret i32 [[TMP10]]
+;
+; O3-LABEL: define i32 @switch_some_duplicate_arms(
+; O3-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; O3-NEXT: switch i32 [[TMP1]], label %[[BB8:.*]] [
+; O3-NEXT: i32 0, label %[[BB6:.*]]
+; O3-NEXT: i32 1, label %[[BB6]]
+; O3-NEXT: i32 2, label %[[BB7:.*]]
+; O3-NEXT: ]
+; O3: [[BB6]]:
+; O3-NEXT: br label %[[BB8]]
+; O3: [[BB7]]:
+; O3-NEXT: br label %[[BB8]]
+; O3: [[BB8]]:
+; O3-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP3]], [[TMP5:%.*]] ], [ [[TMP4]], %[[BB7]] ], [ [[TMP2]], %[[BB6]] ]
+; O3-NEXT: ret i32 [[TMP10]]
+;
+ switch i32 %1, label %9 [
+ i32 0, label %6
+ i32 1, label %7
+ i32 2, label %8
+ ]
+
+6:
+ br label %9
+
+7:
+ br label %9
+
+8:
+ br label %9
+
+9:
+ %10 = phi i32 [ %3, %5 ], [ %4, %8 ], [ %2, %7 ], [ %2, %6 ]
+ ret i32 %10
+}
+
+define i32 @switch_duplicate_arms_multipred(i1 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
+; SIMPLIFY-CFG-LABEL: define i32 @switch_duplicate_arms_multipred(
+; SIMPLIFY-CFG-SAME: i1 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]]) {
+; SIMPLIFY-CFG-NEXT: br i1 [[TMP0]], label %[[BB6:.*]], label %[[BB7:.*]]
+; SIMPLIFY-CFG: [[BB6]]:
+; SIMPLIFY-CFG-NEXT: switch i32 [[TMP2]], label %[[BB9:.*]] [
+; SIMPLIFY-CFG-NEXT: i32 0, label %[[BB7]]
+; SIMPLIFY-CFG-NEXT: i32 1, label %[[BB8:.*]]
+; SIMPLIFY-CFG-NEXT: ]
+; SIMPLIFY-CFG: [[BB7]]:
+; SIMPLIFY-CFG-NEXT: br label %[[BB9]]
+; SIMPLIFY-CFG: [[BB8]]:
+; SIMPLIFY-CFG-NEXT: br label %[[BB9]]
+; SIMPLIFY-CFG: [[BB9]]:
+; SIMPLIFY-CFG-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP4]], %[[BB6]] ], [ [[TMP3]], %[[BB8]] ], [ [[TMP3]], %[[BB7]] ]
+; SIMPLIFY-CFG-NEXT: ret i32 [[TMP10]]
+;
+; O3-LABEL: define i32 @switch_duplicate_arms_multipred(
+; O3-SAME: i1 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; O3-NEXT: br i1 [[TMP0]], label %[[BB6:.*]], label %[[BB7:.*]]
+; O3: [[BB6]]:
+; O3-NEXT: switch i32 [[TMP2]], label %[[BB9:.*]] [
+; O3-NEXT: i32 0, label %[[BB7]]
+; O3-NEXT: i32 1, label %[[BB8:.*]]
+; O3-NEXT: ]
+; O3: [[BB7]]:
+; O3-NEXT: br label %[[BB9]]
+; O3: [[BB8]]:
+; O3-NEXT: br label %[[BB9]]
+; O3: [[BB9]]:
+; O3-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP4]], %[[BB6]] ], [ [[TMP3]], %[[BB8]] ], [ [[TMP3]], %[[BB7]] ]
+; O3-NEXT: ret i32 [[TMP10]]
+;
+ br i1 %0, label %6, label %7
+6:
+ switch i32 %2, label %9 [
+ i32 0, label %7
+ i32 1, label %8
+ ]
+
+7:
+ br label %9
+
+8:
+ br label %9
+
+9:
+ %10 = phi i32 [ %4, %6 ], [ %3, %8 ], [ %3, %7 ]
+ ret i32 %10
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
index 1e2f18b3f339d4..50998e447b71dc 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
@@ -272,16 +272,14 @@ define i8 @switch_to_select_two_case_results_no_default(i32 %i) {
; CHECK-NEXT: i32 0, label [[END:%.*]]
; CHECK-NEXT: i32 2, label [[END]]
; CHECK-NEXT: i32 4, label [[CASE3:%.*]]
-; CHECK-NEXT: i32 6, label [[CASE4:%.*]]
+; CHECK-NEXT: i32 6, label [[CASE3]]
; CHECK-NEXT: ]
; CHECK: case3:
; CHECK-NEXT: br label [[END]]
-; CHECK: case4:
-; CHECK-NEXT: br label [[END]]
; CHECK: default:
; CHECK-NEXT: unreachable
; CHECK: end:
-; CHECK-NEXT: [[T0:%.*]] = phi i8 [ 44, [[CASE3]] ], [ 44, [[CASE4]] ], [ 42, [[ENTRY:%.*]] ], [ 42, [[ENTRY]] ]
+; CHECK-NEXT: [[T0:%.*]] = phi i8 [ 44, [[CASE3]] ], [ 42, [[ENTRY:%.*]] ], [ 42, [[ENTRY]] ]
; CHECK-NEXT: ret i8 [[T0]]
;
entry:
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
I noticed that the two C functions emitted different IR: ``` int switch_duplicate_arms(int switch_val, int v, int w) { switch (switch_val) { default: break; case 0: w = v; break; case 1: w = v; break; } return w; } int if_duplicate_arms(int switch_val, int v, int w) { if (switch_val == 0) w = v; else if (switch_val == 1) w = v; return v0; } ``` For `switch_duplicate_arms`, we generate IR that looks like this: ``` define i32 @switch_duplicate_arms(i32 %0, i32 %1, i32 %2, i32 %3) { switch i32 %1, label %7 [ i32 0, label %5 i32 1, label %6 ] 5: br label %7 6: br label %7 7: %8 = phi i32 [ %3, %4 ], [ %2, %6 ], [ %2, %5 ] ret i32 %8 } ``` For the equivalent `if_duplicate_arms`, we generate: ``` define i32 @if_duplicate_arms(i32 %0, i32 %1, i32 %2, i32 %3) { %5 = icmp ult i32 %1, 2 %6 = select i1 %5, i32 %2, i32 %3 ret i32 %6 } ``` For `switch_duplicate_arms`, taking case 0 and 1 are the same since %5 and %6 branch to the same location and the incoming values for %8 are the same from those blocks. We could remove one on the duplicate switch targets and update the switch with the single target. On RISC-V, prior to this patch, we generate the following code: ``` switch_duplicate_arms: li a4, 1 beq a1, a4, .LBB0_2 mv a0, a3 bnez a1, .LBB0_3 .LBB0_2: mv a0, a2 .LBB0_3: ret if_duplicate_arms: li a4, 2 mv a0, a2 bltu a1, a4, .LBB1_2 mv a0, a3 .LBB1_2: ret ``` After this patch, the O3 code is optimized to the icmp + select pair, which gives us the same code gen as `if_duplicate_arms` as desired. This may help with both code size and further switch simplification. I found that this patch causes no significant impact to spec2006/int/ref and spec2017/intrate/ref.
eb7d34d
to
2db9f00
Compare
if (A->isConditional() != B->isConditional()) | ||
return false; | ||
|
||
if (A->isConditional() && A->getCondition() != B->getCondition()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should probably be areIdenticalUpToCommutativity(A->getCondition(), B->getCondition())
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated
for (auto *Succ : A->successors()) { | ||
for (PHINode &Phi : Succ->phis()) | ||
if (Phi.getIncomingValueForBlock(A->getParent()) != | ||
Phi.getIncomingValueForBlock(B->getParent())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be in the above loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
addressed
b15eceb
to
4e56067
Compare
As a general feedback, I think the current implementation is a bit too constrained for my liking, in that it will make it difficult to extend to generally handle equal basic blocks with more than just one branch without entirely rewriting. I would be more in favor of implementing this by hashing the BBs and arbitrarily de-duplicating equal BBs. |
I noticed that the two C functions emitted different IR:
We generate IR that looks like this:
For
switch_duplicate_arms
, taking case 0 and 1 are the same since %5 and %6branch to the same location and the incoming values for %8 are the same from
those blocks. We could remove one on the duplicate switch targets and update
the switch with the single target.
On RISC-V, prior to this patch, we generate the following code:
After this patch, the O3 code is optimized to the icmp + select pair, which
gives us the same code gen as
if_duplicate_arms
, as desired. This resultsis one less branch instruction in the final assembly.
This may help with both code size and further switch simplification. I found
that this patch causes no significant impact to spec2006/int/ref and
spec2017/intrate/ref.