-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] Add a cost model for shuffleToIdentity. #93937
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Green (davemgreen) ChangesAs with every combine that has ever touched identites, apparently converting to identities can also cause performance issues. This adds a simple cost model which helps when the cost of concat's might be high and will hopefully be useful if more types of shuffles are supported. Full diff: https://github.com/llvm/llvm-project/pull/93937.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7ecfe5218ef67..8b94e6653f559 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1670,8 +1670,12 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
using InstLane = std::pair<Value *, int>;
-static InstLane lookThroughShuffles(Value *V, int Lane) {
+static InstLane
+lookThroughShuffles(Value *V, int Lane,
+ SmallPtrSetImpl<Instruction *> *VisitedShuffles) {
while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ if (VisitedShuffles)
+ VisitedShuffles->insert(SV);
unsigned NumElts =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
int M = SV->getMaskValue(Lane);
@@ -1688,13 +1692,15 @@ static InstLane lookThroughShuffles(Value *V, int Lane) {
return InstLane{V, Lane};
}
-static SmallVector<InstLane>
-generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
+static SmallVector<InstLane> generateInstLaneVectorFromOperand(
+ ArrayRef<InstLane> Item, int Op,
+ SmallPtrSetImpl<Instruction *> *VisitedShuffles) {
SmallVector<InstLane> NItem;
for (InstLane IL : Item) {
auto [V, Lane] = IL;
InstLane OpLane =
- V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane)
+ V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane,
+ VisitedShuffles)
: InstLane{nullptr, PoisonMaskElem};
NItem.emplace_back(OpLane);
}
@@ -1733,8 +1739,9 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
Ops[Idx] = II->getOperand(Idx);
continue;
}
- Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
- Ty, IdentityLeafs, SplatLeafs, Builder);
+ Ops[Idx] =
+ generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx, nullptr),
+ Ty, IdentityLeafs, SplatLeafs, Builder);
}
Builder.SetInsertPoint(I);
Type *DstTy =
@@ -1763,13 +1770,14 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (!Ty)
return false;
+ SmallPtrSet<Instruction *, 4> VisitedShuffles;
SmallVector<InstLane> Start(Ty->getNumElements());
for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
- Start[M] = lookThroughShuffles(&I, M);
+ Start[M] = lookThroughShuffles(&I, M, &VisitedShuffles);
SmallVector<SmallVector<InstLane>> Worklist;
Worklist.push_back(Start);
- SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
+ SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs, ConstLeafs;
unsigned NumVisited = 0;
while (!Worklist.empty()) {
@@ -1803,7 +1811,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
Value *V = IL.first;
return !V || V == FrontV;
})) {
- SplatLeafs.insert(FrontV);
+ ConstLeafs.insert(FrontV);
continue;
}
// Look for a splat value.
@@ -1847,14 +1855,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if ((isa<BinaryOperator>(FrontV) &&
!cast<BinaryOperator>(FrontV)->isIntDivRem()) ||
isa<CmpInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles));
} else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
} else if (isa<SelectInst>(FrontV)) {
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, 2, &VisitedShuffles));
} else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
II && isTriviallyVectorizable(II->getIntrinsicID())) {
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
@@ -1868,7 +1882,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
return false;
continue;
}
- Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
+ Worklist.push_back(
+ generateInstLaneVectorFromOperand(Item, Op, &VisitedShuffles));
}
} else {
return false;
@@ -1878,6 +1893,31 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (NumVisited <= 1)
return false;
+ LLVM_DEBUG(dbgs() << "Found a set of shuffles that can be removed:\n");
+ InstructionCost OldShuffleCost;
+ for (auto *I : VisitedShuffles) {
+ InstructionCost C = TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+ LLVM_DEBUG(dbgs() << C << *I << "\n");
+ OldShuffleCost += C;
+ }
+ LLVM_DEBUG(dbgs() << " total cost " << OldShuffleCost << "\n");
+ SmallVector<int, 16> ExtractMask(Ty->getNumElements());
+ std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
+ InstructionCost IdentityCost = TTI.getShuffleCost(
+ TTI::SK_PermuteSingleSrc, Ty, ExtractMask, TTI::TCK_RecipThroughput);
+ InstructionCost SplatCost = TTI.getShuffleCost(
+ TTI::SK_Broadcast, Ty, std::nullopt, TTI::TCK_RecipThroughput);
+ InstructionCost NewShuffleCost =
+ IdentityCost * IdentityLeafs.size() + SplatCost * SplatLeafs.size();
+ LLVM_DEBUG(dbgs() << " vs " << NewShuffleCost << " (" << IdentityCost
+ << " * " << IdentityLeafs.size() << " + " << SplatCost
+ << " * " << SplatLeafs.size() << ")\n");
+
+ if (OldShuffleCost < NewShuffleCost)
+ return false;
+
+ SplatLeafs.insert(ConstLeafs.begin(), ConstLeafs.end());
+
// If we got this far, we know the shuffles are superfluous and can be
// removed. Scan through again and generate the new tree of instructions.
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder);
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll b/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
new file mode 100644
index 0000000000000..47cc97ddd32ff
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64--linux-gnu"
+
+define void @test_op_pblend_b_0_avx(ptr %l8, ptr %lop_pblend_b_0) "target-cpu"="corei7-avx" {
+; CHECK-LABEL: define void @test_op_pblend_b_0_avx(
+; CHECK-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16
+; CHECK-NEXT: [[L9:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[L10:%.*]] = icmp ugt <16 x i8> [[L9]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT: [[L11:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[L12:%.*]] = icmp ugt <16 x i8> [[L11]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16
+; CHECK-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16
+; CHECK-NEXT: [[L15:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[L16:%.*]] = select <16 x i1> [[L10]], <16 x i8> [[L9]], <16 x i8> [[L15]]
+; CHECK-NEXT: [[L17:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[L18:%.*]] = select <16 x i1> [[L12]], <16 x i8> [[L11]], <16 x i8> [[L17]]
+; CHECK-NEXT: [[L19:%.*]] = shufflevector <16 x i8> [[L16]], <16 x i8> [[L18]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32
+; CHECK-NEXT: ret void
+;
+ %lt162 = load <32 x i8>, ptr %l8, align 16
+ %l9 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %l10 = icmp ugt <16 x i8> %l9, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %l11 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %l12 = icmp ugt <16 x i8> %l11, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %l13 = getelementptr inbounds i8, ptr %l8, i64 16
+ %l14 = load <32 x i8>, ptr %l13, align 16
+ %l15 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %l16 = select <16 x i1> %l10, <16 x i8> %l9, <16 x i8> %l15
+ %l17 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %l18 = select <16 x i1> %l12, <16 x i8> %l11, <16 x i8> %l17
+ %l19 = shufflevector <16 x i8> %l16, <16 x i8> %l18, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ store <32 x i8> %l19, ptr %lop_pblend_b_0, align 32
+ ret void
+}
+
+define void @test_op_pblend_b_0_nocpu(ptr %l8, ptr %lop_pblend_b_0) {
+; CHECK-LABEL: define void @test_op_pblend_b_0_nocpu(
+; CHECK-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) {
+; CHECK-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <32 x i8> [[LT162]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16
+; CHECK-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16
+; CHECK-NEXT: [[L19:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[LT162]], <32 x i8> [[L14]]
+; CHECK-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32
+; CHECK-NEXT: ret void
+;
+ %lt162 = load <32 x i8>, ptr %l8, align 16
+ %l9 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %l10 = icmp ugt <16 x i8> %l9, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %l11 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %l12 = icmp ugt <16 x i8> %l11, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %l13 = getelementptr inbounds i8, ptr %l8, i64 16
+ %l14 = load <32 x i8>, ptr %l13, align 16
+ %l15 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %l16 = select <16 x i1> %l10, <16 x i8> %l9, <16 x i8> %l15
+ %l17 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %l18 = select <16 x i1> %l12, <16 x i8> %l11, <16 x i8> %l17
+ %l19 = shufflevector <16 x i8> %l16, <16 x i8> %l18, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ store <32 x i8> %l19, ptr %lop_pblend_b_0, align 32
+ ret void
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
8dfc683
to
bdcff14
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some issues need to be fixed.
llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Outdated
Show resolved
Hide resolved
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64--linux-gnu" | ||
|
||
define void @test_op_pblend_b_0_avx(ptr %l8, ptr %lop_pblend_b_0) "target-cpu"="corei7-avx" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move the target triple and target cpu into the RUN lines and add core-avx2 as a second RUN
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added them. The core-avx2 is also not transformed. The identity concat's still cost 4 (it was 8 for corei7-avx, free without a -mcpu).
As with every combine that has ever touched identites, apparently converting to identities can also cause performance issues. This adds a simple cost model which helps when the cost of concat's might be high and will hopefully be useful if more types of shuffles are supported.
bdcff14
to
10e8ed4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am hoping this won't be needed, as the regression from #92794 has been fixed in the backend. It might depend upon what happens in #93543. And I have updated the code with a few changes and some modifications to the tests in case it is.
The other thing I was looking at (which turned out to be piece-wise identities, not splats) can check the costs it needs elsewhere, and the difficult part is trying to make sure there is a valid place to insert the new instruction.
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" | ||
target triple = "x86_64--linux-gnu" | ||
|
||
define void @test_op_pblend_b_0_avx(ptr %l8, ptr %lop_pblend_b_0) "target-cpu"="corei7-avx" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added them. The core-avx2 is also not transformed. The identity concat's still cost 4 (it was 8 for corei7-avx, free without a -mcpu).
llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Outdated
Show resolved
Hide resolved
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -S %s | FileCheck %s --check-prefix=CHECK | ||
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=corei7-avx -S %s | FileCheck %s --check-prefix=CHECK-AVX | ||
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=core-avx2 -S %s | FileCheck %s --check-prefix=CHECK-AVX2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By the looks of it you can test both under CHECK-AVX and remove the CHECK-AVX2 duplicates
Let us know if this is still needed. It adds complexity, and if it's unnecessary, it can be dropped. |
As with every combine that has ever touched identites, apparently converting to identities can also cause performance issues. This adds a simple cost model which helps when the cost of concat's might be high and will hopefully be useful if more types of shuffles are supported.
This is one possible fix for the regression reported in #92794. It only currently costs the shuffles, not the other operations.