Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VectorCombine] Add a cost model for shuffleToIdentity. #93937

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 56 additions & 16 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1670,8 +1670,12 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {

using InstLane = std::pair<Value *, int>;

static InstLane lookThroughShuffles(Value *V, int Lane) {
static InstLane
lookThroughShuffles(Value *V, int Lane,
SmallPtrSetImpl<Instruction *> *VisitedShuffles) {
artagnon marked this conversation as resolved.
Show resolved Hide resolved
while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
if (VisitedShuffles)
VisitedShuffles->insert(SV);
unsigned NumElts =
cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
int M = SV->getMaskValue(Lane);
Expand All @@ -1688,13 +1692,15 @@ static InstLane lookThroughShuffles(Value *V, int Lane) {
return InstLane{V, Lane};
}

static SmallVector<InstLane>
generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) {
static SmallVector<InstLane> generateInstLaneVectorFromOperand(
ArrayRef<InstLane> Item, int Op,
SmallPtrSetImpl<Instruction *> *VisitedShuffles) {
SmallVector<InstLane> NItem;
for (InstLane IL : Item) {
auto [V, Lane] = IL;
InstLane OpLane =
V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane)
V ? lookThroughShuffles(cast<Instruction>(V)->getOperand(Op), Lane,
VisitedShuffles)
: InstLane{nullptr, PoisonMaskElem};
NItem.emplace_back(OpLane);
}
Expand Down Expand Up @@ -1733,8 +1739,9 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
Ops[Idx] = II->getOperand(Idx);
continue;
}
Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
Ty, IdentityLeafs, SplatLeafs, Builder);
Ops[Idx] = generateNewInstTree(
generateInstLaneVectorFromOperand(Item, Idx, nullptr), Ty,
IdentityLeafs, SplatLeafs, Builder);
}
Builder.SetInsertPoint(I);
Type *DstTy =
Expand Down Expand Up @@ -1763,13 +1770,14 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (!Ty)
return false;

SmallPtrSet<Instruction *, 4> VisitedShuffles;
SmallVector<InstLane> Start(Ty->getNumElements());
for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
Start[M] = lookThroughShuffles(&I, M);
Start[M] = lookThroughShuffles(&I, M, &VisitedShuffles);

SmallVector<SmallVector<InstLane>> Worklist;
Worklist.push_back(Start);
SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs;
SmallPtrSet<Value *, 4> IdentityLeafs, SplatLeafs, ConstLeafs;
unsigned NumVisited = 0;

while (!Worklist.empty()) {
Expand Down Expand Up @@ -1803,7 +1811,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
Value *V = IL.first;
return !V || V == FrontV;
})) {
SplatLeafs.insert(FrontV);
ConstLeafs.insert(FrontV);
artagnon marked this conversation as resolved.
Show resolved Hide resolved
continue;
}
// Look for a splat value.
Expand Down Expand Up @@ -1847,14 +1855,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if ((isa<BinaryOperator>(FrontV) &&
!cast<BinaryOperator>(FrontV)->isIntDivRem()) ||
isa<CmpInst>(FrontV)) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles));
} else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontV)) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
} else if (isa<SelectInst>(FrontV)) {
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, 2, &VisitedShuffles));
} else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
II && isTriviallyVectorizable(II->getIntrinsicID())) {
for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
Expand All @@ -1868,7 +1882,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
return false;
continue;
}
Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op));
Worklist.push_back(
generateInstLaneVectorFromOperand(Item, Op, &VisitedShuffles));
}
} else {
return false;
Expand All @@ -1878,6 +1893,31 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
if (NumVisited <= 1)
return false;

LLVM_DEBUG(dbgs() << "Found a set of shuffles that can be removed:\n");
InstructionCost OldShuffleCost;
for (auto *I : VisitedShuffles) {
InstructionCost C = TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
LLVM_DEBUG(dbgs() << C << *I << "\n");
OldShuffleCost += C;
}
LLVM_DEBUG(dbgs() << " total cost " << OldShuffleCost << "\n");
SmallVector<int, 16> IdentityMask(Ty->getNumElements());
std::iota(IdentityMask.begin(), IdentityMask.end(), 0);
InstructionCost IdentityCost = TTI.getShuffleCost(
TTI::SK_PermuteSingleSrc, Ty, IdentityMask, TTI::TCK_RecipThroughput);
InstructionCost SplatCost = TTI.getShuffleCost(
TTI::SK_Broadcast, Ty, std::nullopt, TTI::TCK_RecipThroughput);
InstructionCost NewShuffleCost =
IdentityCost * IdentityLeafs.size() + SplatCost * SplatLeafs.size();
LLVM_DEBUG(dbgs() << " vs " << NewShuffleCost << " (" << IdentityCost
<< " * " << IdentityLeafs.size() << " + " << SplatCost
<< " * " << SplatLeafs.size() << ")\n");
artagnon marked this conversation as resolved.
Show resolved Hide resolved

if (OldShuffleCost < NewShuffleCost)
return false;

SplatLeafs.insert(ConstLeafs.begin(), ConstLeafs.end());

// If we got this far, we know the shuffles are superfluous and can be
// removed. Scan through again and generate the new tree of instructions.
Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder);
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -S %s | FileCheck %s --check-prefix=CHECK
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=corei7-avx -S %s | FileCheck %s --check-prefix=CHECK-AVX
; RUN: opt -passes=vector-combine -mtriple=x86_64-- -mcpu=core-avx2 -S %s | FileCheck %s --check-prefix=CHECK-AVX2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the looks of it you can test both under CHECK-AVX and remove the CHECK-AVX2 duplicates



define void @test_op_pblend_b_0(ptr %l8, ptr %lop_pblend_b_0) {
; CHECK-LABEL: define void @test_op_pblend_b_0(
; CHECK-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) {
; CHECK-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <32 x i8> [[LT162]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
; CHECK-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16
; CHECK-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16
; CHECK-NEXT: [[L19:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[LT162]], <32 x i8> [[L14]]
; CHECK-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32
; CHECK-NEXT: ret void
;
; CHECK-AVX-LABEL: define void @test_op_pblend_b_0(
; CHECK-AVX-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-AVX-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16
; CHECK-AVX-NEXT: [[L9:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-AVX-NEXT: [[L10:%.*]] = icmp ugt <16 x i8> [[L9]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
; CHECK-AVX-NEXT: [[L11:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX-NEXT: [[L12:%.*]] = icmp ugt <16 x i8> [[L11]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
; CHECK-AVX-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16
; CHECK-AVX-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16
; CHECK-AVX-NEXT: [[L15:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-AVX-NEXT: [[L16:%.*]] = select <16 x i1> [[L10]], <16 x i8> [[L9]], <16 x i8> [[L15]]
; CHECK-AVX-NEXT: [[L17:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX-NEXT: [[L18:%.*]] = select <16 x i1> [[L12]], <16 x i8> [[L11]], <16 x i8> [[L17]]
; CHECK-AVX-NEXT: [[L19:%.*]] = shufflevector <16 x i8> [[L16]], <16 x i8> [[L18]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32
; CHECK-AVX-NEXT: ret void
;
; CHECK-AVX2-LABEL: define void @test_op_pblend_b_0(
; CHECK-AVX2-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-AVX2-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16
; CHECK-AVX2-NEXT: [[L9:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-AVX2-NEXT: [[L10:%.*]] = icmp ugt <16 x i8> [[L9]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
; CHECK-AVX2-NEXT: [[L11:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX2-NEXT: [[L12:%.*]] = icmp ugt <16 x i8> [[L11]], <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
; CHECK-AVX2-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16
; CHECK-AVX2-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16
; CHECK-AVX2-NEXT: [[L15:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-AVX2-NEXT: [[L16:%.*]] = select <16 x i1> [[L10]], <16 x i8> [[L9]], <16 x i8> [[L15]]
; CHECK-AVX2-NEXT: [[L17:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX2-NEXT: [[L18:%.*]] = select <16 x i1> [[L12]], <16 x i8> [[L11]], <16 x i8> [[L17]]
; CHECK-AVX2-NEXT: [[L19:%.*]] = shufflevector <16 x i8> [[L16]], <16 x i8> [[L18]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-AVX2-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32
; CHECK-AVX2-NEXT: ret void
;
%lt162 = load <32 x i8>, ptr %l8, align 16
%l9 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l10 = icmp ugt <16 x i8> %l9, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%l11 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%l12 = icmp ugt <16 x i8> %l11, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%l13 = getelementptr inbounds i8, ptr %l8, i64 16
%l14 = load <32 x i8>, ptr %l13, align 16
%l15 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%l16 = select <16 x i1> %l10, <16 x i8> %l9, <16 x i8> %l15
%l17 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%l18 = select <16 x i1> %l12, <16 x i8> %l11, <16 x i8> %l17
%l19 = shufflevector <16 x i8> %l16, <16 x i8> %l18, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
store <32 x i8> %l19, ptr %lop_pblend_b_0, align 32
ret void
}
Loading