From bdcff1432b4dd9f69cbebded9199422b590c1924 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 31 May 2024 09:46:17 +0100 Subject: [PATCH] [VectorCombine] Add a cost model for shuffleToIdentity. As with every combine that has ever touched identites, apparently converting to identities can also cause performance issues. This adds a simple cost model which helps when the cost of concat's might be high and will hopefully be useful if more types of shuffles are supported. --- .../Transforms/Vectorize/VectorCombine.cpp | 72 ++++++++++++++----- .../X86/shuffleToIdentityCost.ll | 66 +++++++++++++++++ 2 files changed, 122 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 7ecfe5218ef67c..0d0337bfd4077a 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1670,8 +1670,12 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { using InstLane = std::pair; -static InstLane lookThroughShuffles(Value *V, int Lane) { +static InstLane +lookThroughShuffles(Value *V, int Lane, + SmallPtrSetImpl *VisitedShuffles) { while (auto *SV = dyn_cast(V)) { + if (VisitedShuffles) + VisitedShuffles->insert(SV); unsigned NumElts = cast(SV->getOperand(0)->getType())->getNumElements(); int M = SV->getMaskValue(Lane); @@ -1688,13 +1692,15 @@ static InstLane lookThroughShuffles(Value *V, int Lane) { return InstLane{V, Lane}; } -static SmallVector -generateInstLaneVectorFromOperand(ArrayRef Item, int Op) { +static SmallVector generateInstLaneVectorFromOperand( + ArrayRef Item, int Op, + SmallPtrSetImpl *VisitedShuffles) { SmallVector NItem; for (InstLane IL : Item) { auto [V, Lane] = IL; InstLane OpLane = - V ? lookThroughShuffles(cast(V)->getOperand(Op), Lane) + V ? lookThroughShuffles(cast(V)->getOperand(Op), Lane, + VisitedShuffles) : InstLane{nullptr, PoisonMaskElem}; NItem.emplace_back(OpLane); } @@ -1733,8 +1739,9 @@ static Value *generateNewInstTree(ArrayRef Item, FixedVectorType *Ty, Ops[Idx] = II->getOperand(Idx); continue; } - Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), - Ty, IdentityLeafs, SplatLeafs, Builder); + Ops[Idx] = generateNewInstTree( + generateInstLaneVectorFromOperand(Item, Idx, nullptr), Ty, + IdentityLeafs, SplatLeafs, Builder); } Builder.SetInsertPoint(I); Type *DstTy = @@ -1763,13 +1770,14 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (!Ty) return false; + SmallPtrSet VisitedShuffles; SmallVector Start(Ty->getNumElements()); for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M) - Start[M] = lookThroughShuffles(&I, M); + Start[M] = lookThroughShuffles(&I, M, &VisitedShuffles); SmallVector> Worklist; Worklist.push_back(Start); - SmallPtrSet IdentityLeafs, SplatLeafs; + SmallPtrSet IdentityLeafs, SplatLeafs, ConstLeafs; unsigned NumVisited = 0; while (!Worklist.empty()) { @@ -1803,7 +1811,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { Value *V = IL.first; return !V || V == FrontV; })) { - SplatLeafs.insert(FrontV); + ConstLeafs.insert(FrontV); continue; } // Look for a splat value. @@ -1847,14 +1855,20 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if ((isa(FrontV) && !cast(FrontV)->isIntDivRem()) || isa(FrontV)) { - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles)); } else if (isa(FrontV)) { - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles)); } else if (isa(FrontV)) { - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0)); - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1)); - Worklist.push_back(generateInstLaneVectorFromOperand(Item, 2)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 0, &VisitedShuffles)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 1, &VisitedShuffles)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, 2, &VisitedShuffles)); } else if (auto *II = dyn_cast(FrontV); II && isTriviallyVectorizable(II->getIntrinsicID())) { for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) { @@ -1868,7 +1882,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { return false; continue; } - Worklist.push_back(generateInstLaneVectorFromOperand(Item, Op)); + Worklist.push_back( + generateInstLaneVectorFromOperand(Item, Op, &VisitedShuffles)); } } else { return false; @@ -1878,6 +1893,31 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) { if (NumVisited <= 1) return false; + LLVM_DEBUG(dbgs() << "Found a set of shuffles that can be removed:\n"); + InstructionCost OldShuffleCost; + for (auto *I : VisitedShuffles) { + InstructionCost C = TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); + LLVM_DEBUG(dbgs() << C << *I << "\n"); + OldShuffleCost += C; + } + LLVM_DEBUG(dbgs() << " total cost " << OldShuffleCost << "\n"); + SmallVector ExtractMask(Ty->getNumElements()); + std::iota(ExtractMask.begin(), ExtractMask.end(), 0); + InstructionCost IdentityCost = TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, Ty, ExtractMask, TTI::TCK_RecipThroughput); + InstructionCost SplatCost = TTI.getShuffleCost( + TTI::SK_Broadcast, Ty, std::nullopt, TTI::TCK_RecipThroughput); + InstructionCost NewShuffleCost = + IdentityCost * IdentityLeafs.size() + SplatCost * SplatLeafs.size(); + LLVM_DEBUG(dbgs() << " vs " << NewShuffleCost << " (" << IdentityCost + << " * " << IdentityLeafs.size() << " + " << SplatCost + << " * " << SplatLeafs.size() << ")\n"); + + if (OldShuffleCost < NewShuffleCost) + return false; + + SplatLeafs.insert(ConstLeafs.begin(), ConstLeafs.end()); + // If we got this far, we know the shuffles are superfluous and can be // removed. Scan through again and generate the new tree of instructions. Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs, Builder); diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll b/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll new file mode 100644 index 00000000000000..47cc97ddd32ffe --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffleToIdentityCost.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=vector-combine -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64--linux-gnu" + +define void @test_op_pblend_b_0_avx(ptr %l8, ptr %lop_pblend_b_0) "target-cpu"="corei7-avx" { +; CHECK-LABEL: define void @test_op_pblend_b_0_avx( +; CHECK-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16 +; CHECK-NEXT: [[L9:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[L10:%.*]] = icmp ugt <16 x i8> [[L9]], +; CHECK-NEXT: [[L11:%.*]] = shufflevector <32 x i8> [[LT162]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[L12:%.*]] = icmp ugt <16 x i8> [[L11]], +; CHECK-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16 +; CHECK-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16 +; CHECK-NEXT: [[L15:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[L16:%.*]] = select <16 x i1> [[L10]], <16 x i8> [[L9]], <16 x i8> [[L15]] +; CHECK-NEXT: [[L17:%.*]] = shufflevector <32 x i8> [[L14]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[L18:%.*]] = select <16 x i1> [[L12]], <16 x i8> [[L11]], <16 x i8> [[L17]] +; CHECK-NEXT: [[L19:%.*]] = shufflevector <16 x i8> [[L16]], <16 x i8> [[L18]], <32 x i32> +; CHECK-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32 +; CHECK-NEXT: ret void +; + %lt162 = load <32 x i8>, ptr %l8, align 16 + %l9 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> + %l10 = icmp ugt <16 x i8> %l9, + %l11 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> + %l12 = icmp ugt <16 x i8> %l11, + %l13 = getelementptr inbounds i8, ptr %l8, i64 16 + %l14 = load <32 x i8>, ptr %l13, align 16 + %l15 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> + %l16 = select <16 x i1> %l10, <16 x i8> %l9, <16 x i8> %l15 + %l17 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> + %l18 = select <16 x i1> %l12, <16 x i8> %l11, <16 x i8> %l17 + %l19 = shufflevector <16 x i8> %l16, <16 x i8> %l18, <32 x i32> + store <32 x i8> %l19, ptr %lop_pblend_b_0, align 32 + ret void +} + +define void @test_op_pblend_b_0_nocpu(ptr %l8, ptr %lop_pblend_b_0) { +; CHECK-LABEL: define void @test_op_pblend_b_0_nocpu( +; CHECK-SAME: ptr [[L8:%.*]], ptr [[LOP_PBLEND_B_0:%.*]]) { +; CHECK-NEXT: [[LT162:%.*]] = load <32 x i8>, ptr [[L8]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <32 x i8> [[LT162]], +; CHECK-NEXT: [[L13:%.*]] = getelementptr inbounds i8, ptr [[L8]], i64 16 +; CHECK-NEXT: [[L14:%.*]] = load <32 x i8>, ptr [[L13]], align 16 +; CHECK-NEXT: [[L19:%.*]] = select <32 x i1> [[TMP1]], <32 x i8> [[LT162]], <32 x i8> [[L14]] +; CHECK-NEXT: store <32 x i8> [[L19]], ptr [[LOP_PBLEND_B_0]], align 32 +; CHECK-NEXT: ret void +; + %lt162 = load <32 x i8>, ptr %l8, align 16 + %l9 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> + %l10 = icmp ugt <16 x i8> %l9, + %l11 = shufflevector <32 x i8> %lt162, <32 x i8> poison, <16 x i32> + %l12 = icmp ugt <16 x i8> %l11, + %l13 = getelementptr inbounds i8, ptr %l8, i64 16 + %l14 = load <32 x i8>, ptr %l13, align 16 + %l15 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> + %l16 = select <16 x i1> %l10, <16 x i8> %l9, <16 x i8> %l15 + %l17 = shufflevector <32 x i8> %l14, <32 x i8> poison, <16 x i32> + %l18 = select <16 x i1> %l12, <16 x i8> %l11, <16 x i8> %l17 + %l19 = shufflevector <16 x i8> %l16, <16 x i8> %l18, <32 x i32> + store <32 x i8> %l19, ptr %lop_pblend_b_0, align 32 + ret void +}