diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 54f6de34a76c93..3e7118091c8e5e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -115,6 +115,7 @@ class VectorCombine { bool foldShuffleOfBinops(Instruction &I); bool foldShuffleOfCastops(Instruction &I); bool foldShuffleOfShuffles(Instruction &I); + bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); bool foldCastFromReductions(Instruction &I); @@ -1674,6 +1675,89 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { return true; } +/// Try to convert +/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)". +bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { + Value *V0, *V1; + ArrayRef OldMask; + if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)), + m_Mask(OldMask)))) + return false; + + auto *II0 = dyn_cast(V0); + auto *II1 = dyn_cast(V1); + if (!II0 || !II1) + return false; + + Intrinsic::ID IID = II0->getIntrinsicID(); + if (IID != II1->getIntrinsicID()) + return false; + + auto *ShuffleDstTy = dyn_cast(I.getType()); + auto *II0Ty = dyn_cast(II0->getType()); + if (!ShuffleDstTy || !II0Ty) + return false; + + if (!isTriviallyVectorizable(IID)) + return false; + + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) + if (isVectorIntrinsicWithScalarOpAtArg(IID, I) && + II0->getArgOperand(I) != II1->getArgOperand(I)) + return false; + + InstructionCost OldCost = + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), + TTI::TCK_RecipThroughput) + + TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), + TTI::TCK_RecipThroughput) + + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask, + TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I); + + SmallVector NewArgsTy; + InstructionCost NewCost = 0; + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) + if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) { + NewArgsTy.push_back(II0->getArgOperand(I)->getType()); + } else { + auto *VecTy = cast(II0->getArgOperand(I)->getType()); + NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(), + VecTy->getNumElements() * 2)); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + VecTy, OldMask, TTI::TCK_RecipThroughput); + } + IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy); + NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput); + + LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I + << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost + << "\n"); + + if (NewCost > OldCost) + return false; + + SmallVector NewArgs; + for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) + if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) { + NewArgs.push_back(II0->getArgOperand(I)); + } else { + Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), + II1->getArgOperand(I), OldMask); + NewArgs.push_back(Shuf); + Worklist.pushValue(Shuf); + } + Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs); + + // Intersect flags from the old intrinsics. + if (auto *NewInst = dyn_cast(NewIntrinsic)) { + NewInst->copyIRFlags(II0); + NewInst->andIRFlags(II1); + } + + replaceValue(I, *NewIntrinsic); + return true; +} + using InstLane = std::pair; static InstLane lookThroughShuffles(Use *U, int Lane) { @@ -2645,6 +2729,7 @@ bool VectorCombine::run() { MadeChange |= foldShuffleOfBinops(I); MadeChange |= foldShuffleOfCastops(I); MadeChange |= foldShuffleOfShuffles(I); + MadeChange |= foldShuffleOfIntrinsics(I); MadeChange |= foldSelectShuffle(I); MadeChange |= foldShuffleToIdentity(I); break; diff --git a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll new file mode 100644 index 00000000000000..7ccc14cc0b125e --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=riscv64 -mattr=+v -passes=vector-combine -S %s | FileCheck %s + +define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false) +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; +entry: + %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false) + %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false) + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + ret <8 x i32> %4 +} + +define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; +entry: + %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true) + %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false) + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + ret <8 x i32> %4 +} + +define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP6]] +; +entry: + %4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1) + %5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3) + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + ret <8 x i32> %6 +} + +define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x i1> [[TMP4]] +; +entry: + %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0) + %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0) + %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> + ret <8 x i1> %4 +} + +define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP6]] +; +entry: + %4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1) + %5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3) + %6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> + ret <8 x float> %6 +} + +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32) +declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32) +declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>) diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll new file mode 100644 index 00000000000000..e012683b08f9b8 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,AVX + +define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false) +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; +entry: + %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false) + %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false) + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + ret <8 x i32> %4 +} + +define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP4]] +; +entry: + %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true) + %3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false) + %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> + ret <8 x i32> %4 +} + +define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { +; SSE-LABEL: @test3( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) +; SSE-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]]) +; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> +; SSE-NEXT: ret <8 x i32> [[TMP6]] +; +; AVX-LABEL: @test3( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) +; AVX-NEXT: ret <8 x i32> [[TMP6]] +; +entry: + %4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1) + %5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3) + %6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> + ret <8 x i32> %6 +} + +define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) { +; SSE-LABEL: @test4( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0) +; SSE-NEXT: ret <8 x i1> [[TMP3]] +; +; AVX-LABEL: @test4( +; AVX-NEXT: entry: +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0) +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0) +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> +; AVX-NEXT: ret <8 x i1> [[TMP4]] +; +entry: + %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0) + %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0) + %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> + ret <8 x i1> %4 +} + +define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP6]] +; +entry: + %4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1) + %5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3) + %6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> + ret <8 x float> %6 +} + +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32) +declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32) +declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)