Skip to content

Commit

Permalink
[VectorCombine] Add foldShuffleOfIntrinsics. (#106502)
Browse files Browse the repository at this point in the history
  • Loading branch information
HanKuanChen authored Sep 10, 2024
1 parent bda9474 commit 0ccc609
Show file tree
Hide file tree
Showing 3 changed files with 265 additions and 0 deletions.
85 changes: 85 additions & 0 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class VectorCombine {
bool foldShuffleOfBinops(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
bool foldShuffleOfShuffles(Instruction &I);
bool foldShuffleOfIntrinsics(Instruction &I);
bool foldShuffleToIdentity(Instruction &I);
bool foldShuffleFromReductions(Instruction &I);
bool foldCastFromReductions(Instruction &I);
Expand Down Expand Up @@ -1674,6 +1675,89 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
return true;
}

/// Try to convert
/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
Value *V0, *V1;
ArrayRef<int> OldMask;
if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
m_Mask(OldMask))))
return false;

auto *II0 = dyn_cast<IntrinsicInst>(V0);
auto *II1 = dyn_cast<IntrinsicInst>(V1);
if (!II0 || !II1)
return false;

Intrinsic::ID IID = II0->getIntrinsicID();
if (IID != II1->getIntrinsicID())
return false;

auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
if (!ShuffleDstTy || !II0Ty)
return false;

if (!isTriviallyVectorizable(IID))
return false;

for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
if (isVectorIntrinsicWithScalarOpAtArg(IID, I) &&
II0->getArgOperand(I) != II1->getArgOperand(I))
return false;

InstructionCost OldCost =
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0),
TTI::TCK_RecipThroughput) +
TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1),
TTI::TCK_RecipThroughput) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, II0Ty, OldMask,
TTI::TCK_RecipThroughput, 0, nullptr, {II0, II1}, &I);

SmallVector<Type *> NewArgsTy;
InstructionCost NewCost = 0;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
NewArgsTy.push_back(II0->getArgOperand(I)->getType());
} else {
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
VecTy->getNumElements() * 2));
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
VecTy, OldMask, TTI::TCK_RecipThroughput);
}
IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
NewCost += TTI.getIntrinsicInstrCost(NewAttr, TTI::TCK_RecipThroughput);

LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");

if (NewCost > OldCost)
return false;

SmallVector<Value *> NewArgs;
for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
NewArgs.push_back(II0->getArgOperand(I));
} else {
Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
II1->getArgOperand(I), OldMask);
NewArgs.push_back(Shuf);
Worklist.pushValue(Shuf);
}
Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);

// Intersect flags from the old intrinsics.
if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
NewInst->copyIRFlags(II0);
NewInst->andIRFlags(II1);
}

replaceValue(I, *NewIntrinsic);
return true;
}

using InstLane = std::pair<Use *, int>;

static InstLane lookThroughShuffles(Use *U, int Lane) {
Expand Down Expand Up @@ -2645,6 +2729,7 @@ bool VectorCombine::run() {
MadeChange |= foldShuffleOfBinops(I);
MadeChange |= foldShuffleOfCastops(I);
MadeChange |= foldShuffleOfShuffles(I);
MadeChange |= foldShuffleOfIntrinsics(I);
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
Expand Down
82 changes: 82 additions & 0 deletions llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=riscv64 -mattr=+v -passes=vector-combine -S %s | FileCheck %s

define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
entry:
%2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
%3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
%4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %4
}

define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
entry:
%2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true)
%3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
%4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %4
}

define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x i32> [[TMP6]]
;
entry:
%4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)
%5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3)
%6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %6
}

define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
; CHECK-LABEL: @test4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x i1> [[TMP4]]
;
entry:
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
%3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
%4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i1> %4
}

define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]])
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x float> [[TMP6]]
;
entry:
%4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1)
%5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3)
%6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %6
}

declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)
98 changes: 98 additions & 0 deletions llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s --check-prefixes=CHECK,AVX

define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
; CHECK-LABEL: @test1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
; CHECK-NEXT: ret <8 x i32> [[TMP3]]
;
entry:
%2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
%3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
%4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %4
}

define <8 x i32> @test2(<4 x i32> %0, <4 x i32> %1) {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP0:%.*]], i1 true)
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1:%.*]], i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x i32> [[TMP4]]
;
entry:
%2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 true)
%3 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %1, i1 false)
%4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %4
}

define <8 x i32> @test3(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
; SSE-LABEL: @test3(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]])
; SSE-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: ret <8 x i32> [[TMP6]]
;
; AVX-LABEL: @test3(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP2:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> [[TMP3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
; AVX-NEXT: ret <8 x i32> [[TMP6]]
;
entry:
%4 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1)
%5 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> %3)
%6 = shufflevector <4 x i32> %4, <4 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %6
}

define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
; SSE-LABEL: @test4(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
; SSE-NEXT: ret <8 x i1> [[TMP3]]
;
; AVX-LABEL: @test4(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
; AVX-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AVX-NEXT: ret <8 x i1> [[TMP4]]
;
entry:
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
%3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
%4 = shufflevector <4 x i1> %2, <4 x i1> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i1> %4
}

define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP0:%.*]], i32 [[TMP1:%.*]])
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> [[TMP2:%.*]], <4 x i32> [[TMP3:%.*]])
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x float> [[TMP6]]
;
entry:
%4 = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> %0, i32 %1)
%5 = call <4 x float> @llvm.powi.v4f32.v4i32(<4 x float> %2, <4 x i32> %3)
%6 = shufflevector <4 x float> %4, <4 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %6
}

declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
declare <4 x float> @llvm.powi.v4f32.v4i32(<4 x float>, <4 x i32>)

0 comments on commit 0ccc609

Please sign in to comment.