diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7b981bead6bb89..722590a840a541 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15502,8 +15502,21 @@ void BoUpSLP::computeMinimumValueSizes() { auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, unsigned Opcode, unsigned Limit, bool IsTruncRoot, - bool IsSignedCmp) { + bool IsSignedCmp) -> unsigned { ToDemote.clear(); + // Check if the root is trunc and the next node is gather/buildvector, then + // keep trunc in scalars, which is free in most cases. + if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 && + E.Idx > (IsStoreOrInsertElt ? 2 : 1)) { + ToDemote.push_back(E.Idx); + const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE; + auto It = MinBWs.find(UserTE); + if (It != MinBWs.end()) + return It->second.first; + return DL->getTypeSizeInBits( + E.UserTreeIndices.back().UserTE->Scalars.front()->getType()); + } + unsigned VF = E.getVectorFactor(); auto *TreeRootIT = dyn_cast(E.Scalars.front()->getType()); if (!TreeRootIT || !Opcode) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll index 789d73947d1c73..97e505f4319c6b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll @@ -5,9 +5,9 @@ define void @t(i64 %v) { ; CHECK-LABEL: define void @t( ; CHECK-SAME: i64 [[V:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i16> +; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[V]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll index 032625a1199f9f..57b5d2af48ee60 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll @@ -5,20 +5,16 @@ define void @test(i64 %d.promoted.i) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AND_1_I:%.*]] = and i64 0, [[D_PROMOTED_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[TMP0]] to <8 x i1> -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i1> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[AND_1_I_1:%.*]] = and i64 0, 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i64> , i64 [[AND_1_I_1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i64> [[TMP3]] to <8 x i1> -; CHECK-NEXT: [[TMP5:%.*]] = mul <8 x i1> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) -; CHECK-NEXT: [[TMP7:%.*]] = zext i1 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[D_PROMOTED_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i1> , <16 x i1> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i1 [[TMP8]] to i32 -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[OP_RDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 0 ; CHECK-NEXT: store i32 [[TMP10]], ptr null, align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index b7237cbb02bb32..4ed52247c2ef37 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -11,26 +11,31 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK: if.then22.i: ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], -; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 -; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 -; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_4_I_I]], i32 5 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_5_I_I]], i32 6 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[SHR_6_I_I]], i32 7 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = trunc <16 x i32> [[TMP13]] to <16 x i8> -; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i8> [[TMP14]], -; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr undef, align 1 +; CHECK-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[CONV31_I]], 1 +; CHECK-NEXT: [[SHR_1_I_I:%.*]] = lshr i32 [[CONV31_I]], 2 +; CHECK-NEXT: [[SHR_2_I_I:%.*]] = lshr i32 [[CONV31_I]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[SUB_I]] to i8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[SHR_I_I]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[SHR_1_I_I]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[SHR_2_I_I]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], +; CHECK-NEXT: [[TMP17:%.*]] = trunc <8 x i32> [[TMP16]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = and <16 x i8> [[TMP19]], +; CHECK-NEXT: store <16 x i8> [[TMP20]], ptr undef, align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll index 143052a3d9cd07..c2555889f59816 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-LABEL: @test1( @@ -14,13 +14,14 @@ define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[T5:%.*]] = trunc i128 [[P1]] to i32 ; CHECK-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i128> [[VEC:%.*]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i128> [[VEC]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP12]] ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP13]], [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll index 531e964053482e..88503aeb6071ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll +++ b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll @@ -5,10 +5,10 @@ define i32 @test(i4 %0) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: i4 [[TMP0:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = trunc i8 0 to i4 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i8 0 to i4 -; CHECK-NEXT: [[ADD_R:%.*]] = or i4 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ADD_R14:%.*]] = or i4 0, [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i4> , i4 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i4> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[ADD_R:%.*]] = extractelement <2 x i4> [[TMP2]], i32 0 +; CHECK-NEXT: [[ADD_R14:%.*]] = extractelement <2 x i4> [[TMP2]], i32 1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i4 [[ADD_R]], [[ADD_R14]] ; CHECK-NEXT: ret i32 0 ;