diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 51387a33daadfd..d30f16838a8573 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1131,50 +1131,51 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, const Instruction *CxtI) { if (!isa(VT)) return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); - + Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); // Larger vector widths may require additional instructions, but are // typically cheaper than scalarized versions. unsigned NumVectorElts = cast(VT)->getNumElements(); - if (ST->hasVOP3PInsts()) { - if (DL.getTypeSizeInBits(VT->getElementType()) == 16) { - unsigned RequestedElts = - count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); - if (RequestedElts == 0) + if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + DL.getTypeSizeInBits(VT->getElementType()) == 16) { + bool HasVOP3P = ST->hasVOP3PInsts(); + unsigned RequestedElts = + count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); + if (RequestedElts == 0) + return 0; + switch (Kind) { + case TTI::SK_Broadcast: + case TTI::SK_Reverse: + case TTI::SK_PermuteSingleSrc: { + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle of two elements is free. + if (HasVOP3P && NumVectorElts == 2) return 0; - switch (Kind) { - case TTI::SK_Broadcast: - case TTI::SK_Reverse: - case TTI::SK_PermuteSingleSrc: { - // With op_sel VOP3P instructions freely can access the low half or high - // half of a register, so any swizzle of two elements is free. - if (NumVectorElts == 2) - return 0; - unsigned NumPerms = alignTo(RequestedElts,2) / 2; - // SK_Broadcast just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; - return NumPerms + NumPermMasks; - } - case TTI::SK_ExtractSubvector: - case TTI::SK_InsertSubvector: { - if (NumVectorElts == 2) - return 0; - // Insert/extract subvectors require only shifts / extract code to get the relevant bits - return alignTo(RequestedElts,2) / 2; - } - case TTI::SK_PermuteTwoSrc: - case TTI::SK_Splice: - case TTI::SK_Select: { - unsigned NumPerms = alignTo(RequestedElts,2) / 2; - // SK_Select just reuses the same mask - unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; - return NumPerms + NumPermMasks; - } + unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + // SK_Broadcast just reuses the same mask + unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; + return NumPerms + NumPermMasks; + } + case TTI::SK_ExtractSubvector: + case TTI::SK_InsertSubvector: { + if (HasVOP3P && NumVectorElts == 2) + return 0; + // Insert/extract subvectors require only shifts / extract code to get the + // relevant bits + return alignTo(RequestedElts, 2) / 2; + } + case TTI::SK_PermuteTwoSrc: + case TTI::SK_Splice: + case TTI::SK_Select: { + unsigned NumPerms = alignTo(RequestedElts, 2) / 2; + // SK_Select just reuses the same mask + unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; + return NumPerms + NumPermMasks; + } - default: - break; - } + default: + break; } } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll index 6684c125aa8343..bb637cc57b2854 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -86,13 +86,13 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; VI-LABEL: 'shufflevector_i16' -; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> @@ -123,13 +123,13 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> @@ -240,13 +240,13 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; VI-SIZE-LABEL: 'shufflevector_i16' -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> @@ -277,13 +277,13 @@ define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) { ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> @@ -916,10 +916,10 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> @@ -990,10 +990,10 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> +; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> ; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> @@ -1046,137 +1046,71 @@ define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> } define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) { -; GFX9-10-LABEL: 'concat' -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> -; GFX9-10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; VI-LABEL: 'concat' -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> -; VI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void -; -; GFX9-10-SIZE-LABEL: 'concat' -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> -; GFX9-10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-LABEL: 'concat' +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> +; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; -; VI-SIZE-LABEL: 'concat' -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> -; VI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; ALL-SIZE-LABEL: 'concat' +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> +; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 4effd183505577..3749bdf1bba394 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -323,19 +323,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 -; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 -; GFX8-NEXT: ret <4 x i16> [[INS_3]] +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 6465508d3610ec..0bb641371825b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -323,19 +323,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> ; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> -; GFX8-NEXT: [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 -; GFX8-NEXT: [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3 -; GFX8-NEXT: ret <4 x i16> [[INS_3]] +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> +; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> +; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll index b34b9a3525363f..dfa8be97417792 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -3,21 +3,10 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s define half @reduction_half4(<4 x half> %a) { -; GFX9-LABEL: @reduction_half4( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]]) -; GFX9-NEXT: ret half [[TMP0]] -; -; VI-LABEL: @reduction_half4( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] -; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] -; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] -; VI-NEXT: ret half [[ADD3]] +; GCN-LABEL: @reduction_half4( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]]) +; GCN-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <4 x half> %a, i64 0 @@ -33,29 +22,10 @@ entry: } define half @reduction_half8(<8 x half> %vec8) { -; GFX9-LABEL: @reduction_half8( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]]) -; GFX9-NEXT: ret half [[TMP0]] -; -; VI-LABEL: @reduction_half8( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3 -; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 -; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 -; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 -; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 -; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] -; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] -; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] -; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]] -; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]] -; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]] -; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]] -; VI-NEXT: ret half [[ADD7]] +; GCN-LABEL: @reduction_half8( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]]) +; GCN-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <8 x half> %vec8, i64 0 @@ -86,15 +56,7 @@ define half @reduction_half16(<16 x half> %vec16) { ; ; VI-LABEL: @reduction_half16( ; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3 -; VI-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4 -; VI-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5 -; VI-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6 -; VI-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7 -; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8 +; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 8 ; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9 ; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10 ; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11 @@ -102,22 +64,17 @@ define half @reduction_half16(<16 x half> %vec16) { ; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13 ; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14 ; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15 -; VI-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]] -; VI-NEXT: [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]] -; VI-NEXT: [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]] -; VI-NEXT: [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]] -; VI-NEXT: [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]] -; VI-NEXT: [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]] -; VI-NEXT: [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]] -; VI-NEXT: [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]] -; VI-NEXT: [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]] -; VI-NEXT: [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]] -; VI-NEXT: [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]] -; VI-NEXT: [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]] -; VI-NEXT: [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]] -; VI-NEXT: [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]] -; VI-NEXT: [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]] -; VI-NEXT: ret half [[ADD15]] +; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> +; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]]) +; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT8]] +; VI-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT9]], [[ELT10]] +; VI-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT11]], [[ELT12]] +; VI-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT13]], [[ELT14]] +; VI-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] +; VI-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX2]], [[OP_RDX3]] +; VI-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX4]], [[OP_RDX5]] +; VI-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX6]], [[ELT15]] +; VI-NEXT: ret half [[OP_RDX7]] ; entry: %elt0 = extractelement <16 x half> %vec16, i64 0 @@ -183,21 +140,10 @@ entry: } define i16 @reduction_v4i16(<4 x i16> %a) { -; GFX9-LABEL: @reduction_v4i16( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]]) -; GFX9-NEXT: ret i16 [[TMP0]] -; -; VI-LABEL: @reduction_v4i16( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3 -; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]] -; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]] -; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]] -; VI-NEXT: ret i16 [[ADD3]] +; GCN-LABEL: @reduction_v4i16( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]]) +; GCN-NEXT: ret i16 [[TMP0]] ; entry: %elt0 = extractelement <4 x i16> %a, i64 0 @@ -213,29 +159,10 @@ entry: } define i16 @reduction_v8i16(<8 x i16> %vec8) { -; GFX9-LABEL: @reduction_v8i16( -; GFX9-NEXT: entry: -; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]]) -; GFX9-NEXT: ret i16 [[TMP0]] -; -; VI-LABEL: @reduction_v8i16( -; VI-NEXT: entry: -; VI-NEXT: [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0 -; VI-NEXT: [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1 -; VI-NEXT: [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2 -; VI-NEXT: [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3 -; VI-NEXT: [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4 -; VI-NEXT: [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5 -; VI-NEXT: [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6 -; VI-NEXT: [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7 -; VI-NEXT: [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]] -; VI-NEXT: [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]] -; VI-NEXT: [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]] -; VI-NEXT: [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]] -; VI-NEXT: [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]] -; VI-NEXT: [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]] -; VI-NEXT: [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]] -; VI-NEXT: ret i16 [[ADD7]] +; GCN-LABEL: @reduction_v8i16( +; GCN-NEXT: entry: +; GCN-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]]) +; GCN-NEXT: ret i16 [[TMP0]] ; entry: %elt0 = extractelement <8 x i16> %vec8, i64 0