From 9b9405621bcc55b74d2177c960c21f62cc95e6fd Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Mon, 1 Jul 2024 08:41:19 +0800 Subject: [PATCH] [X86] Relax VPERMV3 to VPERMV combine for more types (#97206) This is a follow up of #96414 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +++---- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 24 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1d4af62c3227d7..8eadf079d4f2f3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41334,15 +41334,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, return SDValue(); } case X86ISD::VPERMV3: { - // VPERM[I,T]2[B,W] are 3 uops on Skylake and Icelake so we try to use - // VPERMV. + // Combine VPERMV3 to widened VPERMV if the two source operands are split + // from the same vector. SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); MVT SVT = V1.getSimpleValueType(); - MVT EVT = VT.getVectorElementType(); MVT NVT = VT.getDoubleNumVectorElementsVT(); - if ((EVT == MVT::i8 || EVT == MVT::i16) && - (NVT.is256BitVector() || + if ((NVT.is256BitVector() || (NVT.is512BitVector() && Subtarget.hasEVEX512())) && V1.getOpcode() == ISD::EXTRACT_SUBVECTOR && V1.getConstantOperandVal(1) == 0 && diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index fc7c8facb9d5e2..f1c70378b1eb34 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -7008,6 +7008,30 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> ret <4 x double> %1 } +define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) { +; X86-LABEL: combine_vpermi2d_vpermps: +; X86: # %bb.0: +; X86-NEXT: vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] +; X86-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: combine_vpermi2d_vpermps: +; X64: # %bb.0: +; X64-NEXT: vmovaps {{.*#+}} ymm1 = [14,13,6,3,5,15,0,1] +; X64-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x0d,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X64-NEXT: retq # encoding: [0xc3] + %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %1, <8 x i32> , <8 x i32> %2) + ret <8 x i32> %3 +} + declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)