From 6c5277baf558c0f3f17043b1adbed54679191779 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Mon, 30 Sep 2024 20:26:54 +0000 Subject: [PATCH] [X86] Decode VPTERNLOG truth tables when disassembling Alongside something like: vpternlogq zmm0, zmm2, zmm1, 64 We will now have a comment on the right like: # zmm0 = zmm0 & zmm2 & ~zmm1 This makes it easy to tell at a glance what sort of truth table the instruction will provide. --- .../X86/MCTargetDesc/X86InstComments.cpp | 92 +++ .../CodeGen/X86/avx512-gfni-intrinsics.ll | 74 ++ .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 10 + llvm/test/CodeGen/X86/avx512-vec-cmp.ll | 7 + .../X86/avx512vl-intrinsics-upgrade.ll | 20 + llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 20 + .../CodeGen/X86/sse-intrinsics-fast-isel.ll | 1 + .../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 2 + .../CodeGen/X86/stack-folding-int-avx512.ll | 18 +- .../vector-interleaved-load-i16-stride-5.ll | 92 +-- .../vector-interleaved-load-i16-stride-6.ll | 244 ++++--- .../vector-interleaved-load-i16-stride-7.ll | 158 ++-- .../vector-interleaved-load-i8-stride-6.ll | 492 ++++++------- .../vector-interleaved-store-i16-stride-5.ll | 308 ++++---- .../vector-interleaved-store-i16-stride-6.ll | 123 ++-- .../vector-interleaved-store-i16-stride-7.ll | 672 +++++++++--------- .../vector-interleaved-store-i8-stride-8.ll | 58 +- 17 files changed, 1370 insertions(+), 1021 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index a4b72515252a08..534717a4bea4ea 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -39,6 +39,11 @@ using namespace llvm; CASE_MASK_INS_COMMON(Inst, Suffix, src) \ CASE_MASKZ_INS_COMMON(Inst, Suffix, src) +#define CASE_PTERNLOG(Inst, src) \ + CASE_AVX512_INS_COMMON(Inst, Z, r##src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_AVX512_INS_COMMON(Inst, Z128, r##src##i) + #define CASE_MOVDUP(Inst, src) \ CASE_AVX512_INS_COMMON(Inst, Z, r##src) \ CASE_AVX512_INS_COMMON(Inst, Z256, r##src) \ @@ -617,6 +622,90 @@ static bool printFMAComments(const MCInst *MI, raw_ostream &OS, return true; } +static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS, + const MCInstrInfo &MCII) { + unsigned NumOperands = MI->getNumOperands(); + + int Src2Idx; + int Src3Idx; + switch (MI->getOpcode()) { + // dest, src1, src2, src3, tbl + // dest, src1, mask, src2, src3, tbl + CASE_PTERNLOG(PTERNLOGD, r) + CASE_PTERNLOG(PTERNLOGQ, r) + Src2Idx = NumOperands - 3; + Src3Idx = NumOperands - 2; + break; + + // dest, src1, src2, memory, tbl + // dest, src1, mask, src2, memory, tbl + CASE_PTERNLOG(PTERNLOGD, m) + CASE_PTERNLOG(PTERNLOGQ, m) + CASE_PTERNLOG(PTERNLOGD, mb) + CASE_PTERNLOG(PTERNLOGQ, mb) + Src2Idx = NumOperands - 7; + Src3Idx = -1; + break; + + default: + return false; + } + const char *DestName = getRegName(MI->getOperand(0).getReg()); + const char *Src1Name = getRegName(MI->getOperand(1).getReg()); + const char *Src2Name = getRegName(MI->getOperand(Src2Idx).getReg()); + const char *Src3Name = + Src3Idx != -1 ? getRegName(MI->getOperand(Src3Idx).getReg()) : "mem"; + uint8_t TruthTable = MI->getOperand(NumOperands - 1).getImm(); + + OS << DestName; + printMasking(OS, MI, MCII); + OS << " = "; + + constexpr unsigned kNumVariables = 3; + constexpr unsigned kNumTruthTableEntries = 1 << kNumVariables; + int NumMinterms = llvm::popcount(TruthTable); + if (NumMinterms == 0) { + OS << '0'; + } else if (NumMinterms == kNumTruthTableEntries) { + OS << "-1"; + } else { + while (TruthTable != 0) { + // Index of the lowest bit set. + unsigned I = llvm::countr_zero(TruthTable); + // Clear the lowest bit set. + TruthTable &= TruthTable - 1; + // Our index tells us which sources are and are not complemented. Note + // that the indexing goes left-to-right. + bool Src1 = I & 0b100; + bool Src2 = I & 0b010; + bool Src3 = I & 0b001; + + // Group in parenthesis to make the output more obvious but only if there + // are multiple terms. + if (NumMinterms > 1) + OS << '('; + + if (!Src1) + OS << '~'; + OS << Src1Name << " & "; + if (!Src2) + OS << '~'; + OS << Src2Name << " & "; + if (!Src3) + OS << '~'; + OS << Src3Name; + + if (NumMinterms > 1) + OS << ')'; + + // Output an OR if there is another term in the table. + if (TruthTable != 0) + OS << " | "; + } + } + OS << '\n'; + return true; +} //===----------------------------------------------------------------------===// // Top Level Entrypoint @@ -636,6 +725,9 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, if (printFMAComments(MI, OS, MCII)) return true; + if (printPTERNLOGComments(MI, OS, MCII)) + return true; + switch (MI->getOpcode()) { default: // Not an instruction for which we can decode comments. diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index bafa33ff9a1c8a..432d27ac04eda9 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -33,9 +33,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] @@ -47,9 +49,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] @@ -95,12 +99,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -113,12 +120,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> @@ -166,18 +176,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X86NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xcf,0xd9,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: # zmm6 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineinvqb_512: @@ -195,18 +210,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X64NOBW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: # zmm6 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0) ; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> @@ -250,9 +270,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X86NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X86NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] @@ -264,9 +286,11 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xcd] ; X64NOBW-NEXT: vpand %xmm4, %xmm5, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xd1,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %xmm0, %xmm5, %xmm2 # encoding: [0x62,0xf3,0xd5,0x08,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # xmm2 = (~xmm2 & xmm5 & xmm0) | (xmm2 & ~xmm5 & ~xmm0) | (xmm2 & ~xmm5 & xmm0) | (xmm2 & xmm5 & xmm0) ; X64NOBW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] @@ -312,12 +336,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X86NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X86NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X86NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -330,12 +357,15 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xed,0x01] ; X64NOBW-NEXT: vpand %ymm4, %ymm5, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xd5,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %ymm0, %ymm5, %ymm2 # encoding: [0x62,0xf3,0xd5,0x28,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # ymm2 = (~ymm2 & ymm5 & ymm0) | (ymm2 & ~ymm5 & ~ymm0) | (ymm2 & ~ymm5 & ymm0) | (ymm2 & ymm5 & ymm0) ; X64NOBW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> @@ -383,18 +413,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X86NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm3, %zmm4 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xe1,0x04] ; X86NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm3, %zmm3 # encoding: [0x62,0xf3,0xe5,0x48,0xce,0xd9,0x05] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} # encoding: [0x62,0xf3,0x55,0xca,0x25,0xed,0xff] +; X86NOBW-NEXT: # zmm5 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X86NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} # encoding: [0x62,0xf3,0x4d,0xc9,0x25,0xf6,0xff] +; X86NOBW-NEXT: # zmm6 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X86NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X86NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X86NOBW-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd3,0xb8] +; X86NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8affineqb_512: @@ -412,18 +447,23 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X64NOBW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x04] ; X64NOBW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x05] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} # encoding: [0x62,0xf3,0x55,0xcb,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xcd,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} # encoding: [0x62,0xf3,0x55,0xc9,0x25,0xed,0xff] +; X64NOBW-NEXT: # zmm5 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm5, %xmm5 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xed] ; X64NOBW-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} # encoding: [0x62,0xf3,0x4d,0xca,0x25,0xf6,0xff] +; X64NOBW-NEXT: # zmm6 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm6, %xmm6 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xf6] ; X64NOBW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x55,0x38,0xee,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 # encoding: [0x62,0xf3,0xd5,0x48,0x3a,0xe9,0x01] ; X64NOBW-NEXT: vpandq %zmm4, %zmm5, %zmm1 # encoding: [0x62,0xf1,0xd5,0x48,0xdb,0xcc] ; X64NOBW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 # encoding: [0x62,0xf3,0xd5,0x48,0x25,0xd0,0xb8] +; X64NOBW-NEXT: # zmm2 = (~zmm2 & zmm5 & zmm0) | (zmm2 & ~zmm5 & ~zmm0) | (zmm2 & ~zmm5 & zmm0) | (zmm2 & zmm5 & zmm0) ; X64NOBW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> @@ -468,8 +508,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca] +; X86NOBW-NEXT: # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2) ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86NOBW-NEXT: retl # encoding: [0xc3] ; @@ -478,8 +520,10 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 ; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogq $202, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0xca] +; X64NOBW-NEXT: # xmm0 = (~xmm0 & ~xmm1 & xmm2) | (~xmm0 & xmm1 & xmm2) | (xmm0 & xmm1 & ~xmm2) | (xmm0 & xmm1 & xmm2) ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> @@ -506,6 +550,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] ; X86NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -516,6 +561,7 @@ define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i1 ; X64NOBW-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64NOBW-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpand %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xc0] ; X64NOBW-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -557,11 +603,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X86NOBW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca] +; X86NOBW-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8mulb_256_mask: @@ -571,11 +620,14 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 ; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] ; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X64NOBW-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0xca] +; X64NOBW-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ymm2) | (~ymm0 & ymm1 & ymm2) | (ymm0 & ymm1 & ~ymm2) | (ymm0 & ymm1 & ymm2) ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) @@ -602,8 +654,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k2 # encoding: [0xc5,0xf8,0x90,0x54,0x24,0x06] ; X86NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X86NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] @@ -616,8 +670,10 @@ define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i3 ; X64NOBW-NEXT: kmovw %edi, %k2 # encoding: [0xc5,0xf8,0x92,0xd7] ; X64NOBW-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} # encoding: [0x62,0xf3,0x6d,0xca,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X64NOBW-NEXT: vpand %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xdb,0xc0] @@ -661,17 +717,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] ; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X86NOBW-NEXT: # zmm0 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X86NOBW-NEXT: # zmm4 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] ; X86NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] ; X86NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X86NOBW-NEXT: # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2) ; X86NOBW-NEXT: retl # encoding: [0xc3] ; ; X64NOBW-LABEL: test_vgf2p8mulb_512_mask: @@ -687,17 +748,22 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 ; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] ; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} # encoding: [0x62,0xf3,0x7d,0xcc,0x25,0xc0,0xff] +; X64NOBW-NEXT: # zmm0 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} # encoding: [0x62,0xf3,0x65,0xcb,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xc3,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x65,0xc9,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} # encoding: [0x62,0xf3,0x5d,0xca,0x25,0xe4,0xff] +; X64NOBW-NEXT: # zmm4 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm4, %xmm4 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xe4] ; X64NOBW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x65,0x38,0xdc,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 # encoding: [0x62,0xf3,0xe5,0x48,0x3a,0xc0,0x01] ; X64NOBW-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0xca] +; X64NOBW-NEXT: # zmm0 = (~zmm0 & ~zmm1 & zmm2) | (~zmm0 & zmm1 & zmm2) | (zmm0 & zmm1 & ~zmm2) | (zmm0 & zmm1 & zmm2) ; X64NOBW-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) @@ -726,13 +792,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6 ; X86NOBW-NEXT: kmovw {{[0-9]+}}(%esp), %k4 # encoding: [0xc5,0xf8,0x90,0x64,0x24,0x08] ; X86NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] ; X86NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X86NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k3} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X86NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X86NOBW-NEXT: # zmm2 {%k1} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X86NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X86NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X86NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X86NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] ; X86NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] @@ -752,13 +822,17 @@ define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i6 ; X64NOBW-NEXT: kmovw %edi, %k4 # encoding: [0xc5,0xf8,0x92,0xe7] ; X64NOBW-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xc1] ; X64NOBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} # encoding: [0x62,0xf3,0x75,0xcc,0x25,0xc9,0xff] +; X64NOBW-NEXT: # zmm1 {%k4} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc9] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} # encoding: [0x62,0xf3,0x6d,0xcb,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k3} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xca,0x01] ; X64NOBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x6d,0xc9,0x25,0xd2,0xff] +; X64NOBW-NEXT: # zmm2 {%k1} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm2, %xmm2 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xd2] ; X64NOBW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} # encoding: [0x62,0xf3,0x65,0xca,0x25,0xdb,0xff] +; X64NOBW-NEXT: # zmm3 {%k2} {z} = -1 ; X64NOBW-NEXT: vpmovdb %zmm3, %xmm3 # encoding: [0x62,0xf2,0x7e,0x48,0x31,0xdb] ; X64NOBW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x6d,0x38,0xd3,0x01] ; X64NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 # encoding: [0x62,0xf3,0xed,0x48,0x3a,0xc9,0x01] diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 6c9c28bc9e55e1..9b94a9cf45ddfc 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -7682,6 +7682,7 @@ define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) ret <16 x i32> %res @@ -7692,12 +7693,14 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) ret <16 x i32> %res @@ -7710,12 +7713,14 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x ; X86: ## %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) ret <16 x i32> %res @@ -7727,6 +7732,7 @@ define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ## zmm0 = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) ret <8 x i64> %res @@ -7738,12 +7744,14 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) ret <8 x i64> %res @@ -7757,12 +7765,14 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21] +; X86-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21] +; X64-NEXT: ## zmm0 {%k1} {z} = (~zmm0 & ~zmm1 & ~zmm2) | (zmm0 & ~zmm1 & zmm2) ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) ret <8 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll index 86ebb1e40870f8..832e55a8355252 100644 --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -301,6 +301,7 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x00] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpsrld $31, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0x1f] ; AVX512-NEXT: retq ## encoding: [0xc3] ; @@ -520,6 +521,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; AVX512-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x37,0xcb] ; AVX512-NEXT: kxnorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x46,0xc9] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq ## encoding: [0xc3] ; @@ -544,6 +546,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> ; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf1,0x6d,0x48,0x66,0xcb] ; KNL-NEXT: kxorw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x47,0xc9] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; KNL-NEXT: retq ## encoding: [0xc3] @@ -1233,6 +1236,7 @@ define <16 x i8> @test47(<16 x i32> %a, <16 x i8> %b, <16 x i8> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x31,0xc0] ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1264,6 +1268,7 @@ define <16 x i16> @test48(<16 x i32> %a, <16 x i16> %b, <16 x i16> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ## encoding: [0xc4,0xe3,0x6d,0x4c,0xc1,0x00] ; KNL-NEXT: retq ## encoding: [0xc3] @@ -1292,6 +1297,7 @@ define <8 x i16> @test49(<8 x i64> %a, <8 x i16> %b, <8 x i16> %c) { ; KNL: ## %bb.0: ; KNL-NEXT: vptestnmq %zmm0, %zmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x48,0x27,0xc8] ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; KNL-NEXT: ## zmm0 {%k1} {z} = -1 ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x33,0xc0] ; KNL-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] ; KNL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -1408,6 +1414,7 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32> ; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf6,0x48,0x27,0xc9] ; AVX512-NEXT: korw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x45,0xc9] ; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x25,0xc0,0xff] +; AVX512-NEXT: ## zmm0 {%k1} {z} = -1 ; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xfa,0xc0] ; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; AVX512-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index c0bb0037923dce..519f19740ab250 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -12346,6 +12346,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1) ret <4 x i32> %res @@ -12357,12 +12358,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) ret <4 x i32> %res @@ -12376,12 +12379,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4) ret <4 x i32> %res @@ -12393,6 +12398,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1) ret <8 x i32> %res @@ -12404,12 +12410,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) ret <8 x i32> %res @@ -12423,12 +12431,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4) ret <8 x i32> %res @@ -12440,6 +12450,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1) ret <2 x i64> %res @@ -12451,12 +12462,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) ret <2 x i64> %res @@ -12470,12 +12483,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4) ret <2 x i64> %res @@ -12487,6 +12502,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1) ret <4 x i64> %res @@ -12498,12 +12514,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) ret <4 x i64> %res @@ -12517,12 +12535,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4) ret <4 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index f1c70378b1eb34..6c7a5d2f863412 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4150,6 +4150,7 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) ret <4 x i32> %1 @@ -4161,12 +4162,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4183,12 +4186,14 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4203,6 +4208,7 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) ret <8 x i32> %1 @@ -4214,12 +4220,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4235,12 +4243,14 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4254,6 +4264,7 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21] +; CHECK-NEXT: # xmm0 = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) ret <2 x i64> %1 @@ -4265,12 +4276,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4285,12 +4298,14 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X86-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21] +; X64-NEXT: # xmm0 {%k1} {z} = (~xmm0 & ~xmm1 & ~xmm2) | (xmm0 & ~xmm1 & xmm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4305,6 +4320,7 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1 ; CHECK-LABEL: test_int_x86_avx512_pternlog_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21] +; CHECK-NEXT: # ymm0 = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) ret <4 x i64> %1 @@ -4316,12 +4332,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> @@ -4336,12 +4354,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X86-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21] +; X64-NEXT: # ymm0 {%k1} {z} = (~ymm0 & ~ymm1 & ~ymm2) | (ymm0 & ~ymm1 & ymm2) ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33) %2 = bitcast i8 %x4 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index f2e48c7f308e5c..86b8121f21cff2 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -96,6 +96,7 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind ; AVX512-LABEL: test_mm_andnot_ps: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index adf4fc28208e78..853bb6367fe4d0 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -288,6 +288,7 @@ define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounw ; AVX512-LABEL: test_mm_andnot_pd: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x double> %a0 to <4 x i32> @@ -316,6 +317,7 @@ define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; AVX512-LABEL: test_mm_andnot_si128: ; AVX512: # %bb.0: ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f] +; AVX512-NEXT: # xmm0 = (~xmm0 & ~xmm0 & ~xmm0) | (~xmm0 & ~xmm0 & xmm0) | (~xmm0 & xmm0 & ~xmm0) | (~xmm0 & xmm0 & xmm0) ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %not = xor <2 x i64> %a0, diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index fb3d57e5953075..e8b0facf534b07 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -442,7 +442,7 @@ define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -461,7 +461,7 @@ define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2210,7 +2210,7 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2286,7 +2286,7 @@ define <8 x i64> @stack_fold_permq(<8 x i64> %a0) { ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2307,7 +2307,7 @@ define <8 x i64> @stack_fold_permq_mask(ptr %passthru, <8 x i64> %a0, i8 %mask) ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2347,7 +2347,7 @@ define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -2368,7 +2368,7 @@ define <8 x i64> @stack_fold_permqvar_mask(ptr %passthru, <8 x i64> %a0, <8 x i6 ; CHECK-NEXT: kmovd %esi, %k1 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 ; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload -; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm0 = -1 ; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -6091,7 +6091,7 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() @@ -7047,6 +7047,7 @@ define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32 ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33) @@ -7062,6 +7063,7 @@ define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: # zmm0 = (~zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index b18f08b62f0d4c..fd9ba68d5707a7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -3926,7 +3926,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm15, %zmm18, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15) ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 @@ -3959,7 +3959,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] ; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -4007,7 +4007,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm14 @@ -4032,7 +4032,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] @@ -4117,7 +4117,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -4133,7 +4133,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] @@ -4177,7 +4177,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 @@ -4201,7 +4201,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 @@ -4298,7 +4298,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm15, %zmm18, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm18 & zmm15) | (zmm7 & ~zmm18 & ~zmm15) | (zmm7 & ~zmm18 & zmm15) | (zmm7 & zmm18 & zmm15) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14 @@ -4331,7 +4331,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm0) | (zmm13 & ~zmm18 & ~zmm0) | (zmm13 & ~zmm18 & zmm0) | (zmm13 & zmm18 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -4379,7 +4379,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm0 & ~mem) | (zmm13 & ~zmm0 & mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm14 @@ -4404,7 +4404,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm18 & zmm13) | (zmm11 & ~zmm18 & ~zmm13) | (zmm11 & ~zmm18 & zmm13) | (zmm11 & zmm18 & zmm13) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] @@ -4489,7 +4489,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm18 & zmm12) | (zmm13 & ~zmm18 & ~zmm12) | (zmm13 & ~zmm18 & zmm12) | (zmm13 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -4505,7 +4505,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm18 & zmm12) | (zmm15 & ~zmm18 & ~zmm12) | (zmm15 & ~zmm18 & zmm12) | (zmm15 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] @@ -4549,7 +4549,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm18, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm18 & zmm12) | (zmm14 & ~zmm18 & ~zmm12) | (zmm14 & ~zmm18 & zmm12) | (zmm14 & zmm18 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 @@ -4573,7 +4573,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm6 & mem) | (zmm12 & ~zmm6 & ~mem) | (zmm12 & zmm6 & ~mem) | (zmm12 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 @@ -8060,7 +8060,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -8118,7 +8118,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm19, %zmm27, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] @@ -8146,6 +8146,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 @@ -8177,6 +8178,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -8204,7 +8206,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq $184, %zmm28, %zmm27, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -8229,7 +8231,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2) ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm8 ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] @@ -8293,7 +8295,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] @@ -8326,7 +8328,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem) ; AVX512-NEXT: vmovdqa %ymm9, %ymm2 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] @@ -8494,7 +8496,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8536,7 +8538,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] @@ -8581,7 +8583,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm10, %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8621,7 +8623,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] @@ -8666,7 +8668,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] @@ -8700,7 +8702,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -8769,7 +8771,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] @@ -8794,7 +8796,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] @@ -8992,7 +8994,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 @@ -9050,7 +9052,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm19, %zmm27, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm19) | (zmm10 & ~zmm27 & ~zmm19) | (zmm10 & ~zmm27 & zmm19) | (zmm10 & zmm27 & zmm19) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] @@ -9078,6 +9080,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 @@ -9109,6 +9112,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = (~zmm2 & zmm27 & mem) | (zmm2 & ~zmm27 & ~mem) | (zmm2 & ~zmm27 & mem) | (zmm2 & zmm27 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] @@ -9136,7 +9140,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm28, %zmm27, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm27 & zmm28) | (zmm0 & ~zmm27 & ~zmm28) | (zmm0 & ~zmm27 & zmm28) | (zmm0 & zmm27 & zmm28) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -9161,7 +9165,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm27 & zmm2) | (zmm13 & ~zmm27 & ~zmm2) | (zmm13 & ~zmm27 & zmm2) | (zmm13 & zmm27 & zmm2) ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8 ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] @@ -9225,7 +9229,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm27 & zmm0) | (zmm10 & ~zmm27 & ~zmm0) | (zmm10 & ~zmm27 & zmm0) | (zmm10 & zmm27 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] @@ -9258,7 +9262,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm0 & mem) | (zmm8 & ~zmm0 & ~mem) | (zmm8 & zmm0 & ~mem) | (zmm8 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] @@ -9426,7 +9430,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm8 & zmm4) | (zmm7 & ~zmm8 & ~zmm4) | (zmm7 & ~zmm8 & zmm4) | (zmm7 & zmm8 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9468,7 +9472,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm16 & zmm1) | (zmm2 & ~zmm16 & ~zmm1) | (zmm2 & ~zmm16 & zmm1) | (zmm2 & zmm16 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] @@ -9513,7 +9517,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm10, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm16 & zmm10) | (zmm4 & ~zmm16 & ~zmm10) | (zmm4 & ~zmm16 & zmm10) | (zmm4 & zmm16 & zmm10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9553,7 +9557,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm0) | (zmm2 & ~zmm10 & ~zmm0) | (zmm2 & ~zmm10 & zmm0) | (zmm2 & zmm10 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] @@ -9598,7 +9602,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm15 & zmm0) | (zmm14 & ~zmm15 & ~zmm0) | (zmm14 & ~zmm15 & zmm0) | (zmm14 & zmm15 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] @@ -9632,7 +9636,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm15 & zmm1) | (zmm2 & ~zmm15 & ~zmm1) | (zmm2 & ~zmm15 & zmm1) | (zmm2 & zmm15 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 @@ -9701,7 +9705,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm5 & zmm3) | (zmm25 & ~zmm5 & ~zmm3) | (zmm25 & ~zmm5 & zmm3) | (zmm25 & zmm5 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] @@ -9726,7 +9730,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm5 & zmm4) | (zmm24 & ~zmm5 & ~zmm4) | (zmm24 & ~zmm5 & zmm4) | (zmm24 & zmm5 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 605deed6536bf3..a37b8e33ceffe2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -2497,7 +2497,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] @@ -2512,7 +2512,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2608,7 +2608,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -2623,7 +2623,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -2721,7 +2721,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] @@ -2736,7 +2736,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -2832,7 +2832,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm7 & ~ymm11) | (~ymm10 & ymm7 & ymm11) | (ymm10 & ~ymm7 & ymm11) | (ymm10 & ymm7 & ~ymm11) | (ymm10 & ymm7 & ymm11) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 @@ -2847,7 +2847,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm11, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ymm11) | (ymm3 & ~ymm2 & ~ymm11) | (ymm3 & ~ymm2 & ymm11) | (ymm3 & ymm2 & ~ymm11) | (ymm3 & ymm2 & ymm11) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -5147,9 +5147,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm0 & zmm17) | (zmm2 & ~zmm0 & zmm17) | (zmm2 & zmm0 & ~zmm17) | (zmm2 & zmm0 & zmm17) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm2) | (zmm16 & ~zmm17 & ~zmm2) | (zmm16 & ~zmm17 & zmm2) | (zmm16 & zmm17 & zmm2) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] @@ -5178,9 +5178,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm2) | (zmm5 & ~zmm0 & zmm2) | (zmm5 & zmm0 & ~zmm2) | (zmm5 & zmm0 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm5) | (zmm20 & ~zmm17 & ~zmm5) | (zmm20 & ~zmm17 & zmm5) | (zmm20 & zmm17 & zmm5) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 @@ -5211,7 +5211,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm10 & ~ymm11) | (~ymm8 & ymm10 & ymm11) | (ymm8 & ~ymm10 & ymm11) | (ymm8 & ymm10 & ~ymm11) | (ymm8 & ymm10 & ymm11) ; AVX512-NEXT: movw $31, %ax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} @@ -5248,7 +5248,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm11) | (~ymm3 & ymm4 & ymm11) | (ymm3 & ~ymm4 & ymm11) | (ymm3 & ymm4 & ~ymm11) | (ymm3 & ymm4 & ymm11) ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] @@ -5261,16 +5261,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm30, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload +; AVX512-NEXT: # zmm15 = (~zmm15 & ~zmm3 & mem) | (zmm15 & ~zmm3 & mem) | (zmm15 & zmm3 & ~mem) | (zmm15 & zmm3 & mem) ; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm10 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm17 & zmm2) | (zmm10 & ~zmm17 & ~zmm2) | (zmm10 & ~zmm17 & zmm2) | (zmm10 & zmm17 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0) ; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) @@ -5426,9 +5428,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm0 & zmm17) | (zmm11 & ~zmm0 & zmm17) | (zmm11 & zmm0 & ~zmm17) | (zmm11 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm11) | (zmm16 & ~zmm17 & ~zmm11) | (zmm16 & ~zmm17 & zmm11) | (zmm16 & zmm17 & zmm11) ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] @@ -5454,9 +5456,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm17 & zmm4) | (zmm20 & ~zmm17 & ~zmm4) | (zmm20 & ~zmm17 & zmm4) | (zmm20 & zmm17 & zmm4) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 @@ -5486,7 +5488,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm10 & ~ymm11) | (~ymm5 & ymm10 & ymm11) | (ymm5 & ~ymm10 & ymm11) | (ymm5 & ymm10 & ~ymm11) | (ymm5 & ymm10 & ymm11) ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} @@ -5509,7 +5511,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm7 & ymm11) | (ymm0 & ~ymm7 & ~ymm11) | (ymm0 & ~ymm7 & ymm11) | (ymm0 & ymm7 & ~ymm11) | (ymm0 & ymm7 & ymm11) ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] @@ -5530,17 +5532,19 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm2) | (zmm8 & ~zmm17 & ~zmm2) | (zmm8 & ~zmm17 & zmm2) | (zmm8 & zmm17 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & zmm1) | (zmm0 & ~zmm17 & ~zmm1) | (zmm0 & ~zmm17 & zmm1) | (zmm0 & zmm17 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9) @@ -5612,7 +5616,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16) ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -5635,7 +5639,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm3) | (zmm2 & ~zmm17 & zmm3) | (zmm2 & zmm17 & ~zmm3) | (zmm2 & zmm17 & zmm3) ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -5702,9 +5706,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] @@ -5731,9 +5735,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm0 & zmm7) | (zmm4 & ~zmm0 & zmm7) | (zmm4 & zmm0 & ~zmm7) | (zmm4 & zmm0 & zmm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm4) | (zmm16 & ~zmm17 & ~zmm4) | (zmm16 & ~zmm17 & zmm4) | (zmm16 & zmm17 & zmm4) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 @@ -5763,7 +5767,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ~ymm14) | (~ymm6 & ymm8 & ymm14) | (ymm6 & ~ymm8 & ymm14) | (ymm6 & ymm8 & ~ymm14) | (ymm6 & ymm8 & ymm14) ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1} @@ -5800,7 +5804,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $236, %ymm14, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm14) | (~ymm2 & ymm3 & ymm14) | (ymm2 & ~ymm3 & ymm14) | (ymm2 & ymm3 & ~ymm14) | (ymm2 & ymm3 & ymm14) ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] @@ -5815,8 +5819,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm8 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm17 & zmm4) | (zmm8 & ~zmm17 & ~zmm4) | (zmm8 & ~zmm17 & zmm4) | (zmm8 & zmm17 & zmm4) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm17 & zmm0) | (zmm1 & ~zmm17 & ~zmm0) | (zmm1 & ~zmm17 & zmm0) | (zmm1 & zmm17 & zmm0) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9) @@ -5888,7 +5892,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm17 & zmm16) | (zmm10 & ~zmm17 & zmm16) | (zmm10 & zmm17 & ~zmm16) | (zmm10 & zmm17 & zmm16) ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -5911,7 +5915,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm17 & zmm5) | (zmm2 & ~zmm17 & zmm5) | (zmm2 & zmm17 & ~zmm5) | (zmm2 & zmm17 & zmm5) ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] @@ -5975,9 +5979,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm0 & zmm16) | (zmm9 & ~zmm0 & zmm16) | (zmm9 & zmm0 & ~zmm16) | (zmm9 & zmm0 & zmm16) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm17 & zmm9) | (zmm18 & ~zmm17 & ~zmm9) | (zmm18 & ~zmm17 & zmm9) | (zmm18 & zmm17 & zmm9) ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] @@ -6001,9 +6005,9 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm0 & zmm7) | (zmm3 & ~zmm0 & zmm7) | (zmm3 & zmm0 & ~zmm7) | (zmm3 & zmm0 & zmm7) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm17 & zmm3) | (zmm16 & ~zmm17 & ~zmm3) | (zmm16 & ~zmm17 & zmm3) | (zmm16 & zmm17 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 @@ -6032,7 +6036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm13, %ymm11, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm11 & ~ymm13) | (~ymm5 & ymm11 & ymm13) | (ymm5 & ~ymm11 & ymm13) | (ymm5 & ymm11 & ~ymm13) | (ymm5 & ymm11 & ymm13) ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1} @@ -6056,7 +6060,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm13, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm8 & ymm13) | (ymm2 & ~ymm8 & ~ymm13) | (ymm2 & ~ymm8 & ymm13) | (ymm2 & ymm8 & ~ymm13) | (ymm2 & ymm8 & ymm13) ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] @@ -6079,8 +6083,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm17 & zmm0) | (zmm7 & ~zmm17 & ~zmm0) | (zmm7 & ~zmm17 & zmm0) | (zmm7 & zmm17 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm17 & zmm1) | (zmm2 & ~zmm17 & ~zmm1) | (zmm2 & ~zmm17 & zmm1) | (zmm2 & zmm17 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) @@ -10680,9 +10684,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm2) | (zmm1 & ~zmm29 & zmm2) | (zmm1 & zmm29 & ~zmm2) | (zmm1 & zmm29 & zmm2) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10743,8 +10747,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm23) | (zmm2 & ~zmm29 & zmm23) | (zmm2 & zmm29 & ~zmm23) | (zmm2 & zmm29 & zmm23) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm22 & zmm2) | (zmm0 & ~zmm22 & ~zmm2) | (zmm0 & ~zmm22 & zmm2) | (zmm0 & zmm22 & zmm2) ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] @@ -10790,8 +10794,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 -; AVX512-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm28) | (zmm0 & ~zmm29 & zmm28) | (zmm0 & zmm29 & ~zmm28) | (zmm0 & zmm29 & zmm28) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm22 & zmm0) | (zmm25 & ~zmm22 & ~zmm0) | (zmm25 & ~zmm22 & zmm0) | (zmm25 & zmm22 & zmm0) ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] @@ -10819,8 +10823,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm29 & zmm0) | (zmm2 & ~zmm29 & zmm0) | (zmm2 & zmm29 & ~zmm0) | (zmm2 & zmm29 & zmm0) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm22 & zmm2) | (zmm28 & ~zmm22 & ~zmm2) | (zmm28 & ~zmm22 & zmm2) | (zmm28 & zmm22 & zmm2) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -10859,7 +10863,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm4 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm4 & ~ymm29) | (~ymm3 & ymm4 & ymm29) | (ymm3 & ~ymm4 & ymm29) | (ymm3 & ymm4 & ~ymm29) | (ymm3 & ymm4 & ymm29) ; AVX512-NEXT: movw $31, %ax ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} @@ -10896,7 +10900,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm0 & ~ymm29) | (~ymm4 & ymm0 & ymm29) | (ymm4 & ~ymm0 & ymm29) | (ymm4 & ymm0 & ~ymm29) | (ymm4 & ymm0 & ymm29) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX512-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -10954,7 +10958,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm13 & ~ymm29) | (~ymm3 & ymm13 & ymm29) | (ymm3 & ~ymm13 & ymm29) | (ymm3 & ymm13 & ~ymm29) | (ymm3 & ymm13 & ymm29) ; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3 @@ -10973,7 +10977,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm3 & ~ymm29) | (~ymm10 & ymm3 & ymm29) | (ymm10 & ~ymm3 & ymm29) | (ymm10 & ymm3 & ~ymm29) | (ymm10 & ymm3 & ymm29) ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] @@ -10999,6 +11003,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -11006,24 +11011,27 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = (~zmm7 & ~zmm6 & mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm6 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 -; AVX512-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm6 & zmm30) | (zmm3 & ~zmm6 & ~zmm30) | (zmm3 & ~zmm6 & zmm30) | (zmm3 & zmm6 & zmm30) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm6 & zmm31) | (zmm5 & ~zmm6 & ~zmm31) | (zmm5 & ~zmm6 & zmm31) | (zmm5 & zmm6 & zmm31) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & zmm2) | (zmm0 & ~zmm6 & ~zmm2) | (zmm0 & ~zmm6 & zmm2) | (zmm0 & zmm6 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm1) | (zmm4 & ~zmm6 & ~zmm1) | (zmm4 & ~zmm6 & zmm1) | (zmm4 & zmm6 & zmm1) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11306,9 +11314,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm29 & zmm3) | (zmm4 & ~zmm29 & zmm3) | (zmm4 & zmm29 & ~zmm3) | (zmm4 & zmm29 & zmm3) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm26 & zmm4) | (zmm5 & ~zmm26 & ~zmm4) | (zmm5 & ~zmm26 & zmm4) | (zmm5 & zmm26 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11364,8 +11372,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm29 & zmm11) | (zmm1 & ~zmm29 & zmm11) | (zmm1 & zmm29 & ~zmm11) | (zmm1 & zmm29 & zmm11) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm26 & zmm1) | (zmm25 & ~zmm26 & ~zmm1) | (zmm25 & ~zmm26 & zmm1) | (zmm25 & zmm26 & zmm1) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -11404,8 +11412,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm23 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm29 & zmm24) | (zmm0 & ~zmm29 & zmm24) | (zmm0 & zmm29 & ~zmm24) | (zmm0 & zmm29 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm26 & zmm0) | (zmm23 & ~zmm26 & ~zmm0) | (zmm23 & ~zmm26 & zmm0) | (zmm23 & zmm26 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload @@ -11439,8 +11447,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm26, %zmm28 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm29 & zmm0) | (zmm5 & ~zmm29 & zmm0) | (zmm5 & zmm29 & ~zmm0) | (zmm5 & zmm29 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm26 & zmm5) | (zmm28 & ~zmm26 & ~zmm5) | (zmm28 & ~zmm26 & zmm5) | (zmm28 & zmm26 & zmm5) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -11477,7 +11485,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~ymm29) | (~ymm2 & ymm3 & ymm29) | (ymm2 & ~ymm3 & ymm29) | (ymm2 & ymm3 & ~ymm29) | (ymm2 & ymm3 & ymm29) ; AVX512-FCP-NEXT: movw $31, %ax ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} @@ -11512,7 +11520,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~ymm29) | (~ymm3 & ymm0 & ymm29) | (ymm3 & ~ymm0 & ymm29) | (ymm3 & ymm0 & ~ymm29) | (ymm3 & ymm0 & ymm29) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -11555,7 +11563,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm10 & ~ymm29) | (~ymm14 & ymm10 & ymm29) | (ymm14 & ~ymm10 & ymm29) | (ymm14 & ymm10 & ~ymm29) | (ymm14 & ymm10 & ymm29) ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11 @@ -11584,7 +11592,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm14, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm14 & ~ymm29) | (~ymm6 & ymm14 & ymm29) | (ymm6 & ~ymm14 & ymm29) | (ymm6 & ymm14 & ~ymm29) | (ymm6 & ymm14 & ymm29) ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] @@ -11607,6 +11615,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload @@ -11614,23 +11623,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (~zmm5 & ~zmm3 & mem) | (zmm5 & ~zmm3 & mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512-FCP-NEXT: vpternlogq $184, %zmm30, %zmm26, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm31, %zmm26, %zmm7 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm11 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm2, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm26 & zmm30) | (zmm4 & ~zmm26 & ~zmm30) | (zmm4 & ~zmm26 & zmm30) | (zmm4 & zmm26 & zmm30) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm26 & zmm31) | (zmm7 & ~zmm26 & ~zmm31) | (zmm7 & ~zmm26 & zmm31) | (zmm7 & zmm26 & zmm31) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm26 & zmm0) | (zmm11 & ~zmm26 & ~zmm0) | (zmm11 & ~zmm26 & zmm0) | (zmm11 & zmm26 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm26 & zmm2) | (zmm1 & ~zmm26 & ~zmm2) | (zmm1 & ~zmm26 & zmm2) | (zmm1 & zmm26 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) @@ -11719,7 +11731,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} @@ -11784,7 +11796,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm16 & zmm17) | (zmm7 & ~zmm16 & zmm17) | (zmm7 & zmm16 & ~zmm17) | (zmm7 & zmm16 & zmm17) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15] @@ -11812,7 +11824,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0 @@ -11844,7 +11856,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11915,9 +11927,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm21, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm2) | (zmm1 & ~zmm21 & zmm2) | (zmm1 & zmm21 & ~zmm2) | (zmm1 & zmm21 & zmm2) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm2 & zmm1) | (zmm3 & ~zmm2 & ~zmm1) | (zmm3 & ~zmm2 & zmm1) | (zmm3 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -11978,8 +11990,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm19) | (zmm2 & ~zmm21 & zmm19) | (zmm2 & zmm21 & ~zmm19) | (zmm2 & zmm21 & zmm19) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm18 & zmm2) | (zmm28 & ~zmm18 & ~zmm2) | (zmm28 & ~zmm18 & zmm2) | (zmm28 & zmm18 & zmm2) ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 @@ -12016,8 +12028,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm21 & zmm20) | (zmm0 & ~zmm21 & zmm20) | (zmm0 & zmm21 & ~zmm20) | (zmm0 & zmm21 & zmm20) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm18 & zmm0) | (zmm27 & ~zmm18 & ~zmm0) | (zmm27 & ~zmm18 & zmm0) | (zmm27 & zmm18 & zmm0) ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] @@ -12045,8 +12057,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm21 & zmm0) | (zmm2 & ~zmm21 & zmm0) | (zmm2 & zmm21 & ~zmm0) | (zmm2 & zmm21 & zmm0) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm2) | (zmm20 & ~zmm18 & ~zmm2) | (zmm20 & ~zmm18 & zmm2) | (zmm20 & zmm18 & zmm2) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12084,7 +12096,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm4 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm4 & ~ymm22) | (~ymm2 & ymm4 & ymm22) | (ymm2 & ~ymm4 & ymm22) | (ymm2 & ymm4 & ~ymm22) | (ymm2 & ymm4 & ymm22) ; AVX512DQ-NEXT: movw $31, %ax ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} @@ -12120,7 +12132,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~ymm22) | (~ymm2 & ymm0 & ymm22) | (ymm2 & ~ymm0 & ymm22) | (ymm2 & ymm0 & ~ymm22) | (ymm2 & ymm0 & ymm22) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12178,7 +12190,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm13, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm13 & ~ymm22) | (~ymm9 & ymm13 & ymm22) | (ymm9 & ~ymm13 & ymm22) | (ymm9 & ymm13 & ~ymm22) | (ymm9 & ymm13 & ymm22) ; AVX512DQ-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm9 @@ -12197,7 +12209,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] -; AVX512DQ-NEXT: vpternlogq $236, %ymm22, %ymm9, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm9 & ~ymm22) | (~ymm11 & ymm9 & ymm22) | (ymm11 & ~ymm9 & ymm22) | (ymm11 & ymm9 & ~ymm22) | (ymm11 & ymm9 & ymm22) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] @@ -12226,10 +12238,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 -; AVX512DQ-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 -; AVX512DQ-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm3 & zmm23) | (zmm24 & ~zmm3 & ~zmm23) | (zmm24 & ~zmm3 & zmm23) | (zmm24 & zmm3 & zmm23) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm3 & zmm25) | (zmm21 & ~zmm3 & ~zmm25) | (zmm21 & ~zmm3 & zmm25) | (zmm21 & zmm3 & zmm25) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & zmm26) | (zmm0 & ~zmm3 & ~zmm26) | (zmm0 & ~zmm3 & zmm26) | (zmm0 & zmm3 & zmm26) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm3 & zmm1) | (zmm2 & ~zmm3 & ~zmm1) | (zmm2 & ~zmm3 & zmm1) | (zmm2 & zmm3 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm1, (%rcx) @@ -12320,7 +12332,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} @@ -12385,7 +12397,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm17) | (zmm6 & ~zmm16 & zmm17) | (zmm6 & zmm16 & ~zmm17) | (zmm6 & zmm16 & zmm17) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15] @@ -12413,7 +12425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm16 & zmm0) | (zmm6 & ~zmm16 & zmm0) | (zmm6 & zmm16 & ~zmm0) | (zmm6 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 @@ -12445,7 +12457,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm16 & zmm0) | (zmm4 & ~zmm16 & zmm0) | (zmm4 & zmm16 & ~zmm0) | (zmm4 & zmm16 & zmm0) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12516,9 +12528,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm20 & zmm3) | (zmm4 & ~zmm20 & zmm3) | (zmm4 & zmm20 & ~zmm3) | (zmm4 & zmm20 & zmm3) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm28, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm28 & zmm4) | (zmm5 & ~zmm28 & ~zmm4) | (zmm5 & ~zmm28 & zmm4) | (zmm5 & zmm28 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -12575,8 +12587,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm27 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm9) | (zmm1 & ~zmm20 & zmm9) | (zmm1 & zmm20 & ~zmm9) | (zmm1 & zmm20 & zmm9) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm28 & zmm1) | (zmm27 & ~zmm28 & ~zmm1) | (zmm27 & ~zmm28 & zmm1) | (zmm27 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 @@ -12614,8 +12626,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm19 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm21) | (zmm1 & ~zmm20 & zmm21) | (zmm1 & zmm20 & ~zmm21) | (zmm1 & zmm20 & zmm21) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm28 & zmm1) | (zmm19 & ~zmm28 & ~zmm1) | (zmm19 & ~zmm28 & zmm1) | (zmm19 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -12641,8 +12653,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm20 & zmm0) | (zmm1 & ~zmm20 & zmm0) | (zmm1 & zmm20 & ~zmm0) | (zmm1 & zmm20 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm28 & zmm1) | (zmm21 & ~zmm28 & ~zmm1) | (zmm21 & ~zmm28 & zmm1) | (zmm21 & zmm28 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -12678,7 +12690,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm3 & ~ymm20) | (~ymm0 & ymm3 & ymm20) | (ymm0 & ~ymm3 & ymm20) | (ymm0 & ymm3 & ~ymm20) | (ymm0 & ymm3 & ymm20) ; AVX512DQ-FCP-NEXT: movw $31, %ax ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} @@ -12715,7 +12727,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm2 & ~ymm20) | (~ymm3 & ymm2 & ymm20) | (ymm3 & ~ymm2 & ymm20) | (ymm3 & ymm2 & ~ymm20) | (ymm3 & ymm2 & ymm20) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -12757,7 +12769,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ymm14 & ~ymm20) | (~ymm10 & ymm14 & ymm20) | (ymm10 & ~ymm14 & ymm20) | (ymm10 & ymm14 & ~ymm20) | (ymm10 & ymm14 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] @@ -12788,7 +12800,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm20, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm15 & ~ymm20) | (~ymm5 & ymm15 & ymm20) | (ymm5 & ~ymm15 & ymm20) | (ymm5 & ymm15 & ~ymm20) | (ymm5 & ymm15 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7] @@ -12813,10 +12825,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm28, %zmm23 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm28, %zmm13 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm28 & zmm22) | (zmm23 & ~zmm28 & ~zmm22) | (zmm23 & ~zmm28 & zmm22) | (zmm23 & zmm28 & zmm22) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm28 & zmm24) | (zmm13 & ~zmm28 & ~zmm24) | (zmm13 & ~zmm28 & zmm24) | (zmm13 & zmm28 & zmm24) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm28 & zmm0) | (zmm10 & ~zmm28 & ~zmm0) | (zmm10 & ~zmm28 & zmm0) | (zmm10 & zmm28 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm28 & zmm2) | (zmm1 & ~zmm28 & ~zmm2) | (zmm1 & ~zmm28 & zmm2) | (zmm1 & zmm28 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index af340d15fe8f64..9c2fb7704d1d45 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -6979,7 +6979,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7] ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm0 & ~mem) | (ymm3 & ~ymm0 & ~mem) | (ymm3 & ~ymm0 & mem) | (ymm3 & ymm0 & ~mem) | (ymm3 & ymm0 & mem) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -7128,18 +7128,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm4, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm4 & zmm21) | (zmm11 & ~zmm4 & ~zmm21) | (zmm11 & ~zmm4 & zmm21) | (zmm11 & zmm4 & zmm21) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & ~mem) | (zmm22 & ~zmm5 & mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm4 & zmm22) | (zmm12 & ~zmm4 & ~zmm22) | (zmm12 & ~zmm4 & zmm22) | (zmm12 & zmm4 & zmm22) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm19, %zmm4, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm4 & zmm19) | (zmm23 & ~zmm4 & ~zmm19) | (zmm23 & ~zmm4 & zmm19) | (zmm23 & zmm4 & zmm19) ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm23 {%k1} -; AVX512-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm27 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm4 & zmm20) | (zmm27 & ~zmm4 & ~zmm20) | (zmm27 & ~zmm4 & zmm20) | (zmm27 & zmm4 & zmm20) ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1} -; AVX512-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm4 & zmm18) | (zmm1 & ~zmm4 & zmm18) | (zmm1 & zmm4 & ~zmm18) | (zmm1 & zmm4 & zmm18) ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx) @@ -7147,11 +7147,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm27, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm4, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm4 & zmm17) | (zmm2 & ~zmm4 & zmm17) | (zmm2 & zmm4 & ~zmm17) | (zmm2 & zmm4 & zmm17) ; AVX512-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpternlogq $226, %zmm16, %zmm4, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm4 & zmm16) | (zmm3 & ~zmm4 & zmm16) | (zmm3 & zmm4 & ~zmm16) | (zmm3 & zmm4 & zmm16) ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper @@ -7344,7 +7344,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm30 & zmm0) | (zmm16 & ~zmm30 & ~zmm0) | (zmm16 & ~zmm30 & zmm0) | (zmm16 & zmm30 & zmm0) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] @@ -7429,17 +7429,17 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm26 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm24 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm24, %zmm3, %zmm27 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm3 & zmm23) | (zmm26 & ~zmm3 & ~zmm23) | (zmm26 & ~zmm3 & zmm23) | (zmm26 & zmm3 & zmm23) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm20 & ~mem) | (zmm24 & ~zmm20 & mem) | (zmm24 & zmm20 & ~mem) | (zmm24 & zmm20 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & zmm24) | (zmm27 & ~zmm3 & ~zmm24) | (zmm27 & ~zmm3 & zmm24) | (zmm27 & zmm3 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm30 & zmm21) | (zmm15 & ~zmm30 & ~zmm21) | (zmm15 & ~zmm30 & zmm21) | (zmm15 & zmm30 & zmm21) ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq $184, %zmm18, %zmm30, %zmm19 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm30 & zmm18) | (zmm19 & ~zmm30 & ~zmm18) | (zmm19 & ~zmm30 & zmm18) | (zmm19 & zmm30 & zmm18) ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm30, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm30 & zmm25) | (zmm11 & ~zmm30 & zmm25) | (zmm11 & zmm30 & ~zmm25) | (zmm11 & zmm30 & zmm25) ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx) @@ -7448,7 +7448,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vpternlogq $226, %zmm10, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm30 & zmm10) | (zmm0 & ~zmm30 & zmm10) | (zmm0 & zmm30 & ~zmm10) | (zmm0 & zmm30 & zmm10) ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) @@ -7639,7 +7639,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm0 & ~mem) | (ymm2 & ~ymm0 & ~mem) | (ymm2 & ~ymm0 & mem) | (ymm2 & ymm0 & ~mem) | (ymm2 & ymm0 & mem) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -7649,7 +7649,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm25 & zmm19) | (zmm22 & ~zmm25 & ~zmm19) | (zmm22 & ~zmm25 & zmm19) | (zmm22 & zmm25 & zmm19) ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1} @@ -7682,7 +7682,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = (~zmm19 & zmm25 & zmm17) | (zmm19 & ~zmm25 & ~zmm17) | (zmm19 & ~zmm25 & zmm17) | (zmm19 & zmm25 & zmm17) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] @@ -7714,7 +7714,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm25 & zmm20) | (zmm11 & ~zmm25 & zmm20) | (zmm11 & zmm25 & ~zmm20) | (zmm11 & zmm25 & zmm20) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -7768,7 +7768,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm24, %zmm25, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm25 & zmm24) | (zmm2 & ~zmm25 & zmm24) | (zmm2 & zmm25 & ~zmm24) | (zmm2 & zmm25 & zmm24) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] @@ -7799,13 +7799,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm27, %zmm25, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm25 & zmm27) | (zmm3 & ~zmm25 & zmm27) | (zmm3 & zmm25 & ~zmm27) | (zmm3 & zmm25 & zmm27) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm1 & zmm18) | (zmm8 & ~zmm1 & ~zmm18) | (zmm8 & ~zmm1 & zmm18) | (zmm8 & zmm1 & zmm18) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 -; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm0 & ~mem) | (zmm21 & ~zmm0 & mem) | (zmm21 & zmm0 & ~mem) | (zmm21 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm21) | (zmm9 & ~zmm1 & ~zmm21) | (zmm9 & ~zmm1 & zmm21) | (zmm9 & zmm1 & zmm21) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rcx) @@ -7939,7 +7939,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm19, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm19 & zmm9) | (zmm16 & ~zmm19 & ~zmm9) | (zmm16 & ~zmm19 & zmm9) | (zmm16 & zmm19 & zmm9) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 @@ -7977,7 +7977,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm2) | (zmm20 & ~zmm19 & ~zmm2) | (zmm20 & ~zmm19 & zmm2) | (zmm20 & zmm19 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -8002,7 +8002,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm19, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm19 & zmm12) | (zmm10 & ~zmm19 & ~zmm12) | (zmm10 & ~zmm19 & zmm12) | (zmm10 & zmm19 & zmm12) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] @@ -8065,7 +8065,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm19 & zmm12) | (zmm11 & ~zmm19 & zmm12) | (zmm11 & zmm19 & ~zmm12) | (zmm11 & zmm19 & zmm12) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9 @@ -8094,12 +8094,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm2) | (zmm0 & ~zmm19 & zmm2) | (zmm0 & zmm19 & ~zmm2) | (zmm0 & zmm19 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm1 & zmm23) | (zmm9 & ~zmm1 & ~zmm23) | (zmm9 & ~zmm1 & zmm23) | (zmm9 & zmm1 & zmm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm21 & ~mem) | (zmm24 & ~zmm21 & mem) | (zmm24 & zmm21 & ~mem) | (zmm24 & zmm21 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm24) | (zmm5 & ~zmm1 & ~zmm24) | (zmm5 & ~zmm1 & zmm24) | (zmm5 & zmm1 & zmm24) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx) @@ -14527,7 +14527,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm20 & ymm0) | (ymm11 & ~ymm20 & ~ymm0) | (ymm11 & ~ymm20 & ymm0) | (ymm11 & ymm20 & ~ymm0) | (ymm11 & ymm20 & ymm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] ; AVX512-NEXT: vmovdqa %ymm15, %ymm13 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm25 @@ -14546,7 +14546,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7] ; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm0) | (ymm1 & ~ymm20 & ~ymm0) | (ymm1 & ~ymm20 & ymm0) | (ymm1 & ymm20 & ~ymm0) | (ymm1 & ymm20 & ymm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -14700,7 +14700,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm0) | (zmm3 & ~zmm19 & ~zmm0) | (zmm3 & ~zmm19 & zmm0) | (zmm3 & zmm19 & zmm0) ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX512-NEXT: vmovdqa %ymm4, %ymm14 @@ -14812,7 +14812,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm30 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = (~zmm30 & zmm19 & zmm0) | (zmm30 & ~zmm19 & ~zmm0) | (zmm30 & ~zmm19 & zmm0) | (zmm30 & zmm19 & zmm0) ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] @@ -14868,7 +14868,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vpternlogq $184, %zmm6, %zmm19, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm19 & zmm6) | (zmm13 & ~zmm19 & ~zmm6) | (zmm13 & ~zmm19 & zmm6) | (zmm13 & zmm19 & zmm6) ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] @@ -14887,13 +14887,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = (~zmm10 & ~zmm9 & mem) | (zmm10 & ~zmm9 & mem) | (zmm10 & zmm9 & ~mem) | (zmm10 & zmm9 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & mem) ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm4 +; AVX512-NEXT: # zmm3 = (~zmm3 & zmm9 & mem) | (zmm3 & ~zmm9 & ~mem) | (zmm3 & ~zmm9 & mem) | (zmm3 & zmm9 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm9 & zmm10) | (zmm2 & ~zmm9 & ~zmm10) | (zmm2 & ~zmm9 & zmm10) | (zmm2 & zmm9 & zmm10) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm9 & zmm12) | (zmm4 & ~zmm9 & ~zmm12) | (zmm4 & ~zmm9 & zmm12) | (zmm4 & zmm9 & zmm12) ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] ; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] @@ -14912,21 +14916,26 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = (~zmm12 & zmm19 & mem) | (zmm12 & ~zmm19 & ~mem) | (zmm12 & ~zmm19 & mem) | (zmm12 & zmm19 & mem) ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & zmm19 & mem) | (zmm6 & ~zmm19 & ~mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = (~zmm25 & zmm19 & mem) | (zmm25 & ~zmm19 & ~mem) | (zmm25 & ~zmm19 & mem) | (zmm25 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = (~zmm6 & ~zmm19 & mem) | (zmm6 & ~zmm19 & mem) | (zmm6 & zmm19 & ~mem) | (zmm6 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rsi) @@ -14941,11 +14950,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (~zmm1 & ~zmm19 & mem) | (zmm1 & ~zmm19 & mem) | (zmm1 & zmm19 & ~mem) | (zmm1 & zmm19 & mem) ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = (~zmm11 & ~zmm19 & mem) | (zmm11 & ~zmm19 & mem) | (zmm11 & zmm19 & ~mem) | (zmm11 & zmm19 & mem) ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} @@ -15433,7 +15444,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm27 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm29 & zmm1) | (zmm27 & ~zmm29 & ~zmm1) | (zmm27 & ~zmm29 & zmm1) | (zmm27 & zmm29 & zmm1) ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7] ; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12 @@ -15469,7 +15480,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm26 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm29 & zmm1) | (zmm26 & ~zmm29 & ~zmm1) | (zmm26 & ~zmm29 & zmm1) | (zmm26 & zmm29 & zmm1) ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 @@ -15619,19 +15630,24 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm12 = (~zmm12 & ~zmm9 & mem) | (zmm12 & ~zmm9 & mem) | (zmm12 & zmm9 & ~mem) | (zmm12 & zmm9 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm13 = (~zmm13 & ~zmm9 & mem) | (zmm13 & ~zmm9 & mem) | (zmm13 & zmm9 & ~mem) | (zmm13 & zmm9 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm17 = (~zmm17 & zmm9 & mem) | (zmm17 & ~zmm9 & ~mem) | (zmm17 & ~zmm9 & mem) | (zmm17 & zmm9 & mem) ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm11 +; AVX512-FCP-NEXT: # zmm21 = (~zmm21 & zmm9 & mem) | (zmm21 & ~zmm9 & ~mem) | (zmm21 & ~zmm9 & mem) | (zmm21 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & zmm12) | (zmm5 & ~zmm9 & ~zmm12) | (zmm5 & ~zmm9 & zmm12) | (zmm5 & zmm9 & zmm12) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm13) | (zmm11 & ~zmm9 & ~zmm13) | (zmm11 & ~zmm9 & zmm13) | (zmm11 & zmm9 & zmm13) ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm19 = (~zmm19 & zmm29 & mem) | (zmm19 & ~zmm29 & ~mem) | (zmm19 & ~zmm29 & mem) | (zmm19 & zmm29 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload @@ -15647,18 +15663,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm29 & mem) | (zmm2 & ~zmm29 & ~mem) | (zmm2 & ~zmm29 & mem) | (zmm2 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = (~zmm8 & zmm29 & mem) | (zmm8 & ~zmm29 & ~mem) | (zmm8 & ~zmm29 & mem) | (zmm8 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = (~zmm6 & zmm29 & mem) | (zmm6 & ~zmm29 & ~mem) | (zmm6 & ~zmm29 & mem) | (zmm6 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1} ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi) @@ -15674,13 +15694,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm29 & mem) | (zmm1 & ~zmm29 & mem) | (zmm1 & zmm29 & ~mem) | (zmm1 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm29 & mem) | (zmm4 & ~zmm29 & mem) | (zmm4 & zmm29 & ~mem) | (zmm4 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm29 & mem) | (zmm0 & ~zmm29 & mem) | (zmm0 & zmm29 & ~mem) | (zmm0 & zmm29 & mem) ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708 @@ -16152,7 +16175,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm19 & ymm0) | (ymm10 & ~ymm19 & ~ymm0) | (ymm10 & ~ymm19 & ymm0) | (ymm10 & ymm19 & ~ymm0) | (ymm10 & ymm19 & ymm0) ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm0, %xmm28 @@ -16165,6 +16188,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = (~zmm10 & zmm17 & mem) | (zmm10 & ~zmm17 & ~mem) | (zmm10 & ~zmm17 & mem) | (zmm10 & zmm17 & mem) ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} @@ -16178,7 +16202,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm15 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7] ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm19 & ymm0) | (ymm11 & ~ymm19 & ~ymm0) | (ymm11 & ~ymm19 & ymm0) | (ymm11 & ymm19 & ~ymm0) | (ymm11 & ymm19 & ymm0) ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] @@ -16189,6 +16213,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = (~zmm11 & zmm17 & mem) | (zmm11 & ~zmm17 & ~mem) | (zmm11 & ~zmm17 & mem) | (zmm11 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 @@ -16227,6 +16252,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm27 = (~zmm27 & zmm17 & mem) | (zmm27 & ~zmm17 & ~mem) | (zmm27 & ~zmm17 & mem) | (zmm27 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 @@ -16266,6 +16292,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm29 = (~zmm29 & zmm17 & mem) | (zmm29 & ~zmm17 & ~mem) | (zmm29 & ~zmm17 & mem) | (zmm29 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -16314,6 +16341,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = (~zmm7 & ~zmm17 & mem) | (zmm7 & ~zmm17 & mem) | (zmm7 & zmm17 & ~mem) | (zmm7 & zmm17 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload @@ -16336,7 +16364,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm14 & zmm0) | (zmm12 & ~zmm14 & ~zmm0) | (zmm12 & ~zmm14 & zmm0) | (zmm12 & zmm14 & zmm0) ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm9 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] @@ -16407,6 +16435,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = (~zmm12 & ~zmm24 & mem) | (zmm12 & ~zmm24 & mem) | (zmm12 & zmm24 & ~mem) | (zmm12 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -16445,7 +16474,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm24 & zmm1) | (zmm7 & ~zmm24 & ~zmm1) | (zmm7 & ~zmm24 & zmm1) | (zmm7 & zmm24 & zmm1) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7] @@ -16486,6 +16515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = (~zmm11 & ~zmm24 & mem) | (zmm11 & ~zmm24 & mem) | (zmm11 & zmm24 & ~mem) | (zmm11 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] @@ -16501,7 +16531,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm24 & zmm1) | (zmm6 & ~zmm24 & ~zmm1) | (zmm6 & ~zmm24 & zmm1) | (zmm6 & zmm24 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 @@ -16527,13 +16557,17 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (~zmm1 & ~zmm0 & mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = (~zmm4 & ~zmm0 & mem) | (zmm4 & ~zmm0 & mem) | (zmm4 & zmm0 & ~mem) | (zmm4 & zmm0 & mem) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm28 = (~zmm28 & zmm0 & mem) | (zmm28 & ~zmm0 & ~mem) | (zmm28 & ~zmm0 & mem) | (zmm28 & zmm0 & mem) ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 +; AVX512DQ-NEXT: # zmm23 = (~zmm23 & zmm0 & mem) | (zmm23 & ~zmm0 & ~mem) | (zmm23 & ~zmm0 & mem) | (zmm23 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & zmm1) | (zmm2 & ~zmm0 & ~zmm1) | (zmm2 & ~zmm0 & zmm1) | (zmm2 & zmm0 & zmm1) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm0 & zmm4) | (zmm3 & ~zmm0 & ~zmm4) | (zmm3 & ~zmm0 & zmm4) | (zmm3 & zmm0 & zmm4) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx) @@ -16946,6 +16980,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & zmm25 & mem) | (zmm6 & ~zmm25 & ~mem) | (zmm6 & ~zmm25 & mem) | (zmm6 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} @@ -16963,6 +16998,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & zmm25 & mem) | (zmm1 & ~zmm25 & ~mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm0 @@ -17000,6 +17036,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm23 = (~zmm23 & zmm25 & mem) | (zmm23 & ~zmm25 & ~mem) | (zmm23 & ~zmm25 & mem) | (zmm23 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm1 @@ -17022,6 +17059,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = (~zmm29 & zmm25 & mem) | (zmm29 & ~zmm25 & ~mem) | (zmm29 & ~zmm25 & mem) | (zmm29 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -17040,7 +17078,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm28 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = (~zmm28 & zmm25 & zmm0) | (zmm28 & ~zmm25 & ~zmm0) | (zmm28 & ~zmm25 & zmm0) | (zmm28 & zmm25 & zmm0) ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 @@ -17077,7 +17115,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm25 & zmm0) | (zmm26 & ~zmm25 & ~zmm0) | (zmm26 & ~zmm25 & zmm0) | (zmm26 & zmm25 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -17132,6 +17170,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm25 & mem) | (zmm0 & ~zmm25 & mem) | (zmm0 & zmm25 & ~mem) | (zmm0 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] @@ -17160,6 +17199,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm9 = (~zmm9 & ~zmm25 & mem) | (zmm9 & ~zmm25 & mem) | (zmm9 & zmm25 & ~mem) | (zmm9 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 @@ -17196,6 +17236,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm12 = (~zmm12 & ~zmm25 & mem) | (zmm12 & ~zmm25 & mem) | (zmm12 & zmm25 & ~mem) | (zmm12 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -17227,17 +17268,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & ~zmm25 & mem) | (zmm1 & ~zmm25 & mem) | (zmm1 & zmm25 & ~mem) | (zmm1 & zmm25 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm2 & mem) | (zmm3 & ~zmm2 & mem) | (zmm3 & zmm2 & ~mem) | (zmm3 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (~zmm4 & ~zmm2 & mem) | (zmm4 & ~zmm2 & mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = (~zmm5 & zmm2 & mem) | (zmm5 & ~zmm2 & ~mem) | (zmm5 & ~zmm2 & mem) | (zmm5 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & zmm2 & mem) | (zmm6 & ~zmm2 & ~mem) | (zmm6 & ~zmm2 & mem) | (zmm6 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm3) | (zmm10 & ~zmm2 & ~zmm3) | (zmm10 & ~zmm2 & zmm3) | (zmm10 & zmm2 & zmm3) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm4) | (zmm8 & ~zmm2 & ~zmm4) | (zmm8 & ~zmm2 & zmm4) | (zmm8 & zmm2 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index e4dc257543d20c..44684603e301d0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -1979,7 +1979,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -1990,34 +1990,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2051,7 +2051,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2062,34 +2062,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2123,7 +2123,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2134,34 +2134,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -2195,7 +2195,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm3 & ymm4) | (~ymm5 & ymm3 & ymm4) | (ymm5 & ymm3 & ~ymm4) | (ymm5 & ymm3 & ymm4) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] @@ -2206,34 +2206,34 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = (~xmm8 & xmm9 & xmm7) | (xmm8 & ~xmm9 & ~xmm7) | (xmm8 & ~xmm9 & xmm7) | (xmm8 & xmm9 & xmm7) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = (~xmm6 & xmm9 & xmm5) | (xmm6 & ~xmm9 & ~xmm5) | (xmm6 & ~xmm9 & xmm5) | (xmm6 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm4 & ymm3) | (~ymm7 & ymm4 & ymm3) | (ymm7 & ymm4 & ~ymm3) | (ymm7 & ymm4 & ymm3) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = (~xmm11 & ~xmm9 & xmm5) | (xmm11 & ~xmm9 & xmm5) | (xmm11 & xmm9 & ~xmm5) | (xmm11 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = (~xmm7 & ~xmm9 & xmm5) | (xmm7 & ~xmm9 & xmm5) | (xmm7 & xmm9 & ~xmm5) | (xmm7 & xmm9 & xmm5) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm4 & ymm3) | (~ymm0 & ymm4 & ymm3) | (ymm0 & ymm4 & ~ymm3) | (ymm0 & ymm4 & ymm3) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] @@ -3775,7 +3775,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -3784,13 +3784,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -3803,7 +3803,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -3812,50 +3812,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -3866,7 +3866,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-NEXT: vmovdqa %ymm4, (%rcx) @@ -3885,7 +3885,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -3894,13 +3894,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -3913,7 +3913,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -3922,50 +3922,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -3976,7 +3976,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) @@ -3995,7 +3995,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -4004,13 +4004,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -4023,7 +4023,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -4032,50 +4032,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -4086,7 +4086,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) @@ -4105,7 +4105,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm17 & ymm3) | (~ymm7 & ymm17 & ymm3) | (ymm7 & ymm17 & ~ymm3) | (ymm7 & ymm17 & ymm3) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] @@ -4114,13 +4114,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm1 & ymm5) | (~ymm10 & ymm1 & ymm5) | (ymm10 & ymm1 & ~ymm5) | (ymm10 & ymm1 & ymm5) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm4 & ymm16) | (ymm11 & ~ymm4 & ~ymm16) | (ymm11 & ~ymm4 & ymm16) | (ymm11 & ymm4 & ~ymm16) | (ymm11 & ymm4 & ymm16) ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm13 & ymm6) | (~ymm14 & ymm13 & ymm6) | (ymm14 & ymm13 & ~ymm6) | (ymm14 & ymm13 & ymm6) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero @@ -4133,7 +4133,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm7 & ymm16) | (ymm8 & ~ymm7 & ~ymm16) | (ymm8 & ~ymm7 & ymm16) | (ymm8 & ymm7 & ~ymm16) | (ymm8 & ymm7 & ymm16) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 @@ -4142,50 +4142,50 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm3 & ymm17) | (~ymm10 & ymm3 & ymm17) | (ymm10 & ymm3 & ~ymm17) | (ymm10 & ymm3 & ymm17) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm5 & ymm1) | (~ymm9 & ymm5 & ymm1) | (ymm9 & ymm5 & ~ymm1) | (ymm9 & ymm5 & ymm1) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ymm12 & ymm16) | (ymm14 & ~ymm12 & ~ymm16) | (ymm14 & ~ymm12 & ymm16) | (ymm14 & ymm12 & ~ymm16) | (ymm14 & ymm12 & ymm16) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm6 & ymm13) | (~ymm12 & ymm6 & ymm13) | (ymm12 & ymm6 & ~ymm13) | (ymm12 & ymm6 & ymm13) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm15 & ymm14) | (ymm4 & ~ymm15 & ~ymm14) | (ymm4 & ~ymm15 & ymm14) | (ymm4 & ymm15 & ymm14) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm10 & ymm16) | (ymm9 & ~ymm10 & ~ymm16) | (ymm9 & ~ymm10 & ymm16) | (ymm9 & ymm10 & ~ymm16) | (ymm9 & ymm10 & ymm16) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm15 & ymm9) | (ymm2 & ~ymm15 & ~ymm9) | (ymm2 & ~ymm15 & ymm9) | (ymm2 & ymm15 & ymm9) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ~ymm13 & ymm6) | (~ymm8 & ymm13 & ymm6) | (ymm8 & ymm13 & ~ymm6) | (ymm8 & ymm13 & ymm6) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm3 & ymm17) | (~ymm0 & ymm3 & ymm17) | (ymm0 & ymm3 & ~ymm17) | (ymm0 & ymm3 & ymm17) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm5 & mem) | (ymm1 & ~ymm5 & ~mem) | (ymm1 & ymm5 & ~mem) | (ymm1 & ymm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm15 & ymm9) | (ymm5 & ~ymm15 & ymm9) | (ymm5 & ymm15 & ~ymm9) | (ymm5 & ymm15 & ymm9) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 @@ -4196,7 +4196,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm15 & ymm6) | (ymm0 & ~ymm15 & ymm6) | (ymm0 & ymm15 & ~ymm6) | (ymm0 & ymm15 & ymm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) @@ -7385,7 +7385,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7396,7 +7396,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7407,7 +7407,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7415,7 +7415,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7441,7 +7441,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7449,7 +7449,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7458,13 +7458,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7485,7 +7485,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -7494,25 +7494,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -7520,7 +7521,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -7528,17 +7529,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -7557,14 +7558,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -7576,24 +7577,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -7603,22 +7604,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -7638,7 +7639,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7649,7 +7650,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7660,7 +7661,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7668,7 +7669,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7694,7 +7695,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7702,7 +7703,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7711,13 +7712,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7738,7 +7739,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -7747,25 +7748,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -7773,7 +7775,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -7781,17 +7783,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -7810,14 +7812,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -7829,24 +7831,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -7856,22 +7858,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -7891,7 +7893,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7902,7 +7904,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -7913,7 +7915,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -7921,7 +7923,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -7947,7 +7949,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -7955,7 +7957,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -7964,13 +7966,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512DQ-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -7991,7 +7993,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -8000,25 +8002,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -8026,7 +8029,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -8034,17 +8037,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512DQ-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -8063,14 +8066,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -8082,24 +8085,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -8109,22 +8112,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rcx) @@ -8144,7 +8147,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm26 & ymm25) | (~ymm0 & ymm26 & ymm25) | (ymm0 & ymm26 & ~ymm25) | (ymm0 & ymm26 & ymm25) ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8155,7 +8158,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm18 & ymm24) | (~ymm6 & ymm18 & ymm24) | (ymm6 & ymm18 & ~ymm24) | (ymm6 & ymm18 & ymm24) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 @@ -8166,7 +8169,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm30 & ymm31) | (~ymm9 & ymm30 & ymm31) | (ymm9 & ymm30 & ~ymm31) | (ymm9 & ymm30 & ymm31) ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 @@ -8174,7 +8177,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm22 & ymm29) | (~ymm1 & ymm22 & ymm29) | (ymm1 & ymm22 & ~ymm29) | (ymm1 & ymm22 & ymm29) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 @@ -8200,7 +8203,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm25 & ymm26) | (~ymm4 & ymm25 & ymm26) | (ymm4 & ymm25 & ~ymm26) | (ymm4 & ymm25 & ymm26) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] @@ -8208,7 +8211,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm24 & ymm18) | (~ymm5 & ymm24 & ymm18) | (ymm5 & ymm24 & ~ymm18) | (ymm5 & ymm24 & ymm18) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 @@ -8217,13 +8220,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = (~ymm13 & ~ymm31 & ymm30) | (~ymm13 & ymm31 & ymm30) | (ymm13 & ymm31 & ~ymm30) | (ymm13 & ymm31 & ymm30) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm29 & ymm22) | (~ymm11 & ymm29 & ymm22) | (ymm11 & ymm29 & ~ymm22) | (ymm11 & ymm29 & ymm22) ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 @@ -8244,7 +8247,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm20 & ymm19) | (~ymm1 & ymm20 & ymm19) | (ymm1 & ymm20 & ~ymm19) | (ymm1 & ymm20 & ymm19) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] @@ -8253,25 +8256,26 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm23 & ymm6) | (~ymm2 & ymm23 & ymm6) | (ymm2 & ymm23 & ~ymm6) | (ymm2 & ymm23 & ymm6) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm17 & ymm4) | (ymm0 & ~ymm17 & ~ymm4) | (ymm0 & ~ymm17 & ymm4) | (ymm0 & ymm17 & ~ymm4) | (ymm0 & ymm17 & ymm4) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm17 & mem) | (zmm0 & ~zmm17 & mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm3 & zmm0) | (zmm15 & ~zmm3 & ~zmm0) | (zmm15 & ~zmm3 & zmm0) | (zmm15 & zmm3 & zmm0) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm21 & ymm4) | (ymm1 & ~ymm21 & ~ymm4) | (ymm1 & ~ymm21 & ymm4) | (ymm1 & ymm21 & ~ymm4) | (ymm1 & ymm21 & ymm4) ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm17 & zmm2) | (zmm1 & ~zmm17 & zmm2) | (zmm1 & zmm17 & ~zmm2) | (zmm1 & zmm17 & zmm2) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm3 & zmm1) | (zmm17 & ~zmm3 & ~zmm1) | (zmm17 & ~zmm3 & zmm1) | (zmm17 & zmm3 & zmm1) ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 @@ -8279,7 +8283,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] ; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm12 & ymm26) | (ymm11 & ~ymm12 & ymm26) | (ymm11 & ymm12 & ~ymm26) | (ymm11 & ymm12 & ymm26) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] @@ -8287,17 +8291,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (~ymm14 & ~ymm9 & ymm24) | (ymm14 & ~ymm9 & ymm24) | (ymm14 & ymm9 & ~ymm24) | (ymm14 & ymm9 & ymm24) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ~ymm31 & ymm30) | (~ymm12 & ymm31 & ymm30) | (ymm12 & ymm31 & ~ymm30) | (ymm12 & ymm31 & ymm30) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ~ymm6 & ymm23) | (~ymm4 & ymm6 & ymm23) | (ymm4 & ymm6 & ~ymm23) | (ymm4 & ymm6 & ymm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ~ymm22 & ymm29) | (~ymm9 & ymm22 & ymm29) | (ymm9 & ymm22 & ~ymm29) | (ymm9 & ymm22 & ymm29) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -8316,14 +8320,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm11 & ~ymm18) | (~ymm16 & ymm11 & ymm18) | (ymm16 & ~ymm11 & ymm18) | (ymm16 & ymm11 & ~ymm18) | (ymm16 & ymm11 & ymm18) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (~ymm21 & ymm4 & ~ymm18) | (~ymm21 & ymm4 & ymm18) | (ymm21 & ~ymm4 & ymm18) | (ymm21 & ymm4 & ~ymm18) | (ymm21 & ymm4 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ~ymm19 & ymm20) | (~ymm5 & ymm19 & ymm20) | (ymm5 & ymm19 & ~ymm20) | (ymm5 & ymm19 & ymm20) ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & ymm18) | (ymm1 & ~ymm4 & ~ymm18) | (ymm1 & ~ymm4 & ymm18) | (ymm1 & ymm4 & ~ymm18) | (ymm1 & ymm4 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 @@ -8335,24 +8339,24 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm2 & ymm23) | (ymm6 & ~ymm2 & ymm23) | (ymm6 & ymm2 & ~ymm23) | (ymm6 & ymm2 & ymm23) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (~ymm5 & ymm9 & ymm18) | (ymm5 & ~ymm9 & ~ymm18) | (ymm5 & ~ymm9 & ymm18) | (ymm5 & ymm9 & ~ymm18) | (ymm5 & ymm9 & ymm18) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm19 & ymm20) | (~ymm2 & ymm19 & ymm20) | (ymm2 & ymm19 & ~ymm20) | (ymm2 & ymm19 & ymm20) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ~ymm9 & ymm7) | (ymm11 & ~ymm9 & ~ymm7) | (ymm11 & ~ymm9 & ymm7) | (ymm11 & ymm9 & ~ymm7) | (ymm11 & ymm9 & ymm7) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm9 & zmm11) | (zmm8 & ~zmm9 & zmm11) | (zmm8 & zmm9 & ~zmm11) | (zmm8 & zmm9 & zmm11) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm11 & zmm8) | (zmm7 & ~zmm11 & ~zmm8) | (zmm7 & ~zmm11 & zmm8) | (zmm7 & zmm11 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 @@ -8362,22 +8366,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ~ymm9 & ymm0) | (ymm2 & ~ymm9 & ~ymm0) | (ymm2 & ~ymm9 & ymm0) | (ymm2 & ymm9 & ~ymm0) | (ymm2 & ymm9 & ymm0) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm9 & zmm2) | (zmm1 & ~zmm9 & zmm2) | (zmm1 & zmm9 & ~zmm2) | (zmm1 & zmm9 & zmm2) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm11 & zmm1) | (zmm0 & ~zmm11 & ~zmm1) | (zmm0 & ~zmm11 & zmm1) | (zmm0 & zmm11 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm2 & zmm1) | (zmm16 & ~zmm2 & ~zmm1) | (zmm16 & ~zmm2 & zmm1) | (zmm16 & zmm2 & zmm1) ; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & zmm2 & zmm1) | (zmm21 & ~zmm2 & ~zmm1) | (zmm21 & ~zmm2 & zmm1) | (zmm21 & zmm2 & zmm1) ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm1 & zmm16) | (zmm4 & ~zmm1 & ~zmm16) | (zmm4 & ~zmm1 & zmm16) | (zmm4 & zmm1 & zmm16) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm21) | (zmm5 & ~zmm1 & ~zmm21) | (zmm5 & ~zmm1 & zmm21) | (zmm5 & zmm1 & zmm21) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 7d2f52d3c5830e..cd481a30cb211e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -444,7 +444,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] @@ -472,7 +472,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] @@ -499,7 +499,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] @@ -527,7 +527,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm3 & ~mem) | (ymm2 & ~ymm3 & mem) | (ymm2 & ymm3 & ~mem) | (ymm2 & ymm3 & mem) ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] @@ -936,7 +936,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -975,7 +975,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1020,7 +1020,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1059,7 +1059,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1737,7 +1737,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1748,12 +1748,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1761,7 +1761,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1770,14 +1770,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1788,9 +1788,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1815,7 +1815,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1825,12 +1825,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1838,7 +1838,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1847,14 +1847,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] @@ -1863,9 +1863,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1890,7 +1890,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1901,12 +1901,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1914,7 +1914,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -1923,14 +1923,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] @@ -1941,9 +1941,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512DQ-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) @@ -1968,7 +1968,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (~ymm8 & ymm5 & mem) | (ymm8 & ~ymm5 & ~mem) | (ymm8 & ymm5 & ~mem) | (ymm8 & ymm5 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] @@ -1978,12 +1978,12 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ~ymm8 & ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ~ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1991,7 +1991,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ~ymm9 & ymm5) | (ymm7 & ~ymm9 & ymm5) | (ymm7 & ymm9 & ~ymm5) | (ymm7 & ymm9 & ymm5) ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm1, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] @@ -2000,14 +2000,14 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (~ymm10 & ~ymm8 & ymm5) | (ymm10 & ~ymm8 & ymm5) | (ymm10 & ymm8 & ~ymm5) | (ymm10 & ymm8 & ymm5) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm5 & mem) | (zmm4 & ~zmm5 & ~mem) | (zmm4 & ~zmm5 & mem) | (zmm4 & zmm5 & ~mem) | (zmm4 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] @@ -2016,9 +2016,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ~ymm9 & ymm2) | (ymm0 & ~ymm9 & ymm2) | (ymm0 & ymm9 & ~ymm2) | (ymm0 & ymm9 & ymm2) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ~ymm0 & mem) | (~ymm1 & ymm0 & mem) | (ymm1 & ymm0 & ~mem) | (ymm1 & ymm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) @@ -3383,11 +3383,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem) ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -3413,7 +3413,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem) ; AVX512-NEXT: vmovdqa (%r8), %ymm9 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3422,7 +3422,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] ; AVX512-NEXT: vpandnq %ymm10, %ymm21, %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm2, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21) ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 @@ -3441,11 +3441,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -3468,12 +3468,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4) ; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm6 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0) ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] @@ -3492,11 +3492,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0) ; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 ; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) @@ -3540,7 +3540,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3550,7 +3550,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] ; AVX512-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm16, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm15 @@ -3574,11 +3574,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14) ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3602,13 +3602,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 @@ -3628,11 +3628,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7) ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8) ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3651,11 +3651,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4) ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -3697,11 +3697,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm16 & zmm0) | (zmm3 & ~zmm16 & zmm0) | (zmm3 & zmm16 & ~zmm0) | (zmm3 & zmm16 & zmm0) ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm19 = (~zmm19 & zmm3 & mem) | (zmm19 & ~zmm3 & ~mem) | (zmm19 & zmm3 & ~mem) | (zmm19 & zmm3 & mem) ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -3727,7 +3727,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm11 & mem) | (zmm2 & ~zmm11 & ~mem) | (zmm2 & zmm11 & ~mem) | (zmm2 & zmm11 & mem) ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3736,7 +3736,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] ; AVX512DQ-NEXT: vpandnq %ymm10, %ymm21, %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & zmm21) | (zmm10 & ~zmm2 & ~zmm21) | (zmm10 & ~zmm2 & zmm21) | (zmm10 & zmm2 & ~zmm21) | (zmm10 & zmm2 & zmm21) ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 @@ -3755,11 +3755,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm2) | (zmm4 & ~zmm7 & zmm2) | (zmm4 & zmm7 & ~zmm2) | (zmm4 & zmm7 & zmm2) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512DQ-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -3782,12 +3782,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm4) | (zmm0 & ~zmm7 & zmm4) | (zmm0 & zmm7 & ~zmm4) | (zmm0 & zmm7 & zmm4) ; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm6 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm6 & zmm0) | (zmm4 & ~zmm6 & ~zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & zmm0) ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm20[1,1,2,2] @@ -3806,11 +3806,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm0) | (zmm1 & ~zmm16 & zmm0) | (zmm1 & zmm16 & ~zmm0) | (zmm1 & zmm16 & zmm0) ; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) @@ -3854,7 +3854,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2],xmm13[3],xmm2[4,5],xmm13[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] @@ -3864,7 +3864,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] ; AVX512DQ-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm16, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & zmm1 & zmm16) | (zmm18 & ~zmm1 & ~zmm16) | (zmm18 & ~zmm1 & zmm16) | (zmm18 & zmm1 & ~zmm16) | (zmm18 & zmm1 & zmm16) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm15 @@ -3888,11 +3888,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm16 & zmm14) | (zmm1 & ~zmm16 & zmm14) | (zmm1 & zmm16 & ~zmm14) | (zmm1 & zmm16 & zmm14) ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3916,13 +3916,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm19 & zmm3) | (zmm2 & ~zmm19 & zmm3) | (zmm2 & zmm19 & ~zmm3) | (zmm2 & zmm19 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm3, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm2 @@ -3942,11 +3942,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm16 & zmm7) | (zmm8 & ~zmm16 & zmm7) | (zmm8 & zmm16 & ~zmm7) | (zmm8 & zmm16 & zmm7) ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm3 & zmm8) | (zmm1 & ~zmm3 & ~zmm8) | (zmm1 & ~zmm3 & zmm8) | (zmm1 & zmm3 & zmm8) ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3965,11 +3965,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm19, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm19 & zmm4) | (zmm5 & ~zmm19 & zmm4) | (zmm5 & zmm19 & ~zmm4) | (zmm5 & zmm19 & zmm4) ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & mem) | (zmm2 & ~zmm5 & ~mem) | (zmm2 & zmm5 & ~mem) | (zmm2 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -6984,20 +6984,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem) ; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 -; AVX512-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25) ; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload +; AVX512-NEXT: # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem) ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem) ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512-NEXT: # ymm18 = mem[0,1,0,1] ; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload @@ -7025,43 +7028,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm31, %zmm23 -; AVX512-NEXT: vpternlogq $248, %zmm21, %zmm24, %zmm26 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21) ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18) ; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21) ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 ; AVX512-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] -; AVX512-NEXT: vpternlogq $226, %zmm21, %zmm24, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21) ; AVX512-NEXT: vpbroadcastq (%r8), %ymm21 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 -; AVX512-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm10 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22) ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13) ; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7) ; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $184, %zmm5, %zmm16, %zmm6 -; AVX512-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 -; AVX512-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %zmm5, %zmm9, %zmm17 -; AVX512-NEXT: vpternlogq $248, %zmm5, %zmm0, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5) ; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm6, 256(%r9) @@ -7184,7 +7187,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm20 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -7194,7 +7197,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 @@ -7243,11 +7246,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1) ; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2) ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 @@ -7288,11 +7291,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm31, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11) ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 ; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm15, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] @@ -7315,7 +7318,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -7329,7 +7332,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8) ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] @@ -7340,31 +7343,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4) ; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm6, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5) ; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20) ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm21, %zmm5, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21) ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm9 +; AVX512-FCP-NEXT: # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm27 -; AVX512-FCP-NEXT: vpternlogq $248, %zmm1, %zmm9, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) @@ -7611,20 +7617,23 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = (~zmm31 & ~zmm19 & mem) | (zmm31 & ~zmm19 & mem) | (zmm31 & zmm19 & ~mem) | (zmm31 & zmm19 & mem) ; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm1 ; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm18 & zmm31) | (zmm1 & ~zmm18 & ~zmm31) | (zmm1 & ~zmm18 & zmm31) | (zmm1 & zmm18 & zmm31) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & ~zmm19 & zmm25) | (zmm27 & ~zmm19 & zmm25) | (zmm27 & zmm19 & ~zmm25) | (zmm27 & zmm19 & zmm25) ; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm18 & zmm27) | (zmm2 & ~zmm18 & ~zmm27) | (zmm2 & ~zmm18 & zmm27) | (zmm2 & zmm18 & zmm27) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm31 = (~zmm31 & ~zmm18 & mem) | (zmm31 & ~zmm18 & mem) | (zmm31 & zmm18 & ~mem) | (zmm31 & zmm18 & mem) ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm24 = (~zmm24 & ~zmm18 & mem) | (zmm24 & ~zmm18 & mem) | (zmm24 & zmm18 & ~mem) | (zmm24 & zmm18 & mem) ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm18 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload @@ -7652,43 +7661,43 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm31, %zmm23 -; AVX512DQ-NEXT: vpternlogq $248, %zmm21, %zmm24, %zmm26 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & zmm31 & zmm21) | (zmm23 & ~zmm31 & ~zmm21) | (zmm23 & ~zmm31 & zmm21) | (zmm23 & zmm31 & ~zmm21) | (zmm23 & zmm31 & zmm21) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm24 & zmm21) | (zmm26 & ~zmm24 & ~zmm21) | (zmm26 & ~zmm24 & zmm21) | (zmm26 & zmm24 & ~zmm21) | (zmm26 & zmm24 & zmm21) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm18, %zmm18 ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm18, %zmm24, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm24 & zmm18) | (zmm21 & ~zmm24 & zmm18) | (zmm21 & zmm24 & ~zmm18) | (zmm21 & zmm24 & zmm18) ; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm8 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm18 & zmm21) | (zmm8 & ~zmm18 & ~zmm21) | (zmm8 & ~zmm18 & zmm21) | (zmm8 & zmm18 & zmm21) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm21 ; AVX512DQ-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm22 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-NEXT: vpternlogq $226, %zmm21, %zmm24, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm24 & zmm21) | (zmm22 & ~zmm24 & zmm21) | (zmm22 & zmm24 & ~zmm21) | (zmm22 & zmm24 & zmm21) ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm21 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 -; AVX512DQ-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm10 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm18 & zmm22) | (zmm10 & ~zmm18 & ~zmm22) | (zmm10 & ~zmm18 & zmm22) | (zmm10 & zmm18 & zmm22) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQ-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm24 & zmm13) | (zmm11 & ~zmm24 & zmm13) | (zmm11 & zmm24 & ~zmm13) | (zmm11 & zmm24 & zmm13) ; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm16 & zmm11) | (zmm12 & ~zmm16 & ~zmm11) | (zmm12 & ~zmm16 & zmm11) | (zmm12 & zmm16 & zmm11) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm24 & zmm7) | (zmm5 & ~zmm24 & zmm7) | (zmm5 & zmm24 & ~zmm7) | (zmm5 & zmm24 & zmm7) ; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $184, %zmm5, %zmm16, %zmm6 -; AVX512DQ-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm16 & zmm5) | (zmm6 & ~zmm16 & ~zmm5) | (zmm6 & ~zmm16 & zmm5) | (zmm6 & zmm16 & zmm5) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm19 & zmm30) | (zmm9 & ~zmm19 & zmm30) | (zmm9 & zmm19 & ~zmm30) | (zmm9 & zmm19 & zmm30) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm19 & zmm20) | (zmm0 & ~zmm19 & zmm20) | (zmm0 & zmm19 & ~zmm20) | (zmm0 & zmm19 & zmm20) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %zmm5, %zmm9, %zmm17 -; AVX512DQ-NEXT: vpternlogq $248, %zmm5, %zmm0, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm9 & zmm5) | (zmm17 & ~zmm9 & ~zmm5) | (zmm17 & ~zmm9 & zmm5) | (zmm17 & zmm9 & ~zmm5) | (zmm17 & zmm9 & zmm5) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm5) | (zmm4 & ~zmm0 & ~zmm5) | (zmm4 & ~zmm0 & zmm5) | (zmm4 & zmm0 & ~zmm5) | (zmm4 & zmm0 & zmm5) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 384(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 256(%r9) @@ -7811,7 +7820,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & ~zmm31 & zmm1) | (zmm20 & ~zmm31 & zmm1) | (zmm20 & zmm31 & ~zmm1) | (zmm20 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 @@ -7821,7 +7830,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm21 = zmm4[0,1,0,1,4,5,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm31 & zmm1) | (zmm21 & ~zmm31 & zmm1) | (zmm21 & zmm31 & ~zmm1) | (zmm21 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm28 @@ -7870,11 +7879,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm31 & zmm1) | (zmm2 & ~zmm31 & zmm1) | (zmm2 & zmm31 & ~zmm1) | (zmm2 & zmm31 & zmm1) ; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm12 & zmm2) | (zmm14 & ~zmm12 & ~zmm2) | (zmm14 & ~zmm12 & zmm2) | (zmm14 & zmm12 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 @@ -7915,11 +7924,11 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm15 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm31, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & ~zmm31 & zmm11) | (zmm15 & ~zmm31 & zmm11) | (zmm15 & zmm31 & ~zmm11) | (zmm15 & zmm31 & zmm11) ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm15, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm12 & zmm15) | (zmm7 & ~zmm12 & ~zmm15) | (zmm7 & ~zmm12 & zmm15) | (zmm7 & zmm12 & zmm15) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] @@ -7942,7 +7951,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm1 & zmm0) | (zmm8 & ~zmm1 & zmm0) | (zmm8 & zmm1 & ~zmm0) | (zmm8 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] @@ -7956,7 +7965,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm8) | (zmm0 & ~zmm5 & ~zmm8) | (zmm0 & ~zmm5 & zmm8) | (zmm0 & zmm5 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] @@ -7967,31 +7976,34 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm1 & zmm4) | (zmm6 & ~zmm1 & zmm4) | (zmm6 & zmm1 & ~zmm4) | (zmm6 & zmm1 & zmm4) ; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm6, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm6) | (zmm4 & ~zmm5 & ~zmm6) | (zmm4 & ~zmm5 & zmm6) | (zmm4 & zmm5 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm5 & mem) | (zmm3 & ~zmm5 & mem) | (zmm3 & zmm5 & ~mem) | (zmm3 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm16 = (~zmm16 & ~zmm5 & mem) | (zmm16 & ~zmm5 & mem) | (zmm16 & zmm5 & ~mem) | (zmm16 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & zmm3 & zmm5) | (zmm8 & ~zmm3 & ~zmm5) | (zmm8 & ~zmm3 & zmm5) | (zmm8 & zmm3 & ~zmm5) | (zmm8 & zmm3 & zmm5) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm16 & zmm5) | (zmm17 & ~zmm16 & ~zmm5) | (zmm17 & ~zmm16 & zmm5) | (zmm17 & zmm16 & ~zmm5) | (zmm17 & zmm16 & zmm5) ; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm20) | (zmm2 & ~zmm5 & ~zmm20) | (zmm2 & ~zmm5 & zmm20) | (zmm2 & zmm5 & zmm20) ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm21, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm21) | (zmm6 & ~zmm5 & ~zmm21) | (zmm6 & ~zmm5 & zmm21) | (zmm6 & zmm5 & zmm21) ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm9 +; AVX512DQ-FCP-NEXT: # zmm25 = (~zmm25 & ~zmm1 & mem) | (zmm25 & ~zmm1 & mem) | (zmm25 & zmm1 & ~mem) | (zmm25 & zmm1 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm1 & zmm19) | (zmm9 & ~zmm1 & zmm19) | (zmm9 & zmm1 & ~zmm19) | (zmm9 & zmm1 & zmm19) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm27 -; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm1, %zmm9, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm25 & zmm1) | (zmm27 & ~zmm25 & ~zmm1) | (zmm27 & ~zmm25 & zmm1) | (zmm27 & zmm25 & ~zmm1) | (zmm27 & zmm25 & zmm1) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = (~zmm24 & zmm9 & zmm1) | (zmm24 & ~zmm9 & ~zmm1) | (zmm24 & ~zmm9 & zmm1) | (zmm24 & zmm9 & ~zmm1) | (zmm24 & zmm9 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index c725dcd972cd59..c93d7b7a720c3e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -4128,19 +4128,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm16, %zmm9, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm9 & zmm16) | (zmm8 & ~zmm9 & ~zmm16) | (zmm8 & ~zmm9 & zmm16) | (zmm8 & zmm9 & zmm16) ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512-NEXT: vpternlogd $184, %zmm17, %zmm9, %zmm13 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm9 & zmm17) | (zmm13 & ~zmm9 & ~zmm17) | (zmm13 & ~zmm9 & zmm17) | (zmm13 & zmm9 & zmm17) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm5 & zmm2) | (zmm0 & ~zmm5 & ~zmm2) | (zmm0 & ~zmm5 & zmm2) | (zmm0 & zmm5 & zmm2) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm4) | (zmm2 & ~zmm5 & ~zmm4) | (zmm2 & ~zmm5 & zmm4) | (zmm2 & zmm5 & zmm4) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm12, %zmm5, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm12) | (zmm4 & ~zmm5 & ~zmm12) | (zmm4 & ~zmm5 & zmm12) | (zmm4 & zmm5 & zmm12) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm5, %zmm3 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm5 & zmm1) | (zmm3 & ~zmm5 & ~zmm1) | (zmm3 & ~zmm5 & zmm1) | (zmm3 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) @@ -4337,23 +4337,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & zmm1) | (zmm0 & ~zmm2 & ~zmm1) | (zmm0 & ~zmm2 & zmm1) | (zmm0 & zmm2 & zmm1) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm2 & zmm0) | (zmm8 & ~zmm2 & ~zmm0) | (zmm8 & ~zmm2 & zmm0) | (zmm8 & zmm2 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm21, %zmm0, %zmm15 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm0 & zmm21) | (zmm15 & ~zmm0 & ~zmm21) | (zmm15 & ~zmm0 & zmm21) | (zmm15 & zmm0 & zmm21) ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512-FCP-NEXT: vpternlogd $184, %zmm19, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm0 & zmm19) | (zmm14 & ~zmm0 & ~zmm19) | (zmm14 & ~zmm0 & zmm19) | (zmm14 & zmm0 & zmm19) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm17) | (zmm1 & ~zmm0 & ~zmm17) | (zmm1 & ~zmm0 & zmm17) | (zmm1 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & zmm16) | (zmm1 & ~zmm0 & ~zmm16) | (zmm1 & ~zmm0 & zmm16) | (zmm1 & zmm0 & zmm16) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512-FCP-NEXT: popq %rax ; AVX512-FCP-NEXT: vzeroupper @@ -4568,19 +4568,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm16, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm10 = (~zmm10 & zmm11 & zmm16) | (zmm10 & ~zmm11 & ~zmm16) | (zmm10 & ~zmm11 & zmm16) | (zmm10 & zmm11 & zmm16) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm12 -; AVX512DQ-NEXT: vpternlogd $184, %zmm17, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm11 & zmm17) | (zmm12 & ~zmm11 & ~zmm17) | (zmm12 & ~zmm11 & zmm17) | (zmm12 & zmm11 & zmm17) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm22, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm4 & zmm22) | (zmm0 & ~zmm4 & ~zmm22) | (zmm0 & ~zmm4 & zmm22) | (zmm0 & zmm4 & zmm22) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpternlogd $184, %zmm18, %zmm4, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & zmm18) | (zmm7 & ~zmm4 & ~zmm18) | (zmm7 & ~zmm4 & zmm18) | (zmm7 & zmm4 & zmm18) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm5 & zmm2) | (zmm4 & ~zmm5 & ~zmm2) | (zmm4 & ~zmm5 & zmm2) | (zmm4 & zmm5 & zmm2) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512DQ-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm5 & zmm3) | (zmm1 & ~zmm5 & ~zmm3) | (zmm1 & ~zmm5 & zmm3) | (zmm1 & zmm5 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) @@ -4787,22 +4787,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm5 & zmm4) | (zmm6 & ~zmm5 & ~zmm4) | (zmm6 & ~zmm5 & zmm4) | (zmm6 & zmm5 & zmm4) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm5 & zmm3) | (zmm2 & ~zmm5 & ~zmm3) | (zmm2 & ~zmm5 & zmm3) | (zmm2 & zmm5 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm2 & zmm0) | (zmm1 & ~zmm2 & ~zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm18, %zmm2, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm2 & zmm18) | (zmm14 & ~zmm2 & ~zmm18) | (zmm14 & ~zmm2 & zmm18) | (zmm14 & zmm2 & zmm18) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 320(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm0) | (zmm13 & ~zmm1 & ~zmm0) | (zmm13 & ~zmm1 & zmm0) | (zmm13 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm15 = (~zmm15 & zmm1 & zmm0) | (zmm15 & ~zmm1 & ~zmm0) | (zmm15 & ~zmm1 & zmm0) | (zmm15 & zmm1 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -8710,6 +8710,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = (~zmm24 & zmm25 & mem) | (zmm24 & ~zmm25 & ~mem) | (zmm24 & ~zmm25 & mem) | (zmm24 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] @@ -8722,6 +8723,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: # zmm26 = (~zmm26 & zmm25 & mem) | (zmm26 & ~zmm25 & ~mem) | (zmm26 & ~zmm25 & mem) | (zmm26 & zmm25 & mem) ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] @@ -8736,26 +8738,30 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = (~zmm17 & zmm25 & mem) | (zmm17 & ~zmm25 & ~mem) | (zmm17 & ~zmm25 & mem) | (zmm17 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm16 ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload +; AVX512-NEXT: # zmm16 = (~zmm16 & zmm25 & mem) | (zmm16 & ~zmm25 & ~mem) | (zmm16 & ~zmm25 & mem) | (zmm16 & zmm25 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm23, %zmm18 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = (~zmm18 & zmm23 & mem) | (zmm18 & ~zmm23 & ~mem) | (zmm18 & ~zmm23 & mem) | (zmm18 & zmm23 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 ; AVX512-NEXT: vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (~zmm4 & zmm23 & mem) | (zmm4 & ~zmm23 & ~mem) | (zmm4 & ~zmm23 & mem) | (zmm4 & zmm23 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512-NEXT: vpternlogd $184, %zmm20, %zmm23, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm23 & zmm20) | (zmm7 & ~zmm23 & ~zmm20) | (zmm7 & ~zmm23 & zmm20) | (zmm7 & zmm23 & zmm20) ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512-NEXT: vpternlogd $184, %zmm19, %zmm23, %zmm5 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm23 & zmm19) | (zmm5 & ~zmm23 & ~zmm19) | (zmm5 & ~zmm23 & zmm19) | (zmm5 & zmm23 & zmm19) ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm22, %zmm10, %zmm8 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm10 & zmm22) | (zmm8 & ~zmm10 & ~zmm22) | (zmm8 & ~zmm10 & zmm22) | (zmm8 & zmm10 & zmm22) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 -; AVX512-NEXT: vpternlogd $184, %zmm28, %zmm10, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm10 & zmm28) | (zmm6 & ~zmm10 & ~zmm28) | (zmm6 & ~zmm10 & zmm28) | (zmm6 & zmm10 & zmm28) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512-NEXT: vpternlogd $184, %zmm15, %zmm10, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm10 & zmm15) | (zmm0 & ~zmm10 & ~zmm15) | (zmm0 & ~zmm10 & zmm15) | (zmm0 & zmm10 & zmm15) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm10, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm10 & zmm1) | (zmm2 & ~zmm10 & ~zmm1) | (zmm2 & ~zmm10 & zmm1) | (zmm2 & zmm10 & zmm1) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) @@ -9166,40 +9172,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm28, %zmm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm27, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm9, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm9 & zmm8) | (zmm6 & ~zmm9 & ~zmm8) | (zmm6 & ~zmm9 & zmm8) | (zmm6 & zmm9 & zmm8) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm12 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm9 & zmm0) | (zmm12 & ~zmm9 & ~zmm0) | (zmm12 & ~zmm9 & zmm0) | (zmm12 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & zmm0) | (zmm11 & ~zmm9 & ~zmm0) | (zmm11 & ~zmm9 & zmm0) | (zmm11 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 640(%rax) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm9 & zmm0) | (zmm3 & ~zmm9 & ~zmm0) | (zmm3 & ~zmm9 & zmm0) | (zmm3 & zmm9 & zmm0) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm0 & zmm17) | (zmm4 & ~zmm0 & ~zmm17) | (zmm4 & ~zmm0 & zmm17) | (zmm4 & zmm0 & zmm17) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm0 & zmm16) | (zmm5 & ~zmm0 & ~zmm16) | (zmm5 & ~zmm0 & zmm16) | (zmm5 & zmm0 & zmm16) ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm25, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm25) | (zmm13 & ~zmm1 & ~zmm25) | (zmm13 & ~zmm1 & zmm25) | (zmm13 & zmm1 & zmm25) ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm1 & mem) | (zmm2 & ~zmm1 & ~mem) | (zmm2 & ~zmm1 & mem) | (zmm2 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 576(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512-FCP-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX512-FCP-NEXT: vzeroupper @@ -9545,6 +9556,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm16 = (~zmm16 & zmm28 & mem) | (zmm16 & ~zmm28 & ~mem) | (zmm16 & ~zmm28 & mem) | (zmm16 & zmm28 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3] ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm1 @@ -9621,6 +9633,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (~zmm1 & zmm28 & mem) | (zmm1 & ~zmm28 & ~mem) | (zmm1 & ~zmm28 & mem) | (zmm1 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm15 @@ -9651,26 +9664,31 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = (~zmm12 & zmm28 & mem) | (zmm12 & ~zmm28 & ~mem) | (zmm12 & ~zmm28 & mem) | (zmm12 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm17 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = (~zmm17 & zmm28 & mem) | (zmm17 & ~zmm28 & ~mem) | (zmm17 & ~zmm28 & mem) | (zmm17 & zmm28 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm24, %zmm22 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm22 = (~zmm22 & zmm24 & mem) | (zmm22 & ~zmm24 & ~mem) | (zmm22 & ~zmm24 & mem) | (zmm22 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm7 = (~zmm7 & zmm24 & mem) | (zmm7 & ~zmm24 & ~mem) | (zmm7 & ~zmm24 & mem) | (zmm7 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = (~zmm3 & zmm24 & mem) | (zmm3 & ~zmm24 & ~mem) | (zmm3 & ~zmm24 & mem) | (zmm3 & zmm24 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogd $184, %zmm23, %zmm24, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm24 & zmm23) | (zmm0 & ~zmm24 & ~zmm23) | (zmm0 & ~zmm24 & zmm23) | (zmm0 & zmm24 & zmm23) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm25, %zmm13, %zmm5 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm25) | (zmm5 & ~zmm13 & ~zmm25) | (zmm5 & ~zmm13 & zmm25) | (zmm5 & zmm13 & zmm25) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $184, %zmm26, %zmm13, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & zmm26) | (zmm6 & ~zmm13 & ~zmm26) | (zmm6 & ~zmm13 & zmm26) | (zmm6 & zmm13 & zmm26) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512DQ-NEXT: vpternlogd $184, %zmm27, %zmm13, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm27) | (zmm4 & ~zmm13 & ~zmm27) | (zmm4 & ~zmm13 & zmm27) | (zmm4 & zmm13 & zmm27) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogd $184, %zmm2, %zmm13, %zmm8 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm8 = (~zmm8 & zmm13 & zmm2) | (zmm8 & ~zmm13 & ~zmm2) | (zmm8 & ~zmm13 & zmm2) | (zmm8 & zmm13 & zmm2) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 448(%rax) @@ -10094,42 +10112,45 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm8 & zmm0) | (zmm11 & ~zmm8 & ~zmm0) | (zmm11 & ~zmm8 & zmm0) | (zmm11 & zmm8 & zmm0) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm10, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm8 & zmm10) | (zmm12 & ~zmm8 & ~zmm10) | (zmm12 & ~zmm8 & zmm10) | (zmm12 & zmm8 & zmm10) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm0 & zmm2) | (zmm7 & ~zmm0 & ~zmm2) | (zmm7 & ~zmm0 & zmm2) | (zmm7 & zmm0 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm27, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm0 & zmm27) | (zmm6 & ~zmm0 & ~zmm27) | (zmm6 & ~zmm0 & zmm27) | (zmm6 & zmm0 & zmm27) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm6 & zmm2) | (zmm7 & ~zmm6 & ~zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm23, %zmm8, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm8 & zmm23) | (zmm5 & ~zmm8 & ~zmm23) | (zmm5 & ~zmm8 & zmm23) | (zmm5 & zmm8 & zmm23) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (~zmm4 & zmm8 & mem) | (zmm4 & ~zmm8 & ~mem) | (zmm4 & ~zmm8 & mem) | (zmm4 & zmm8 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 576(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & ~zmm0 & mem) | (zmm3 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) ; AVX512DQ-FCP-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm6 & zmm0) | (zmm1 & ~zmm6 & ~zmm0) | (zmm1 & ~zmm6 & zmm0) | (zmm1 & zmm6 & zmm0) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FCP-NEXT: addq $1176, %rsp # imm = 0x498 ; AVX512DQ-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index dc362d729fcd31..51f173bc1a9bc4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -647,7 +647,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 48(%rax) @@ -684,7 +684,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] @@ -730,7 +730,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm0 & ~mem) | (zmm1 & ~zmm0 & mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, 48(%rax) @@ -767,7 +767,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512DQ-FCP-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] @@ -1381,8 +1381,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastd (%r10), %ymm11 ; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem) +; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem) ; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1401,8 +1401,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1441,8 +1441,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero @@ -1463,8 +1463,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1506,8 +1506,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm11 ; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm10 & ~mem) | (~zmm11 & zmm10 & mem) | (zmm11 & ~zmm10 & mem) | (zmm11 & zmm10 & ~mem) | (zmm11 & zmm10 & mem) +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm9 & mem) | (zmm11 & ~zmm9 & ~mem) | (zmm11 & zmm9 & ~mem) | (zmm11 & zmm9 & mem) ; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1526,8 +1526,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) @@ -1566,8 +1566,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (~ymm3 & ~ymm2 & mem) | (~ymm3 & ymm2 & ~mem) | (~ymm3 & ymm2 & mem) | (ymm3 & ymm2 & ~mem) | (ymm3 & ymm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm1 & ~mem) | (ymm3 & ~ymm1 & mem) | (ymm3 & ymm1 & ~mem) | (ymm3 & ymm1 & mem) ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero @@ -1588,8 +1588,8 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & ~mem) | (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) @@ -2937,27 +2937,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem) ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem) ; AVX512-NEXT: vmovdqa %ymm0, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rcx) @@ -3065,24 +3065,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem) ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem) ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem) ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) @@ -3199,27 +3199,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm7 & ~mem) | (zmm9 & ~zmm7 & mem) | (zmm9 & zmm7 & ~mem) | (zmm9 & zmm7 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & ~zmm4 & mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm9 & mem) | (zmm5 & ~zmm9 & ~mem) | (zmm5 & zmm9 & ~mem) | (zmm5 & zmm9 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm4 & ~mem) | (zmm7 & ~zmm4 & mem) | (zmm7 & zmm4 & ~mem) | (zmm7 & zmm4 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm0 & mem) | (zmm3 & ~zmm0 & ~mem) | (zmm3 & zmm0 & ~mem) | (zmm3 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm7 & mem) | (zmm3 & ~zmm7 & ~mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & ~mem) | (zmm0 & ~zmm18 & mem) | (zmm0 & zmm18 & ~mem) | (zmm0 & zmm18 & mem) ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm15 & mem) | (zmm1 & ~zmm15 & ~mem) | (zmm1 & zmm15 & ~mem) | (zmm1 & zmm15 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm0 & mem) | (zmm1 & ~zmm0 & ~mem) | (zmm1 & zmm0 & ~mem) | (zmm1 & zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & mem) | (ymm6 & ~ymm8 & ~mem) | (ymm6 & ymm8 & ~mem) | (ymm6 & ymm8 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm2 & mem) | (ymm0 & ~ymm2 & ~mem) | (ymm0 & ymm2 & ~mem) | (ymm0 & ymm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (~ymm0 & ymm6 & mem) | (ymm0 & ~ymm6 & ~mem) | (ymm0 & ymm6 & ~mem) | (ymm0 & ymm6 & mem) ; AVX512DQ-NEXT: vmovdqa %ymm0, 192(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rcx) @@ -3327,24 +3327,24 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & ~mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & ~zmm2 & mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm0 & mem) | (zmm10 & ~zmm0 & ~mem) | (zmm10 & zmm0 & ~mem) | (zmm10 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm9 & ~mem) | (zmm0 & ~zmm9 & mem) | (zmm0 & zmm9 & ~mem) | (zmm0 & zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm13 & mem) | (zmm6 & ~zmm13 & ~mem) | (zmm6 & zmm13 & ~mem) | (zmm6 & zmm13 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm0 & mem) | (zmm6 & ~zmm0 & ~mem) | (zmm6 & zmm0 & ~mem) | (zmm6 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm15 & ~mem) | (zmm0 & ~zmm15 & mem) | (zmm0 & zmm15 & ~mem) | (zmm0 & zmm15 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm20 & mem) | (zmm2 & ~zmm20 & ~mem) | (zmm2 & zmm20 & ~mem) | (zmm2 & zmm20 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & mem) | (zmm2 & ~zmm0 & ~mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm5 & mem) | (ymm4 & ~ymm5 & ~mem) | (ymm4 & ymm5 & ~mem) | (ymm4 & ymm5 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm7 & ~mem) | (ymm1 & ~ymm7 & mem) | (ymm1 & ymm7 & ~mem) | (ymm1 & ymm7 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (~ymm1 & ymm4 & mem) | (ymm1 & ~ymm4 & ~mem) | (ymm1 & ymm4 & ~mem) | (ymm1 & ymm4 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) @@ -6136,19 +6136,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm28, %zmm30, %zmm29 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28) ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm30, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1) ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] ; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6177,48 +6177,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem) +; AVX512-NEXT: vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3) ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem) ; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) @@ -6447,20 +6447,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem) ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm26, %zmm2, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26) ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload @@ -6477,36 +6477,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem) ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7] ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm6 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm16, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem) ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -6764,19 +6764,19 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm28, %zmm30, %zmm29 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = (~zmm29 & zmm30 & zmm28) | (zmm29 & ~zmm30 & ~zmm28) | (zmm29 & ~zmm30 & zmm28) | (zmm29 & zmm30 & zmm28) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm30, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & ~zmm30 & zmm1) | (zmm9 & ~zmm30 & zmm1) | (zmm9 & zmm30 & ~zmm1) | (zmm9 & zmm30 & zmm1) ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm2 & zmm0) | (zmm1 & ~zmm2 & zmm0) | (zmm1 & zmm2 & ~zmm0) | (zmm1 & zmm2 & zmm0) ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm3 & ~mem) | (~zmm0 & zmm3 & mem) | (zmm0 & ~zmm3 & mem) | (zmm0 & zmm3 & ~mem) | (zmm0 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] ; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -6805,48 +6805,48 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm5 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm29 & mem) | (zmm5 & ~zmm29 & ~mem) | (zmm5 & zmm29 & ~mem) | (zmm5 & zmm29 & mem) +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm27 = (~zmm27 & zmm5 & mem) | (zmm27 & ~zmm5 & ~mem) | (zmm27 & zmm5 & ~mem) | (zmm27 & zmm5 & mem) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm2 & zmm3) | (zmm4 & ~zmm2 & zmm3) | (zmm4 & zmm2 & ~zmm3) | (zmm4 & zmm2 & zmm3) ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm3 & mem) | (zmm2 & ~zmm3 & ~mem) | (zmm2 & zmm3 & ~mem) | (zmm2 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & ~mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm20 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm3 & mem) | (zmm20 & ~zmm3 & ~mem) | (zmm20 & ~zmm3 & mem) | (zmm20 & zmm3 & ~mem) | (zmm20 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm4 & mem) | (zmm20 & ~zmm4 & ~mem) | (zmm20 & zmm4 & ~mem) | (zmm20 & zmm4 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm26 & ~mem) | (zmm3 & ~zmm26 & mem) | (zmm3 & zmm26 & ~mem) | (zmm3 & zmm26 & mem) ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm4 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = (~zmm4 & zmm25 & mem) | (zmm4 & ~zmm25 & ~mem) | (zmm4 & zmm25 & ~mem) | (zmm4 & zmm25 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm3 & mem) | (zmm4 & ~zmm3 & ~mem) | (zmm4 & zmm3 & ~mem) | (zmm4 & zmm3 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm3 & mem) | (zmm5 & ~zmm3 & ~mem) | (zmm5 & zmm3 & ~mem) | (zmm5 & zmm3 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = (~zmm6 & zmm3 & mem) | (zmm6 & ~zmm3 & ~mem) | (zmm6 & zmm3 & ~mem) | (zmm6 & zmm3 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & zmm5 & mem) | (zmm6 & ~zmm5 & ~mem) | (zmm6 & zmm5 & ~mem) | (zmm6 & zmm5 & mem) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) @@ -7075,20 +7075,20 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm24, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & ~zmm16 & zmm24) | (zmm12 & ~zmm16 & zmm24) | (zmm12 & zmm16 & ~zmm24) | (zmm12 & zmm16 & zmm24) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (~zmm15 & zmm12 & mem) | (zmm15 & ~zmm12 & ~mem) | (zmm15 & zmm12 & ~mem) | (zmm15 & zmm12 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm15 & mem) | (zmm7 & ~zmm15 & ~mem) | (zmm7 & zmm15 & ~mem) | (zmm7 & zmm15 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm26, %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm2 & zmm26) | (zmm8 & ~zmm2 & zmm26) | (zmm8 & zmm2 & ~zmm26) | (zmm8 & zmm2 & zmm26) ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = (~zmm12 & zmm29 & mem) | (zmm12 & ~zmm29 & ~mem) | (zmm12 & zmm29 & ~mem) | (zmm12 & zmm29 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm8 & mem) | (zmm12 & ~zmm8 & ~mem) | (zmm12 & zmm8 & ~mem) | (zmm12 & zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = (~zmm27 & zmm3 & mem) | (zmm27 & ~zmm3 & ~mem) | (zmm27 & zmm3 & ~mem) | (zmm27 & zmm3 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload @@ -7105,36 +7105,36 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm6 & mem) | (zmm4 & ~zmm6 & ~mem) | (zmm4 & zmm6 & ~mem) | (zmm4 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm31 & mem) | (zmm5 & ~zmm31 & ~mem) | (zmm5 & zmm31 & ~mem) | (zmm5 & zmm31 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm4 & mem) | (zmm5 & ~zmm4 & ~mem) | (zmm5 & zmm4 & ~mem) | (zmm5 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm25, %zmm16, %zmm21 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm2 & zmm4) | (zmm6 & ~zmm2 & zmm4) | (zmm6 & zmm2 & ~zmm4) | (zmm6 & zmm2 & zmm4) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = (~zmm21 & ~zmm16 & zmm25) | (zmm21 & ~zmm16 & zmm25) | (zmm21 & zmm16 & ~zmm25) | (zmm21 & zmm16 & zmm25) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm30 & ~mem) | (~zmm0 & zmm30 & mem) | (zmm0 & ~zmm30 & mem) | (zmm0 & zmm30 & ~mem) | (zmm0 & zmm30 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm2 & mem) | (zmm4 & ~zmm2 & ~mem) | (zmm4 & zmm2 & ~mem) | (zmm4 & zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm9 & mem) | (zmm1 & ~zmm9 & ~mem) | (zmm1 & ~zmm9 & mem) | (zmm1 & zmm9 & ~mem) | (zmm1 & zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & mem) | (zmm1 & ~zmm4 & ~mem) | (zmm1 & zmm4 & ~mem) | (zmm1 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = (~zmm2 & zmm23 & mem) | (zmm2 & ~zmm23 & ~mem) | (zmm2 & zmm23 & ~mem) | (zmm2 & zmm23 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm21 & mem) | (zmm2 & ~zmm21 & ~mem) | (zmm2 & zmm21 & ~mem) | (zmm2 & zmm21 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & ~mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm27 & mem) | (zmm3 & ~zmm27 & ~mem) | (zmm3 & zmm27 & ~mem) | (zmm3 & zmm27 & mem) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) @@ -12769,24 +12769,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11) ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $248, %ymm11, %ymm6, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11) ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6) ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %ymm7, %ymm8, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13024,7 +13024,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem) ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX512-NEXT: vmovdqa %xmm4, %xmm6 @@ -13042,7 +13042,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1) ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] @@ -13054,7 +13054,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm19, %zmm25 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1) ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] ; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] @@ -13071,7 +13071,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm0, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3) ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -13081,7 +13081,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm19, %zmm20 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] @@ -13156,7 +13156,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem) ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] @@ -13169,99 +13169,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3) ; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512-NEXT: # ymm3 = mem[2,1,3,2] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %ymm1, %ymm29, %ymm3 -; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm28, %ymm30 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm23, %zmm28, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm23 -; AVX512-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem) ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload +; AVX512-NEXT: # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm30 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm11 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm25 -; AVX512-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm20 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512-NEXT: vpermd 64(%rax), %zmm14, %zmm5 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm5 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1) ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 ; AVX512-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm14 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1) ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1) ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1) ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] @@ -13305,27 +13309,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogd $184, %zmm7, %zmm29, %zmm9 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7) ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512-NEXT: vpternlogd $184, %zmm3, %zmm29, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm9 -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8) ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm28, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm31 -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm13 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -13442,26 +13446,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512-FCP-NEXT: vpternlogq $248, %ymm13, %ymm12, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13) ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $248, %ymm16, %ymm11, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm11 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm12, %ymm10, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13499,7 +13503,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -13551,7 +13555,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -13566,7 +13570,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm4, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4) ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3 @@ -13587,7 +13591,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm5, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5) ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm12 ; AVX512-FCP-NEXT: vprold $16, %ymm26, %ymm3 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm22 @@ -13603,8 +13607,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm5, %zmm18, %zmm0 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7] @@ -13637,7 +13641,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2) ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] @@ -13653,9 +13657,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm21, %zmm31 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm5, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 @@ -13677,7 +13681,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6) ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] @@ -13690,8 +13694,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm3, %zmm21, %zmm26 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm8, %zmm2, %zmm26 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -13722,7 +13726,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm27, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload @@ -13740,7 +13744,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] @@ -13749,8 +13753,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 @@ -13781,14 +13785,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm18, %zmm25 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm25 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 @@ -13800,11 +13804,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] @@ -13823,7 +13827,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $184, %zmm6, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6) ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] @@ -13839,10 +13843,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17 ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm8, %zmm18, %zmm20 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm20 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm8 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3] @@ -13860,7 +13864,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm8, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8) ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 @@ -13870,23 +13874,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogd $184, %zmm2, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq $184, %ymm16, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] @@ -13926,25 +13932,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm11, %zmm28, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $226, %zmm6, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 +; AVX512-FCP-NEXT: # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) @@ -14097,24 +14105,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm9 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm6 & ~mem) | (zmm7 & ~zmm6 & mem) | (zmm7 & zmm6 & ~mem) | (zmm7 & zmm6 & mem) ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %ymm11, %ymm7, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm7 & ymm11) | (ymm6 & ~ymm7 & ~ymm11) | (ymm6 & ~ymm7 & ymm11) | (ymm6 & ymm7 & ~ymm11) | (ymm6 & ymm7 & ymm11) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $248, %ymm11, %ymm6, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = (~ymm9 & ymm6 & ymm11) | (ymm9 & ~ymm6 & ~ymm11) | (ymm9 & ~ymm6 & ymm11) | (ymm9 & ymm6 & ~ymm11) | (ymm9 & ymm6 & ymm11) ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (~ymm7 & ymm10 & ymm6) | (ymm7 & ~ymm10 & ~ymm6) | (ymm7 & ~ymm10 & ymm6) | (ymm7 & ymm10 & ymm6) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %ymm7, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm8 & ymm7) | (ymm6 & ~ymm8 & ~ymm7) | (ymm6 & ~ymm8 & ymm7) | (ymm6 & ymm8 & ymm7) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14352,7 +14360,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm0 & mem) | (zmm31 & ~zmm0 & ~mem) | (zmm31 & zmm0 & ~mem) | (zmm31 & zmm0 & mem) ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm6 @@ -14370,7 +14378,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm0 & zmm1) | (zmm14 & ~zmm0 & zmm1) | (zmm14 & zmm0 & ~zmm1) | (zmm14 & zmm0 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] @@ -14382,7 +14390,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm19, %zmm25 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm19 & zmm1) | (zmm25 & ~zmm19 & ~zmm1) | (zmm25 & ~zmm19 & zmm1) | (zmm25 & zmm19 & zmm1) ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] @@ -14399,7 +14407,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm0, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm0 & zmm3) | (zmm5 & ~zmm0 & zmm3) | (zmm5 & zmm0 & ~zmm3) | (zmm5 & zmm0 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -14409,7 +14417,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm19, %zmm20 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm19 & zmm0) | (zmm20 & ~zmm19 & ~zmm0) | (zmm20 & ~zmm19 & zmm0) | (zmm20 & zmm19 & zmm0) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] @@ -14484,7 +14492,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm1 & mem) | (zmm13 & ~zmm1 & ~mem) | (zmm13 & zmm1 & ~mem) | (zmm13 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] @@ -14497,99 +14505,103 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & zmm4 & zmm3) | (zmm1 & ~zmm4 & ~zmm3) | (zmm1 & ~zmm4 & zmm3) | (zmm1 & zmm4 & zmm3) ; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = mem[2,1,3,2] ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %ymm1, %ymm29, %ymm3 -; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm28, %ymm30 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (~ymm3 & ymm29 & ymm1) | (ymm3 & ~ymm29 & ~ymm1) | (ymm3 & ~ymm29 & ymm1) | (ymm3 & ymm29 & ymm1) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = (~ymm30 & ymm28 & ymm3) | (ymm30 & ~ymm28 & ~ymm3) | (ymm30 & ~ymm28 & ymm3) | (ymm30 & ymm28 & ymm3) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = (~zmm3 & ~zmm28 & mem) | (~zmm3 & zmm28 & mem) | (zmm3 & ~zmm28 & mem) | (zmm3 & zmm28 & ~mem) | (zmm3 & zmm28 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm28 & mem) | (~zmm0 & zmm28 & mem) | (zmm0 & ~zmm28 & mem) | (zmm0 & zmm28 & ~mem) | (zmm0 & zmm28 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm23, %zmm28, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm28 & zmm23) | (zmm18 & ~zmm28 & zmm23) | (zmm18 & zmm28 & ~zmm23) | (zmm18 & zmm28 & zmm23) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (~zmm18 & ~zmm2 & zmm3) | (zmm18 & ~zmm2 & zmm3) | (zmm18 & zmm2 & ~zmm3) | (zmm18 & zmm2 & zmm3) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm23 -; AVX512DQ-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm23 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm28 & zmm3) | (zmm23 & ~zmm28 & zmm3) | (zmm23 & zmm28 & ~zmm3) | (zmm23 & zmm28 & zmm3) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (~zmm23 & ~zmm2 & zmm0) | (zmm23 & ~zmm2 & zmm0) | (zmm23 & zmm2 & ~zmm0) | (zmm23 & zmm2 & zmm0) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = (~zmm17 & zmm0 & ~mem) | (zmm17 & ~zmm0 & mem) | (zmm17 & zmm0 & ~mem) | (zmm17 & zmm0 & mem) ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm19, %ymm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = (~ymm0 & ~ymm19 & mem) | (ymm0 & ~ymm19 & mem) | (ymm0 & ymm19 & ~mem) | (ymm0 & ymm19 & mem) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (~ymm2 & ymm1 & ymm0) | (ymm2 & ~ymm1 & ~ymm0) | (ymm2 & ~ymm1 & ymm0) | (ymm2 & ymm1 & ymm0) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm19 = (~zmm19 & zmm1 & mem) | (zmm19 & ~zmm1 & ~mem) | (zmm19 & ~zmm1 & mem) | (zmm19 & zmm1 & ~mem) | (zmm19 & zmm1 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (~zmm0 & ~zmm1 & mem) | (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm30 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm30 = (~zmm30 & zmm2 & zmm1) | (zmm30 & ~zmm2 & ~zmm1) | (zmm30 & ~zmm2 & zmm1) | (zmm30 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm2, %zmm11 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = (~zmm11 & zmm2 & zmm1) | (zmm11 & ~zmm2 & ~zmm1) | (zmm11 & ~zmm2 & zmm1) | (zmm11 & zmm2 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm14, %zmm1, %zmm25 -; AVX512DQ-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm20 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm1 & zmm14) | (zmm25 & ~zmm1 & ~zmm14) | (zmm25 & ~zmm1 & zmm14) | (zmm25 & zmm1 & zmm14) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm1 & zmm5) | (zmm20 & ~zmm1 & ~zmm5) | (zmm20 & ~zmm1 & zmm5) | (zmm20 & zmm1 & zmm5) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & ~zmm5 & zmm1) | (zmm2 & ~zmm5 & zmm1) | (zmm2 & zmm5 & ~zmm1) | (zmm2 & zmm5 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm5 & zmm1) | (zmm22 & ~zmm5 & zmm1) | (zmm22 & zmm5 & ~zmm1) | (zmm22 & zmm5 & zmm1) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] ; AVX512DQ-NEXT: vpermd 64(%rax), %zmm14, %zmm5 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm3 & zmm1) | (zmm5 & ~zmm3 & ~zmm1) | (zmm5 & ~zmm3 & zmm1) | (zmm5 & zmm3 & zmm1) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 ; AVX512DQ-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm14 = (~zmm14 & zmm3 & zmm1) | (zmm14 & ~zmm3 & ~zmm1) | (zmm14 & ~zmm3 & zmm1) | (zmm14 & zmm3 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm1, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm2) | (zmm5 & ~zmm1 & ~zmm2) | (zmm5 & ~zmm1 & zmm2) | (zmm5 & zmm1 & zmm2) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm1 & zmm22) | (zmm14 & ~zmm1 & ~zmm22) | (zmm14 & ~zmm1 & zmm22) | (zmm14 & zmm1 & zmm22) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm4 & zmm1) | (zmm2 & ~zmm4 & ~zmm1) | (zmm2 & ~zmm4 & zmm1) | (zmm2 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & ~zmm4 & zmm1) | (zmm22 & ~zmm4 & zmm1) | (zmm22 & zmm4 & ~zmm1) | (zmm22 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm4 & zmm1) | (zmm8 & ~zmm4 & zmm1) | (zmm8 & zmm4 & ~zmm1) | (zmm8 & zmm4 & zmm1) ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] @@ -14633,27 +14645,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogd $184, %zmm7, %zmm29, %zmm9 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = (~zmm9 & zmm29 & zmm7) | (zmm9 & ~zmm29 & ~zmm7) | (zmm9 & ~zmm29 & zmm7) | (zmm9 & zmm29 & zmm7) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm10 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpternlogd $184, %zmm3, %zmm29, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = (~zmm7 & zmm29 & zmm3) | (zmm7 & ~zmm29 & ~zmm3) | (zmm7 & ~zmm29 & zmm3) | (zmm7 & zmm29 & zmm3) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm3, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm3 & zmm22) | (zmm9 & ~zmm3 & ~zmm22) | (zmm9 & ~zmm3 & zmm22) | (zmm9 & zmm3 & zmm22) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & zmm3 & zmm8) | (zmm7 & ~zmm3 & ~zmm8) | (zmm7 & ~zmm3 & zmm8) | (zmm7 & zmm3 & zmm8) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm28 & zmm3) | (zmm8 & ~zmm28 & zmm3) | (zmm8 & zmm28 & ~zmm3) | (zmm8 & zmm28 & zmm3) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm28, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & ~zmm28 & zmm1) | (zmm3 & ~zmm28 & zmm1) | (zmm3 & zmm28 & ~zmm1) | (zmm3 & zmm28 & zmm1) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq $184, %zmm8, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm13 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm1 & zmm8) | (zmm31 & ~zmm1 & ~zmm8) | (zmm31 & ~zmm1 & zmm8) | (zmm31 & zmm1 & zmm8) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = (~zmm13 & zmm1 & zmm3) | (zmm13 & ~zmm1 & ~zmm3) | (zmm13 & ~zmm1 & zmm3) | (zmm13 & zmm1 & zmm3) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm17 & mem) | (zmm0 & ~zmm17 & ~mem) | (zmm0 & zmm17 & ~mem) | (zmm0 & zmm17 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm2 & mem) | (zmm11 & ~zmm2 & ~mem) | (zmm11 & zmm2 & ~mem) | (zmm11 & zmm2 & mem) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) @@ -14770,26 +14782,26 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm11 & ~mem) | (zmm12 & ~zmm11 & mem) | (zmm12 & zmm11 & ~mem) | (zmm12 & zmm11 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm13, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm12 & ymm13) | (ymm11 & ~ymm12 & ~ymm13) | (ymm11 & ~ymm12 & ymm13) | (ymm11 & ymm12 & ~ymm13) | (ymm11 & ymm12 & ymm13) ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm16, %ymm11, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm11 & ymm16) | (ymm6 & ~ymm11 & ~ymm16) | (ymm6 & ~ymm11 & ymm16) | (ymm6 & ymm11 & ~ymm16) | (ymm6 & ymm11 & ymm16) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (~ymm12 & ymm16 & ymm11) | (ymm12 & ~ymm16 & ~ymm11) | (ymm12 & ~ymm16 & ymm11) | (ymm12 & ymm16 & ymm11) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm11 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm12, %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (~ymm11 & ymm10 & ymm12) | (ymm11 & ~ymm10 & ~ymm12) | (ymm11 & ~ymm10 & ymm12) | (ymm11 & ymm10 & ymm12) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14827,7 +14839,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm27 & zmm4) | (zmm1 & ~zmm27 & zmm4) | (zmm1 & zmm27 & ~zmm4) | (zmm1 & zmm27 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 @@ -14879,7 +14891,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm4, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm27 & zmm4) | (zmm0 & ~zmm27 & zmm4) | (zmm0 & zmm27 & ~zmm4) | (zmm0 & zmm27 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] @@ -14894,7 +14906,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm4, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm19 & zmm4) | (zmm3 & ~zmm19 & ~zmm4) | (zmm3 & ~zmm19 & zmm4) | (zmm3 & zmm19 & zmm4) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3 @@ -14915,7 +14927,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm5, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (~zmm6 & ~zmm27 & zmm5) | (zmm6 & ~zmm27 & zmm5) | (zmm6 & zmm27 & ~zmm5) | (zmm6 & zmm27 & zmm5) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm12 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm26, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm22 @@ -14931,8 +14943,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm5, %zmm18, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = (~zmm0 & zmm18 & zmm5) | (zmm0 & ~zmm18 & ~zmm5) | (zmm0 & ~zmm18 & zmm5) | (zmm0 & zmm18 & zmm5) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & zmm6 & mem) | (zmm0 & ~zmm6 & ~mem) | (zmm0 & zmm6 & ~mem) | (zmm0 & zmm6 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7] @@ -14965,7 +14977,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm3 & zmm2) | (zmm5 & ~zmm3 & zmm2) | (zmm5 & zmm3 & ~zmm2) | (zmm5 & zmm3 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] @@ -14981,9 +14993,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm21, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = (~zmm31 & zmm21 & zmm2) | (zmm31 & ~zmm21 & ~zmm2) | (zmm31 & ~zmm21 & zmm2) | (zmm31 & zmm21 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm5, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = (~zmm31 & zmm2 & zmm5) | (zmm31 & ~zmm2 & ~zmm5) | (zmm31 & ~zmm2 & zmm5) | (zmm31 & zmm2 & zmm5) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 @@ -15005,7 +15017,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (~zmm8 & ~zmm3 & zmm6) | (zmm8 & ~zmm3 & zmm6) | (zmm8 & zmm3 & ~zmm6) | (zmm8 & zmm3 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] @@ -15018,8 +15030,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm3, %zmm21, %zmm26 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm8, %zmm2, %zmm26 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = (~zmm26 & zmm21 & zmm3) | (zmm26 & ~zmm21 & ~zmm3) | (zmm26 & ~zmm21 & zmm3) | (zmm26 & zmm21 & zmm3) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = (~zmm26 & zmm2 & zmm8) | (zmm26 & ~zmm2 & ~zmm8) | (zmm26 & ~zmm2 & zmm8) | (zmm26 & zmm2 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -15050,7 +15062,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm27, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & ~zmm27 & zmm3) | (zmm5 & ~zmm27 & zmm3) | (zmm5 & zmm27 & ~zmm3) | (zmm5 & zmm27 & zmm3) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload @@ -15068,7 +15080,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & ~zmm21 & zmm1) | (zmm10 & ~zmm21 & zmm1) | (zmm10 & zmm21 & ~zmm1) | (zmm10 & zmm21 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] @@ -15077,8 +15089,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0] ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = (~zmm22 & zmm8 & mem) | (zmm22 & ~zmm8 & ~mem) | (zmm22 & zmm8 & ~mem) | (zmm22 & zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (~zmm22 & zmm5 & mem) | (zmm22 & ~zmm5 & ~mem) | (zmm22 & zmm5 & ~mem) | (zmm22 & zmm5 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 @@ -15109,14 +15121,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm18, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = (~zmm25 & zmm18 & zmm2) | (zmm25 & ~zmm18 & ~zmm2) | (zmm25 & ~zmm18 & zmm2) | (zmm25 & zmm18 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm10, %zmm27, %zmm25 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = (~zmm25 & zmm27 & zmm10) | (zmm25 & ~zmm27 & ~zmm10) | (zmm25 & ~zmm27 & zmm10) | (zmm25 & zmm27 & zmm10) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm21 & zmm0) | (zmm1 & ~zmm21 & zmm0) | (zmm1 & zmm21 & ~zmm0) | (zmm1 & zmm21 & zmm0) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3 @@ -15128,11 +15140,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm3) | (zmm7 & ~zmm28 & zmm3) | (zmm7 & zmm28 & ~zmm3) | (zmm7 & zmm28 & zmm3) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (~ymm16 & ymm18 & ymm0) | (ymm16 & ~ymm18 & ~ymm0) | (ymm16 & ~ymm18 & ymm0) | (ymm16 & ymm18 & ymm0) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] @@ -15151,7 +15163,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm6, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = (~zmm5 & zmm24 & zmm6) | (zmm5 & ~zmm24 & ~zmm6) | (zmm5 & ~zmm24 & zmm6) | (zmm5 & zmm24 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] @@ -15167,10 +15179,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17 ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm8, %zmm18, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = (~zmm20 & zmm18 & zmm8) | (zmm20 & ~zmm18 & ~zmm8) | (zmm20 & ~zmm18 & zmm8) | (zmm20 & zmm18 & zmm8) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = (~zmm20 & zmm27 & zmm1) | (zmm20 & ~zmm27 & ~zmm1) | (zmm20 & ~zmm27 & zmm1) | (zmm20 & zmm27 & zmm1) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm1 & zmm7) | (zmm5 & ~zmm1 & ~zmm7) | (zmm5 & ~zmm1 & zmm7) | (zmm5 & zmm1 & zmm7) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm8 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3] @@ -15188,7 +15200,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm8, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm8) | (zmm7 & ~zmm28 & zmm8) | (zmm7 & zmm28 & ~zmm8) | (zmm7 & zmm28 & zmm8) ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 @@ -15198,23 +15210,25 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm2, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = (~zmm3 & zmm24 & zmm2) | (zmm3 & ~zmm24 & ~zmm2) | (zmm3 & ~zmm24 & zmm2) | (zmm3 & zmm24 & zmm2) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (~zmm3 & zmm1 & zmm7) | (zmm3 & ~zmm1 & ~zmm7) | (zmm3 & ~zmm1 & zmm7) | (zmm3 & zmm1 & zmm7) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = (~ymm13 & ~ymm0 & mem) | (ymm13 & ~ymm0 & mem) | (ymm13 & ymm0 & ~mem) | (ymm13 & ymm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm13, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (~ymm6 & ymm1 & ymm13) | (ymm6 & ~ymm1 & ~ymm13) | (ymm6 & ~ymm1 & ymm13) | (ymm6 & ymm1 & ymm13) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm16, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (~ymm4 & ymm2 & ymm16) | (ymm4 & ~ymm2 & ~ymm16) | (ymm4 & ~ymm2 & ymm16) | (ymm4 & ymm2 & ymm16) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = (~zmm13 & zmm0 & mem) | (zmm13 & ~zmm0 & ~mem) | (zmm13 & zmm0 & ~mem) | (zmm13 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & ~zmm2 & mem) | (~zmm0 & zmm2 & mem) | (zmm0 & ~zmm2 & mem) | (zmm0 & zmm2 & ~mem) | (zmm0 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = (~zmm2 & ~zmm4 & mem) | (~zmm2 & zmm4 & mem) | (zmm2 & ~zmm4 & mem) | (zmm2 & zmm4 & ~mem) | (zmm2 & zmm4 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] @@ -15254,25 +15268,27 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & zmm12 & mem) | (zmm14 & ~zmm12 & ~mem) | (zmm14 & zmm12 & ~mem) | (zmm14 & zmm12 & mem) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm11, %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm28 & zmm11) | (zmm4 & ~zmm28 & zmm11) | (zmm4 & zmm28 & ~zmm11) | (zmm4 & zmm28 & zmm11) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm6, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm28 & zmm6) | (zmm7 & ~zmm28 & zmm6) | (zmm7 & zmm28 & ~zmm6) | (zmm7 & zmm28 & zmm6) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm6 & zmm0) | (zmm4 & ~zmm6 & zmm0) | (zmm4 & zmm6 & ~zmm0) | (zmm4 & zmm6 & zmm0) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (~zmm7 & ~zmm6 & zmm2) | (zmm7 & ~zmm6 & zmm2) | (zmm7 & zmm6 & ~zmm2) | (zmm7 & zmm6 & zmm2) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm0 & ~mem) | (zmm2 & ~zmm0 & mem) | (zmm2 & zmm0 & ~mem) | (zmm2 & zmm0 & mem) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (~zmm0 & zmm1 & mem) | (zmm0 & ~zmm1 & ~mem) | (zmm0 & ~zmm1 & mem) | (zmm0 & zmm1 & ~mem) | (zmm0 & zmm1 & mem) ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 +; AVX512DQ-FCP-NEXT: # zmm10 = (~zmm10 & zmm1 & mem) | (zmm10 & ~zmm1 & ~mem) | (zmm10 & ~zmm1 & mem) | (zmm10 & zmm1 & ~mem) | (zmm10 & zmm1 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm2 & mem) | (zmm10 & ~zmm2 & ~mem) | (zmm10 & zmm2 & ~mem) | (zmm10 & zmm2 & mem) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 311166ef60dda0..47690f3c60edfa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -7226,7 +7226,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm13, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload @@ -7253,7 +7253,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -7280,7 +7280,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -7314,7 +7314,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -7391,7 +7391,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm13, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -7427,7 +7427,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero @@ -7454,7 +7454,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512-NEXT: vpternlogq $184, %zmm7, %zmm13, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7) ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 @@ -7473,7 +7473,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -7664,6 +7664,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 @@ -7747,30 +7748,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem) ; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem) ; AVX512-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm20, %zmm7, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20) ; AVX512-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm24, %zmm7, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24) ; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm27, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27) ; AVX512-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm30, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30) ; AVX512-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpternlogq $226, %zmm19, %zmm7, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19) ; AVX512-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 ; AVX512-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1} @@ -7999,7 +8002,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm13, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (~zmm10 & zmm13 & zmm3) | (zmm10 & ~zmm13 & ~zmm3) | (zmm10 & ~zmm13 & zmm3) | (zmm10 & zmm13 & zmm3) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload @@ -8026,7 +8029,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (~zmm9 & zmm13 & zmm0) | (zmm9 & ~zmm13 & ~zmm0) | (zmm9 & ~zmm13 & zmm0) | (zmm9 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -8053,7 +8056,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & zmm13 & zmm0) | (zmm11 & ~zmm13 & ~zmm0) | (zmm11 & ~zmm13 & zmm0) | (zmm11 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -8087,7 +8090,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & zmm13 & zmm0) | (zmm4 & ~zmm13 & ~zmm0) | (zmm4 & ~zmm13 & zmm0) | (zmm4 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -8164,7 +8167,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq $184, %zmm12, %zmm13, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (~zmm5 & zmm13 & zmm12) | (zmm5 & ~zmm13 & ~zmm12) | (zmm5 & ~zmm13 & zmm12) | (zmm5 & zmm13 & zmm12) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -8200,7 +8203,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = (~zmm12 & zmm13 & zmm0) | (zmm12 & ~zmm13 & ~zmm0) | (zmm12 & ~zmm13 & zmm0) | (zmm12 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero @@ -8227,7 +8230,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpternlogq $184, %zmm7, %zmm13, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (~zmm16 & zmm13 & zmm7) | (zmm16 & ~zmm13 & ~zmm7) | (zmm16 & ~zmm13 & zmm7) | (zmm16 & zmm13 & zmm7) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 @@ -8246,7 +8249,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (~zmm2 & zmm13 & zmm0) | (zmm2 & ~zmm13 & ~zmm0) | (zmm2 & ~zmm13 & zmm0) | (zmm2 & zmm13 & zmm0) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 @@ -8437,6 +8440,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (~zmm3 & ~zmm7 & mem) | (zmm3 & ~zmm7 & mem) | (zmm3 & zmm7 & ~mem) | (zmm3 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpandnq %zmm19, %zmm2, %zmm19 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm21 @@ -8520,30 +8524,32 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = (~zmm6 & ~zmm7 & mem) | (zmm6 & ~zmm7 & mem) | (zmm6 & zmm7 & ~mem) | (zmm6 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm15 = (~zmm15 & ~zmm7 & mem) | (zmm15 & ~zmm7 & mem) | (zmm15 & zmm7 & ~mem) | (zmm15 & zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpandnq (%rsp), %zmm2, %zmm5 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm15 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm20, %zmm7, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (~zmm11 & ~zmm7 & zmm20) | (zmm11 & ~zmm7 & zmm20) | (zmm11 & zmm7 & ~zmm20) | (zmm11 & zmm7 & zmm20) ; AVX512DQ-FCP-NEXT: vpandnq %zmm22, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm23, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm24, %zmm7, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (~zmm14 & ~zmm7 & zmm24) | (zmm14 & ~zmm7 & zmm24) | (zmm14 & zmm7 & ~zmm24) | (zmm14 & zmm7 & zmm24) ; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm26, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm14 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm27, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (~zmm0 & ~zmm7 & zmm27) | (zmm0 & ~zmm7 & zmm27) | (zmm0 & zmm7 & ~zmm27) | (zmm0 & zmm7 & zmm27) ; AVX512DQ-FCP-NEXT: vpandnq %zmm28, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm29, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm30, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (~zmm1 & ~zmm7 & zmm30) | (zmm1 & ~zmm7 & zmm30) | (zmm1 & zmm7 & ~zmm30) | (zmm1 & zmm7 & zmm30) ; AVX512DQ-FCP-NEXT: vpandnq %zmm31, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm16, %zmm8 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm8, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm19, %zmm7, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (~zmm4 & ~zmm7 & zmm19) | (zmm4 & ~zmm7 & zmm19) | (zmm4 & zmm7 & ~zmm19) | (zmm4 & zmm7 & zmm19) ; AVX512DQ-FCP-NEXT: vpandnq %zmm13, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm21, %zmm2 ; AVX512DQ-FCP-NEXT: vpord %zmm5, %zmm2, %zmm4 {%k1}