diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 839006cbaed4c6..7b9e6c0a002739 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17846,6 +17846,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS); } + // v16i16/v32i8 selects without AVX2, if the condition and another operand + // are free to split, then better to split before expanding the + // select. Don't bother with XOP as it has the fast VPCMOV instruction. + // TODO: This is very similar to narrowVectorSelect. + // TODO: Add Load splitting to isFreeToSplitVector ? + if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() && + !Subtarget.hasXOP()) { + bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG); + bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) || + (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse()); + bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) || + (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse()); + if (FreeCond && (FreeLHS || FreeRHS)) + return splitVectorOp(Op, DAG, dl); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index f976222ac3b378..84317ad34fb29d 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -1509,16 +1509,16 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) { ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1 -; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_blend_load_v16i16: @@ -1578,16 +1578,16 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) { ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1 -; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_blend_load_v32i8: