-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAG][X86] expandABD - add branchless abds/abdu expansion for 0/-1 comparison result cases #92780
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the comparison results are allbits masks, we can expand as This allows us to remove a lot of X86 specific legalization code, and will be useful in future generic expansion for the legalization work in #92576 Alive2: https://alive2.llvm.org/ce/z/sj863C Patch is 106.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92780.diff 8 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3ec6b9b795079..268e3f5f505a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9194,11 +9194,20 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
- // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
- // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
+
+ // Branchless expansion iif cmp result is allbits:
+ // abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs)))
+ if (CCVT == VT && getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+ SDValue Diff = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Diff, Cmp);
+ return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor);
+ }
+
+ // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+ // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5d0846453685f..c1c1ebbb7a70f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1108,13 +1108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
}
- setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
- setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
- setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
- setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
- setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
- setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
-
setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
@@ -1135,6 +1128,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
+ setOperationAction(ISD::ABDS, VT, Custom);
+ setOperationAction(ISD::ABDU, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1336,11 +1331,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
- for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
- setOperationAction(ISD::ABDS, VT, Custom);
- setOperationAction(ISD::ABDU, VT, Custom);
- }
-
setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
@@ -28421,18 +28411,6 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // TODO: Move to TargetLowering expandABD().
- if (!Subtarget.hasSSE41() &&
- ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
- SDValue LHS = DAG.getFreeze(Op.getOperand(0));
- SDValue RHS = DAG.getFreeze(Op.getOperand(1));
- ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
- SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
- SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
- SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
- return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
- }
-
// Default to expand.
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index 3143bf6190657..bcb42002fb08e 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -12,14 +12,12 @@
define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_ext_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v16i8:
@@ -47,14 +45,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_ext_v16i8_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v16i8_undef:
@@ -128,14 +124,12 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_ext_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v4i32:
@@ -163,14 +157,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_ext_v4i32_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -198,61 +190,48 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_ext_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: movq %rdi, %r8
-; SSE2-NEXT: sarq $63, %r8
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: subq %r9, %rdx
-; SSE2-NEXT: sbbq %r10, %rsi
-; SSE2-NEXT: subq %rdi, %rax
-; SSE2-NEXT: sbbq %r8, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: subq %rcx, %rax
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: subq %rsi, %rdx
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64:
@@ -272,61 +251,48 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_ext_v2i64_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: movq %rdi, %r8
-; SSE2-NEXT: sarq $63, %r8
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: subq %r9, %rdx
-; SSE2-NEXT: sbbq %r10, %rsi
-; SSE2-NEXT: subq %rdi, %rax
-; SSE2-NEXT: sbbq %r8, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: subq %rcx, %rax
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: subq %rsi, %rdx
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v2i64_undef:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -350,14 +316,12 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_minmax_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v16i8:
@@ -404,14 +368,12 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_minmax_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v4i32:
@@ -445,47 +407,40 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_minmax_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v2i64:
@@ -507,14 +462,12 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_cmp_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_cmp_v16i8:
@@ -563,14 +516,12 @@ define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_cmp_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_cmp_v4i32:
@@ -598,9 +549,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_cmp_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -609,12 +560,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Simon Pilgrim (RKSimon) ChangesIf the comparison results are allbits masks, we can expand as This allows us to remove a lot of X86 specific legalization code, and will be useful in future generic expansion for the legalization work in #92576 Alive2: https://alive2.llvm.org/ce/z/sj863C Patch is 106.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92780.diff 8 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3ec6b9b795079..268e3f5f505a5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9194,11 +9194,20 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
- // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
- // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
+
+ // Branchless expansion iif cmp result is allbits:
+ // abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs)))
+ if (CCVT == VT && getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+ SDValue Diff = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Diff, Cmp);
+ return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor);
+ }
+
+ // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+ // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5d0846453685f..c1c1ebbb7a70f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1108,13 +1108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
}
- setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
- setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
- setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
- setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
- setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
- setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
-
setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
@@ -1135,6 +1128,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
+ setOperationAction(ISD::ABDS, VT, Custom);
+ setOperationAction(ISD::ABDU, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1336,11 +1331,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
- for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
- setOperationAction(ISD::ABDS, VT, Custom);
- setOperationAction(ISD::ABDU, VT, Custom);
- }
-
setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
@@ -28421,18 +28411,6 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // TODO: Move to TargetLowering expandABD().
- if (!Subtarget.hasSSE41() &&
- ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
- SDValue LHS = DAG.getFreeze(Op.getOperand(0));
- SDValue RHS = DAG.getFreeze(Op.getOperand(1));
- ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
- SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
- SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
- SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
- return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
- }
-
// Default to expand.
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index 3143bf6190657..bcb42002fb08e 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -12,14 +12,12 @@
define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_ext_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v16i8:
@@ -47,14 +45,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_ext_v16i8_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v16i8_undef:
@@ -128,14 +124,12 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_ext_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v4i32:
@@ -163,14 +157,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_ext_v4i32_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -198,61 +190,48 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_ext_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: movq %rdi, %r8
-; SSE2-NEXT: sarq $63, %r8
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: subq %r9, %rdx
-; SSE2-NEXT: sbbq %r10, %rsi
-; SSE2-NEXT: subq %rdi, %rax
-; SSE2-NEXT: sbbq %r8, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: subq %rcx, %rax
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: subq %rsi, %rdx
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64:
@@ -272,61 +251,48 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_ext_v2i64_undef:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: movq %rdi, %r8
-; SSE2-NEXT: sarq $63, %r8
-; SSE2-NEXT: movq %xmm1, %r9
-; SSE2-NEXT: movq %r9, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: subq %r9, %rdx
-; SSE2-NEXT: sbbq %r10, %rsi
-; SSE2-NEXT: subq %rdi, %rax
-; SSE2-NEXT: sbbq %r8, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: xorq %rcx, %rax
-; SSE2-NEXT: subq %rcx, %rax
-; SSE2-NEXT: sarq $63, %rsi
-; SSE2-NEXT: xorq %rsi, %rdx
-; SSE2-NEXT: subq %rsi, %rdx
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movq %rax, %xmm1
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_ext_v2i64_undef:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -350,14 +316,12 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_minmax_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v16i8:
@@ -404,14 +368,12 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_minmax_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v4i32:
@@ -445,47 +407,40 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: psubq %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_minmax_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psubq %xmm1, %xmm3
-; SSE42-NEXT: psubq %xmm0, %xmm1
+; SSE42-NEXT: psubq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm2, %xmm0
+; SSE42-NEXT: psubq %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm2, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_minmax_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v2i64:
@@ -507,14 +462,12 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: abd_cmp_v16i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: psubb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_cmp_v16i8:
@@ -563,14 +516,12 @@ define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: abd_cmp_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: abd_cmp_v4i32:
@@ -598,9 +549,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: abd_cmp_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -609,12 +560,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psubq %xmm1, %xmm3
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); | ||
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; | ||
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC); | ||
|
||
// Branchless expansion iif cmp result is allbits: | ||
// abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thats clever, took me a second.
FWIW, for avx512
targets, think vblendvpd
-> vpternlogd
is probably the best codegen (the new logic has less ILP).
LGTM. Wait a day or so to give others time to review before pushing. |
// abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(lhs, rhs))) | ||
// abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(lhs, rhs))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need to change either the comparison or the sub to make this work, e.g.:
// abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(lhs, rhs))) | |
// abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(lhs, rhs))) | |
// abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(rhs, lhs))) | |
// abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(rhs, lhs))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alive says no? https://alive2.llvm.org/ce/z/sj863C
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've double checked with a second abd (max/min) implementation as well: https://alive2.llvm.org/ce/z/wyAMWa
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, good point, sorry. For some reason I thought the operands of the final sub
were the other way round.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for checking! OK to commit?
…mparison result cases If the comparison results are allbits masks, we can expand as "abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs)))", replacing a sub+sub+select pattern with the simpler sub+xor+sub pattern. This allows us to remove a lot of X86 specific legalization code, and will be useful in future generic expansion for the legalization work in llvm#92576 Alive2: https://alive2.llvm.org/ce/z/sj863C
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If the comparison results are allbits masks, we can expand as
abd(lhs, rhs) -> sub(cmpgt(lhs, rhs), xor(sub(lhs, rhs), cmpgt(lhs, rhs)))
, replacing a sub+sub+select pattern with the simpler sub+xor+sub pattern.This allows us to remove a lot of X86 specific legalization code, and will be useful in future generic expansion for the legalization work in #92576
Alive2: https://alive2.llvm.org/ce/z/sj863C