Skip to content

Commit

Permalink
[AArch64] Split v8f32 fptosi_sat into two v4f32.
Browse files Browse the repository at this point in the history
If we produce illegal v8f32 types, the VectorLegalizer will unroll them,
scalarizing the operations. In this patch we pre-split them during custom
legalization to produce better results.
  • Loading branch information
davemgreen authored and banach-space committed Aug 7, 2024
1 parent cb185fc commit 7401ef0
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 842 deletions.
42 changes: 34 additions & 8 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4508,21 +4508,28 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
EVT SrcElementVT = SrcVT.getVectorElementType();

// In the absence of FP16 support, promote f16 to f32 and saturate the result.
SDLoc DL(Op);
SDValue SrcVal2;
if ((SrcElementVT == MVT::f16 &&
(!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
SrcElementVT == MVT::bf16) {
MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
// If we are extending to a v8f32, split into two v4f32 to produce legal
// types.
if (F32VT.getSizeInBits() > 128) {
std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
F32VT = F32VT.getHalfNumVectorElementsVT();
}
SrcVT = F32VT;
SrcElementVT = MVT::f32;
SrcElementWidth = 32;
} else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
return SDValue();

SDLoc DL(Op);
// Expand to f64 if we are saturating to i64, to help produce keep the lanes
// the same width and produce a fcvtzu.
// Expand to f64 if we are saturating to i64, to help keep the lanes the same
// width and produce a fcvtzu.
if (SatWidth == 64 && SrcElementWidth < 64) {
MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
Expand All @@ -4531,9 +4538,16 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
SrcElementWidth = 64;
}
// Cases that we can emit directly.
if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
DAG.getValueType(DstVT.getScalarType()));
if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
DAG.getValueType(DstVT.getScalarType()));
if (SrcVal2) {
SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
DAG.getValueType(DstVT.getScalarType()));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
}
return Res;
}

// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
// result. This is only valid if the legal cvt is larger than the saturate
Expand All @@ -4545,20 +4559,32 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
DAG.getValueType(IntVT.getScalarType()));
SDValue Sat;
SDValue NativeCvt2 =
SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
DAG.getValueType(IntVT.getScalarType()))
: SDValue();
SDValue Sat, Sat2;
if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
SDValue MinC = DAG.getConstant(
APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
SDValue MaxC = DAG.getConstant(
APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
} else {
SDValue MinC = DAG.getConstant(
APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
}

if (SrcVal2)
Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),
Sat, Sat2);

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
}

Expand Down
65 changes: 6 additions & 59 deletions llvm/test/CodeGen/AArch64/fcvt_combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -466,72 +466,19 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: movi v1.8h, #68, lsl #8
; CHECK-NO16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000
; CHECK-NO16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-NO16-NEXT: fmul v2.4s, v2.4s, v3.4s
; CHECK-NO16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-NO16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-NO16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v1.8h
; CHECK-NO16-NEXT: fcvtl v1.4s, v1.4h
; CHECK-NO16-NEXT: mov s2, v0.s[1]
; CHECK-NO16-NEXT: fcvtzs w10, s0
; CHECK-NO16-NEXT: fcvtzs w15, s1
; CHECK-NO16-NEXT: fcvtzs w9, s2
; CHECK-NO16-NEXT: mov s2, v0.s[2]
; CHECK-NO16-NEXT: mov s0, v0.s[3]
; CHECK-NO16-NEXT: cmp w9, w8
; CHECK-NO16-NEXT: fcvtzs w12, s2
; CHECK-NO16-NEXT: mov s2, v1.s[1]
; CHECK-NO16-NEXT: csel w9, w9, w8, lt
; CHECK-NO16-NEXT: fcvtzs w13, s0
; CHECK-NO16-NEXT: mov s0, v1.s[2]
; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w9, w9, w11, gt
; CHECK-NO16-NEXT: cmp w10, w8
; CHECK-NO16-NEXT: csel w10, w10, w8, lt
; CHECK-NO16-NEXT: fcvtzs w14, s2
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: fcvtzs w16, s0
; CHECK-NO16-NEXT: mov s0, v1.s[3]
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
; CHECK-NO16-NEXT: cmp w12, w8
; CHECK-NO16-NEXT: csel w12, w12, w8, lt
; CHECK-NO16-NEXT: fmov s1, w10
; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w12, w12, w11, gt
; CHECK-NO16-NEXT: cmp w13, w8
; CHECK-NO16-NEXT: csel w13, w13, w8, lt
; CHECK-NO16-NEXT: mov v1.s[1], w9
; CHECK-NO16-NEXT: fcvtzs w9, s0
; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w13, w13, w11, gt
; CHECK-NO16-NEXT: cmp w14, w8
; CHECK-NO16-NEXT: csel w14, w14, w8, lt
; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: mov v1.s[2], w12
; CHECK-NO16-NEXT: csel w14, w14, w11, gt
; CHECK-NO16-NEXT: cmp w15, w8
; CHECK-NO16-NEXT: csel w15, w15, w8, lt
; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w10, w15, w11, gt
; CHECK-NO16-NEXT: cmp w16, w8
; CHECK-NO16-NEXT: mov v1.s[3], w13
; CHECK-NO16-NEXT: fmov s2, w10
; CHECK-NO16-NEXT: csel w10, w16, w8, lt
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
; CHECK-NO16-NEXT: cmp w9, w8
; CHECK-NO16-NEXT: mov v2.s[1], w14
; CHECK-NO16-NEXT: csel w8, w9, w8, lt
; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w8, w8, w11, gt
; CHECK-NO16-NEXT: mov v2.s[2], w10
; CHECK-NO16-NEXT: mov v2.s[3], w8
; CHECK-NO16-NEXT: uzp1 v0.8h, v2.8h, v1.8h
; CHECK-NO16-NEXT: fcvtl v0.4s, v1.4h
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-NO16-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NO16-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NO16-NEXT: sqxtn v0.4h, v0.4s
; CHECK-NO16-NEXT: sqxtn2 v0.8h, v1.4s
; CHECK-NO16-NEXT: ret
;
; CHECK-FP16-LABEL: test_v8f16_sat:
Expand Down
Loading

0 comments on commit 7401ef0

Please sign in to comment.