Skip to content

Commit

Permalink
[AMDGPU] Fix sign confusion in performMulLoHiCombine (#105831)
Browse files Browse the repository at this point in the history
SMUL_LOHI and UMUL_LOHI are different operations because the high part
of the result is different, so it is not OK to optimize the signed
version to MUL_U24/MULHI_U24 or the unsigned version to
MUL_I24/MULHI_I24.
  • Loading branch information
jayfoad authored and tru committed Sep 13, 2024
1 parent 373180b commit 93998af
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 12 deletions.
30 changes: 18 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4349,6 +4349,7 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);

bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

Expand All @@ -4363,20 +4364,25 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,

// Try to use two fast 24-bit multiplies (one for each half of the result)
// instead of one slow extending multiply.
unsigned LoOpcode, HiOpcode;
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
LoOpcode = AMDGPUISD::MUL_U24;
HiOpcode = AMDGPUISD::MULHI_U24;
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
LoOpcode = AMDGPUISD::MUL_I24;
HiOpcode = AMDGPUISD::MULHI_I24;
unsigned LoOpcode = 0;
unsigned HiOpcode = 0;
if (Signed) {
if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
LoOpcode = AMDGPUISD::MUL_I24;
HiOpcode = AMDGPUISD::MULHI_I24;
}
} else {
return SDValue();
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
LoOpcode = AMDGPUISD::MUL_U24;
HiOpcode = AMDGPUISD::MULHI_U24;
}
}
if (!LoOpcode)
return SDValue();

SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
Expand Down
98 changes: 98 additions & 0 deletions llvm/test/CodeGen/AMDGPU/mul_int24.ll
Original file line number Diff line number Diff line change
Expand Up @@ -813,4 +813,102 @@ bb7:
ret void

}

define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
; SI-LABEL: test_umul_i24:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s1, s[2:3], 0xb
; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s1, 9
; SI-NEXT: v_mul_hi_u32 v0, s1, v0
; SI-NEXT: s_mul_i32 s1, s1, 0xff803fe1
; SI-NEXT: v_alignbit_b32 v0, v0, s1, 1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_umul_i24:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[2:3], 0x2c
; VI-NEXT: v_mov_b32_e32 v0, 0xff803fe1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s0, s0, 9
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1
; VI-NEXT: s_nop 1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul_i24:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s1, s[2:3], 0x2c
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s1, s1, 9
; GFX9-NEXT: s_mul_hi_u32 s4, s1, 0xff803fe1
; GFX9-NEXT: s_mul_i32 s1, s1, 0xff803fe1
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: s_mov_b32 s1, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_umul_i24:
; EG: ; %bb.0:
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00)
; EG-NEXT: MULHI * T0.X, PV.W, literal.x,
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
; EG-NEXT: MULLO_INT * T0.Y, T0.W, literal.x,
; EG-NEXT: -8372255(nan), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T0.X, T0.X, PS, 1,
; EG-NEXT: MOV * T1.X, literal.x,
; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
;
; CM-LABEL: test_umul_i24:
; CM: ; %bb.0:
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
; CM-NEXT: 9(1.261169e-44), 0(0.000000e+00)
; CM-NEXT: MULHI T0.X, T0.W, literal.x,
; CM-NEXT: MULHI T0.Y (MASKED), T0.W, literal.x,
; CM-NEXT: MULHI T0.Z (MASKED), T0.W, literal.x,
; CM-NEXT: MULHI * T0.W (MASKED), T0.W, literal.x,
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T0.X (MASKED), T0.W, literal.x,
; CM-NEXT: MULLO_INT T0.Y, T0.W, literal.x,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, literal.x,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, literal.x,
; CM-NEXT: -8372255(nan), 0(0.000000e+00)
; CM-NEXT: BIT_ALIGN_INT * T0.X, T0.X, PV.Y, 1,
; CM-NEXT: MOV * T1.X, literal.x,
; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00)
%i = lshr i32 %arg, 9
%i1 = zext i32 %i to i64
%i2 = mul i64 %i1, 4286595041
%i3 = lshr i64 %i2, 1
%i4 = trunc i64 %i3 to i32
store i32 %i4, ptr addrspace(1) null, align 4
ret void
}

attributes #0 = { nounwind }

0 comments on commit 93998af

Please sign in to comment.