From 93998aff7662d9b3f94d9627179dffe342e2b399 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 27 Aug 2024 17:09:40 +0100 Subject: [PATCH] [AMDGPU] Fix sign confusion in performMulLoHiCombine (#105831) SMUL_LOHI and UMUL_LOHI are different operations because the high part of the result is different, so it is not OK to optimize the signed version to MUL_U24/MULHI_U24 or the unsigned version to MUL_I24/MULHI_I24. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 30 +++--- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 98 +++++++++++++++++++ 2 files changed, 116 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 39ae7c96cf7729..a71c9453d968dd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4349,6 +4349,7 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + bool Signed = N->getOpcode() == ISD::SMUL_LOHI; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4363,20 +4364,25 @@ AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N, // Try to use two fast 24-bit multiplies (one for each half of the result) // instead of one slow extending multiply. - unsigned LoOpcode, HiOpcode; - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - LoOpcode = AMDGPUISD::MUL_U24; - HiOpcode = AMDGPUISD::MULHI_U24; - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - LoOpcode = AMDGPUISD::MUL_I24; - HiOpcode = AMDGPUISD::MULHI_I24; + unsigned LoOpcode = 0; + unsigned HiOpcode = 0; + if (Signed) { + if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + LoOpcode = AMDGPUISD::MUL_I24; + HiOpcode = AMDGPUISD::MULHI_I24; + } } else { - return SDValue(); + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + LoOpcode = AMDGPUISD::MUL_U24; + HiOpcode = AMDGPUISD::MULHI_U24; + } } + if (!LoOpcode) + return SDValue(); SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1); SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1); diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index be77a10380c49b..8f4c48fae6fb31 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -813,4 +813,102 @@ bb7: ret void } + +define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { +; SI-LABEL: test_umul_i24: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s1, s[2:3], 0xb +; SI-NEXT: v_mov_b32_e32 v0, 0xff803fe1 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s1, 9 +; SI-NEXT: v_mul_hi_u32 v0, s1, v0 +; SI-NEXT: s_mul_i32 s1, s1, 0xff803fe1 +; SI-NEXT: v_alignbit_b32 v0, v0, s1, 1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: test_umul_i24: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[2:3], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 0xff803fe1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s0, s0, 9 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1 +; VI-NEXT: s_nop 1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_umul_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x2c +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s1, s1, 9 +; GFX9-NEXT: s_mul_hi_u32 s4, s1, 0xff803fe1 +; GFX9-NEXT: s_mul_i32 s1, s1, 0xff803fe1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; EG-LABEL: test_umul_i24: +; EG: ; %bb.0: +; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: MULHI * T0.X, PV.W, literal.x, +; EG-NEXT: -8372255(nan), 0(0.000000e+00) +; EG-NEXT: MULLO_INT * T0.Y, T0.W, literal.x, +; EG-NEXT: -8372255(nan), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T0.X, T0.X, PS, 1, +; EG-NEXT: MOV * T1.X, literal.x, +; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; +; CM-LABEL: test_umul_i24: +; CM: ; %bb.0: +; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, +; CM-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; CM-NEXT: MULHI T0.X, T0.W, literal.x, +; CM-NEXT: MULHI T0.Y (MASKED), T0.W, literal.x, +; CM-NEXT: MULHI T0.Z (MASKED), T0.W, literal.x, +; CM-NEXT: MULHI * T0.W (MASKED), T0.W, literal.x, +; CM-NEXT: -8372255(nan), 0(0.000000e+00) +; CM-NEXT: MULLO_INT T0.X (MASKED), T0.W, literal.x, +; CM-NEXT: MULLO_INT T0.Y, T0.W, literal.x, +; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, literal.x, +; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, literal.x, +; CM-NEXT: -8372255(nan), 0(0.000000e+00) +; CM-NEXT: BIT_ALIGN_INT * T0.X, T0.X, PV.Y, 1, +; CM-NEXT: MOV * T1.X, literal.x, +; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) + %i = lshr i32 %arg, 9 + %i1 = zext i32 %i to i64 + %i2 = mul i64 %i1, 4286595041 + %i3 = lshr i64 %i2, 1 + %i4 = trunc i64 %i3 to i32 + store i32 %i4, ptr addrspace(1) null, align 4 + ret void +} + attributes #0 = { nounwind }