From 269cefbc02986d460eee44a4e95024261be87656 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 7 Aug 2024 12:13:39 +0200 Subject: [PATCH] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (#102130) isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 164 ++++++++---------- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +- .../GlobalISel/combine-fma-add-ext-fma.ll | 8 +- 3 files changed, 74 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 73f3921b2ff4c4..f78699f88de56c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { MachineInstrBuilder SelectedMI; MachineOperand &LHS = I.getOperand(2); MachineOperand &RHS = I.getOperand(3); - auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); - auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); + auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg()); + auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg()); Register Src0Reg = copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); Register Src1Reg = @@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return false; } +static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) { + return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg; +} + +static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { + Register BitcastSrc; + if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc)))) + Reg = BitcastSrc; + return Reg; +} + static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out) { + Register Trunc; + if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc)))) + return false; + Register LShlSrc; - if (mi_match(In, MRI, - m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) { - Out = LShlSrc; + Register Cst; + if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) { + Cst = stripCopy(Cst, MRI); + if (mi_match(Cst, MRI, m_SpecificICst(16))) { + Out = stripBitCast(LShlSrc, MRI); + return true; + } + } + + MachineInstr *Shuffle = MRI.getVRegDef(Trunc); + if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) + return false; + + assert(MRI.getType(Shuffle->getOperand(0).getReg()) == + LLT::fixed_vector(2, 16)); + + ArrayRef Mask = Shuffle->getOperand(3).getShuffleMask(); + assert(Mask.size() == 2); + + if (Mask[0] == 1 && Mask[1] <= 1) { + Out = Shuffle->getOperand(0).getReg(); return true; } + return false; } @@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool IsCanonicalizing, - bool AllowAbs, bool OpSel) const { - Register Src = Root.getReg(); +std::pair AMDGPUInstructionSelector::selectVOP3ModsImpl( + Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const { unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { @@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false); @@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { @@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); + std::tie(Src, Mods) = + selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, - /*AllowAbs=*/false); + std::tie(Src, Mods) = + selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); // FIXME: Handle op_sel return {{ @@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false, /*OpSel=*/false); @@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false, /*OpSel=*/true); @@ -5229,59 +5262,6 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; } -// Variant of stripBitCast that returns the instruction instead of a -// MachineOperand. -static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { - if (MI->getOpcode() == AMDGPU::G_BITCAST) - return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); - return MI; -} - -// Figure out if this is really an extract of the high 16-bits of a dword, -// returns nullptr if it isn't. -static MachineInstr *isExtractHiElt(MachineInstr *Inst, - MachineRegisterInfo &MRI) { - Inst = stripBitCast(Inst, MRI); - - if (Inst->getOpcode() != AMDGPU::G_TRUNC) - return nullptr; - - MachineInstr *TruncOp = - getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); - TruncOp = stripBitCast(TruncOp, MRI); - - // G_LSHR x, (G_CONSTANT i32 16) - if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { - auto SrlAmount = getIConstantVRegValWithLookThrough( - TruncOp->getOperand(2).getReg(), MRI); - if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) { - MachineInstr *SrlOp = - getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); - return stripBitCast(SrlOp, MRI); - } - } - - // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) - // 1, 0 swaps the low/high 16 bits. - // 1, 1 sets the high 16 bits to be the same as the low 16. - // in any case, it selects the high elts. - if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { - assert(MRI.getType(TruncOp->getOperand(0).getReg()) == - LLT::fixed_vector(2, 16)); - - ArrayRef Mask = TruncOp->getOperand(3).getShuffleMask(); - assert(Mask.size() == 2); - - if (Mask[0] == 1 && Mask[1] <= 1) { - MachineInstr *LHS = - getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); - return stripBitCast(LHS, MRI); - } - } - - return nullptr; -} - std::pair AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, bool &Matched) const { @@ -5289,37 +5269,34 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); - - MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); - if (MI->getOpcode() == AMDGPU::G_FPEXT) { - MachineOperand *MO = &MI->getOperand(1); - Src = MO->getReg(); - MI = getDefIgnoringCopies(Src, *MRI); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); + if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) { assert(MRI->getType(Src) == LLT::scalar(16)); - // See through bitcasts. - // FIXME: Would be nice to use stripBitCast here. - if (MI->getOpcode() == AMDGPU::G_BITCAST) { - MO = &MI->getOperand(1); - Src = MO->getReg(); - MI = getDefIgnoringCopies(Src, *MRI); - } + // Only change Src if src modifier could be gained. In such cases new Src + // could be sgpr but this does not violate constant bus restriction for + // instruction that is being selected. + // Note: Src is not changed when there is only a simple sgpr to vgpr copy + // since this could violate constant bus restriction. + Register PeekSrc = stripCopy(Src, *MRI); const auto CheckAbsNeg = [&]() { // Be careful about folding modifiers if we already have an abs. fneg is // applied last, so we don't want to apply an earlier fneg. if ((Mods & SISrcMods::ABS) == 0) { unsigned ModsTmp; - std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO); - MI = getDefIgnoringCopies(Src, *MRI); + std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc); - if ((ModsTmp & SISrcMods::NEG) != 0) + if ((ModsTmp & SISrcMods::NEG) != 0) { Mods ^= SISrcMods::NEG; + Src = PeekSrc; + } - if ((ModsTmp & SISrcMods::ABS) != 0) + if ((ModsTmp & SISrcMods::ABS) != 0) { Mods |= SISrcMods::ABS; + Src = PeekSrc; + } } }; @@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, Mods |= SISrcMods::OP_SEL_1; - if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) { + if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) { + Src = PeekSrc; Mods |= SISrcMods::OP_SEL_0; - MI = ExtractHiEltMI; - MO = &MI->getOperand(0); - Src = MO->getReg(); - CheckAbsNeg(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 7fff7d2685e7f2..69806b240cf2bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const; bool selectSBarrierLeave(MachineInstr &I) const; - std::pair selectVOP3ModsImpl(MachineOperand &Root, + std::pair selectVOP3ModsImpl(Register Src, bool IsCanonicalizing = true, bool AllowAbs = true, bool OpSel = false) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll index e910c2eca2ced3..b2b433167fe4d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0 ; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_matching_source_from_unmerge: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: ds_read_b64 v[2:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0 ; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_matching_source_from_unmerge: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = load <4 x half>, ptr addrspace(3) %aptr, align 16