Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (llvm#10…
Browse files Browse the repository at this point in the history
…2130)

isExtractHiElt should return new source register instead of returning
instruction that defines it. Src = MI.getOperand(0).getReg() is not
correct when MI(for example G_UNMERGE_VALUES) defines multiple registers.
Refactor existing code to work with source registers only.
  • Loading branch information
petar-avramovic authored Aug 7, 2024
1 parent e842998 commit 269cefb
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 100 deletions.
164 changes: 69 additions & 95 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
MachineInstrBuilder SelectedMI;
MachineOperand &LHS = I.getOperand(2);
MachineOperand &RHS = I.getOperand(3);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
Register Src0Reg =
copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
Register Src1Reg =
Expand Down Expand Up @@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}

static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) {
return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg;
}

static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) {
Register BitcastSrc;
if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc))))
Reg = BitcastSrc;
return Reg;
}

static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
Register &Out) {
Register Trunc;
if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc))))
return false;

Register LShlSrc;
if (mi_match(In, MRI,
m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
Out = LShlSrc;
Register Cst;
if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) {
Cst = stripCopy(Cst, MRI);
if (mi_match(Cst, MRI, m_SpecificICst(16))) {
Out = stripBitCast(LShlSrc, MRI);
return true;
}
}

MachineInstr *Shuffle = MRI.getVRegDef(Trunc);
if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
return false;

assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));

ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);

if (Mask[0] == 1 && Mask[1] <= 1) {
Out = Shuffle->getOperand(0).getReg();
return true;
}

return false;
}

Expand Down Expand Up @@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {

}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
bool IsCanonicalizing,
bool AllowAbs, bool OpSel) const {
Register Src = Root.getReg();
std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const {
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);

Expand Down Expand Up @@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false);

Expand All @@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3675,7 +3706,8 @@ AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false);

return {{
[=](MachineInstrBuilder &MIB) {
Expand All @@ -3689,8 +3721,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);
std::tie(Src, Mods) =
selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true,
/*AllowAbs=*/false);

return {{
[=](MachineInstrBuilder &MIB) {
Expand Down Expand Up @@ -4016,7 +4049,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

// FIXME: Handle op_sel
return {{
Expand All @@ -4029,7 +4062,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/false);
Expand All @@ -4047,7 +4080,7 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
/*IsCanonicalizing=*/true,
/*AllowAbs=*/false,
/*OpSel=*/true);
Expand Down Expand Up @@ -5229,97 +5262,41 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}

// Variant of stripBitCast that returns the instruction instead of a
// MachineOperand.
static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) {
if (MI->getOpcode() == AMDGPU::G_BITCAST)
return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
return MI;
}

// Figure out if this is really an extract of the high 16-bits of a dword,
// returns nullptr if it isn't.
static MachineInstr *isExtractHiElt(MachineInstr *Inst,
MachineRegisterInfo &MRI) {
Inst = stripBitCast(Inst, MRI);

if (Inst->getOpcode() != AMDGPU::G_TRUNC)
return nullptr;

MachineInstr *TruncOp =
getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI);
TruncOp = stripBitCast(TruncOp, MRI);

// G_LSHR x, (G_CONSTANT i32 16)
if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
auto SrlAmount = getIConstantVRegValWithLookThrough(
TruncOp->getOperand(2).getReg(), MRI);
if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
MachineInstr *SrlOp =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(SrlOp, MRI);
}
}

// G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
// 1, 0 swaps the low/high 16 bits.
// 1, 1 sets the high 16 bits to be the same as the low 16.
// in any case, it selects the high elts.
if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
LLT::fixed_vector(2, 16));

ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
assert(Mask.size() == 2);

if (Mask[0] == 1 && Mask[1] <= 1) {
MachineInstr *LHS =
getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
return stripBitCast(LHS, MRI);
}
}

return nullptr;
}

std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
Matched = false;

Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root);

MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
if (MI->getOpcode() == AMDGPU::G_FPEXT) {
MachineOperand *MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());

if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) {
assert(MRI->getType(Src) == LLT::scalar(16));

// See through bitcasts.
// FIXME: Would be nice to use stripBitCast here.
if (MI->getOpcode() == AMDGPU::G_BITCAST) {
MO = &MI->getOperand(1);
Src = MO->getReg();
MI = getDefIgnoringCopies(Src, *MRI);
}
// Only change Src if src modifier could be gained. In such cases new Src
// could be sgpr but this does not violate constant bus restriction for
// instruction that is being selected.
// Note: Src is not changed when there is only a simple sgpr to vgpr copy
// since this could violate constant bus restriction.
Register PeekSrc = stripCopy(Src, *MRI);

const auto CheckAbsNeg = [&]() {
// Be careful about folding modifiers if we already have an abs. fneg is
// applied last, so we don't want to apply an earlier fneg.
if ((Mods & SISrcMods::ABS) == 0) {
unsigned ModsTmp;
std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
MI = getDefIgnoringCopies(Src, *MRI);
std::tie(PeekSrc, ModsTmp) = selectVOP3ModsImpl(PeekSrc);

if ((ModsTmp & SISrcMods::NEG) != 0)
if ((ModsTmp & SISrcMods::NEG) != 0) {
Mods ^= SISrcMods::NEG;
Src = PeekSrc;
}

if ((ModsTmp & SISrcMods::ABS) != 0)
if ((ModsTmp & SISrcMods::ABS) != 0) {
Mods |= SISrcMods::ABS;
Src = PeekSrc;
}
}
};

Expand All @@ -5332,12 +5309,9 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,

Mods |= SISrcMods::OP_SEL_1;

if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
if (isExtractHiElt(*MRI, PeekSrc, PeekSrc)) {
Src = PeekSrc;
Mods |= SISrcMods::OP_SEL_0;
MI = ExtractHiEltMI;
MO = &MI->getOperand(0);
Src = MO->getReg();

CheckAbsNeg();
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;

std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -446,28 +446,28 @@ define amdgpu_ps float @test_matching_source_from_unmerge(ptr addrspace(3) %aptr
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_matching_source_from_unmerge:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: ds_read_b64 v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_matching_source_from_unmerge:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: ds_read_b64 v[2:3], v0
; GFX10-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_matching_source_from_unmerge:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: ds_read_b64 v[2:3], v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v2, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v2, v3, v1 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = load <4 x half>, ptr addrspace(3) %aptr, align 16
Expand Down

0 comments on commit 269cefb

Please sign in to comment.