Skip to content

Commit

Permalink
AMDGPU/GlobalISel: RBSelect
Browse files Browse the repository at this point in the history
Assign register banks to virtual registers.
Defs and uses of G_ instructions have register banks exclusively,
if they had register class, reassign appropriate register bank.

Assign register banks using machine uniformity analysis:
SGPR - uniform values and some lane masks
VGPR - divergent, non S1, values
VCC  - divergent S1 values(lane masks)

RBSelect does not consider available instructions and, in some cases, G_
instructions with some register bank assignment can't be inst-selected.
This is solved in RBLegalize.

Exceptions when uniformity analysis does not work:
S32/S64 lane masks:
- need to end up with SGPR register class after instruction selection
- In most cases Uniformity analysis declares them as uniform
  (forced by tablegen) resulting in sgpr S32/S64 reg bank
- When Uniformity analysis declares them as divergent (some phis),
  use intrinsic lane mask analyzer to still assign sgpr register bank
temporal divergence copy:
- COPY to vgpr with implicit use of $exec inside of the cycle
- this copy is declared as uniform by uniformity analysis
- make sure that assigned bank is vgpr
Note: uniformity analysis does not consider that registers with vgpr def
are divergent (you can have uniform value in vgpr).
- TODO: implicit use of $exec could be implemented as indicator
  that instruction is divergent
  • Loading branch information
petar-avramovic committed Oct 18, 2024
1 parent ff34aa1 commit 2124eb3
Show file tree
Hide file tree
Showing 5 changed files with 971 additions and 686 deletions.
38 changes: 38 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@
//===----------------------------------------------------------------------===//

#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGenTypes/LowLevelType.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"

using namespace llvm;
using namespace AMDGPU;
using namespace MIPatternMatch;

std::pair<Register, unsigned>
Expand Down Expand Up @@ -69,3 +72,38 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,

return std::pair(Reg, 0);
}

IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
: MRI(MF.getRegInfo()) {
initLaneMaskIntrinsics(MF);
}

bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) {
return S32S64LaneMask.contains(Reg);
}

void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.getOpcode() == AMDGPU::G_INTRINSIC &&
MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID() ==
Intrinsic::amdgcn_if_break) {
S32S64LaneMask.insert(MI.getOperand(3).getReg());
findLCSSAPhi(MI.getOperand(0).getReg());
}

if (MI.getOpcode() == AMDGPU::SI_IF ||
MI.getOpcode() == AMDGPU::SI_ELSE) {
findLCSSAPhi(MI.getOperand(0).getReg());
}
}
}
}

void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(Reg);
for (auto &LCSSAPhi : MRI.use_instructions(Reg)) {
if (LCSSAPhi.isPHI())
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H

#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>

Expand All @@ -26,6 +28,26 @@ std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
GISelKnownBits *KnownBits = nullptr,
bool CheckNUW = false);

// Currently finds S32/S64 lane masks that can be declared as divergent by
// uniformity analysis (all are phis at the moment).
// These are defined as i32/i64 in some IR intrinsics (not as i1).
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
// reg class after instruction-select don't search for all of them.
class IntrinsicLaneMaskAnalyzer {
DenseSet<Register> S32S64LaneMask;
MachineRegisterInfo &MRI;

public:
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
bool isS32S64LaneMask(Register Reg);

private:
void initLaneMaskIntrinsics(MachineFunction &MF);
// This will not be needed when we turn of LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
}
}

Expand Down
194 changes: 193 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURBSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPURegisterBankInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/InitializePasses.h"

#define DEBUG_TYPE "rb-select"
Expand All @@ -39,6 +44,7 @@ class AMDGPURBSelect : public MachineFunctionPass {
StringRef getPassName() const override { return "AMDGPU RB select"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand All @@ -54,6 +60,7 @@ class AMDGPURBSelect : public MachineFunctionPass {

INITIALIZE_PASS_BEGIN(AMDGPURBSelect, DEBUG_TYPE, "AMDGPU RB select", false,
false)
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPURBSelect, DEBUG_TYPE, "AMDGPU RB select", false,
false)

Expand All @@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID;

FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); }

bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; }
bool shouldRBSelect(MachineInstr &MI) {
if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
return false;

if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
return false;

if (MI.isInlineAsm())
return false;

return true;
}

void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B,
MachineRegisterInfo &MRI, const RegisterBank &RB) {
Register Reg = DefOP.getReg();
// Register that already has Register class got it during pre-inst selection
// of another instruction. Maybe cross bank copy was required so we insert a
// copy trat can be removed later. This simplifies post-rb-legalize artifact
// combiner and avoids need to special case some patterns.
if (MRI.getRegClassOrNull(Reg)) {
LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({&RB, Ty});
DefOP.setReg(NewReg);

auto &MBB = *MI.getParent();
B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI()
: std::next(MI.getIterator()));
B.buildCopy(Reg, NewReg);

// The problem was discoverd for uniform S1 that was used as both
// lane mask(vcc) and regular sgpr S1.
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
// - the regular regular sgpr S1(uniform) instruction is now broken since
// it uses sreg_64_xexec(S1) which is divergent.

// "Clear" reg classes from uses on generic instructions and but register
// banks instead.
for (auto &UseMI : MRI.use_instructions(Reg)) {
if (shouldRBSelect(UseMI)) {
for (MachineOperand &Op : UseMI.operands()) {
if (Op.isReg() && Op.isUse() && Op.getReg() == Reg)
Op.setReg(NewReg);
}
}
}

} else {
MRI.setRegBank(Reg, RB);
}
}

void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B,
MachineRegisterInfo &MRI, const RegisterBank &RB) {
Register Reg = UseOP.getReg();

LLT Ty = MRI.getType(Reg);
Register NewReg = MRI.createVirtualRegister({&RB, Ty});
UseOP.setReg(NewReg);

if (MI.isPHI()) {
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
MachineBasicBlock *DefMBB = DefMI->getParent();
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
} else {
B.setInstr(MI);
}

B.buildCopy(NewReg, Reg);
}

// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
// the cycle
// Note: uniformity analysis does not consider that registers with vgpr def are
// divergent (you can have uniform value in vgpr).
// - TODO: implicit use of $exec could be implemented as indicator that
// instruction is divergent
bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (MI->getOpcode() == AMDGPU::COPY) {
for (auto Op : MI->implicit_operands()) {
if (!Op.isReg())
continue;
Register Reg = Op.getReg();
if (Reg == AMDGPU::EXEC) {
return true;
}
}
}

return false;
}

Register getVReg(MachineOperand &Op) {
if (!Op.isReg())
return 0;

Register Reg = Op.getReg();
if (!Reg.isVirtual())
return 0;

return Reg;
}

bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) {
MachineUniformityInfo &MUI =
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF);
MachineRegisterInfo &MRI = MF.getRegInfo();
const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();

MachineIRBuilder B(MF);

// Assign register banks to ALL def registers on G_ instructions.
// Same for copies if they have no register bank or class on def.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!shouldRBSelect(MI))
continue;

for (MachineOperand &DefOP : MI.defs()) {
Register DefReg = getVReg(DefOP);
if (!DefReg)
continue;

// Copies can have register class on def registers.
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
continue;
}

if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID));
} else {
if (MRI.getType(DefReg) == LLT::scalar(1))
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID));
else
setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VGPRRegBankID));
}
}
}
}

// At this point all virtual registers have register class or bank
// - Defs of G_ instructions have register banks.
// - Defs and uses of inst-selected instructions have register class.
// - Defs and uses of copies can have either register class or bank
// and most notably
// - Uses of G_ instructions can have either register class or bank

// Reassign uses of G_ instructions to only have register banks.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!shouldRBSelect(MI))
continue;

// Copies can have register class on use registers.
if (MI.isCopy())
continue;

for (MachineOperand &UseOP : MI.uses()) {
Register UseReg = getVReg(UseOP);
if (!UseReg)
continue;

if (!MRI.getRegClassOrNull(UseReg))
continue;

if (!isTemporalDivergenceCopy(UseReg, MRI) &&
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID));
} else {
if (MRI.getType(UseReg) == LLT::scalar(1))
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID));
else
setRBUse(MI, UseOP, B, MRI, RBI.getRegBank(AMDGPU::VGPRRegBankID));
}
}
}
}

// Defs and uses of G_ instructions have register banks exclusively.

return true;
}
Loading

0 comments on commit 2124eb3

Please sign in to comment.