Skip to content

Commit

Permalink
AMDGPU/GlobalISel: RegBankLegalize
Browse files Browse the repository at this point in the history
Lower G_ instructions that can't be inst-selected with register bank
assignment from StandaloneRegBankSelect based on uniformity analysis.
- Lower instruction to perform it on assigned register bank
- Put uniform value in vgpr because SALU instruction is not available
- Execute divergent instruction in SALU - "waterfall loop"

Given LLTs on all operands after legalizer, some register bank
assignments require lowering while other do not.
Note: cases where all register bank assignments would require lowering
are lowered in legalizer.

RegBankLegalize goals:
- Define Rules: when and how to perform lowering
- Goal of defining Rules it to provide high level table-like brief
  overview of how to lower generic instructions based on available
  target features and uniformity info (uniform vs divergent).
- Fast search of Rules, depends on how complicated Rule.Predicate is
- For some opcodes there would be too many Rules that are essentially
  all the same just for different combinations of types and banks.
  Write custom function that handles all cases.
- Rules are made from enum IDs that correspond to each operand.
  Names of IDs are meant to give brief description what lowering does
  for each operand or the whole instruction.
- RegBankLegalizeHelper implements lowering algorithms and handles all IDs

Since this is the first patch that actually enables -new-reg-bank-select
here is the summary of regression tests that were added earlier:
- if instruction is uniform always select SALU instruction if available
- eliminate back to back vgpr to sgpr to vgpr copies of uniform values
- fast rules: small differences for standard and vector instruction
- enabling Rule based on target feature - salu_float
- how to specify lowering algorithm - vgpr S64 AND to S32
- on G_TRUNC in reg, it is up to user to deal with truncated bits
  G_TRUNC in reg is treated as no-op.
- dealing with truncated high bits - ABS S16 to S32
- sgpr S1 phi lowering
- new opcodes for vcc-to-scc and scc-to-vcc copies
- lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC)
- S1 zext and sext lowering to select
- uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into
  SALU instruction
- divergent phi with uniform inputs
- divergent instruction with temporal divergent use, source instruction
  is defined as uniform(StandaloneRegBankSelect) - missing temporal
  divergence lowering
- uniform phi, because of undef incoming, is assigned to vgpr. Will be
  fixed in StandaloneRegBankSelect via another fix in machine uniformity
  analysis.
  • Loading branch information
petar-avramovic committed Oct 22, 2024
1 parent df50c85 commit db1cdae
Show file tree
Hide file tree
Showing 17 changed files with 2,077 additions and 258 deletions.
8 changes: 8 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,14 @@ class MachineRegisterInfo {
return dyn_cast_if_present<const TargetRegisterClass *>(Val);
}

/// Return the register bank of \p Reg.
/// This shouldn't be used directly unless \p Reg has a register bank.
const RegisterBank *getRegBank(Register Reg) const {
assert(isa<const RegisterBank *>(VRegInfo[Reg.id()].first) &&
"Register bank not set, wrong accessor");
return cast<const RegisterBank *>(VRegInfo[Reg.id()].first);
}

/// Return the register bank of \p Reg, or null if Reg has not been assigned
/// a register bank or has been assigned a register class.
/// \note It is possible to get the register bank from the register class via
Expand Down
117 changes: 117 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,120 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}

MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI) {
auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc});
Register Dst = RFL->getOperand(0).getReg();
Register Src = RFL->getOperand(1).getReg();
MachineRegisterInfo &MRI = *B.getMRI();
if (!MRI.getRegBankOrNull(Dst))
MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID));
if (!MRI.getRegBankOrNull(Src))
MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID));
return RFL;
}

MachineInstrBuilder
AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst,
const SrcOp &VgprSrc, LLT B32Ty,
const RegisterBankInfo &RBI) {
MachineRegisterInfo &MRI = *B.getMRI();
SmallVector<Register, 8> SgprDstParts;
auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
SgprDstParts.push_back(
buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0));
}

auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
return Merge;
}

MachineInstrBuilder
AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI) {
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
MachineRegisterInfo &MRI = *B.getMRI();
SmallVector<Register, 8> SgprDstParts;
auto Unmerge = B.buildUnmerge(S64, VgprSrc);

for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID));
auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i));
SmallVector<Register, 2> Unmerge64Parts;
Unmerge64Parts.push_back(
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0));
Unmerge64Parts.push_back(
buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0));
Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0);
MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID));
SgprDstParts.push_back(MergeReg);
}

auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
return Merge;
}

MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI) {
MachineRegisterInfo &MRI = *B.getMRI();
LLT S16 = LLT::scalar(16);
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
LLT S256 = LLT::scalar(256);
LLT V2S16 = LLT::fixed_vector(2, 16);
LLT Ty = SgprDst.getLLTTy(MRI);

if (Ty == S16) {
return B.buildTrunc(
SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI));
}

if (Ty == S32 || Ty == V2S16 ||
(Ty.isPointer() && Ty.getSizeInBits() == 32)) {
return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI);
}

if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) ||
(Ty.isVector() && Ty.getElementType() == S32)) {
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI);
}

if (Ty.isVector() && Ty.getElementType() == S16) {
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI);
}

if (Ty.isVector() && Ty.getElementType() == S64) {
return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI);
}

llvm_unreachable("Type not supported");
}

void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
const RegisterBankInfo &RBI) {
MachineRegisterInfo &MRI = *B.getMRI();
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst);
if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID))
return;

Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID));

MI.getOperand(0).setReg(VgprDst);
MachineBasicBlock *MBB = MI.getParent();
B.setInsertPt(*MBB, std::next(MI.getIterator()));
// readAnyLane VgprDst into Dst after MI.
buildReadAnyLane(B, Dst, VgprDst, RBI);
return;
}
37 changes: 35 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H

#include "AMDGPURegisterBankInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
Expand Down Expand Up @@ -48,7 +52,36 @@ class IntrinsicLaneMaskAnalyzer {
// This will not be needed when we turn off LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
}
}

void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI,
const RegisterBankInfo &RBI);

MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI);

MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
LLT B32Ty,
const RegisterBankInfo &RBI);

MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI);

MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst,
const SrcOp &VgprSrc,
const RegisterBankInfo &RBI);

// Create new vgpr destination register for MI then move it to current
// MI's sgpr destination using one or more G_READANYLANE instructions.
void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
const RegisterBankInfo &RBI);

} // namespace AMDGPU
} // namespace llvm

#endif
90 changes: 89 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

unsigned CmpOpc =
STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
.addReg(I.getOperand(1).getReg())
.addImm(0);
if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
return false;

Register DstReg = I.getOperand(0).getReg();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);

I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI);
}

bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);

if (Arg) {
const int64_t Value = Arg->Value.getZExtValue();
if (Value == 0) {
unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
} else {
assert(Value == 1);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
}
I.eraseFromParent();
return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
}

// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);

unsigned SelectOpcode =
STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
.addReg(TRI.getExec())
.addImm(0);

I.eraseFromParent();
return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();

const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();

auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
.addReg(SrcReg);

I.eraseFromParent();
return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
Expand Down Expand Up @@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
}

// TODO: Verify that all registers have the same bank
// If inputs have register bank, assign corresponding reg class.
// Note: registers don't need to have the same reg bank.
for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
const Register SrcReg = I.getOperand(i).getReg();

const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
if (RB) {
const LLT SrcTy = MRI->getType(SrcReg);
const TargetRegisterClass *SrcRC =
TRI.getRegClassForTypeOnBank(SrcTy, *RB);
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
}
}

I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
Expand Down Expand Up @@ -3656,6 +3738,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectStackRestore(I);
case AMDGPU::G_PHI:
return selectPHI(I);
case AMDGPU::G_COPY_SCC_VCC:
return selectCOPY_SCC_VCC(I);
case AMDGPU::G_COPY_VCC_SCC:
return selectCOPY_VCC_SCC(I);
case AMDGPU::G_READANYLANE:
return selectReadAnyLane(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
default:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {

bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
bool selectCOPY(MachineInstr &I) const;
bool selectCOPY_SCC_VCC(MachineInstr &I) const;
bool selectCOPY_VCC_SCC(MachineInstr &I) const;
bool selectReadAnyLane(MachineInstr &I) const;
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
Expand Down
Loading

0 comments on commit db1cdae

Please sign in to comment.