Skip to content

Commit

Permalink
[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs
Browse files Browse the repository at this point in the history
Currently, the custom SGPR spill lowering pass spills
SGPRs into physical VGPR lanes and the remaining VGPRs
are used by regalloc for vector regclass allocation.
This imposes many restrictions that we ended up with
unsuccessful SGPR spilling when there won't be enough
VGPRs and we are forced to spill the leftover into
memory during PEI. The custom spill handling during PEI
has many edge cases and often breaks the compiler time
to time.

This patch implements spilling SGPRs into virtual VGPR
lanes. Since we now split the register allocation for
SGPRs and VGPRs, the virtual registers introduced for
the spill lanes would get allocated automatically in
the subsequent regalloc invocation for VGPRs.

Spill to virtual registers will always be successful,
even in the high-pressure situations, and hence it avoids
most of the edge cases during PEI. We are now left with
only the custom SGPR spills during PEI for special registers
like the frame pointer which is an unproblematic case.

Differential Revision: https://reviews.llvm.org/D124196
  • Loading branch information
cdevadas authored and Yashwant Singh committed Jul 7, 2023
1 parent 691dc2d commit 7a98f08
Show file tree
Hide file tree
Showing 83 changed files with 7,413 additions and 5,688 deletions.
16 changes: 8 additions & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));

LLVM_DEBUG(
auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
<< '\n';);
} else {
// Remove dead <FI> index
MF.getFrameInfo().RemoveStackObject(FI);
Expand Down Expand Up @@ -264,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {

assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);

for (unsigned I = 0; I < NumSubRegs; ++I) {
Expand Down Expand Up @@ -309,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromVGPRLane(const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);

for (unsigned I = 0; I < NumSubRegs; ++I) {
Expand Down Expand Up @@ -1353,8 +1353,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
assert(RS != nullptr);
// FIXME: change to enterBasicBlockEnd()
RS->enterBasicBlock(MBB);
RS->enterBasicBlockEnd(MBB);
RS->backward(MI);
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
SpillFIs.set(FI);
continue;
Expand Down
136 changes: 114 additions & 22 deletions llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,23 @@ class SILowerSGPRSpills : public MachineFunctionPass {
SILowerSGPRSpills() : MachineFunctionPass(ID) {}

void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);

bool runOnMachineFunction(MachineFunction &MF) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}

MachineFunctionProperties getClearedProperties() const override {
// SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
return MachineFunctionProperties()
.set(MachineFunctionProperties::Property::IsSSA)
.set(MachineFunctionProperties::Property::NoVRegs);
}
};

} // end anonymous namespace
Expand Down Expand Up @@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
EntryBB.sortUniqueLiveIns();
}

bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
bool SILowerSGPRSpills::spillCalleeSavedRegs(
MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
TRI->getSpillAlign(*RC), true);

CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
CalleeSavedFIs.push_back(JunkFI);
}
}

Expand All @@ -248,6 +259,50 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}

void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
LiveIntervals *LIS) {
// TODO: This is a workaround to avoid the unmodelled liveness computed with
// whole-wave virtual registers when allocated together with the regular VGPR
// virtual registers. Presently, the liveness computed during the regalloc is
// only uniform (or single lane aware) and it doesn't take account of the
// divergent control flow that exists for our GPUs. Since the WWM registers
// can modify inactive lanes, the wave-aware liveness should be computed for
// the virtual registers to accurately plot their interferences. Without
// having the divergent CFG for the function, it is difficult to implement the
// wave-aware liveness info. Until then, we conservatively extend the liveness
// of the wwm registers into the entire function so that they won't be reused
// without first spilling/splitting their liveranges.
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

// Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
if (LIS) {
LIS->InsertMachineInstrInMaps(*MIB);
}
}
}

// Insert the KILL in the return blocks to extend their liveness untill the
// end of function. Insert a separate KILL for each VGPR.
for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
MachineBasicBlock::iterator InsertBefore =
RestoreBlock->getFirstTerminator();
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
auto MIB =
BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
TII->get(TargetOpcode::KILL));
MIB.addReg(Reg);
if (LIS)
LIS->InsertMachineInstrInMaps(*MIB);
}
}
}

bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
Expand All @@ -261,7 +316,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// First, expose any CSR SGPR spills. This is mostly the same as what PEI
// does, but somewhat simpler.
calculateSaveRestoreBlocks(MF);
bool HasCSRs = spillCalleeSavedRegs(MF);
SmallVector<int> CalleeSavedFIs;
bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);

MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
Expand All @@ -275,6 +331,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {

bool MadeChange = false;
bool NewReservedRegs = false;
bool SpilledToVirtVGPRLanes = false;

// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
Expand All @@ -297,23 +354,53 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {

int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS);
(void)Spilled;
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
SpillFIs.set(FI);

bool IsCalleeSaveSGPRSpill =
std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
CalleeSavedFIs.end();
if (IsCalleeSaveSGPRSpill) {
// Spill callee-saved SGPRs into physical VGPR lanes.

// TODO: This is to ensure the CFIs are static for efficient frame
// unwinding in the debugger. Spilling them into virtual VGPR lanes
// involve regalloc to allocate the physical VGPRs and that might
// cause intermediate spill/split of such liveranges for successful
// allocation. This would result in broken CFI encoding unless the
// regalloc aware CFI generation to insert new CFIs along with the
// intermediate spills is implemented. There is no such support
// currently exist in the LLVM compiler.
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS, true);
if (!Spilled)
llvm_unreachable(
"failed to spill SGPR to physical VGPR lane when allocated");
}
} else {
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS);
if (!Spilled)
llvm_unreachable(
"failed to spill SGPR to virtual VGPR lane when allocated");
SpillFIs.set(FI);
SpilledToVirtVGPRLanes = true;
}
}
}
}

// FIXME: Adding to live-ins redundant with reserving registers.
for (MachineBasicBlock &MBB : MF) {
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(Reg);
MBB.sortUniqueLiveIns();
if (SpilledToVirtVGPRLanes) {
extendWWMVirtRegLiveness(MF, LIS);
if (LIS) {
// Compute the LiveInterval for the newly created virtual registers.
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
LIS->createAndComputeVirtRegInterval(Reg);
}
}

for (MachineBasicBlock &MBB : MF) {
// FIXME: The dead frame indices are replaced with a null register from
// the debug value instructions. We should instead, update it with the
// correct register value. But not sure the register value alone is
Expand All @@ -334,6 +421,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// lane".
FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);

MadeChange = true;
}

if (SpilledToVirtVGPRLanes) {
const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
// Shift back the reserved SGPR for EXEC copy into the lowest range.
// This SGPR is reserved to handle the whole-wave spill/copy operations
Expand All @@ -342,20 +433,21 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);

MadeChange = true;
} else {
// No SGPR spills and hence there won't be any WWM spills/copies. Reset the
// SGPR reserved for EXEC copy.
// No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
// spills/copies. Reset the SGPR reserved for EXEC copy.
FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
}

SaveBlocks.clear();
RestoreBlocks.clear();

// Updated the reserved registers with any VGPRs added for SGPR spills.
if (NewReservedRegs)
MRI.freezeReservedRegs(MF);
// Updated the reserved registers with any physical VGPRs added for SGPR
// spills.
if (NewReservedRegs) {
for (Register Reg : FuncInfo->getWWMReservedRegs())
MRI.reserveReg(Reg, TRI);
}

return MadeChange;
}
69 changes: 34 additions & 35 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,37 +314,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}

bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
int FI,
unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
SGPRSpillToVGPRLanes.erase(FI);
return false;
}

LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
SpillVGPRs.push_back(LaneVGPR);
// Add this register as live-in to all blocks to avoid machine verifier
// complaining about use of an undefined physical register.
for (MachineBasicBlock &BB : MF)
BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back();
}

SGPRSpillToVGPRLanes[FI].push_back(
SGPRSpillsToVirtualVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}

bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Expand All @@ -355,16 +341,21 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
SGPRSpillsToPhysicalVGPRLanes.erase(FI);
return false;
}

allocateWWMSpill(MF, LaneVGPR);
reserveWWMRegister(LaneVGPR);
for (MachineBasicBlock &MBB : MF) {
MBB.addLiveIn(LaneVGPR);
MBB.sortUniqueLiveIns();
}
} else {
LaneVGPR = WWMSpills.back().first;
LaneVGPR = WWMReservedRegs.back();
}

PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
Expand All @@ -373,8 +364,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
int FI,
bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
: SGPRSpillToVGPRLanes[FI];
IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
: SGPRSpillsToVirtualVGPRLanes[FI];

// This has already been allocated.
if (!SpillLanes.empty())
Expand All @@ -395,15 +386,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
"not spilling SGPRs to VGPRs");

unsigned &NumSpillLanes =
IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;

for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);

bool Allocated =
IsPrologEpilog
? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
: allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
bool Allocated = IsPrologEpilog
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
return false;
Expand Down Expand Up @@ -484,16 +474,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,

bool SIMachineFunctionInfo::removeDeadFrameIndices(
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
// Remove dead frame indices from function frame. And also make sure to remove
// the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
// could result in an unexpected side effect and bug, in case of any
// re-mapping of freed frame indices by later pass(es) like "stack slot
// Remove dead frame indices from function frame, however keep FP & BP since
// spills for them haven't been inserted yet. And also make sure to remove the
// frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
// otherwise, it could result in an unexpected side effect and bug, in case of
// any re-mapping of freed frame indices by later pass(es) like "stack slot
// coloring".
for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
MFI.RemoveStackObject(R.first);
SGPRSpillToVGPRLanes.erase(R.first);
SGPRSpillsToVirtualVGPRLanes.erase(R.first);
}

// Remove the dead frame indices of CSR SGPRs which are spilled to physical
// VGPR lanes during SILowerSGPRSpills pass.
if (!ResetSGPRSpillStackIDs) {
for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
MFI.RemoveStackObject(R.first);
SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
}
}
bool HaveSGPRToMemory = false;

if (ResetSGPRSpillStackIDs) {
Expand Down
Loading

0 comments on commit 7a98f08

Please sign in to comment.