[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs

Currently, the custom SGPR spill lowering pass spills SGPRs into physical VGPR lanes and the remaining VGPRs are used by regalloc for vector regclass allocation. This imposes many restrictions that we ended up with unsuccessful SGPR spilling when there won't be enough VGPRs and we are forced to spill the leftover into memory during PEI. The custom spill handling during PEI has many edge cases and often breaks the compiler time to time. This patch implements spilling SGPRs into virtual VGPR lanes. Since we now split the register allocation for SGPRs and VGPRs, the virtual registers introduced for the spill lanes would get allocated automatically in the subsequent regalloc invocation for VGPRs. Spill to virtual registers will always be successful, even in the high-pressure situations, and hence it avoids most of the edge cases during PEI. We are now left with only the custom SGPR spills during PEI for special registers like the frame pointer which is an unproblematic case. Differential Revision: https://reviews.llvm.org/D124196
llvm · Jul 7, 2023 · 7a98f08 · 7a98f08
1 parent 691dc2d
commit 7a98f08
Show file tree

Hide file tree

Showing 83 changed files with 7,413 additions and 5,688 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -99,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
           SGPR, PrologEpilogSGPRSaveRestoreInfo(
                     SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
 
-      LLVM_DEBUG(
-          auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
-          dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
-                 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
+      LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
+                 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+                        << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                        << '\n';);
     } else {
       // Remove dead <FI> index
       MF.getFrameInfo().RemoveStackObject(FI);
@@ -264,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {
 
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -309,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
   void restoreFromVGPRLane(const int FI) {
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
     ArrayRef<SIRegisterInfo::SpilledReg> Spill =
-        FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+        FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
     assert(Spill.size() == NumSubRegs);
 
     for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -1353,8 +1353,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
           if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
                                                 TRI->isAGPR(MRI, VReg))) {
             assert(RS != nullptr);
-            // FIXME: change to enterBasicBlockEnd()
-            RS->enterBasicBlock(MBB);
+            RS->enterBasicBlockEnd(MBB);
+            RS->backward(MI);
             TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
             SpillFIs.set(FI);
             continue;

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -50,14 +50,23 @@ class SILowerSGPRSpills : public MachineFunctionPass {
   SILowerSGPRSpills() : MachineFunctionPass(ID) {}
 
   void calculateSaveRestoreBlocks(MachineFunction &MF);
-  bool spillCalleeSavedRegs(MachineFunction &MF);
+  bool spillCalleeSavedRegs(MachineFunction &MF,
+                            SmallVectorImpl<int> &CalleeSavedFIs);
+  void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
+    return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA)
+        .set(MachineFunctionProperties::Property::NoVRegs);
+  }
 };
 
 } // end anonymous namespace
@@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
   EntryBB.sortUniqueLiveIns();
 }
 
-bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
+bool SILowerSGPRSpills::spillCalleeSavedRegs(
+    MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
                                            TRI->getSpillAlign(*RC), true);
 
         CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
+        CalleeSavedFIs.push_back(JunkFI);
       }
     }
 
@@ -248,6 +259,50 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   return false;
 }
 
+void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
+                                                 LiveIntervals *LIS) {
+  // TODO: This is a workaround to avoid the unmodelled liveness computed with
+  // whole-wave virtual registers when allocated together with the regular VGPR
+  // virtual registers. Presently, the liveness computed during the regalloc is
+  // only uniform (or single lane aware) and it doesn't take account of the
+  // divergent control flow that exists for our GPUs. Since the WWM registers
+  // can modify inactive lanes, the wave-aware liveness should be computed for
+  // the virtual registers to accurately plot their interferences. Without
+  // having the divergent CFG for the function, it is difficult to implement the
+  // wave-aware liveness info. Until then, we conservatively extend the liveness
+  // of the wwm registers into the entire function so that they won't be reused
+  // without first spilling/splitting their liveranges.
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
+  for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+    for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+      MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+      auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+                         TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+      MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+      if (LIS) {
+        LIS->InsertMachineInstrInMaps(*MIB);
+      }
+    }
+  }
+
+  // Insert the KILL in the return blocks to extend their liveness untill the
+  // end of function. Insert a separate KILL for each VGPR.
+  for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
+    MachineBasicBlock::iterator InsertBefore =
+        RestoreBlock->getFirstTerminator();
+    for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+      auto MIB =
+          BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+                  TII->get(TargetOpcode::KILL));
+      MIB.addReg(Reg);
+      if (LIS)
+        LIS->InsertMachineInstrInMaps(*MIB);
+    }
+  }
+}
+
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -261,7 +316,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   // First, expose any CSR SGPR spills. This is mostly the same as what PEI
   // does, but somewhat simpler.
   calculateSaveRestoreBlocks(MF);
-  bool HasCSRs = spillCalleeSavedRegs(MF);
+  SmallVector<int> CalleeSavedFIs;
+  bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -275,6 +331,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   bool NewReservedRegs = false;
+  bool SpilledToVirtVGPRLanes = false;
 
   // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
   // handled as SpilledToReg in regular PrologEpilogInserter.
@@ -297,23 +354,53 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
 
         int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
         assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
-        if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
-          NewReservedRegs = true;
-          bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
-              MI, FI, nullptr, Indexes, LIS);
-          (void)Spilled;
-          assert(Spilled && "failed to spill SGPR to VGPR when allocated");
-          SpillFIs.set(FI);
+
+        bool IsCalleeSaveSGPRSpill =
+            std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
+            CalleeSavedFIs.end();
+        if (IsCalleeSaveSGPRSpill) {
+          // Spill callee-saved SGPRs into physical VGPR lanes.
+
+          // TODO: This is to ensure the CFIs are static for efficient frame
+          // unwinding in the debugger. Spilling them into virtual VGPR lanes
+          // involve regalloc to allocate the physical VGPRs and that might
+          // cause intermediate spill/split of such liveranges for successful
+          // allocation. This would result in broken CFI encoding unless the
+          // regalloc aware CFI generation to insert new CFIs along with the
+          // intermediate spills is implemented. There is no such support
+          // currently exist in the LLVM compiler.
+          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
+            NewReservedRegs = true;
+            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+                MI, FI, nullptr, Indexes, LIS, true);
+            if (!Spilled)
+              llvm_unreachable(
+                  "failed to spill SGPR to physical VGPR lane when allocated");
+          }
+        } else {
+          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
+            bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+                MI, FI, nullptr, Indexes, LIS);
+            if (!Spilled)
+              llvm_unreachable(
+                  "failed to spill SGPR to virtual VGPR lane when allocated");
+            SpillFIs.set(FI);
+            SpilledToVirtVGPRLanes = true;
+          }
         }
       }
     }
 
-    // FIXME: Adding to live-ins redundant with reserving registers.
-    for (MachineBasicBlock &MBB : MF) {
-      for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
-        MBB.addLiveIn(Reg);
-      MBB.sortUniqueLiveIns();
+    if (SpilledToVirtVGPRLanes) {
+      extendWWMVirtRegLiveness(MF, LIS);
+      if (LIS) {
+        // Compute the LiveInterval for the newly created virtual registers.
+        for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+          LIS->createAndComputeVirtRegInterval(Reg);
+      }
+    }
 
+    for (MachineBasicBlock &MBB : MF) {
       // FIXME: The dead frame indices are replaced with a null register from
       // the debug value instructions. We should instead, update it with the
       // correct register value. But not sure the register value alone is
@@ -334,6 +421,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     // lane".
     FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
 
+    MadeChange = true;
+  }
+
+  if (SpilledToVirtVGPRLanes) {
     const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
     // Shift back the reserved SGPR for EXEC copy into the lowest range.
     // This SGPR is reserved to handle the whole-wave spill/copy operations
@@ -342,20 +433,21 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
                              TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
       FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
-
-    MadeChange = true;
   } else {
-    // No SGPR spills and hence there won't be any WWM spills/copies. Reset the
-    // SGPR reserved for EXEC copy.
+    // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
+    // spills/copies. Reset the SGPR reserved for EXEC copy.
     FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
   }
 
   SaveBlocks.clear();
   RestoreBlocks.clear();
 
-  // Updated the reserved registers with any VGPRs added for SGPR spills.
-  if (NewReservedRegs)
-    MRI.freezeReservedRegs(MF);
+  // Updated the reserved registers with any physical VGPRs added for SGPR
+  // spills.
+  if (NewReservedRegs) {
+    for (Register Reg : FuncInfo->getWWMReservedRegs())
+      MRI.reserveReg(Reg, TRI);
+  }
 
   return MadeChange;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -314,37 +314,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
-bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
-                                                      int FI,
-                                                      unsigned LaneIndex) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
+    MachineFunction &MF, int FI, unsigned LaneIndex) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LaneVGPR;
   if (!LaneIndex) {
-    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
-    if (LaneVGPR == AMDGPU::NoRegister) {
-      // We have no VGPRs left for spilling SGPRs. Reset because we will not
-      // partially spill the SGPR to VGPRs.
-      SGPRSpillToVGPRLanes.erase(FI);
-      return false;
-    }
-
+    LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     SpillVGPRs.push_back(LaneVGPR);
-    // Add this register as live-in to all blocks to avoid machine verifier
-    // complaining about use of an undefined physical register.
-    for (MachineBasicBlock &BB : MF)
-      BB.addLiveIn(LaneVGPR);
   } else {
     LaneVGPR = SpillVGPRs.back();
   }
 
-  SGPRSpillToVGPRLanes[FI].push_back(
+  SGPRSpillsToVirtualVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
 
-bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
+bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
     MachineFunction &MF, int FI, unsigned LaneIndex) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -355,16 +341,21 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
     if (LaneVGPR == AMDGPU::NoRegister) {
       // We have no VGPRs left for spilling SGPRs. Reset because we will not
       // partially spill the SGPR to VGPRs.
-      PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
+      SGPRSpillsToPhysicalVGPRLanes.erase(FI);
       return false;
     }
 
     allocateWWMSpill(MF, LaneVGPR);
+    reserveWWMRegister(LaneVGPR);
+    for (MachineBasicBlock &MBB : MF) {
+      MBB.addLiveIn(LaneVGPR);
+      MBB.sortUniqueLiveIns();
+    }
   } else {
-    LaneVGPR = WWMSpills.back().first;
+    LaneVGPR = WWMReservedRegs.back();
   }
 
-  PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
+  SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
       SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
   return true;
 }
@@ -373,8 +364,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
                                                         int FI,
                                                         bool IsPrologEpilog) {
   std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
-      IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
-                     : SGPRSpillToVGPRLanes[FI];
+      IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
+                     : SGPRSpillsToVirtualVGPRLanes[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -395,15 +386,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
          "not spilling SGPRs to VGPRs");
 
   unsigned &NumSpillLanes =
-      IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
+      IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
 
   for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
     unsigned LaneIndex = (NumSpillLanes % WaveSize);
 
-    bool Allocated =
-        IsPrologEpilog
-            ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
-            : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
+    bool Allocated = IsPrologEpilog
+                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+                         : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
     if (!Allocated) {
       NumSpillLanes -= I;
       return false;
@@ -484,16 +474,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 
 bool SIMachineFunctionInfo::removeDeadFrameIndices(
     MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
-  // Remove dead frame indices from function frame. And also make sure to remove
-  // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
-  // could result in an unexpected side effect and bug, in case of any
-  // re-mapping of freed frame indices by later pass(es) like "stack slot
+  // Remove dead frame indices from function frame, however keep FP & BP since
+  // spills for them haven't been inserted yet. And also make sure to remove the
+  // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
+  // otherwise, it could result in an unexpected side effect and bug, in case of
+  // any re-mapping of freed frame indices by later pass(es) like "stack slot
   // coloring".
-  for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
+  for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
     MFI.RemoveStackObject(R.first);
-    SGPRSpillToVGPRLanes.erase(R.first);
+    SGPRSpillsToVirtualVGPRLanes.erase(R.first);
   }
 
+  // Remove the dead frame indices of CSR SGPRs which are spilled to physical
+  // VGPR lanes during SILowerSGPRSpills pass.
+  if (!ResetSGPRSpillStackIDs) {
+    for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
+      MFI.RemoveStackObject(R.first);
+      SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
+    }
+  }
   bool HaveSGPRToMemory = false;
 
   if (ResetSGPRSpillStackIDs) {