Skip to content

Commit

Permalink
[AMDGPU] Split VGPR regalloc pipeline
Browse files Browse the repository at this point in the history
Allocating wwm-registers and regular VGPR operands
together imposes many challenges in the way the
registers are reused during allocation. There are
times when regalloc reuses the registers of regular
VGPRs operations for wwm-operations in a small range
leading to unwantedly clobbering their inactive lanes
causing correctness issues which are hard to trace.

This patch splits the VGPR allocation pipeline further
to allocate wwm-registers first and the regular VGPR
operands in a separate pipeline. The splitting would
ensure that the physical registers used for wwm
allocations won't taken part in the next allocation
pipeline to avoid any such clobbering.
  • Loading branch information
cdevadas committed Jun 27, 2024
1 parent 29e59f4 commit 906586e
Show file tree
Hide file tree
Showing 83 changed files with 9,085 additions and 9,497 deletions.
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}

const MachineFunction &getMF() const { return *MF; }

//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
Expand Down Expand Up @@ -149,6 +150,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};

void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsID;

void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;

Expand Down
96 changes: 96 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass should be invoked at the end of wwm-regalloc pipeline.
/// It identifies the WWM regs allocated during this pipeline and add
/// them to the list of reserved registers so that they won't be available for
/// per-thread VGPR allocation in the subsequent regalloc pipeline.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"

namespace {

class AMDGPUReserveWWMRegs : public MachineFunctionPass {
public:
static char ID;

AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "AMDGPU Reserve WWM Registers";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // End anonymous namespace.

INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
"AMDGPU Reserve WWM Registers", false, false)

char AMDGPUReserveWWMRegs::ID = 0;

char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;

bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
continue;

Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
? MI.getOperand(0).getReg()
: MI.getOperand(1).getReg();

assert(Reg.isPhysical() &&
"All WWM registers should have been allocated by now.");

MFI->reserveWWMRegister(Reg);
Changed |= true;
}
}

// The renamable flag can't be set for reserved registers. Reset the flag for
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
// pipeline.
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (Register Reg : MFI->getWWMReservedRegs()) {
for (MachineOperand &MO : MRI.reg_operands(Reg))
MO.setIsRenamable(false);
}

// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
MFI->clearNonWWMRegAllocMask();

return Changed;
}
95 changes: 90 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};

class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
public:
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};

static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
Expand All @@ -98,13 +104,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}

/// -{sgpr|vgpr}-regalloc=... command line option.
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const SIMachineFunctionInfo *MFI =
MRI.getMF().getInfo<SIMachineFunctionInfo>();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
}

/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }

/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;

static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
Expand All @@ -121,6 +138,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));

static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<WWMRegisterRegAlloc>>
WWMRegAlloc("wwm-regalloc", cl::Hidden,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));

static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
Expand All @@ -140,6 +162,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}

static void initializeDefaultWWMRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();

if (!Ctor) {
Ctor = WWMRegAlloc;
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
}
}

static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
Expand All @@ -164,6 +195,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}

static FunctionPass *createBasicWWMRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createGreedyWWMRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createFastWWMRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
}

static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
Expand All @@ -180,7 +223,15 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(

static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
}
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
"basic register allocator",
createBasicWWMRegisterAllocator);
static WWMRegisterRegAlloc
greedyRegAllocWWMReg("greedy", "greedy register allocator",
createGreedyWWMRegisterAllocator);
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
createFastWWMRegisterAllocator);
} // namespace

static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
Expand Down Expand Up @@ -429,6 +480,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
Expand Down Expand Up @@ -952,6 +1004,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {

FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;

bool addRegAssignAndRewriteFast() override;
Expand Down Expand Up @@ -1364,7 +1417,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}

bool GCNPassConfig::addPreRewrite() {
addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
Expand Down Expand Up @@ -1400,12 +1452,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}

FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
initializeDefaultWWMRegisterAllocatorOnce);

RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();

if (Optimized)
return createGreedyWWMRegisterAllocator();

return createFastWWMRegisterAllocator();
}

FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}

static const char RegAllocOptNotSupportedMessage[] =
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
"and -vgpr-regalloc";

bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
Expand All @@ -1417,11 +1485,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

addPass(createVGPRAllocPass(false));
// For allocating other wwm register operands.
addPass(createWWMRegAllocPass(false));

addPass(&SILowerWWMCopiesID);
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(false));

return true;
}

Expand All @@ -1441,8 +1517,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

// For allocating other whole wave mode registers.
addPass(createWWMRegAllocPass(true));
addPass(&SILowerWWMCopiesID);
addPass(createVirtRegRewriter(false));
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));

addPreRewrite();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
Expand Down
Loading

0 comments on commit 906586e

Please sign in to comment.