Skip to content

Commit

Permalink
[AMDGPU] Split vgpr regalloc pipeline (llvm#93526)
Browse files Browse the repository at this point in the history
Allocating wwm-registers and per-thread VGPR operands
together imposes many challenges in the way the
registers are reused during allocation. There are
times when regalloc reuses the registers of regular
VGPRs operations for wwm-operations in a small range
leading to unwantedly clobbering their inactive lanes
causing correctness issues that are hard to trace.

This patch splits the VGPR allocation pipeline further
to allocate wwm-registers first and the regular VGPR
operands in a separate pipeline. The splitting would
ensure that the physical registers used for wwm
allocations won't take part in the next allocation
pipeline to avoid any such clobbering.
  • Loading branch information
cdevadas authored and xgupta committed Oct 4, 2024
1 parent 5edef89 commit fd8aa0a
Show file tree
Hide file tree
Showing 90 changed files with 10,690 additions and 11,159 deletions.
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/MachineRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ class MachineRegisterInfo {
TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg);
}

const MachineFunction &getMF() const { return *MF; }

//===--------------------------------------------------------------------===//
// Function State
//===--------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPrepareLegacyPass();
FunctionPass *createAMDGPUReserveWWMRegsPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
Expand Down Expand Up @@ -154,6 +155,9 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};

void initializeAMDGPUReserveWWMRegsPass(PassRegistry &);
extern char &AMDGPUReserveWWMRegsID;

void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
extern char &AMDGPURewriteOutArgumentsID;

Expand Down
96 changes: 96 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//===-- AMDGPUReserveWWMRegs.cpp - Add WWM Regs to reserved regs list -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass should be invoked at the end of wwm-regalloc pipeline.
/// It identifies the WWM regs allocated during this pipeline and add
/// them to the list of reserved registers so that they won't be available for
/// per-thread VGPR allocation in the subsequent regalloc pipeline.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-reserve-wwm-regs"

namespace {

class AMDGPUReserveWWMRegs : public MachineFunctionPass {
public:
static char ID;

AMDGPUReserveWWMRegs() : MachineFunctionPass(ID) {
initializeAMDGPUReserveWWMRegsPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;

StringRef getPassName() const override {
return "AMDGPU Reserve WWM Registers";
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
};

} // End anonymous namespace.

INITIALIZE_PASS(AMDGPUReserveWWMRegs, DEBUG_TYPE,
"AMDGPU Reserve WWM Registers", false, false)

char AMDGPUReserveWWMRegs::ID = 0;

char &llvm::AMDGPUReserveWWMRegsID = AMDGPUReserveWWMRegs::ID;

bool AMDGPUReserveWWMRegs::runOnMachineFunction(MachineFunction &MF) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::SI_SPILL_S32_TO_VGPR &&
Opc != AMDGPU::SI_RESTORE_S32_FROM_VGPR)
continue;

Register Reg = Opc == AMDGPU::SI_SPILL_S32_TO_VGPR
? MI.getOperand(0).getReg()
: MI.getOperand(1).getReg();

assert(Reg.isPhysical() &&
"All WWM registers should have been allocated by now.");

MFI->reserveWWMRegister(Reg);
Changed |= true;
}
}

// The renamable flag can't be set for reserved registers. Reset the flag for
// MOs involving wwm-regs as they will be reserved during vgpr-regalloc
// pipeline.
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (Register Reg : MFI->getWWMReservedRegs()) {
for (MachineOperand &MO : MRI.reg_operands(Reg))
MO.setIsRenamable(false);
}

// Now clear the NonWWMRegMask earlier set during wwm-regalloc.
MFI->clearNonWWMRegAllocMask();

return Changed;
}
93 changes: 89 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
: RegisterRegAllocBase(N, D, C) {}
};

class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
public:
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};

static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
Expand All @@ -122,13 +128,24 @@ static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}

/// -{sgpr|vgpr}-regalloc=... command line option.
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const SIMachineFunctionInfo *MFI =
MRI.getMF().getInfo<SIMachineFunctionInfo>();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
}

/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }

/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;

static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
Expand All @@ -145,6 +162,11 @@ static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));

static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<WWMRegisterRegAlloc>>
WWMRegAlloc("wwm-regalloc", cl::Hidden,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));

static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
Expand All @@ -164,6 +186,15 @@ static void initializeDefaultVGPRRegisterAllocatorOnce() {
}
}

static void initializeDefaultWWMRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();

if (!Ctor) {
Ctor = WWMRegAlloc;
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
}
}

static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
Expand All @@ -188,6 +219,18 @@ static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}

static FunctionPass *createBasicWWMRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createGreedyWWMRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
}

static FunctionPass *createFastWWMRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
}

static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
Expand All @@ -204,6 +247,14 @@ static VGPRRegisterRegAlloc greedyRegAllocVGPR(

static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
"basic register allocator",
createBasicWWMRegisterAllocator);
static WWMRegisterRegAlloc
greedyRegAllocWWMReg("greedy", "greedy register allocator",
createGreedyWWMRegisterAllocator);
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
createFastWWMRegisterAllocator);
} // anonymous namespace

static cl::opt<bool>
Expand Down Expand Up @@ -440,6 +491,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPUReserveWWMRegsPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
Expand Down Expand Up @@ -989,6 +1041,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {

FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;

bool addRegAssignAndRewriteFast() override;
Expand Down Expand Up @@ -1382,7 +1435,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}

bool GCNPassConfig::addPreRewrite() {
addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
Expand Down Expand Up @@ -1418,12 +1470,28 @@ FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
return createFastVGPRRegisterAllocator();
}

FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
initializeDefaultWWMRegisterAllocatorOnce);

RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();

if (Optimized)
return createGreedyWWMRegisterAllocator();

return createFastWWMRegisterAllocator();
}

FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}

static const char RegAllocOptNotSupportedMessage[] =
"-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
"and -vgpr-regalloc";

bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
Expand All @@ -1435,11 +1503,19 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

addPass(createVGPRAllocPass(false));
// For allocating other wwm register operands.
addPass(createWWMRegAllocPass(false));

addPass(&SILowerWWMCopiesID);
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(false));

return true;
}

Expand All @@ -1459,8 +1535,17 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);

// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsID);

// For allocating other whole wave mode registers.
addPass(createWWMRegAllocPass(true));
addPass(&SILowerWWMCopiesID);
addPass(createVirtRegRewriter(false));
addPass(&AMDGPUReserveWWMRegsID);

// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));

addPreRewrite();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegBankSelect.cpp
AMDGPURegisterBankInfo.cpp
AMDGPURemoveIncompatibleFunctions.cpp
AMDGPUReserveWWMRegs.cpp
AMDGPUResourceUsageAnalysis.cpp
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
Expand Down
Loading

0 comments on commit fd8aa0a

Please sign in to comment.