Skip to content

Commit

Permalink
[AMDGPU] Control Flow lowering: add S_CMOV_b32/64_term and S_CSELECT_…
Browse files Browse the repository at this point in the history
…B32/64_term pseudo instructions
  • Loading branch information
alex-t committed May 20, 2024
1 parent 1805a17 commit a96acb5
Show file tree
Hide file tree
Showing 128 changed files with 14,041 additions and 15,175 deletions.
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1553,7 +1553,8 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
return true;
}

bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
MachineInstr &MI) const {
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
MachineBasicBlock *BB = MI.getParent();
Expand Down Expand Up @@ -2084,7 +2085,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_wave_reconverge:
return selectEndCfIntrinsic(I);
return selectWaveReconvergeIntrinsic(I);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
return selectDSOrderedIntrinsic(I, IntrinsicID);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectReturnAddress(MachineInstr &I) const;
bool selectG_INTRINSIC(MachineInstr &I) const;

bool selectEndCfIntrinsic(MachineInstr &MI) const;
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15754,7 +15754,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
Next++;
}

assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC)) &&
assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
"Malformed CFG detected!\n");

if (NeedToMove) {
Expand Down
28 changes: 28 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;

case AMDGPU::S_CMOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CMOV_B64));
break;

case AMDGPU::S_MOV_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_MOV_B32));
break;

case AMDGPU::S_CMOV_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CMOV_B32));
break;

case AMDGPU::S_CSELECT_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CSELECT_B32));
break;

case AMDGPU::S_CSELECT_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_CSELECT_B64));
break;

case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
Expand Down Expand Up @@ -3088,17 +3112,21 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
while (I != E && !I->isBranch() && !I->isReturn()) {
switch (I->getOpcode()) {
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_CMOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
case AMDGPU::S_AND_SAVEEXEC_B64_term:
case AMDGPU::S_CSELECT_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_CMOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
case AMDGPU::S_AND_SAVEEXEC_B32_term:
case AMDGPU::S_CSELECT_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<

let WaveSizePredicate = isWave64 in {
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
def S_CMOV_B64_term : WrapTerminatorInst<S_CMOV_B64>;
def S_CSELECT_B64_term : WrapTerminatorInst<S_CSELECT_B64>;
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
Expand All @@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;

let WaveSizePredicate = isWave32 in {
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
def S_CMOV_B32_term : WrapTerminatorInst<S_CMOV_B32>;
def S_CSELECT_B32_term : WrapTerminatorInst<S_CSELECT_B32>;
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
Expand Down
111 changes: 84 additions & 27 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class SILowerControlFlow : public MachineFunctionPass {
SmallSet<Register, 8> RecomputeRegs;

const TargetRegisterClass *BoolRC = nullptr;
long unsigned TestMask;
uint64_t TestMask;
unsigned Select;
unsigned CmovOpc;
unsigned AndOpc;
Expand All @@ -96,12 +96,14 @@ class SILowerControlFlow : public MachineFunctionPass {
unsigned OrSaveExecOpc;
unsigned Exec;

bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);

void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
void emitLoop(MachineInstr &MI);
void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask,
Register DisableLanesMask);
Register DisableLanesMask, bool IsIf);

void emitWaveReconverge(MachineInstr &MI);

Expand Down Expand Up @@ -165,6 +167,37 @@ INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,

char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;

bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
const MachineBasicBlock *End) {
DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());

while (!Worklist.empty()) {
MachineBasicBlock *MBB = Worklist.pop_back_val();

if (MBB == End || !Visited.insert(MBB).second)
continue;
if (KillBlocks.contains(MBB))
return true;

Worklist.append(MBB->succ_begin(), MBB->succ_end());
}

return false;
}

static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
Register SaveExecReg = MI.getOperand(0).getReg();
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);

if (U == MRI->use_instr_nodbg_end() ||
std::next(U) != MRI->use_instr_nodbg_end() ||
U->getOpcode() != AMDGPU::SI_WAVE_RECONVERGE)
return false;

return true;
}

void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
Expand All @@ -173,6 +206,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineOperand &Cond = MI.getOperand(1);
assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
Register CondReg = Cond.getReg();
MachineInstr *CondRegDef = MRI->getVRegDef(CondReg);
if (CondRegDef && CondRegDef->getParent() == &MBB && TII->isVALU(*CondRegDef))
return emitWaveDiverge(MI, CondReg, MaskElse, true);

Register MaskThen = MRI->createVirtualRegister(BoolRC);
// Get rid of the garbage bits in the Cond register which might be coming from
Expand All @@ -184,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
if (LV)
LV->replaceKillInstruction(CondReg, MI, *CondFiltered);

emitWaveDiverge(MI, MaskThen, MaskElse);
emitWaveDiverge(MI, MaskThen, MaskElse, true);

if (LIS) {
LIS->InsertMachineInstrInMaps(*CondFiltered);
Expand All @@ -195,7 +231,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
void SILowerControlFlow::emitElse(MachineInstr &MI) {
Register InvCondReg = MI.getOperand(0).getReg();
Register CondReg = MI.getOperand(1).getReg();
emitWaveDiverge(MI, CondReg, InvCondReg);
emitWaveDiverge(MI, CondReg, InvCondReg, false);
}

void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
Expand Down Expand Up @@ -258,24 +294,19 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {

Register Cond = MI.getOperand(0).getReg();
Register MaskLoop = MRI->createVirtualRegister(BoolRC);
Register MaskExit = MRI->createVirtualRegister(BoolRC);
Register AndZero = MRI->createVirtualRegister(BoolRC);

MachineInstr *CondLoop = BuildMI(MBB, &MI, DL, TII->get(Andn2Opc), MaskLoop)
.addReg(Exec)
.addReg(Cond);

MachineInstr *ExitExec = BuildMI(MBB, &MI, DL, TII->get(OrOpc), MaskExit)
.addReg(Cond)
.addReg(Exec);

MachineInstr *IfZeroMask = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndZero)
.addReg(MaskLoop)
.addImm(TestMask);

MachineInstr *SetExec= BuildMI(MBB, &MI, DL, TII->get(Select), Exec)
.addReg(MaskLoop)
.addReg(MaskExit);
.addReg(Cond);

if (LV)
LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *SetExec);
Expand All @@ -290,10 +321,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
LIS->ReplaceMachineInstrInMaps(MI, *SetExec);
LIS->InsertMachineInstrInMaps(*CondLoop);
LIS->InsertMachineInstrInMaps(*IfZeroMask);
LIS->InsertMachineInstrInMaps(*ExitExec);
LIS->InsertMachineInstrInMaps(*Branch);
LIS->createAndComputeVirtRegInterval(MaskLoop);
LIS->createAndComputeVirtRegInterval(MaskExit);
LIS->createAndComputeVirtRegInterval(AndZero);
}

Expand All @@ -302,20 +331,49 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {

void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
Register EnabledLanesMask,
Register DisableLanesMask) {
Register DisableLanesMask, bool IsIf) {

MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(MI);

MachineInstr *CondInverted =
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
.addReg(EnabledLanesMask)
.addReg(Exec);

if (LV) {
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
bool NeedXor = true;
if (IsIf) {
// If there is only one use of save exec register and that use is SI_END_CF,
// we can optimize SI_IF by returning the full saved exec mask instead of
// just cleared bits.
bool SimpleIf = isSimpleIf(MI, MRI);

if (SimpleIf) {
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
// if there is any such terminator simplifications are not safe.
auto UseMI = MRI->use_instr_nodbg_begin(DisableLanesMask);
SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
}
NeedXor = !SimpleIf;
}

if (NeedXor) {

MachineInstr *CondInverted =
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
.addReg(EnabledLanesMask)
.addReg(Exec);

if (LV) {
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
}

if (LIS) {
LIS->InsertMachineInstrInMaps(*CondInverted);
}
} else {
MachineInstr *CopyExec =
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DisableLanesMask)
.addReg(Exec);
if(LIS)
LIS->InsertMachineInstrInMaps(*CopyExec);
}
Register TestResultReg = MRI->createVirtualRegister(BoolRC);
MachineInstr *IfZeroMask =
BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg)
Expand All @@ -327,7 +385,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,

MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB();
MachineBasicBlock *TargetBB = nullptr;
// determine target BBs
// determine target BBs
I = skipToUncondBrOrEnd(MBB, I);
if (I != MBB.end()) {
// skipToUncondBrOrEnd returns either unconditional branch or end()
Expand Down Expand Up @@ -358,8 +416,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
return;
}

LIS->InsertMachineInstrInMaps(*CondInverted);
LIS->InsertMachineInstrInMaps(*IfZeroMask);
LIS->InsertMachineInstrInMaps(*IfZeroMask);
LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc);

RecomputeRegs.insert(MI.getOperand(0).getReg());
Expand Down Expand Up @@ -607,8 +664,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {

if (ST.isWave32()) {
TestMask = 0xffffffff;
Select = AMDGPU::S_CSELECT_B32;
CmovOpc = AMDGPU::S_CMOV_B32;
Select = AMDGPU::S_CSELECT_B32_term;
CmovOpc = AMDGPU::S_CMOV_B32_term;
AndOpc = AMDGPU::S_AND_B32;
Andn2Opc = AMDGPU::S_ANDN2_B32;
OrOpc = AMDGPU::S_OR_B32;
Expand All @@ -621,8 +678,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
Exec = AMDGPU::EXEC_LO;
} else {
TestMask = 0xffffffffffffffff;
Select = AMDGPU::S_CSELECT_B64;
CmovOpc = AMDGPU::S_CMOV_B64;
Select = AMDGPU::S_CSELECT_B64_term;
CmovOpc = AMDGPU::S_CMOV_B64_term;
AndOpc = AMDGPU::S_AND_B64;
Andn2Opc = AMDGPU::S_ANDN2_B64;
OrOpc = AMDGPU::S_OR_B64;
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_CMOV_B64:
case AMDGPU::S_CMOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
Expand Down
14 changes: 14 additions & 0 deletions llvm/test/%t
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
warning: <unknown>:0:0: in function func_use_lds_global void (): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_uses_lds_code_after void (ptr addrspace(1)): local memory global used by non-kernel function

warning: <unknown>:0:0: in function func_uses_lds_phi_after i32 (i1, ptr addrspace(1)): local memory global used by non-kernel function

Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,8 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
; GFX10-NEXT: s_or_b32 s6, s6, s4
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
; GFX10-NEXT: s_and_b32 s8, s4, -1
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
; GFX10-NEXT: s_and_b32 s7, s4, -1
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5
; GFX10-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
Expand Down Expand Up @@ -166,9 +165,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: s_and_b32 s7, exec_lo, s4
; GFX10-NEXT: s_or_b32 s6, s6, s7
; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s5
; GFX10-NEXT: s_or_b32 s8, s5, exec_lo
; GFX10-NEXT: s_and_b32 s9, s7, -1
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s8
; GFX10-NEXT: s_and_b32 s8, s7, -1
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s5
; GFX10-NEXT: s_cbranch_scc0 .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
Expand Down
Loading

0 comments on commit a96acb5

Please sign in to comment.