-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SME] Stop RA from coalescing COPY instructions that transcend beyond smstart/smstop. #78294
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesThis patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to For example:
If the COPY would be coalesced, that would lead to:
being replaced by:
which means the whole ZPR reg would be live upto the call, causing the
which would be incorrect for two reasons:
By disabling the coalescing, we get the desired results:
Patch is 89.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78294.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index bb7f4d907ffd7f..08a24badda266c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1528,6 +1528,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
return true;
}
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ MI.eraseFromParent();
+ return true;
case AArch64::LD1B_2Z_IMM_PSEUDO:
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 620872790ed8db..3d7138855045c8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2338,6 +2338,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER:
break;
+ MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -7090,13 +7091,18 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
}
}
+static bool isPassedInFPR(EVT VT) {
+ return VT.isFixedLengthVector() ||
+ (VT.isFloatingPoint() && !VT.isScalableVector());
+}
+
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const {
+ SDValue ThisVal, bool RequiresSMChange) const {
DenseMap<unsigned, SDValue> CopiedRegs;
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -7141,6 +7147,10 @@ SDValue AArch64TargetLowering::LowerCallResult(
break;
}
+ if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
+ Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
+ Val);
+
InVals.push_back(Val);
}
@@ -7829,6 +7839,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
return ArgReg.Reg == VA.getLocReg();
});
} else {
+ // Add an extra level of indirection for streaming mode changes by
+ // using a pseudo copy node that cannot be rematerialised between a
+ // smstart/smstop and the call by the simple register coalescer.
+ if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
+ Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+ Arg.getValueType(), Arg);
RegsToPass.emplace_back(VA.getLocReg(), Arg);
RegsUsed.insert(VA.getLocReg());
const TargetOptions &Options = DAG.getTarget().Options;
@@ -8063,7 +8079,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// return.
SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs,
DL, DAG, InVals, IsThisReturn,
- IsThisReturn ? OutVals[0] : SDValue());
+ IsThisReturn ? OutVals[0] : SDValue(),
+ RequiresSMChange.has_value());
if (!Ins.empty())
InGlue = Result.getValue(Result->getNumValues() - 1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1fd639b4f7ee8f..2cb22752152fe5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,6 +58,8 @@ enum NodeType : unsigned {
CALL_BTI, // Function call followed by a BTI instruction.
+ COALESCER_BARRIER,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
@@ -1021,7 +1023,7 @@ class AArch64TargetLowering : public TargetLowering {
const SmallVectorImpl<CCValAssign> &RVLocs,
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
- SDValue ThisVal) const;
+ SDValue ThisVal, bool RequiresSMChange) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 24ba9dd95004c6..fbe4ded2eaabcf 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1012,6 +1012,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
const TargetRegisterClass *DstRC, unsigned DstSubReg,
const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+ MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
if (MI->isCopy() &&
((DstRC->getID() == AArch64::GPR64RegClassID) ||
(DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
@@ -1020,5 +1022,38 @@ bool AArch64RegisterInfo::shouldCoalesce(
// which implements a 32 to 64 bit zero extension
// which relies on the upper 32 bits being zeroed.
return false;
+
+ auto IsCoalescerBarrier = [](const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AArch64::COALESCER_BARRIER_FPR16:
+ case AArch64::COALESCER_BARRIER_FPR32:
+ case AArch64::COALESCER_BARRIER_FPR64:
+ case AArch64::COALESCER_BARRIER_FPR128:
+ return true;
+ default:
+ return false;
+ }
+ };
+
+ // For calls that temporarily have to toggle streaming mode as part of the
+ // call-sequence, we need to be more careful when coalescing copy instructions
+ // so that we don't end up coalescing the NEON/FP result or argument register
+ // with a whole Z-register, such that after coalescing the register allocator
+ // will try to spill/reload the entire Z register.
+ //
+ // We do this by checking if the node has any defs/uses that are COALESCER_BARRIER
+ // pseudos. These are 'nops' in practice, but they exist to instruct the
+ // coalescer to avoid coalescing the copy.
+ if (MI->isCopy() && SubReg != DstSubReg &&
+ (AArch64::ZPRRegClass.hasSubClassEq(DstRC) ||
+ AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) {
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier))
+ return false;
+ unsigned DstReg = MI->getOperand(0).getReg();
+ if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier))
+ return false;
+ }
+
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 380f6e1fcfdaef..5f075a1be6325d 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -22,6 +22,8 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPVariadic,
SDNPOptInGlue]>;
+def AArch64CoalescerBarrier
+ : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>;
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
@@ -183,6 +185,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
(MSR 0xde85, GPR64:$val)>;
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
(MRS 0xde85)>;
+
+multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
+ def NAME : Pseudo<(outs rc:$dst), (ins rc:$idx), []>, Sched<[]> {
+ let Constraints = "$dst = $idx";
+ }
+ foreach vt = vts in {
+ def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$idx))),
+ (!cast<Instruction>(NAME) rc:$idx)>;
+ }
+}
+
+multiclass CoalescerBarriers {
+ defm _FPR16 : CoalescerBarrierPseudo<FPR16, [f16]>;
+ defm _FPR32 : CoalescerBarrierPseudo<FPR32, [f32]>;
+ defm _FPR64 : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>;
+ defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>;
+}
+
+defm COALESCER_BARRIER : CoalescerBarriers;
+
} // End let Predicates = [HasSME]
// Pseudo to match to smstart/smstop. This expands:
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index e18e18a1cfad18..381091b4539433 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -23,9 +23,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-FISEL-NEXT: bl streaming_callee
; CHECK-FISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstop sm
+; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: adrp x8, .LCPI0_0
; CHECK-FISEL-NEXT: ldr d0, [x8, :lo12:.LCPI0_0]
-; CHECK-FISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: fadd d0, d1, d0
; CHECK-FISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-FISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -49,9 +49,9 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-GISEL-NEXT: bl streaming_callee
; CHECK-GISEL-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstop sm
+; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-GISEL-NEXT: fmov d0, x8
-; CHECK-GISEL-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: fadd d0, d1, d0
; CHECK-GISEL-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-GISEL-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -82,9 +82,9 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: bl normal_callee
; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d0, x8
-; CHECK-COMMON-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: fadd d0, d1, d0
; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -110,14 +110,16 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl normal_callee
; CHECK-COMMON-NEXT: str d0, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
+; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000
; CHECK-COMMON-NEXT: fmov d0, x8
-; CHECK-COMMON-NEXT: ldr d1, [sp, #16] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: fadd d0, d1, d0
; CHECK-COMMON-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
@@ -329,9 +331,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-COMMON-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
-; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
@@ -390,9 +392,9 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp s1, s0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
-; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp s1, s0, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl fmodf
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
@@ -420,7 +422,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
+; CHECK-COMMON-NEXT: ldp s2, s0, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: and x19, x0, #0x1
+; CHECK-COMMON-NEXT: stp s2, s0, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2
; CHECK-COMMON-NEXT: // %bb.1:
; CHECK-COMMON-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
new file mode 100644
index 00000000000000..60c676274ec5aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -0,0 +1,1551 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-unknown-eabi-elf"
+
+; This test verifies that call arguments and results are not coalesced
+; with SVE vector registers by the coalescer, such that no 'mul vl'
+; ldr/str pairs are generated in the streaming-mode-changing call
+; sequence.
+
+;
+; Scalar arguments
+;
+
+define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i8
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1b { z0.b }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 16 x i8> poison, i8 %arg, i32 0
+ call void @use_i8(i8 %arg)
+ store <vscale x 16 x i8> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i16
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1h { z0.h }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 8 x i16> poison, i16 %arg, i32 0
+ call void @use_i16(i16 %arg)
+ store <vscale x 8 x i16> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i32
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1w { z0.s }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %vec = insertelement <vscale x 4 x i32> poison, i32 %arg, i32 0
+ call void @use_i32(i32 %arg)
+ store <vscale x 4 x i32> %vec, ptr %ptr
+ ret void
+}
+
+define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl use_i64
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: st1d { z0.d }, ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -8063,7 +8079,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, | |||
// return. | |||
SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs, | |||
DL, DAG, InVals, IsThisReturn, | |||
IsThisReturn ? OutVals[0] : SDValue()); | |||
IsThisReturn ? OutVals[0] : SDValue(), | |||
RequiresSMChange.has_value()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RequiresSMChange being an optional<bool>
is unclear to me what it means, and why having any value at all is what you want for the value of LowerCallResult()'s bool RequiresSMChange
parameter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That interface is indeed awfully confusing! I've refactored it here: #78703
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
… smstart/smstop. This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, llvm#8] // 8-byte Folded Spill smstop sm ldr d0, [sp, llvm#8] // 8-byte Folded Reload bl use_f64
089b389
to
29a0b5f
Compare
@@ -189,6 +191,26 @@ def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), | |||
(MSR 0xde85, GPR64:$val)>; | |||
def : Pat<(i64 (int_aarch64_sme_get_tpidr2)), | |||
(MRS 0xde85)>; | |||
|
|||
multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> { | |||
def NAME : Pseudo<(outs rc:$dst), (ins rc:$idx), []>, Sched<[]> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Is $src
a better name instead of $idx
, especially since you have the constraint $dst = $idx
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that's a better name.
defm _FPR16 : CoalescerBarrierPseudo<FPR16, [f16]>; | ||
defm _FPR32 : CoalescerBarrierPseudo<FPR32, [f32]>; | ||
defm _FPR64 : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>; | ||
defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to worry about v1i128 - or is that not even legal?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding v1i128 leads to TableGen issues that say Type set is empty for each HW mode
, so i don't think we need to worry about it.
} | ||
|
||
multiclass CoalescerBarriers { | ||
defm _FPR16 : CoalescerBarrierPseudo<FPR16, [f16]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about bf16?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch!
@@ -249,11 +249,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta | |||
; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill | |||
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill | |||
; CHECK-NEXT: smstart sm | |||
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess we might be able to improve these over time, but for now it's important to have functional correctness.
multiclass CoalescerBarriers { | ||
defm _FPR16 : CoalescerBarrierPseudo<FPR16, [f16]>; | ||
defm _FPR32 : CoalescerBarrierPseudo<FPR32, [f32]>; | ||
defm _FPR64 : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to worry about unusual types such as v8i7? I assume these get promoted to v8i8 anyway, but I wasn't sure if we should perhaps have at least one test for them. Similarly, I wonder what happens with v8i1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As you say, we should only have legal types at this point. I've added a test for v8i1, but I'm not sure how to add a test for v8i7, as there will be no legalisation for <vscale x 8 x i7>
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! What a marvellous patch. :)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
… smstart/smstop. (llvm#78294) This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to a 'nop', but which stops the register allocator from coalescing a COPY node when its use/def crosses a SMSTART or SMSTOP instruction. For example: %0:fpr64 = COPY killed $d0 undef %2.dsub:zpr = COPY %0 // <- Do not coalesce this COPY ADJCALLSTACKDOWN 0, 0 MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $d0 $d0 = COPY killed %0 BL @use_f64, csr_aarch64_aapcs If the COPY would be coalesced, that would lead to: $d0 = COPY killed %0 being replaced by: $d0 = COPY killed %2.dsub which means the whole ZPR reg would be live upto the call, causing the MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register: str q0, [sp] // 16-byte Folded Spill smstop sm ldr z0, [sp] // 16-byte Folded Reload bl use_f64 which would be incorrect for two reasons: 1. The program may load more data than it has allocated. 2. If there are other SVE objects on the stack, the compiler might use the 'mul vl' addressing modes to access the spill location. By disabling the coalescing, we get the desired results: str d0, [sp, #8] // 8-byte Folded Spill smstop sm ldr d0, [sp, #8] // 8-byte Folded Reload bl use_f64 (cherry picked from commit dd73666)
This patch introduces a 'COALESCER_BARRIER' which is a pseudo node that expands to
a 'nop', but which stops the register allocator from coalescing a COPY node when
its use/def crosses a SMSTART or SMSTOP instruction.
For example:
If the COPY would be coalesced, that would lead to:
being replaced by:
which means the whole ZPR reg would be live upto the call, causing the
MSRpstatesvcrImm1 (smstop) to spill/reload the ZPR register:
which would be incorrect for two reasons:
'mul vl' addressing modes to access the spill location.
By disabling the coalescing, we get the desired results: