aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp274
1 files changed, 153 insertions, 121 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 903e726c667d..0f89df144486 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -11,7 +11,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Target/TargetMachine.h"
@@ -26,13 +26,17 @@ static cl::opt<bool> EnableSpillVGPRToAGPR(
cl::ReallyHidden,
cl::init(true));
-// Find a register matching \p RC from \p LiveRegs which is unused and available
-// throughout the function. On failure, returns AMDGPU::NoRegister.
+// Find a register matching \p RC from \p LiveUnits which is unused and
+// available throughout the function. On failure, returns AMDGPU::NoRegister.
+// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
+// MCRegisters. This should reduce the number of iterations and avoid redundant
+// checking.
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
- const LivePhysRegs &LiveRegs,
+ const LiveRegUnits &LiveUnits,
const TargetRegisterClass &RC) {
for (MCRegister Reg : RC) {
- if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+ if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
+ !MRI.isReserved(Reg))
return Reg;
}
return MCRegister();
@@ -42,22 +46,21 @@ static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
// callee-save registers since they may appear to be free when this is called
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
// when this is called from emitPrologue.
-static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
- LivePhysRegs &LiveRegs,
- const TargetRegisterClass &RC,
- bool Unused = false) {
+static MCRegister findScratchNonCalleeSaveRegister(
+ MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
+ const TargetRegisterClass &RC, bool Unused = false) {
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
for (unsigned i = 0; CSRegs[i]; ++i)
- LiveRegs.addReg(CSRegs[i]);
+ LiveUnits.addReg(CSRegs[i]);
// We are looking for a register that can be used throughout the entire
// function, so any use is unacceptable.
if (Unused)
- return findUnusedRegister(MRI, LiveRegs, RC);
+ return findUnusedRegister(MRI, LiveUnits, RC);
for (MCRegister Reg : RC) {
- if (LiveRegs.available(MRI, Reg))
+ if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
return Reg;
}
@@ -65,9 +68,9 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
}
/// Query target location for spilling SGPRs
-/// \p IncludeScratchCopy : Also look for free scratch SGPRs
+/// \p IncludeScratchCopy : Also look for free scratch SGPRs
static void getVGPRSpillLaneOrTempRegister(
- MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
+ MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
bool IncludeScratchCopy = true) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -81,11 +84,11 @@ static void getVGPRSpillLaneOrTempRegister(
// We need to save and restore the given SGPR.
Register ScratchSGPR;
- // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
- // should have all the callee saved registers marked as used. For certain
- // cases we skip copy to scratch SGPR.
+ // 1: Try to save the given register into an unused scratch SGPR. The
+ // LiveUnits should have all the callee saved registers marked as used. For
+ // certain cases we skip copy to scratch SGPR.
if (IncludeScratchCopy)
- ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
+ ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
if (!ScratchSGPR) {
int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
@@ -99,10 +102,10 @@ static void getVGPRSpillLaneOrTempRegister(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
- LLVM_DEBUG(
- auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
- dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
- << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
+ LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
+ dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n';);
} else {
// Remove dead <FI> index
MF.getFrameInfo().RemoveStackObject(FI);
@@ -118,7 +121,7 @@ static void getVGPRSpillLaneOrTempRegister(
MFI->addToPrologEpilogSGPRSpills(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
- LiveRegs.addReg(ScratchSGPR);
+ LiveUnits.addReg(ScratchSGPR);
LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
<< printReg(ScratchSGPR, TRI) << '\n');
}
@@ -129,7 +132,7 @@ static void getVGPRSpillLaneOrTempRegister(
// use.
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &FuncInfo,
- LivePhysRegs &LiveRegs, MachineFunction &MF,
+ LiveRegUnits &LiveUnits, MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register SpillReg, int FI, Register FrameReg,
@@ -142,18 +145,18 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
FrameInfo.getObjectAlign(FI));
- LiveRegs.addReg(SpillReg);
+ LiveUnits.addReg(SpillReg);
bool IsKill = !MBB.isLiveIn(SpillReg);
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
- DwordOff, MMO, nullptr, &LiveRegs);
+ DwordOff, MMO, nullptr, &LiveUnits);
if (IsKill)
- LiveRegs.removeReg(SpillReg);
+ LiveUnits.removeReg(SpillReg);
}
static void buildEpilogRestore(const GCNSubtarget &ST,
const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &FuncInfo,
- LivePhysRegs &LiveRegs, MachineFunction &MF,
+ LiveRegUnits &LiveUnits, MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register SpillReg, int FI,
@@ -167,7 +170,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
FrameInfo.getObjectAlign(FI));
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
- DwordOff, MMO, nullptr, &LiveRegs);
+ DwordOff, MMO, nullptr, &LiveUnits);
}
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -195,18 +198,18 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addReg(GitPtrLo);
}
-static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo *FuncInfo,
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, bool IsProlog) {
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
+static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo *FuncInfo,
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, bool IsProlog) {
+ if (LiveUnits.empty()) {
+ LiveUnits.init(TRI);
if (IsProlog) {
- LiveRegs.addLiveIns(MBB);
+ LiveUnits.addLiveIns(MBB);
} else {
// In epilog.
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
+ LiveUnits.addLiveOuts(MBB);
+ LiveUnits.stepBackward(*MBBI);
}
}
}
@@ -228,7 +231,7 @@ class PrologEpilogSGPRSpillBuilder {
const SIRegisterInfo &TRI;
Register SuperReg;
const PrologEpilogSGPRSaveRestoreInfo SI;
- LivePhysRegs &LiveRegs;
+ LiveRegUnits &LiveUnits;
const DebugLoc &DL;
Register FrameReg;
ArrayRef<int16_t> SplitParts;
@@ -239,10 +242,10 @@ class PrologEpilogSGPRSpillBuilder {
MachineRegisterInfo &MRI = MF.getRegInfo();
assert(!MFI.isDeadObjectIndex(FI));
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
@@ -253,7 +256,7 @@ class PrologEpilogSGPRSpillBuilder {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg);
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
FI, FrameReg, DwordOff);
DwordOff += 4;
}
@@ -264,14 +267,15 @@ class PrologEpilogSGPRSpillBuilder {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
+ Spill[I].VGPR)
.addReg(SubReg)
.addImm(Spill[I].Lane)
.addReg(Spill[I].VGPR, RegState::Undef);
@@ -287,9 +291,9 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromMemory(const int FI) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
@@ -298,8 +302,8 @@ class PrologEpilogSGPRSpillBuilder {
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
- FI, FrameReg, DwordOff);
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
+ TmpVGPR, FI, FrameReg, DwordOff);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
@@ -309,14 +313,14 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromVGPRLane(const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
.addReg(Spill[I].VGPR)
.addImm(Spill[I].Lane);
}
@@ -335,11 +339,12 @@ public:
MachineBasicBlock::iterator MI,
const DebugLoc &DL, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
- LivePhysRegs &LiveRegs, Register FrameReg)
+ LiveRegUnits &LiveUnits, Register FrameReg)
: MI(MI), MBB(MBB), MF(*MBB.getParent()),
ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
- SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
+ SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
+ FrameReg(FrameReg) {
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
SplitParts = TRI.getRegSplitParts(RC, EltSize);
NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
@@ -396,9 +401,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
- LivePhysRegs LiveRegs;
- LiveRegs.init(*TRI);
- LiveRegs.addLiveIns(MBB);
+ LiveRegUnits LiveUnits;
+ LiveUnits.init(*TRI);
+ LiveUnits.addLiveIns(MBB);
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -409,8 +414,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR64s) {
- if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
- !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+ if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
+ MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
FlatScrInit = Reg;
break;
}
@@ -692,7 +697,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
bool NeedsFlatScratchInit =
- MFI->hasFlatScratchInit() &&
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
@@ -775,7 +780,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- if (MFI->hasImplicitBufferPtr()) {
+ if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@@ -814,7 +819,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
}
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
@@ -873,7 +877,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
// Activate only the inactive lanes when \p EnableInactiveLanes is true.
// Otherwise, activate all lanes. It returns the saved exec.
-static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
+static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -886,14 +890,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
const SIRegisterInfo &TRI = TII->getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ MRI, LiveUnits, *TRI.getWaveMaskRegClass());
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(ScratchExecCopy);
+ LiveUnits.addReg(ScratchExecCopy);
const unsigned SaveExecOpc =
ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
@@ -909,7 +913,7 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
void SIFrameLowering::emitCSRSpillStores(
MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
Register FrameReg, Register FramePtrRegScratchCopy) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -924,7 +928,7 @@ void SIFrameLowering::emitCSRSpillStores(
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
if (!WWMScratchRegs.empty())
ScratchExecCopy =
- buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true, /*EnableInactiveLanes*/ true);
auto StoreWWMRegisters =
@@ -932,7 +936,7 @@ void SIFrameLowering::emitCSRSpillStores(
for (const auto &Reg : WWMRegs) {
Register VGPR = Reg.first;
int FI = Reg.second;
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
}
};
@@ -943,7 +947,7 @@ void SIFrameLowering::emitCSRSpillStores(
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
/*EnableInactiveLanes*/ false);
}
@@ -955,7 +959,7 @@ void SIFrameLowering::emitCSRSpillStores(
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
.addReg(ScratchExecCopy, RegState::Kill);
- LiveRegs.addReg(ScratchExecCopy);
+ LiveUnits.addReg(ScratchExecCopy);
}
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -971,7 +975,7 @@ void SIFrameLowering::emitCSRSpillStores(
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveRegs, FrameReg);
+ LiveUnits, FrameReg);
SB.save();
}
@@ -986,16 +990,16 @@ void SIFrameLowering::emitCSRSpillStores(
MBB.sortUniqueLiveIns();
}
- if (!LiveRegs.empty()) {
+ if (!LiveUnits.empty()) {
for (MCPhysReg Reg : ScratchSGPRs)
- LiveRegs.addReg(Reg);
+ LiveUnits.addReg(Reg);
}
}
}
void SIFrameLowering::emitCSRSpillRestores(
MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
Register FrameReg, Register FramePtrRegScratchCopy) const {
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -1015,7 +1019,7 @@ void SIFrameLowering::emitCSRSpillRestores(
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveRegs, FrameReg);
+ LiveUnits, FrameReg);
SB.restore();
}
@@ -1027,7 +1031,7 @@ void SIFrameLowering::emitCSRSpillRestores(
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
if (!WWMScratchRegs.empty())
ScratchExecCopy =
- buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false, /*EnableInactiveLanes*/ true);
auto RestoreWWMRegisters =
@@ -1035,7 +1039,7 @@ void SIFrameLowering::emitCSRSpillRestores(
for (const auto &Reg : WWMRegs) {
Register VGPR = Reg.first;
int FI = Reg.second;
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
}
};
@@ -1046,7 +1050,7 @@ void SIFrameLowering::emitCSRSpillRestores(
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false,
/*EnableInactiveLanes*/ false);
}
@@ -1079,13 +1083,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
Register BasePtrReg =
TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
- LivePhysRegs LiveRegs;
+ LiveRegUnits LiveUnits;
MachineBasicBlock::iterator MBBI = MBB.begin();
// DebugLoc must be unknown since the first instruction with DebugLoc is used
// to determine the end of the prologue.
DebugLoc DL;
+ if (FuncInfo->isChainFunction()) {
+ // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
+ // are free to set one up if they need it.
+ bool UseSP = requiresStackPointerReference(MF);
+ if (UseSP) {
+ assert(StackPtrReg != AMDGPU::SP_REG);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
+ .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
+ }
+ }
+
bool HasFP = false;
bool HasBP = false;
uint32_t NumBytes = MFI.getStackSize();
@@ -1097,14 +1113,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register FramePtrRegScratchCopy;
if (!HasFP && !hasFP(MF)) {
// Emit the CSR spill stores with SP base register.
- emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
+ emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
+ FuncInfo->isChainFunction() ? Register() : StackPtrReg,
FramePtrRegScratchCopy);
} else {
// CSR spill stores will use FP as base register.
Register SGPRForFPSaveRestoreCopy =
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
if (SGPRForFPSaveRestoreCopy) {
// Copy FP to the scratch register now and emit the CFI entry. It avoids
// the extra FP copy needed in the other two cases when FP is spilled to
@@ -1112,18 +1129,18 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
PrologEpilogSGPRSpillBuilder SB(
FramePtrReg,
FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
- DL, TII, TRI, LiveRegs, FramePtrReg);
+ DL, TII, TRI, LiveUnits, FramePtrReg);
SB.save();
- LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
+ LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
// Copy FP into a new scratch register so that its previous value can be
// spilled after setting up the new frame.
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
+ MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
if (!FramePtrRegScratchCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(FramePtrRegScratchCopy);
+ LiveUnits.addReg(FramePtrRegScratchCopy);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
.addReg(FramePtrReg);
}
@@ -1133,9 +1150,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const unsigned Alignment = MFI.getMaxAlign().value();
RoundedSize += Alignment;
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
+ if (LiveUnits.empty()) {
+ LiveUnits.init(TRI);
+ LiveUnits.addLiveIns(MBB);
}
// s_add_i32 s33, s32, NumBytes
@@ -1158,10 +1175,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// If FP is used, emit the CSR spills with FP base register.
if (HasFP) {
- emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
+ emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
FramePtrRegScratchCopy);
if (FramePtrRegScratchCopy)
- LiveRegs.removeReg(FramePtrRegScratchCopy);
+ LiveUnits.removeReg(FramePtrRegScratchCopy);
}
// If we need a base pointer, set it up here. It's whatever the value of
@@ -1210,7 +1227,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- LivePhysRegs LiveRegs;
+ LiveRegUnits LiveUnits;
// Get the insert location for the epilogue. If there were no terminators in
// the block, get the last instruction.
MachineBasicBlock::iterator MBBI = MBB.end();
@@ -1240,19 +1257,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
// into a new scratch register and copy to FP later when other registers are
// restored from the current stack frame.
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
if (SGPRForFPSaveRestoreCopy) {
- LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
+ LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
+ MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
if (!FramePtrRegScratchCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(FramePtrRegScratchCopy);
+ LiveUnits.addReg(FramePtrRegScratchCopy);
}
- emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
+ emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
FramePtrRegScratchCopy);
}
@@ -1275,7 +1292,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MIB.setMIFlag(MachineInstr::FrameDestroy);
} else {
// Insert the CSR spill restores with SP as the base register.
- emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
+ emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
FramePtrRegScratchCopy);
}
}
@@ -1318,7 +1335,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
// Allocate spill slots for WWM reserved VGPRs.
- if (!FuncInfo->isEntryFunction()) {
+ // For chain functions, we only need to do this if we have calls to
+ // llvm.amdgcn.cs.chain.
+ bool IsChainWithoutCalls =
+ FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
+ if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
@@ -1353,8 +1374,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
assert(RS != nullptr);
- // FIXME: change to enterBasicBlockEnd()
- RS->enterBasicBlock(MBB);
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(std::next(MI.getIterator()));
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
SpillFIs.set(FI);
continue;
@@ -1472,30 +1493,30 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- LivePhysRegs LiveRegs;
- LiveRegs.init(*TRI);
+ LiveRegUnits LiveUnits;
+ LiveUnits.init(*TRI);
// Initially mark callee saved registers as used so we will not choose them
// while looking for scratch SGPRs.
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
for (unsigned I = 0; CSRegs[I]; ++I)
- LiveRegs.addReg(CSRegs[I]);
+ LiveUnits.addReg(CSRegs[I]);
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
if (NeedExecCopyReservedReg) {
Register ReservedReg = MFI->getSGPRForEXECCopy();
assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
- Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC);
+ Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
if (UnusedScratchReg) {
// If found any unused scratch SGPR, reserve the register itself for Exec
// copy and there is no need for any spills in that case.
MFI->setSGPRForEXECCopy(UnusedScratchReg);
- LiveRegs.addReg(UnusedScratchReg);
+ LiveUnits.addReg(UnusedScratchReg);
} else {
// Needs spill.
assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
"Re-reserving spill slot for EXEC copy register");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC,
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
/*IncludeScratchCopy=*/false);
}
}
@@ -1516,14 +1537,14 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
Register FramePtrReg = MFI->getFrameOffsetReg();
assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
"Re-reserving spill slot for FP");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
}
if (TRI->hasBasePointer(MF)) {
Register BasePtrReg = TRI->getBaseRegister();
assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
"Re-reserving spill slot for BP");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
}
}
@@ -1531,8 +1552,15 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedVGPRs,
RegScavenger *RS) const {
- TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If this is a function with the amdgpu_cs_chain[_preserve] calling
+ // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
+ // we don't need to save and restore anything.
+ if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
+ return;
+
+ TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
if (MFI->isEntryFunction())
return;
@@ -1551,17 +1579,17 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
// TODO: Handle this elsewhere at an early point. Walking through all MBBs
// here would be a bad heuristic. A better way should be by calling
// allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers. That will
- // also help excluding the general use of WRITELANE/READLANE intrinsics
- // that won't really need any such special handling.
- if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
+ // register is allocated for the intended virtual registers.
+ if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
+ else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ (MFI->isChainFunction() &&
+ TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
assert(!ReturnMI ||
(count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
@@ -1695,6 +1723,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
uint64_t EstStackSize = MFI.estimateStackSize(MF);
uint64_t MaxOffset = EstStackSize - 1;
@@ -1706,12 +1735,11 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
// rather than allocating as close to possible. This could save a lot of space
// on frames with alignment requirements.
if (ST.enableFlatScratch()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch))
return false;
} else {
- if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
+ if (TII->isLegalMUBUFImmOffset(MaxOffset))
return false;
}
@@ -1770,10 +1798,11 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- // For entry functions we can use an immediate offset in most cases, so the
- // presence of calls doesn't imply we need a distinct frame pointer.
+ // For entry & chain functions we can use an immediate offset in most cases,
+ // so the presence of calls doesn't imply we need a distinct frame pointer.
if (MFI.hasCalls() &&
- !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
+ !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
// All offsets are unsigned, so need to be addressed in the same direction
// as stack growth.
@@ -1793,11 +1822,14 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
// register. We may need to initialize the stack pointer depending on the frame
// properties, which logically overlaps many of the cases where an ordinary
// function would require an FP.
+// Also used for chain functions. While not technically entry functions, chain
+// functions may need to set up a stack pointer in some situations.
bool SIFrameLowering::requiresStackPointerReference(
const MachineFunction &MF) const {
// Callable functions always require a stack pointer reference.
- assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
- "only expected to call this for entry points");
+ assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
+ MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
+ "only expected to call this for entry points and chain functions");
const MachineFrameInfo &MFI = MF.getFrameInfo();