diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 274 |
1 files changed, 153 insertions, 121 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 903e726c667d..0f89df144486 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -11,7 +11,7 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/Target/TargetMachine.h" @@ -26,13 +26,17 @@ static cl::opt<bool> EnableSpillVGPRToAGPR( cl::ReallyHidden, cl::init(true)); -// Find a register matching \p RC from \p LiveRegs which is unused and available -// throughout the function. On failure, returns AMDGPU::NoRegister. +// Find a register matching \p RC from \p LiveUnits which is unused and +// available throughout the function. On failure, returns AMDGPU::NoRegister. +// TODO: Rewrite the loop here to iterate over MCRegUnits instead of +// MCRegisters. This should reduce the number of iterations and avoid redundant +// checking. static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, - const LivePhysRegs &LiveRegs, + const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC) { for (MCRegister Reg : RC) { - if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) && + !MRI.isReserved(Reg)) return Reg; } return MCRegister(); @@ -42,22 +46,21 @@ static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, // callee-save registers since they may appear to be free when this is called // from canUseAsPrologue (during shrink wrapping), but then no longer be free // when this is called from emitPrologue. -static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC, - bool Unused = false) { +static MCRegister findScratchNonCalleeSaveRegister( + MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, + const TargetRegisterClass &RC, bool Unused = false) { // Mark callee saved registers as used so we will not choose them. const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); + LiveUnits.addReg(CSRegs[i]); // We are looking for a register that can be used throughout the entire // function, so any use is unacceptable. if (Unused) - return findUnusedRegister(MRI, LiveRegs, RC); + return findUnusedRegister(MRI, LiveUnits, RC); for (MCRegister Reg : RC) { - if (LiveRegs.available(MRI, Reg)) + if (LiveUnits.available(Reg) && !MRI.isReserved(Reg)) return Reg; } @@ -65,9 +68,9 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, } /// Query target location for spilling SGPRs -/// \p IncludeScratchCopy : Also look for free scratch SGPRs +/// \p IncludeScratchCopy : Also look for free scratch SGPRs static void getVGPRSpillLaneOrTempRegister( - MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, + MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy = true) { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -81,11 +84,11 @@ static void getVGPRSpillLaneOrTempRegister( // We need to save and restore the given SGPR. Register ScratchSGPR; - // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. For certain - // cases we skip copy to scratch SGPR. + // 1: Try to save the given register into an unused scratch SGPR. The + // LiveUnits should have all the callee saved registers marked as used. For + // certain cases we skip copy to scratch SGPR. if (IncludeScratchCopy) - ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -99,10 +102,10 @@ static void getVGPRSpillLaneOrTempRegister( SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); - LLVM_DEBUG( - auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); - dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); + LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); } else { // Remove dead <FI> index MF.getFrameInfo().RemoveStackObject(FI); @@ -118,7 +121,7 @@ static void getVGPRSpillLaneOrTempRegister( MFI->addToPrologEpilogSGPRSpills( SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR)); - LiveRegs.addReg(ScratchSGPR); + LiveUnits.addReg(ScratchSGPR); LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to " << printReg(ScratchSGPR, TRI) << '\n'); } @@ -129,7 +132,7 @@ static void getVGPRSpillLaneOrTempRegister( // use. static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, - LivePhysRegs &LiveRegs, MachineFunction &MF, + LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, @@ -142,18 +145,18 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); - LiveRegs.addReg(SpillReg); + LiveUnits.addReg(SpillReg); bool IsKill = !MBB.isLiveIn(SpillReg); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg, - DwordOff, MMO, nullptr, &LiveRegs); + DwordOff, MMO, nullptr, &LiveUnits); if (IsKill) - LiveRegs.removeReg(SpillReg); + LiveUnits.removeReg(SpillReg); } static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, - LivePhysRegs &LiveRegs, MachineFunction &MF, + LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, @@ -167,7 +170,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST, PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI), FrameInfo.getObjectAlign(FI)); TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg, - DwordOff, MMO, nullptr, &LiveRegs); + DwordOff, MMO, nullptr, &LiveUnits); } static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -195,18 +198,18 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addReg(GitPtrLo); } -static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo *FuncInfo, - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, bool IsProlog) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); +static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo *FuncInfo, + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, bool IsProlog) { + if (LiveUnits.empty()) { + LiveUnits.init(TRI); if (IsProlog) { - LiveRegs.addLiveIns(MBB); + LiveUnits.addLiveIns(MBB); } else { // In epilog. - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); + LiveUnits.addLiveOuts(MBB); + LiveUnits.stepBackward(*MBBI); } } } @@ -228,7 +231,7 @@ class PrologEpilogSGPRSpillBuilder { const SIRegisterInfo &TRI; Register SuperReg; const PrologEpilogSGPRSaveRestoreInfo SI; - LivePhysRegs &LiveRegs; + LiveRegUnits &LiveUnits; const DebugLoc &DL; Register FrameReg; ArrayRef<int16_t> SplitParts; @@ -239,10 +242,10 @@ class PrologEpilogSGPRSpillBuilder { MachineRegisterInfo &MRI = MF.getRegInfo(); assert(!MFI.isDeadObjectIndex(FI)); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + MRI, LiveUnits, AMDGPU::VGPR_32RegClass); if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); @@ -253,7 +256,7 @@ class PrologEpilogSGPRSpillBuilder { BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR) .addReg(SubReg); - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, + buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR, FI, FrameReg, DwordOff); DwordOff += 4; } @@ -264,14 +267,15 @@ class PrologEpilogSGPRSpillBuilder { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { Register SubReg = NumSubRegs == 1 ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR), + Spill[I].VGPR) .addReg(SubReg) .addImm(Spill[I].Lane) .addReg(Spill[I].VGPR, RegState::Undef); @@ -287,9 +291,9 @@ class PrologEpilogSGPRSpillBuilder { void restoreFromMemory(const int FI) { MachineRegisterInfo &MRI = MF.getRegInfo(); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false); MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::VGPR_32RegClass); + MRI, LiveUnits, AMDGPU::VGPR_32RegClass); if (!TmpVGPR) report_fatal_error("failed to find free scratch register"); @@ -298,8 +302,8 @@ class PrologEpilogSGPRSpillBuilder { ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR, - FI, FrameReg, DwordOff); + buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, + TmpVGPR, FI, FrameReg, DwordOff); BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpVGPR, RegState::Kill); DwordOff += 4; @@ -309,14 +313,14 @@ class PrologEpilogSGPRSpillBuilder { void restoreFromVGPRLane(const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { Register SubReg = NumSubRegs == 1 ? SuperReg : Register(TRI.getSubReg(SuperReg, SplitParts[I])); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg) .addReg(Spill[I].VGPR) .addImm(Spill[I].Lane); } @@ -335,11 +339,12 @@ public: MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, - LivePhysRegs &LiveRegs, Register FrameReg) + LiveRegUnits &LiveUnits, Register FrameReg) : MI(MI), MBB(MBB), MF(*MBB.getParent()), ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()), FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI), - SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) { + SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), + FrameReg(FrameReg) { const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg); SplitParts = TRI.getRegSplitParts(RC, EltSize); NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); @@ -396,9 +401,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( if (ST.isAmdPalOS()) { // Extract the scratch offset from the descriptor in the GIT - LivePhysRegs LiveRegs; - LiveRegs.init(*TRI); - LiveRegs.addLiveIns(MBB); + LiveRegUnits LiveUnits; + LiveUnits.init(*TRI); + LiveUnits.addLiveIns(MBB); // Find unused reg to load flat scratch init into MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -409,8 +414,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit( std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded)); Register GITPtrLoReg = MFI->getGITPtrLoReg(MF); for (MCPhysReg Reg : AllSGPR64s) { - if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { + if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) && + MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { FlatScrInit = Reg; break; } @@ -692,7 +697,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } bool NeedsFlatScratchInit = - MFI->hasFlatScratchInit() && + MFI->getUserSGPRInfo().hasFlatScratchInit() && (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); @@ -775,7 +780,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasImplicitBufferPtr()) { + if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -814,7 +819,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( BuildMI(MBB, I, DL, SMovB32, Rsrc1) .addExternalSymbol("SCRATCH_RSRC_DWORD1") .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - } BuildMI(MBB, I, DL, SMovB32, Rsrc2) @@ -873,7 +877,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { // Activate only the inactive lanes when \p EnableInactiveLanes is true. // Otherwise, activate all lanes. It returns the saved exec. -static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, +static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -886,14 +890,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI = TII->getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog); ScratchExecCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + MRI, LiveUnits, *TRI.getWaveMaskRegClass()); if (!ScratchExecCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(ScratchExecCopy); + LiveUnits.addReg(ScratchExecCopy); const unsigned SaveExecOpc = ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32 @@ -909,7 +913,7 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, void SIFrameLowering::emitCSRSpillStores( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const { SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -924,7 +928,7 @@ void SIFrameLowering::emitCSRSpillStores( FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); if (!WWMScratchRegs.empty()) ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ true); auto StoreWWMRegisters = @@ -932,7 +936,7 @@ void SIFrameLowering::emitCSRSpillStores( for (const auto &Reg : WWMRegs) { Register VGPR = Reg.first; int FI = Reg.second; - buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, VGPR, FI, FrameReg); } }; @@ -943,7 +947,7 @@ void SIFrameLowering::emitCSRSpillStores( unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true, /*EnableInactiveLanes*/ false); } @@ -955,7 +959,7 @@ void SIFrameLowering::emitCSRSpillStores( unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); - LiveRegs.addReg(ScratchExecCopy); + LiveUnits.addReg(ScratchExecCopy); } Register FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -971,7 +975,7 @@ void SIFrameLowering::emitCSRSpillStores( continue; PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, - LiveRegs, FrameReg); + LiveUnits, FrameReg); SB.save(); } @@ -986,16 +990,16 @@ void SIFrameLowering::emitCSRSpillStores( MBB.sortUniqueLiveIns(); } - if (!LiveRegs.empty()) { + if (!LiveUnits.empty()) { for (MCPhysReg Reg : ScratchSGPRs) - LiveRegs.addReg(Reg); + LiveUnits.addReg(Reg); } } } void SIFrameLowering::emitCSRSpillRestores( MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const { const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -1015,7 +1019,7 @@ void SIFrameLowering::emitCSRSpillRestores( continue; PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI, - LiveRegs, FrameReg); + LiveUnits, FrameReg); SB.restore(); } @@ -1027,7 +1031,7 @@ void SIFrameLowering::emitCSRSpillRestores( FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); if (!WWMScratchRegs.empty()) ScratchExecCopy = - buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ false, /*EnableInactiveLanes*/ true); auto RestoreWWMRegisters = @@ -1035,7 +1039,7 @@ void SIFrameLowering::emitCSRSpillRestores( for (const auto &Reg : WWMRegs) { Register VGPR = Reg.first; int FI = Reg.second; - buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL, + buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, VGPR, FI, FrameReg); } }; @@ -1046,7 +1050,7 @@ void SIFrameLowering::emitCSRSpillRestores( unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { - ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, + ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ false, /*EnableInactiveLanes*/ false); } @@ -1079,13 +1083,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Register FramePtrReg = FuncInfo->getFrameOffsetReg(); Register BasePtrReg = TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); - LivePhysRegs LiveRegs; + LiveRegUnits LiveUnits; MachineBasicBlock::iterator MBBI = MBB.begin(); // DebugLoc must be unknown since the first instruction with DebugLoc is used // to determine the end of the prologue. DebugLoc DL; + if (FuncInfo->isChainFunction()) { + // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but + // are free to set one up if they need it. + bool UseSP = requiresStackPointerReference(MF); + if (UseSP) { + assert(StackPtrReg != AMDGPU::SP_REG); + + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg) + .addImm(MFI.getStackSize() * getScratchScaleFactor(ST)); + } + } + bool HasFP = false; bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); @@ -1097,14 +1113,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, Register FramePtrRegScratchCopy; if (!HasFP && !hasFP(MF)) { // Emit the CSR spill stores with SP base register. - emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, + FuncInfo->isChainFunction() ? Register() : StackPtrReg, FramePtrRegScratchCopy); } else { // CSR spill stores will use FP as base register. Register SGPRForFPSaveRestoreCopy = FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg); - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true); if (SGPRForFPSaveRestoreCopy) { // Copy FP to the scratch register now and emit the CFI entry. It avoids // the extra FP copy needed in the other two cases when FP is spilled to @@ -1112,18 +1129,18 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, PrologEpilogSGPRSpillBuilder SB( FramePtrReg, FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI, - DL, TII, TRI, LiveRegs, FramePtrReg); + DL, TII, TRI, LiveUnits, FramePtrReg); SB.save(); - LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + LiveUnits.addReg(SGPRForFPSaveRestoreCopy); } else { // Copy FP into a new scratch register so that its previous value can be // spilled after setting up the new frame. FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); if (!FramePtrRegScratchCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(FramePtrRegScratchCopy); + LiveUnits.addReg(FramePtrRegScratchCopy); BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy) .addReg(FramePtrReg); } @@ -1133,9 +1150,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const unsigned Alignment = MFI.getMaxAlign().value(); RoundedSize += Alignment; - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); + if (LiveUnits.empty()) { + LiveUnits.init(TRI); + LiveUnits.addLiveIns(MBB); } // s_add_i32 s33, s32, NumBytes @@ -1158,10 +1175,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // If FP is used, emit the CSR spills with FP base register. if (HasFP) { - emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, FramePtrRegScratchCopy); if (FramePtrRegScratchCopy) - LiveRegs.removeReg(FramePtrRegScratchCopy); + LiveUnits.removeReg(FramePtrRegScratchCopy); } // If we need a base pointer, set it up here. It's whatever the value of @@ -1210,7 +1227,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - LivePhysRegs LiveRegs; + LiveRegUnits LiveUnits; // Get the insert location for the epilogue. If there were no terminators in // the block, get the last instruction. MachineBasicBlock::iterator MBBI = MBB.end(); @@ -1240,19 +1257,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP // into a new scratch register and copy to FP later when other registers are // restored from the current stack frame. - initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); + initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false); if (SGPRForFPSaveRestoreCopy) { - LiveRegs.addReg(SGPRForFPSaveRestoreCopy); + LiveUnits.addReg(SGPRForFPSaveRestoreCopy); } else { FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass); + MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass); if (!FramePtrRegScratchCopy) report_fatal_error("failed to find free scratch register"); - LiveRegs.addReg(FramePtrRegScratchCopy); + LiveUnits.addReg(FramePtrRegScratchCopy); } - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg, FramePtrRegScratchCopy); } @@ -1275,7 +1292,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, MIB.setMIFlag(MachineInstr::FrameDestroy); } else { // Insert the CSR spill restores with SP as the base register. - emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg, + emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg, FramePtrRegScratchCopy); } } @@ -1318,7 +1335,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); // Allocate spill slots for WWM reserved VGPRs. - if (!FuncInfo->isEntryFunction()) { + // For chain functions, we only need to do this if we have calls to + // llvm.amdgcn.cs.chain. + bool IsChainWithoutCalls = + FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall(); + if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) { for (Register Reg : FuncInfo->getWWMReservedRegs()) { const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg); FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC), @@ -1353,8 +1374,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { assert(RS != nullptr); - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); + RS->enterBasicBlockEnd(MBB); + RS->backward(std::next(MI.getIterator())); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; @@ -1472,30 +1493,30 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - LivePhysRegs LiveRegs; - LiveRegs.init(*TRI); + LiveRegUnits LiveUnits; + LiveUnits.init(*TRI); // Initially mark callee saved registers as used so we will not choose them // while looking for scratch SGPRs. const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs(); for (unsigned I = 0; CSRegs[I]; ++I) - LiveRegs.addReg(CSRegs[I]); + LiveUnits.addReg(CSRegs[I]); const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); if (NeedExecCopyReservedReg) { Register ReservedReg = MFI->getSGPRForEXECCopy(); assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); - Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); + Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC); if (UnusedScratchReg) { // If found any unused scratch SGPR, reserve the register itself for Exec // copy and there is no need for any spills in that case. MFI->setSGPRForEXECCopy(UnusedScratchReg); - LiveRegs.addReg(UnusedScratchReg); + LiveUnits.addReg(UnusedScratchReg); } else { // Needs spill. assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && "Re-reserving spill slot for EXEC copy register"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC, /*IncludeScratchCopy=*/false); } } @@ -1516,14 +1537,14 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( Register FramePtrReg = MFI->getFrameOffsetReg(); assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) && "Re-reserving spill slot for FP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg); + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg); } if (TRI->hasBasePointer(MF)) { Register BasePtrReg = TRI->getBaseRegister(); assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) && "Re-reserving spill slot for BP"); - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg); + getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg); } } @@ -1531,8 +1552,15 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If this is a function with the amdgpu_cs_chain[_preserve] calling + // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then + // we don't need to save and restore anything. + if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall()) + return; + + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); if (MFI->isEntryFunction()) return; @@ -1551,17 +1579,17 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, // TODO: Handle this elsewhere at an early point. Walking through all MBBs // here would be a bad heuristic. A better way should be by calling // allocateWWMSpill during the regalloc pipeline whenever a physical - // register is allocated for the intended virtual registers. That will - // also help excluding the general use of WRITELANE/READLANE intrinsics - // that won't really need any such special handling. - if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32) + // register is allocated for the intended virtual registers. + if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); - else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) + else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + (MFI->isChainFunction() && + TII->isChainCallOpcode(MI.getOpcode()))) { // We expect all return to be the same size. assert(!ReturnMI || (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == @@ -1695,6 +1723,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); uint64_t EstStackSize = MFI.estimateStackSize(MF); uint64_t MaxOffset = EstStackSize - 1; @@ -1706,12 +1735,11 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( // rather than allocating as close to possible. This could save a lot of space // on frames with alignment requirements. if (ST.enableFlatScratch()) { - const SIInstrInfo *TII = ST.getInstrInfo(); if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) return false; } else { - if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset)) + if (TII->isLegalMUBUFImmOffset(MaxOffset)) return false; } @@ -1770,10 +1798,11 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) { bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - // For entry functions we can use an immediate offset in most cases, so the - // presence of calls doesn't imply we need a distinct frame pointer. + // For entry & chain functions we can use an immediate offset in most cases, + // so the presence of calls doesn't imply we need a distinct frame pointer. if (MFI.hasCalls() && - !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { + !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. @@ -1793,11 +1822,14 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { // register. We may need to initialize the stack pointer depending on the frame // properties, which logically overlaps many of the cases where an ordinary // function would require an FP. +// Also used for chain functions. While not technically entry functions, chain +// functions may need to set up a stack pointer in some situations. bool SIFrameLowering::requiresStackPointerReference( const MachineFunction &MF) const { // Callable functions always require a stack pointer reference. - assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() && - "only expected to call this for entry points"); + assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() || + MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) && + "only expected to call this for entry points and chain functions"); const MachineFrameInfo &MFI = MF.getFrameInfo(); |