diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-21 18:13:02 +0000 |
commit | 54db30ce18663e6c2991958f3b5d18362e8e93c4 (patch) | |
tree | 4aa6442802570767398cc83ba484e97b1309bdc2 /contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | |
parent | 35284c22e9c8348159b7ce032ea45f2cdeb65298 (diff) | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) |
Merge llvm trunk r366426, resolve conflicts, and update FREEBSD-Xlist.
Notes
Notes:
svn path=/projects/clang900-import/; revision=351344
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 804 |
1 files changed, 760 insertions, 44 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 69ddbfb53958..885239e2faed 100644 --- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1,9 +1,8 @@ //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -21,6 +20,7 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/MC/MCInstrDesc.h" @@ -38,6 +38,7 @@ using namespace llvm; //===----------------------------------------------------------------------===// GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), @@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = 5; + MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + TSchedModel.init(&ST); } void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { @@ -114,6 +116,12 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, } } +static bool isPermlane(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == AMDGPU::V_PERMLANE16_B32 || + Opcode == AMDGPU::V_PERMLANEX16_B32; +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -123,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); + if (MI->isBundle()) + return NoHazard; if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; @@ -133,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { && checkVMEMHazards(MI) > 0) return NoopHazard; + if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) + return NoopHazard; + + if (checkFPAtomicToDenormModeHazard(MI) > 0) + return NoopHazard; + + if (ST.hasNoDataDepHazard()) + return NoHazard; + if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) return NoopHazard; @@ -163,6 +182,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { checkReadM0Hazards(MI) > 0) return NoopHazard; + if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) + return NoopHazard; + + if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0) + return NoopHazard; + if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) return NoopHazard; @@ -172,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { return NoHazard; } +static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) + .addImm(0); +} + +void GCNHazardRecognizer::processBundle() { + MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator()); + MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end(); + // Check bundled MachineInstr's for hazards. + for (; MI != E && MI->isInsideBundle(); ++MI) { + CurrCycleInstr = &*MI; + unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); + + if (IsHazardRecognizerMode) + fixHazards(CurrCycleInstr); + + for (unsigned i = 0; i < WaitStates; ++i) + insertNoopInBundle(CurrCycleInstr, TII); + + // It’s unnecessary to track more than MaxLookAhead instructions. Since we + // include the bundled MI directly after, only add a maximum of + // (MaxLookAhead - 1) noops to EmittedInstrs. + for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i) + EmittedInstrs.push_front(nullptr); + + EmittedInstrs.push_front(CurrCycleInstr); + EmittedInstrs.resize(MaxLookAhead); + } + CurrCycleInstr = nullptr; +} + unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { - return PreEmitNoops(SU->getInstr()); + IsHazardRecognizerMode = false; + return PreEmitNoopsCommon(SU->getInstr()); } unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + IsHazardRecognizerMode = true; + CurrCycleInstr = MI; + unsigned W = PreEmitNoopsCommon(MI); + fixHazards(MI); + CurrCycleInstr = nullptr; + return W; +} + +unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) { + if (MI->isBundle()) + return 0; + int WaitStates = std::max(0, checkAnyInstHazards(MI)); if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVALU(*MI)) - WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (ST.hasNSAtoVMEMBug()) + WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI)); + + WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI)); + + if (ST.hasNoDataDepHazard()) + return WaitStates; + + if (SIInstrInfo::isVALU(*MI)) + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) WaitStates = std::max(WaitStates, checkDPPHazards(MI)); @@ -216,6 +293,12 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); + if (SIInstrInfo::isMAI(*MI)) + return std::max(WaitStates, checkMAIHazards(MI)); + + if (MI->mayLoad() || MI->mayStore()) + return std::max(WaitStates, checkMAILdStHazards(MI)); + return WaitStates; } @@ -232,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() { // Do not track non-instructions which do not affect the wait states. // If included, these instructions can lead to buffer overflow such that // detectable hazards are missed. - if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) + if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || + CurrCycleInstr->isKill()) return; - else if (CurrCycleInstr->isDebugInstr()) + + if (CurrCycleInstr->isBundle()) { + processBundle(); return; + } unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); @@ -266,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -int GCNHazardRecognizer::getWaitStatesSince( - function_ref<bool(MachineInstr *)> IsHazard) { +typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn; + +// Returns a minimum wait states since \p I walking all predecessors. +// Only scans until \p IsExpired does not return true. +// Can only be run in a hazard recognizer mode. +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineBasicBlock *MBB, + MachineBasicBlock::reverse_instr_iterator I, + int WaitStates, + IsExpiredFn IsExpired, + DenseSet<const MachineBasicBlock *> &Visited) { + for (auto E = MBB->instr_rend(); I != E; ++I) { + // Don't add WaitStates for parent BUNDLE instructions. + if (I->isBundle()) + continue; + + if (IsHazard(&*I)) + return WaitStates; + + if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr()) + continue; + + WaitStates += SIInstrInfo::getNumWaitStates(*I); + + if (IsExpired(&*I, WaitStates)) + return std::numeric_limits<int>::max(); + } + + int MinWaitStates = WaitStates; + bool Found = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (!Visited.insert(Pred).second) + continue; + + int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), + WaitStates, IsExpired, Visited); + + if (W == std::numeric_limits<int>::max()) + continue; + + MinWaitStates = Found ? std::min(MinWaitStates, W) : W; + if (IsExpired(nullptr, MinWaitStates)) + return MinWaitStates; + + Found = true; + } + + if (Found) + return MinWaitStates; + + return std::numeric_limits<int>::max(); +} + +static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, + MachineInstr *MI, + IsExpiredFn IsExpired) { + DenseSet<const MachineBasicBlock *> Visited; + return getWaitStatesSince(IsHazard, MI->getParent(), + std::next(MI->getReverseIterator()), + 0, IsExpired, Visited); +} + +int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) { + if (IsHazardRecognizerMode) { + auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) { + return WaitStates >= Limit; + }; + return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn); + } + int WaitStates = 0; for (MachineInstr *MI : EmittedInstrs) { if (MI) { if (IsHazard(MI)) return WaitStates; - unsigned Opcode = MI->getOpcode(); - if (Opcode == AMDGPU::INLINEASM) + if (MI->isInlineAsm()) continue; } ++WaitStates; + + if (WaitStates >= Limit) + break; } return std::numeric_limits<int>::max(); } -int GCNHazardRecognizer::getWaitStatesSinceDef( - unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) { +int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, + IsHazardFn IsHazardDef, + int Limit) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) { return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } -int GCNHazardRecognizer::getWaitStatesSinceSetReg( - function_ref<bool(MachineInstr *)> IsHazard) { +int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, + int Limit) { auto IsHazardFn = [IsHazard] (MachineInstr *MI) { return isSSetReg(MI->getOpcode()) && IsHazard(MI); }; - return getWaitStatesSince(IsHazardFn); + return getWaitStatesSince(IsHazardFn, Limit); } //===----------------------------------------------------------------------===// @@ -342,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). // - // In order to handle these situations correctly we need to make sure - // that when a clause has more than one instruction, no instruction in the - // clause writes to a register that is read another instruction in the clause + // In order to handle these situations correctly we need to make sure that + // when a clause has more than one instruction, no instruction in the clause + // writes to a register that is read by another instruction in the clause // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. @@ -377,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. - if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (!ST.hasSMRDReadVALUDefHazard()) return WaitStatesNeeded; // A read of an SGPR by SMRD instruction requires 4 wait states when the @@ -398,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (!Use.isReg()) continue; int WaitStatesNeededForUse = - SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); // This fixes what appears to be undocumented hardware behavior in SI where @@ -411,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { if (IsBufferSMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), - IsBufferHazardDefFn); + IsBufferHazardDefFn, + SmrdSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } } @@ -420,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasVMEMReadSGPRVALUDefHazard()) return 0; int WaitStatesNeeded = checkSoftClauseHazards(VMEM); @@ -429,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { // SGPR was written by a VALU Instruction. const int VmemSgprWaitStates = 5; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; - for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn, + VmemSgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } return WaitStatesNeeded; @@ -455,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = - DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + [](MachineInstr *) { return true; }, + DppVgprWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } WaitStatesNeeded = std::max( WaitStatesNeeded, - DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn)); + DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn, + DppExecWaitStates)); return WaitStatesNeeded; } @@ -473,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) { // instruction. const int DivFMasWaitStates = 4; auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; - int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn); + int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn, + DivFMasWaitStates); return DivFMasWaitStates - WaitStatesNeeded; } @@ -486,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) { auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) { return GetRegHWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates); return GetRegWaitStates - WaitStatesNeeded; } @@ -495,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) { const SIInstrInfo *TII = ST.getInstrInfo(); unsigned HWReg = getHWReg(TII, *SetRegInstr); - const int SetRegWaitStates = - ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2; + const int SetRegWaitStates = ST.getSetRegWaitStates(); auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) { return HWReg == getHWReg(TII, *MI); }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates); return SetRegWaitStates - WaitStatesNeeded; } @@ -571,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); }; int WaitStatesNeededForDef = - VALUWaitStates - getWaitStatesSince(IsHazardFn); + VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); return WaitStatesNeeded; @@ -636,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { }; const int RWLaneWaitStates = 4; - int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn); + int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn, + RWLaneWaitStates); return RWLaneWaitStates - WaitStatesSince; } int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { - if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (!ST.hasRFEHazards()) return 0; const SIInstrInfo *TII = ST.getInstrInfo(); @@ -651,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { auto IsHazardFn = [TII] (MachineInstr *MI) { return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS; }; - int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn); + int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates); return RFEWaitStates - WaitStatesNeeded; } @@ -675,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { return MI->getOpcode() == AMDGPU::S_MOV_FED_B32; }; int WaitStatesNeededForUse = - MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn); + MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn, + MovFedWaitStates); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } @@ -688,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; - return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn); + return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, + SMovRelWaitStates); +} + +void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { + fixVMEMtoScalarWriteHazards(MI); + fixVcmpxPermlaneHazards(MI); + fixSMEMtoVectorWriteHazards(MI); + fixVcmpxExecWARHazard(MI); + fixLdsBranchVmemWARHazard(MI); +} + +bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) { + if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsHazardFn = [TII] (MachineInstr *MI) { + return TII->isVOPC(*MI); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + if (!MI) + return false; + unsigned Opc = MI->getOpcode(); + return SIInstrInfo::isVALU(*MI) && + Opc != AMDGPU::V_NOP_e32 && + Opc != AMDGPU::V_NOP_e64 && + Opc != AMDGPU::V_NOP_sdwa; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + // V_NOP will be discarded by SQ. + // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE* + // which is always a VGPR and available. + auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + unsigned Reg = Src0->getReg(); + bool IsUndef = Src0->isUndef(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32)) + .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0)) + .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill); + + return true; +} + +bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { + if (!ST.hasVMEMtoScalarWriteHazard()) + return false; + + if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI)) + return false; + + if (MI->getNumDefs() == 0) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsHazardFn = [TRI, MI] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) && + !SIInstrInfo::isFLAT(*I)) + return false; + + for (const MachineOperand &Def : MI->defs()) { + MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI); + if (!Op) + continue; + return true; + } + return false; + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int) { + return MI && (SIInstrInfo::isVALU(*MI) || + (MI->getOpcode() == AMDGPU::S_WAITCNT && + !MI->getOperand(0).getImm())); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + +bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { + if (!ST.hasSMEMtoVectorWriteHazard()) + return false; + + if (!SIInstrInfo::isVALU(*MI)) + return false; + + unsigned SDSTName; + switch (MI->getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READFIRSTLANE_B32: + SDSTName = AMDGPU::OpName::vdst; + break; + default: + SDSTName = AMDGPU::OpName::sdst; + break; + } + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU()); + const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName); + if (!SDST) { + for (const auto &MO : MI->implicit_operands()) { + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) { + SDST = &MO; + break; + } + } + } + + if (!SDST) + return false; + + const unsigned SDSTReg = SDST->getReg(); + auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) { + return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI); + }; + + auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) { + if (MI) { + if (TII->isSALU(*MI)) { + switch (MI->getOpcode()) { + case AMDGPU::S_SETVSKIP: + case AMDGPU::S_VERSION: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + // These instructions cannot not mitigate the hazard. + return false; + case AMDGPU::S_WAITCNT_LGKMCNT: + // Reducing lgkmcnt count to 0 always mitigates the hazard. + return (MI->getOperand(1).getImm() == 0) && + (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + case AMDGPU::S_WAITCNT: { + const int64_t Imm = MI->getOperand(0).getImm(); + AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm); + return (Decoded.LgkmCnt == 0); + } + default: + // SOPP instructions cannot mitigate the hazard. + if (TII->isSOPP(*MI)) + return false; + // At this point the SALU can be assumed to mitigate the hazard + // because either: + // (a) it is independent of the at risk SMEM (breaking chain), + // or + // (b) it is dependent on the SMEM, in which case an appropriate + // s_waitcnt lgkmcnt _must_ exist between it and the at risk + // SMEM instruction. + return true; + } + } + } + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL) + .addImm(0); + return true; +} + +bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { + if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI)) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + if (!MI->modifiesRegister(AMDGPU::EXEC, TRI)) + return false; + + auto IsHazardFn = [TRI] (MachineInstr *I) { + if (SIInstrInfo::isVALU(*I)) + return false; + return I->readsRegister(AMDGPU::EXEC, TRI); + }; + + const SIInstrInfo *TII = ST.getInstrInfo(); + auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) { + if (!MI) + return false; + if (SIInstrInfo::isVALU(*MI)) { + if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst)) + return true; + for (auto MO : MI->implicit_operands()) + if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) + return true; + } + if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe) + return true; + return false; + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xfffe); + return true; +} + +bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) { + if (!ST.hasLdsBranchVmemWARHazard()) + return false; + + auto IsHazardInst = [] (const MachineInstr *MI) { + if (SIInstrInfo::isDS(*MI)) + return 1; + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI)) + return 2; + return 0; + }; + + auto InstType = IsHazardInst(MI); + if (!InstType) + return false; + + auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) { + return I && (IsHazardInst(I) || + (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm())); + }; + + auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) { + if (!I->isBranch()) + return false; + + auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) { + auto InstType2 = IsHazardInst(I); + return InstType2 && InstType != InstType2; + }; + + auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) { + if (!I) + return false; + + auto InstType2 = IsHazardInst(I); + if (InstType == InstType2) + return true; + + return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT && + I->getOperand(0).getReg() == AMDGPU::SGPR_NULL && + !I->getOperand(1).getImm(); + }; + + return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) != + std::numeric_limits<int>::max(); + }; + + if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == + std::numeric_limits<int>::max()) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + return true; +} + +int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) { + int NSAtoVMEMWaitStates = 1; + + if (!ST.hasNSAtoVMEMBug()) + return 0; + + if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI)) + return 0; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset); + if (!Offset || (Offset->getImm() & 6) == 0) + return 0; + + auto IsHazardFn = [TII] (MachineInstr *I) { + if (!SIInstrInfo::isMIMG(*I)) + return false; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode()); + return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA && + TII->getInstSizeInBytes(*I) >= 16; + }; + + return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1); +} + +int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { + int FPAtomicToDenormModeWaitStates = 3; + + if (MI->getOpcode() != AMDGPU::S_DENORM_MODE) + return 0; + + auto IsHazardFn = [] (MachineInstr *I) { + if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I)) + return false; + return SIInstrInfo::isFPAtomic(*I); + }; + + auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) { + if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI)) + return true; + + switch (MI->getOpcode()) { + case AMDGPU::S_WAITCNT: + case AMDGPU::S_WAITCNT_VSCNT: + case AMDGPU::S_WAITCNT_VMCNT: + case AMDGPU::S_WAITCNT_EXPCNT: + case AMDGPU::S_WAITCNT_LGKMCNT: + case AMDGPU::S_WAITCNT_IDLE: + return true; + default: + break; + } + + return false; + }; + + + return FPAtomicToDenormModeWaitStates - + ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn); +} + +int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { + assert(SIInstrInfo::isMAI(*MI)); + + int WaitStatesNeeded = 0; + unsigned Opc = MI->getOpcode(); + + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI); + }; + + if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write + const int LegacyVALUWritesVGPRWaitStates = 2; + const int VALUWritesExecWaitStates = 4; + const int MaxWaitStates = 4; + + int WaitStatesNeededForUse = VALUWritesExecWaitStates - + getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded < MaxWaitStates) { + for (const MachineOperand &Use : MI->explicit_uses()) { + const int MaxWaitStates = 2; + + if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates - + getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + } + } + + auto IsMFMAFn = [] (MachineInstr *MI) { + return SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_operands()) { + if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg())) + continue; + + if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32) + continue; + + const int MFMAWritesAGPROverlappedSrcABWaitStates = 4; + const int MFMAWritesAGPROverlappedSrcCWaitStates = 2; + const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4; + const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10; + const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18; + const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1; + const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7; + const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15; + const int MaxWaitStates = 18; + unsigned Reg = Op.getReg(); + unsigned HazardDefLatency = 0; + + auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + if (DstReg == Reg) + return false; + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(DstReg, Reg); + }; + + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, + MaxWaitStates); + int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; + int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + int OpNo = MI->getOperandNo(&Op); + if (OpNo == SrcCIdx) { + NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; + } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates; + break; + } + } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates; + break; + } + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + return false; + unsigned DstReg = MI->getOperand(0).getReg(); + return TRI.regsOverlap(Reg, DstReg); + }; + + const int AccVGPRWriteMFMAReadSrcCWaitStates = 1; + const int AccVGPRWriteMFMAReadSrcABWaitStates = 3; + const int AccVGPRWriteAccVgprReadWaitStates = 3; + NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates; + if (OpNo == SrcCIdx) + NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates; + else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) + NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates; + + WaitStatesNeededForUse = NeedWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + } + + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) { + const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0; + const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5; + const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13; + const int MaxWaitStates = 13; + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned HazardDefLatency = 0; + + auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg(); + HazardDefLatency = std::max(HazardDefLatency, + TSchedModel.computeInstrLatency(MI)); + return TRI.regsOverlap(Reg, DstReg); + }; + + int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates); + int NeedWaitStates; + switch (HazardDefLatency) { + case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates; + break; + case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { + if (!ST.hasMAIInsts()) + return 0; + + int WaitStatesNeeded = 0; + + auto IsAccVgprReadFn = [] (MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32; + }; + + for (const MachineOperand &Op : MI->explicit_uses()) { + if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg())) + continue; + + unsigned Reg = Op.getReg(); + + const int AccVgprReadLdStWaitStates = 2; + const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1; + const int MaxWaitStates = 2; + + int WaitStatesNeededForUse = AccVgprReadLdStWaitStates - + getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + return WaitStatesNeeded; // Early exit. + + auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32) + return false; + auto IsVALUFn = [] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI); + }; + return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) < + std::numeric_limits<int>::max(); + }; + + WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates - + getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; } |