src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-21 18:13:02 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-21 18:13:02 +0000
commit	54db30ce18663e6c2991958f3b5d18362e8e93c4 (patch)
tree	4aa6442802570767398cc83ba484e97b1309bdc2 /contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
parent	35284c22e9c8348159b7ce032ea45f2cdeb65298 (diff)
parent	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)

Merge llvm trunk r366426, resolve conflicts, and update FREEBSD-Xlist.

Notes

Notes: svn path=/projects/clang900-import/; revision=351344

Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp')

-rw-r--r--

contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

804

1 files changed, 760 insertions, 44 deletions

diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 69ddbfb53958..885239e2faed 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

@@ -1,9 +1,8 @@

//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -21,6 +20,7 @@

#include "llvm/ADT/iterator_range.h"

#include "llvm/CodeGen/MachineFunction.h"

#include "llvm/CodeGen/MachineInstr.h"

+#include "llvm/CodeGen/MachineInstrBuilder.h"

#include "llvm/CodeGen/MachineOperand.h"

#include "llvm/CodeGen/ScheduleDAG.h"

#include "llvm/MC/MCInstrDesc.h"

@@ -38,6 +38,7 @@ using namespace llvm;

//===----------------------------------------------------------------------===//

GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :

+ IsHazardRecognizerMode(false),

CurrCycleInstr(nullptr),

MF(MF),

ST(MF.getSubtarget<GCNSubtarget>()),

@@ -45,7 +46,8 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :

TRI(TII.getRegisterInfo()),

ClauseUses(TRI.getNumRegUnits()),

ClauseDefs(TRI.getNumRegUnits()) {

- MaxLookAhead = 5;

+ MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;

+ TSchedModel.init(&ST);

}

void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {

@@ -114,6 +116,12 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,

}

+static bool isPermlane(const MachineInstr &MI) {

+ unsigned Opcode = MI.getOpcode();

+ return Opcode == AMDGPU::V_PERMLANE16_B32 ||

+ Opcode == AMDGPU::V_PERMLANEX16_B32;

static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {

const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,

AMDGPU::OpName::simm16);

@@ -123,6 +131,8 @@ static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {

ScheduleHazardRecognizer::HazardType

GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {

MachineInstr *MI = SU->getInstr();

+ if (MI->isBundle())

+ return NoHazard;

if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)

return NoopHazard;

@@ -133,6 +143,15 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {

&& checkVMEMHazards(MI) > 0)

return NoopHazard;

+ if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)

+ return NoopHazard;

+ if (checkFPAtomicToDenormModeHazard(MI) > 0)

+ return NoopHazard;

+ if (ST.hasNoDataDepHazard())

+ return NoHazard;

if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)

return NoopHazard;

@@ -163,6 +182,12 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {

checkReadM0Hazards(MI) > 0)

return NoopHazard;

+ if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)

+ return NoopHazard;

+ if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)

+ return NoopHazard;

if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)

return NoopHazard;

@@ -172,22 +197,74 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {

return NoHazard;

}

+static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))

+ .addImm(0);

+void GCNHazardRecognizer::processBundle() {

+ MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());

+ MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();

+ // Check bundled MachineInstr's for hazards.

+ for (; MI != E && MI->isInsideBundle(); ++MI) {

+ CurrCycleInstr = &*MI;

+ unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);

+ if (IsHazardRecognizerMode)

+ fixHazards(CurrCycleInstr);

+ for (unsigned i = 0; i < WaitStates; ++i)

+ insertNoopInBundle(CurrCycleInstr, TII);

+ // It’s unnecessary to track more than MaxLookAhead instructions. Since we

+ // include the bundled MI directly after, only add a maximum of

+ // (MaxLookAhead - 1) noops to EmittedInstrs.

+ for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)

+ EmittedInstrs.push_front(nullptr);

+ EmittedInstrs.push_front(CurrCycleInstr);

+ EmittedInstrs.resize(MaxLookAhead);

+ }

+ CurrCycleInstr = nullptr;

unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {

- return PreEmitNoops(SU->getInstr());

+ IsHazardRecognizerMode = false;

+ return PreEmitNoopsCommon(SU->getInstr());

}

unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {

+ IsHazardRecognizerMode = true;

+ CurrCycleInstr = MI;

+ unsigned W = PreEmitNoopsCommon(MI);

+ fixHazards(MI);

+ CurrCycleInstr = nullptr;

+ return W;

+unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {

+ if (MI->isBundle())

+ return 0;

int WaitStates = std::max(0, checkAnyInstHazards(MI));

if (SIInstrInfo::isSMRD(*MI))

return std::max(WaitStates, checkSMRDHazards(MI));

- if (SIInstrInfo::isVALU(*MI))

- WaitStates = std::max(WaitStates, checkVALUHazards(MI));

if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))

WaitStates = std::max(WaitStates, checkVMEMHazards(MI));

+ if (ST.hasNSAtoVMEMBug())

+ WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));

+ WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));

+ if (ST.hasNoDataDepHazard())

+ return WaitStates;

+ if (SIInstrInfo::isVALU(*MI))

+ WaitStates = std::max(WaitStates, checkVALUHazards(MI));

if (SIInstrInfo::isDPP(*MI))

WaitStates = std::max(WaitStates, checkDPPHazards(MI));

@@ -216,6 +293,12 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {

if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))

return std::max(WaitStates, checkReadM0Hazards(MI));

+ if (SIInstrInfo::isMAI(*MI))

+ return std::max(WaitStates, checkMAIHazards(MI));

+ if (MI->mayLoad() || MI->mayStore())

+ return std::max(WaitStates, checkMAILdStHazards(MI));

return WaitStates;

}

@@ -232,10 +315,14 @@ void GCNHazardRecognizer::AdvanceCycle() {

// Do not track non-instructions which do not affect the wait states.

// If included, these instructions can lead to buffer overflow such that

// detectable hazards are missed.

- if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)

+ if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||

+ CurrCycleInstr->isKill())

return;

- else if (CurrCycleInstr->isDebugInstr())

+ if (CurrCycleInstr->isBundle()) {

+ processBundle();

return;

+ }

unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);

@@ -266,41 +353,112 @@ void GCNHazardRecognizer::RecedeCycle() {

// Helper Functions

//===----------------------------------------------------------------------===//

-int GCNHazardRecognizer::getWaitStatesSince(

- function_ref<bool(MachineInstr *)> IsHazard) {

+typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;

+// Returns a minimum wait states since \p I walking all predecessors.

+// Only scans until \p IsExpired does not return true.

+// Can only be run in a hazard recognizer mode.

+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,

+ MachineBasicBlock *MBB,

+ MachineBasicBlock::reverse_instr_iterator I,

+ int WaitStates,

+ IsExpiredFn IsExpired,

+ DenseSet<const MachineBasicBlock *> &Visited) {

+ for (auto E = MBB->instr_rend(); I != E; ++I) {

+ // Don't add WaitStates for parent BUNDLE instructions.

+ if (I->isBundle())

+ continue;

+ if (IsHazard(&*I))

+ return WaitStates;

+ if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())

+ continue;

+ WaitStates += SIInstrInfo::getNumWaitStates(*I);

+ if (IsExpired(&*I, WaitStates))

+ return std::numeric_limits<int>::max();

+ }

+ int MinWaitStates = WaitStates;

+ bool Found = false;

+ for (MachineBasicBlock *Pred : MBB->predecessors()) {

+ if (!Visited.insert(Pred).second)

+ continue;

+ int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),

+ WaitStates, IsExpired, Visited);

+ if (W == std::numeric_limits<int>::max())

+ continue;

+ MinWaitStates = Found ? std::min(MinWaitStates, W) : W;

+ if (IsExpired(nullptr, MinWaitStates))

+ return MinWaitStates;

+ Found = true;

+ }

+ if (Found)

+ return MinWaitStates;

+ return std::numeric_limits<int>::max();

+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,

+ MachineInstr *MI,

+ IsExpiredFn IsExpired) {

+ DenseSet<const MachineBasicBlock *> Visited;

+ return getWaitStatesSince(IsHazard, MI->getParent(),

+ std::next(MI->getReverseIterator()),

+ 0, IsExpired, Visited);

+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {

+ if (IsHazardRecognizerMode) {

+ auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {

+ return WaitStates >= Limit;

+ };

+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);

+ }

int WaitStates = 0;

for (MachineInstr *MI : EmittedInstrs) {

if (MI) {

if (IsHazard(MI))

return WaitStates;

- unsigned Opcode = MI->getOpcode();

- if (Opcode == AMDGPU::INLINEASM)

+ if (MI->isInlineAsm())

continue;

}

++WaitStates;

+ if (WaitStates >= Limit)

+ break;

}

return std::numeric_limits<int>::max();

}

-int GCNHazardRecognizer::getWaitStatesSinceDef(

- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {

+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,

+ IsHazardFn IsHazardDef,

+ int Limit) {

const SIRegisterInfo *TRI = ST.getRegisterInfo();

auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {

return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);

};

- return getWaitStatesSince(IsHazardFn);

+ return getWaitStatesSince(IsHazardFn, Limit);

}

-int GCNHazardRecognizer::getWaitStatesSinceSetReg(

- function_ref<bool(MachineInstr *)> IsHazard) {

+int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,

+ int Limit) {

auto IsHazardFn = [IsHazard] (MachineInstr *MI) {

return isSSetReg(MI->getOpcode()) && IsHazard(MI);

};

- return getWaitStatesSince(IsHazardFn);

+ return getWaitStatesSince(IsHazardFn, Limit);

}

//===----------------------------------------------------------------------===//

@@ -342,9 +500,9 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {

// instructions in this group may return out of order and/or may be

// replayed (i.e. the same instruction issued more than once).

- // In order to handle these situations correctly we need to make sure

- // that when a clause has more than one instruction, no instruction in the

- // clause writes to a register that is read another instruction in the clause

+ // In order to handle these situations correctly we need to make sure that

+ // when a clause has more than one instruction, no instruction in the clause

+ // writes to a register that is read by another instruction in the clause

// (including itself). If we encounter this situaion, we need to break the

// clause by inserting a non SMEM instruction.

@@ -377,13 +535,12 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {

}

int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {

- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

int WaitStatesNeeded = 0;

WaitStatesNeeded = checkSoftClauseHazards(SMRD);

// This SMRD hazard only affects SI.

- if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS)

+ if (!ST.hasSMRDReadVALUDefHazard())

return WaitStatesNeeded;

// A read of an SGPR by SMRD instruction requires 4 wait states when the

@@ -398,7 +555,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {

if (!Use.isReg())

continue;

int WaitStatesNeededForUse =

- SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);

+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,

+ SmrdSgprWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

// This fixes what appears to be undocumented hardware behavior in SI where

@@ -411,7 +569,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {

if (IsBufferSMRD) {

int WaitStatesNeededForUse =

SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),

- IsBufferHazardDefFn);

+ IsBufferHazardDefFn,

+ SmrdSgprWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

}

@@ -420,7 +579,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {

}

int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {

- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)

+ if (!ST.hasVMEMReadSGPRVALUDefHazard())

return 0;

int WaitStatesNeeded = checkSoftClauseHazards(VMEM);

@@ -429,13 +588,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {

// SGPR was written by a VALU Instruction.

const int VmemSgprWaitStates = 5;

auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };

for (const MachineOperand &Use : VMEM->uses()) {

if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))

continue;

int WaitStatesNeededForUse =

- VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);

+ VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,

+ VmemSgprWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

}

return WaitStatesNeeded;

@@ -455,13 +614,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {

if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))

continue;

int WaitStatesNeededForUse =

- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());

+ DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),

+ [](MachineInstr *) { return true; },

+ DppVgprWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

}

WaitStatesNeeded = std::max(

WaitStatesNeeded,

- DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));

+ DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,

+ DppExecWaitStates));

return WaitStatesNeeded;

}

@@ -473,7 +635,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {

// instruction.

const int DivFMasWaitStates = 4;

auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };

- int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);

+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,

+ DivFMasWaitStates);

return DivFMasWaitStates - WaitStatesNeeded;

}

@@ -486,7 +649,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {

auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {

return GetRegHWReg == getHWReg(TII, *MI);

};

- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);

+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);

return GetRegWaitStates - WaitStatesNeeded;

}

@@ -495,12 +658,11 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {

const SIInstrInfo *TII = ST.getInstrInfo();

unsigned HWReg = getHWReg(TII, *SetRegInstr);

- const int SetRegWaitStates =

- ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ? 1 : 2;

+ const int SetRegWaitStates = ST.getSetRegWaitStates();

auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {

return HWReg == getHWReg(TII, *MI);

};

- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);

+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);

return SetRegWaitStates - WaitStatesNeeded;

}

@@ -571,7 +733,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,

TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);

};

int WaitStatesNeededForDef =

- VALUWaitStates - getWaitStatesSince(IsHazardFn);

+ VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

return WaitStatesNeeded;

@@ -636,12 +798,13 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {

};

const int RWLaneWaitStates = 4;

- int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);

+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,

+ RWLaneWaitStates);

return RWLaneWaitStates - WaitStatesSince;

}

int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {

- if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)

+ if (!ST.hasRFEHazards())

return 0;

const SIInstrInfo *TII = ST.getInstrInfo();

@@ -651,7 +814,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {

auto IsHazardFn = [TII] (MachineInstr *MI) {

return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;

};

- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);

+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);

return RFEWaitStates - WaitStatesNeeded;

}

@@ -675,7 +838,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {

return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;

};

int WaitStatesNeededForUse =

- MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);

+ MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,

+ MovFedWaitStates);

WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

}

@@ -688,5 +852,557 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {

auto IsHazardFn = [TII] (MachineInstr *MI) {

return TII->isSALU(*MI);

};

- return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);

+ return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,

+ SMovRelWaitStates);

+void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {

+ fixVMEMtoScalarWriteHazards(MI);

+ fixVcmpxPermlaneHazards(MI);

+ fixSMEMtoVectorWriteHazards(MI);

+ fixVcmpxExecWARHazard(MI);

+ fixLdsBranchVmemWARHazard(MI);

+bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {

+ if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))

+ return false;

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ auto IsHazardFn = [TII] (MachineInstr *MI) {

+ return TII->isVOPC(*MI);

+ };

+ auto IsExpiredFn = [] (MachineInstr *MI, int) {

+ if (!MI)

+ return false;

+ unsigned Opc = MI->getOpcode();

+ return SIInstrInfo::isVALU(*MI) &&

+ Opc != AMDGPU::V_NOP_e32 &&

+ Opc != AMDGPU::V_NOP_e64 &&

+ Opc != AMDGPU::V_NOP_sdwa;

+ };

+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==

+ std::numeric_limits<int>::max())

+ return false;

+ // V_NOP will be discarded by SQ.

+ // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*

+ // which is always a VGPR and available.

+ auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);

+ unsigned Reg = Src0->getReg();

+ bool IsUndef = Src0->isUndef();

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

+ TII->get(AMDGPU::V_MOV_B32_e32))

+ .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))

+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);

+ return true;

+bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {

+ if (!ST.hasVMEMtoScalarWriteHazard())

+ return false;

+ if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))

+ return false;

+ if (MI->getNumDefs() == 0)

+ return false;

+ const SIRegisterInfo *TRI = ST.getRegisterInfo();

+ auto IsHazardFn = [TRI, MI] (MachineInstr *I) {

+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&

+ !SIInstrInfo::isFLAT(*I))

+ return false;

+ for (const MachineOperand &Def : MI->defs()) {

+ MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);

+ if (!Op)

+ continue;

+ return true;

+ }

+ return false;

+ };

+ auto IsExpiredFn = [] (MachineInstr *MI, int) {

+ return MI && (SIInstrInfo::isVALU(*MI) ||

+ (MI->getOpcode() == AMDGPU::S_WAITCNT &&

+ !MI->getOperand(0).getImm()));

+ };

+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==

+ std::numeric_limits<int>::max())

+ return false;

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));

+ return true;

+bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {

+ if (!ST.hasSMEMtoVectorWriteHazard())

+ return false;

+ if (!SIInstrInfo::isVALU(*MI))

+ return false;

+ unsigned SDSTName;

+ switch (MI->getOpcode()) {

+ case AMDGPU::V_READLANE_B32:

+ case AMDGPU::V_READFIRSTLANE_B32:

+ SDSTName = AMDGPU::OpName::vdst;

+ break;

+ default:

+ SDSTName = AMDGPU::OpName::sdst;

+ break;

+ }

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ const SIRegisterInfo *TRI = ST.getRegisterInfo();

+ const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());

+ const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);

+ if (!SDST) {

+ for (const auto &MO : MI->implicit_operands()) {

+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {

+ SDST = &MO;

+ break;

+ }

+ if (!SDST)

+ return false;

+ const unsigned SDSTReg = SDST->getReg();

+ auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {

+ return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);

+ };

+ auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {

+ if (MI) {

+ if (TII->isSALU(*MI)) {

+ switch (MI->getOpcode()) {

+ case AMDGPU::S_SETVSKIP:

+ case AMDGPU::S_VERSION:

+ case AMDGPU::S_WAITCNT_VSCNT:

+ case AMDGPU::S_WAITCNT_VMCNT:

+ case AMDGPU::S_WAITCNT_EXPCNT:

+ // These instructions cannot not mitigate the hazard.

+ return false;

+ case AMDGPU::S_WAITCNT_LGKMCNT:

+ // Reducing lgkmcnt count to 0 always mitigates the hazard.

+ return (MI->getOperand(1).getImm() == 0) &&

+ (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);

+ case AMDGPU::S_WAITCNT: {

+ const int64_t Imm = MI->getOperand(0).getImm();

+ AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);

+ return (Decoded.LgkmCnt == 0);

+ }

+ default:

+ // SOPP instructions cannot mitigate the hazard.

+ if (TII->isSOPP(*MI))

+ return false;

+ // At this point the SALU can be assumed to mitigate the hazard

+ // because either:

+ // (a) it is independent of the at risk SMEM (breaking chain),

+ // or

+ // (b) it is dependent on the SMEM, in which case an appropriate

+ // s_waitcnt lgkmcnt _must_ exist between it and the at risk

+ // SMEM instruction.

+ return true;

+ }

+ return false;

+ };

+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==

+ std::numeric_limits<int>::max())

+ return false;

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

+ TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)

+ .addImm(0);

+ return true;

+bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {

+ if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))

+ return false;

+ const SIRegisterInfo *TRI = ST.getRegisterInfo();

+ if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))

+ return false;

+ auto IsHazardFn = [TRI] (MachineInstr *I) {

+ if (SIInstrInfo::isVALU(*I))

+ return false;

+ return I->readsRegister(AMDGPU::EXEC, TRI);

+ };

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {

+ if (!MI)

+ return false;

+ if (SIInstrInfo::isVALU(*MI)) {

+ if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))

+ return true;

+ for (auto MO : MI->implicit_operands())

+ if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))

+ return true;

+ }

+ if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

+ (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)

+ return true;

+ return false;

+ };

+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==

+ std::numeric_limits<int>::max())

+ return false;

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

+ TII->get(AMDGPU::S_WAITCNT_DEPCTR))

+ .addImm(0xfffe);

+ return true;

+bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {

+ if (!ST.hasLdsBranchVmemWARHazard())

+ return false;

+ auto IsHazardInst = [] (const MachineInstr *MI) {

+ if (SIInstrInfo::isDS(*MI))

+ return 1;

+ if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))

+ return 2;

+ return 0;

+ };

+ auto InstType = IsHazardInst(MI);

+ if (!InstType)

+ return false;

+ auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {

+ return I && (IsHazardInst(I) ||

+ (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&

+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&

+ !I->getOperand(1).getImm()));

+ };

+ auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {

+ if (!I->isBranch())

+ return false;

+ auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {

+ auto InstType2 = IsHazardInst(I);

+ return InstType2 && InstType != InstType2;

+ };

+ auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {

+ if (!I)

+ return false;

+ auto InstType2 = IsHazardInst(I);

+ if (InstType == InstType2)

+ return true;

+ return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&

+ I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&

+ !I->getOperand(1).getImm();

+ };

+ return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=

+ std::numeric_limits<int>::max();

+ };

+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==

+ std::numeric_limits<int>::max())

+ return false;

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

+ TII->get(AMDGPU::S_WAITCNT_VSCNT))

+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)

+ .addImm(0);

+ return true;

+int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {

+ int NSAtoVMEMWaitStates = 1;

+ if (!ST.hasNSAtoVMEMBug())

+ return 0;

+ if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))

+ return 0;

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);

+ if (!Offset || (Offset->getImm() & 6) == 0)

+ return 0;

+ auto IsHazardFn = [TII] (MachineInstr *I) {

+ if (!SIInstrInfo::isMIMG(*I))

+ return false;

+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());

+ return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&

+ TII->getInstSizeInBytes(*I) >= 16;

+ };

+ return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);

+int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {

+ int FPAtomicToDenormModeWaitStates = 3;

+ if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)

+ return 0;

+ auto IsHazardFn = [] (MachineInstr *I) {

+ if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))

+ return false;

+ return SIInstrInfo::isFPAtomic(*I);

+ };

+ auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {

+ if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))

+ return true;

+ switch (MI->getOpcode()) {

+ case AMDGPU::S_WAITCNT:

+ case AMDGPU::S_WAITCNT_VSCNT:

+ case AMDGPU::S_WAITCNT_VMCNT:

+ case AMDGPU::S_WAITCNT_EXPCNT:

+ case AMDGPU::S_WAITCNT_LGKMCNT:

+ case AMDGPU::S_WAITCNT_IDLE:

+ return true;

+ default:

+ break;

+ }

+ return false;

+ };

+ return FPAtomicToDenormModeWaitStates -

+ ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);

+int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {

+ assert(SIInstrInfo::isMAI(*MI));

+ int WaitStatesNeeded = 0;

+ unsigned Opc = MI->getOpcode();

+ auto IsVALUFn = [] (MachineInstr *MI) {

+ return SIInstrInfo::isVALU(*MI);

+ };

+ if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write

+ const int LegacyVALUWritesVGPRWaitStates = 2;

+ const int VALUWritesExecWaitStates = 4;

+ const int MaxWaitStates = 4;

+ int WaitStatesNeededForUse = VALUWritesExecWaitStates -

+ getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ if (WaitStatesNeeded < MaxWaitStates) {

+ for (const MachineOperand &Use : MI->explicit_uses()) {

+ const int MaxWaitStates = 2;

+ if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))

+ continue;

+ int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -

+ getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ if (WaitStatesNeeded == MaxWaitStates)

+ break;

+ }

+ auto IsMFMAFn = [] (MachineInstr *MI) {

+ return SIInstrInfo::isMAI(*MI) &&

+ MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&

+ MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;

+ };

+ for (const MachineOperand &Op : MI->explicit_operands()) {

+ if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))

+ continue;

+ if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)

+ continue;

+ const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;

+ const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;

+ const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;

+ const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;

+ const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;

+ const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;

+ const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;

+ const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;

+ const int MaxWaitStates = 18;

+ unsigned Reg = Op.getReg();

+ unsigned HazardDefLatency = 0;

+ auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]

+ (MachineInstr *MI) {

+ if (!IsMFMAFn(MI))

+ return false;

+ unsigned DstReg = MI->getOperand(0).getReg();

+ if (DstReg == Reg)

+ return false;

+ HazardDefLatency = std::max(HazardDefLatency,

+ TSchedModel.computeInstrLatency(MI));

+ return TRI.regsOverlap(DstReg, Reg);

+ };

+ int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,

+ MaxWaitStates);

+ int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;

+ int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

+ int OpNo = MI->getOperandNo(&Op);

+ if (OpNo == SrcCIdx) {

+ NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;

+ } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {

+ switch (HazardDefLatency) {

+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;

+ break;

+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;

+ break;

+ case 16: LLVM_FALLTHROUGH;

+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;

+ break;

+ }

+ } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {

+ switch (HazardDefLatency) {

+ case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;

+ break;

+ case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;

+ break;

+ case 16: LLVM_FALLTHROUGH;

+ default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;

+ break;

+ }

+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ if (WaitStatesNeeded == MaxWaitStates)

+ return WaitStatesNeeded; // Early exit.

+ auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {

+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)

+ return false;

+ unsigned DstReg = MI->getOperand(0).getReg();

+ return TRI.regsOverlap(Reg, DstReg);

+ };

+ const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;

+ const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;

+ const int AccVGPRWriteAccVgprReadWaitStates = 3;

+ NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;

+ if (OpNo == SrcCIdx)

+ NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;

+ else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)

+ NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;

+ WaitStatesNeededForUse = NeedWaitStates -

+ getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ if (WaitStatesNeeded == MaxWaitStates)

+ return WaitStatesNeeded; // Early exit.

+ }

+ if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {

+ const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;

+ const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;

+ const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;

+ const int MaxWaitStates = 13;

+ unsigned DstReg = MI->getOperand(0).getReg();

+ unsigned HazardDefLatency = 0;

+ auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]

+ (MachineInstr *MI) {

+ if (!IsMFMAFn(MI))

+ return false;

+ unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();

+ HazardDefLatency = std::max(HazardDefLatency,

+ TSchedModel.computeInstrLatency(MI));

+ return TRI.regsOverlap(Reg, DstReg);

+ };

+ int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);

+ int NeedWaitStates;

+ switch (HazardDefLatency) {

+ case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;

+ break;

+ case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;

+ break;

+ case 16: LLVM_FALLTHROUGH;

+ default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;

+ break;

+ }

+ int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ }

+ return WaitStatesNeeded;

+int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {

+ if (!ST.hasMAIInsts())

+ return 0;

+ int WaitStatesNeeded = 0;

+ auto IsAccVgprReadFn = [] (MachineInstr *MI) {

+ return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;

+ };

+ for (const MachineOperand &Op : MI->explicit_uses()) {

+ if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))

+ continue;

+ unsigned Reg = Op.getReg();

+ const int AccVgprReadLdStWaitStates = 2;

+ const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;

+ const int MaxWaitStates = 2;

+ int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -

+ getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ if (WaitStatesNeeded == MaxWaitStates)

+ return WaitStatesNeeded; // Early exit.

+ auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {

+ if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)

+ return false;

+ auto IsVALUFn = [] (MachineInstr *MI) {

+ return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);

+ };

+ return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <

+ std::numeric_limits<int>::max();

+ };

+ WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -

+ getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);

+ WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

+ }

+ return WaitStatesNeeded;

}