aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp2058
1 files changed, 2058 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
new file mode 100644
index 000000000000..6b67b76652ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -0,0 +1,2058 @@
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+// performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+// We want the blocks help control register usage and hide high latencies
+// later. To help control register usage, we typically want all local
+// computations, when for example you create a result that can be comsummed
+// right away, to be contained in a block. Block inputs and outputs would
+// typically be important results that are needed in several locations of
+// the shader. Since we do want blocks to help hide high latencies, we want
+// the instructions inside the block to have a minimal set of dependencies
+// on high latencies. It will make it easy to pick blocks to hide specific
+// high latencies.
+// The block creation algorithm is divided into several steps, and several
+// variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is chosen.
+// At that step we do take into account only register usage and hiding
+// low latency instructions
+//
+// Third the block order is chosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwidth limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+ switch (Reason) {
+ case NoCand: return "NOCAND";
+ case RegUsage: return "REGUSAGE";
+ case Latency: return "LATENCY";
+ case Successor: return "SUCCESSOR";
+ case Depth: return "DEPTH";
+ case NodeOrder: return "ORDER";
+ }
+ llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal < CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal > CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal > CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal < CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+ NodeNum2Index[SU->NodeNum] = SUnits.size();
+ SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+ dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+ dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+ SISchedCandidate &TryCand) {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return;
+ }
+
+ if (Cand.SGPRUsage > 60 &&
+ tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Schedule low latency instructions as top as possible.
+ // Order of priority is:
+ // . Low latency instructions which do not depend on other low latency
+ // instructions we haven't waited for
+ // . Other instructions which do not depend on low latency instructions
+ // we haven't waited for
+ // . Low latencies
+ // . All other instructions
+ // Goal is to get: low latency instructions - independent instructions
+ // - (eventually some more low latency instructions)
+ // - instructions that depend on the first low latency instructions.
+ // If in the block there is a lot of constant loads, the SGPR usage
+ // could go quite high, thus above the arbitrary limit of 60 will encourage
+ // use the already loaded constants (in order to release some SGPRs) before
+ // loading more.
+ if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+ Cand.HasLowLatencyNonWaitedParent,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (TryCand.IsLowLatency &&
+ tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+ SISchedCandidate TopCand;
+
+ for (SUnit* SU : TopReadySUs) {
+ SISchedCandidate TryCand;
+ std::vector<unsigned> pressure;
+ std::vector<unsigned> MaxPressure;
+ // Predict register usage after this instruction.
+ TryCand.SU = SU;
+ TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+ TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+ TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+ TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+ TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+ TryCand.HasLowLatencyNonWaitedParent =
+ HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+ tryCandidateTopDown(TopCand, TryCand);
+ if (TryCand.Reason != NoCand)
+ TopCand.setBest(TryCand);
+ }
+
+ return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+ TopReadySUs.clear();
+ if (Scheduled)
+ undoSchedule();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = TopReadySUs[0];
+ ScheduledSUnits.push_back(SU);
+ nodeScheduled(SU);
+ }
+
+ Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+ SlotIndex First, SlotIndex Last,
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
+ for (MachineRegisterInfo::def_instr_iterator
+ UI = MRI->def_instr_begin(Reg),
+ UE = MRI->def_instr_end(); UI != UE; ++UI) {
+ const MachineInstr* MI = &*UI;
+ if (MI->isDebugValue())
+ continue;
+ SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+ if (InstSlot >= First && InstSlot <= Last)
+ return true;
+ }
+ return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ IntervalPressure Pressure, BotPressure;
+ RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+ LiveIntervals *LIS = DAG->getLIS();
+ MachineRegisterInfo *MRI = DAG->getMRI();
+ DAG->initRPTracker(TopRPTracker);
+ DAG->initRPTracker(BotRPTracker);
+ DAG->initRPTracker(RPTracker);
+
+ // Goes though all SU. RPTracker captures what had to be alive for the SUs
+ // to execute, and what is still alive at the end.
+ for (SUnit* SU : ScheduledSUnits) {
+ RPTracker.setPos(SU->getInstr());
+ RPTracker.advance();
+ }
+
+ // Close the RPTracker to finalize live ins/outs.
+ RPTracker.closeRegion();
+
+ // Initialize the live ins and live outs.
+ TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+ BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+ // Do not Track Physical Registers, because it messes up.
+ for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
+ if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+ LiveInRegs.insert(RegMaskPair.RegUnit);
+ }
+ LiveOutRegs.clear();
+ // There is several possibilities to distinguish:
+ // 1) Reg is not input to any instruction in the block, but is output of one
+ // 2) 1) + read in the block and not needed after it
+ // 3) 1) + read in the block but needed in another block
+ // 4) Reg is input of an instruction but another block will read it too
+ // 5) Reg is input of an instruction and then rewritten in the block.
+ // result is not read in the block (implies used in another block)
+ // 6) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block and not needed in another block
+ // 7) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block but also needed in another block
+ // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+ // We want LiveOutRegs to contain only Regs whose content will be read after
+ // in another block, and whose content was written in the current block,
+ // that is we want it to get 1, 3, 5, 7
+ // Since we made the MIs of a block to be packed all together before
+ // scheduling, then the LiveIntervals were correct, and the RPTracker was
+ // able to correctly handle 5 vs 6, 2 vs 3.
+ // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+ // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+ // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+ // The use of findDefBetween removes the case 4.
+ for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+ unsigned Reg = RegMaskPair.RegUnit;
+ if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+ isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
+ LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
+ LIS)) {
+ LiveOutRegs.insert(Reg);
+ }
+ }
+
+ // Pressure = sum_alive_registers register size
+ // Internally llvm will represent some registers as big 128 bits registers
+ // for example, but they actually correspond to 4 actual 32 bits registers.
+ // Thus Pressure is not equal to num_alive_registers * constant.
+ LiveInPressure = TopPressure.MaxSetPressure;
+ LiveOutPressure = BotPressure.MaxSetPressure;
+
+ // Prepares TopRPTracker for top down scheduling.
+ TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ if (!Scheduled)
+ fastSchedule();
+
+ // PreScheduling phase to set LiveIn and LiveOut.
+ initRegPressure(BeginBlock, EndBlock);
+ undoSchedule();
+
+ // Schedule for real now.
+
+ TopReadySUs.clear();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = pickNode();
+ ScheduledSUnits.push_back(SU);
+ TopRPTracker.setPos(SU->getInstr());
+ TopRPTracker.advance();
+ nodeScheduled(SU);
+ }
+
+ // TODO: compute InternalAdditionnalPressure.
+ InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+ // Check everything is right.
+#ifndef NDEBUG
+ assert(SUnits.size() == ScheduledSUnits.size() &&
+ TopReadySUs.empty());
+ for (SUnit* SU : SUnits) {
+ assert(SU->isScheduled &&
+ SU->NumPredsLeft == 0);
+ }
+#endif
+
+ Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+ for (SUnit* SU : SUnits) {
+ SU->isScheduled = false;
+ for (SDep& Succ : SU->Succs) {
+ if (BC->isSUInBlock(Succ.getSUnit(), ID))
+ undoReleaseSucc(SU, &Succ);
+ }
+ }
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+ ScheduledSUnits.clear();
+ Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ ++SuccSU->WeakPredsLeft;
+ return;
+ }
+ ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ --SuccSU->WeakPredsLeft;
+ return;
+ }
+#ifndef NDEBUG
+ if (SuccSU->NumPredsLeft == 0) {
+ dbgs() << "*** Scheduling failed! ***\n";
+ SuccSU->dump(DAG);
+ dbgs() << " has been released too many times!\n";
+ llvm_unreachable(nullptr);
+ }
+#endif
+
+ --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+ for (SDep& Succ : SU->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+
+ if (SuccSU->NodeNum >= DAG->SUnits.size())
+ continue;
+
+ if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+ continue;
+
+ releaseSucc(SU, &Succ);
+ if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+ TopReadySUs.push_back(SuccSU);
+ }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+ // Is in TopReadySUs
+ assert (!SU->NumPredsLeft);
+ std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
+ if (I == TopReadySUs.end()) {
+ dbgs() << "Data Structure Bug in SI Scheduler\n";
+ llvm_unreachable(nullptr);
+ }
+ TopReadySUs.erase(I);
+
+ releaseSuccessors(SU, true);
+ // Scheduling this node will trigger a wait,
+ // thus propagate to other instructions that they do not need to wait either.
+ if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+ if (DAG->IsLowLatencySU[SU->NodeNum]) {
+ for (SDep& Succ : SU->Succs) {
+ std::map<unsigned, unsigned>::iterator I =
+ NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+ if (I != NodeNum2Index.end())
+ HasLowLatencyNonWaitedParent[I->second] = 1;
+ }
+ }
+ SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+ // We remove links from outside blocks to enable scheduling inside the block.
+ for (SUnit* SU : SUnits) {
+ releaseSuccessors(SU, false);
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ HighLatencyBlock = true;
+ }
+ HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+ unsigned PredID = Pred->getID();
+
+ // Check if not already predecessor.
+ for (SIScheduleBlock* P : Preds) {
+ if (PredID == P->getID())
+ return;
+ }
+ Preds.push_back(Pred);
+
+ assert(none_of(Succs,
+ [=](std::pair<SIScheduleBlock*,
+ SIScheduleBlockLinkKind> S) {
+ return PredID == S.first->getID();
+ }) &&
+ "Loop in the Block Graph!");
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
+ SIScheduleBlockLinkKind Kind) {
+ unsigned SuccID = Succ->getID();
+
+ // Check if not already predecessor.
+ for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
+ if (SuccID == S.first->getID()) {
+ if (S.second == SIScheduleBlockLinkKind::NoData &&
+ Kind == SIScheduleBlockLinkKind::Data)
+ S.second = Kind;
+ return;
+ }
+ }
+ if (Succ->isHighLatencyBlock())
+ ++NumHighLatencySuccessors;
+ Succs.push_back(std::make_pair(Succ, Kind));
+
+ assert(none_of(Preds,
+ [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
+ "Loop in the Block Graph!");
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+ dbgs() << "Block (" << ID << ")\n";
+ if (!full)
+ return;
+
+ dbgs() << "\nContains High Latency Instruction: "
+ << HighLatencyBlock << '\n';
+ dbgs() << "\nDepends On:\n";
+ for (SIScheduleBlock* P : Preds) {
+ P->printDebug(false);
+ }
+
+ dbgs() << "\nSuccessors:\n";
+ for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
+ if (S.second == SIScheduleBlockLinkKind::Data)
+ dbgs() << "(Data Dep) ";
+ S.first->printDebug(false);
+ }
+
+ if (Scheduled) {
+ dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+ << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+ dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+ << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+ dbgs() << "LiveIns:\n";
+ for (unsigned Reg : LiveInRegs)
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+ dbgs() << "\nLiveOuts:\n";
+ for (unsigned Reg : LiveOutRegs)
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ }
+
+ dbgs() << "\nInstructions:\n";
+ if (!Scheduled) {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ } else {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ }
+
+ dbgs() << "///////////////////////\n";
+}
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+ std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+ Blocks.find(BlockVariant);
+ if (B == Blocks.end()) {
+ SIScheduleBlocks Res;
+ createBlocksForVariant(BlockVariant);
+ topologicalSort();
+ scheduleInsideBlocks();
+ fillStats();
+ Res.Blocks = CurrentBlocks;
+ Res.TopDownIndex2Block = TopDownIndex2Block;
+ Res.TopDownBlock2Index = TopDownBlock2Index;
+ Blocks[BlockVariant] = Res;
+ return Res;
+ } else {
+ return B->second;
+ }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+ if (SU->NodeNum >= DAG->SUnits.size())
+ return false;
+ return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum]) {
+ CurrentColoring[SU->NodeNum] = NextReservedID++;
+ }
+ }
+}
+
+static bool
+hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
+ for (const auto &PredDep : SU.Preds) {
+ if (PredDep.getSUnit() == &FromSU &&
+ PredDep.getKind() == llvm::SDep::Data)
+ return true;
+ }
+ return false;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned NumHighLatencies = 0;
+ unsigned GroupSize;
+ int Color = NextReservedID;
+ unsigned Count = 0;
+ std::set<unsigned> FormingGroup;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ ++NumHighLatencies;
+ }
+
+ if (NumHighLatencies == 0)
+ return;
+
+ if (NumHighLatencies <= 6)
+ GroupSize = 2;
+ else if (NumHighLatencies <= 12)
+ GroupSize = 3;
+ else
+ GroupSize = 4;
+
+ for (unsigned SUNum : DAG->TopDownIndex2SU) {
+ const SUnit &SU = DAG->SUnits[SUNum];
+ if (DAG->IsHighLatencySU[SU.NodeNum]) {
+ unsigned CompatibleGroup = true;
+ int ProposedColor = Color;
+ std::vector<int> AdditionalElements;
+
+ // We don't want to put in the same block
+ // two high latency instructions that depend
+ // on each other.
+ // One way would be to check canAddEdge
+ // in both directions, but that currently is not
+ // enough because there the high latency order is
+ // enforced (via links).
+ // Instead, look at the dependencies between the
+ // high latency instructions and deduce if it is
+ // a data dependency or not.
+ for (unsigned j : FormingGroup) {
+ bool HasSubGraph;
+ std::vector<int> SubGraph;
+ // By construction (topological order), if SU and
+ // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+ // in the parent graph of SU.
+#ifndef NDEBUG
+ SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+ HasSubGraph);
+ assert(!HasSubGraph);
+#endif
+ SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+ HasSubGraph);
+ if (!HasSubGraph)
+ continue; // No dependencies between each other
+ else if (SubGraph.size() > 5) {
+ // Too many elements would be required to be added to the block.
+ CompatibleGroup = false;
+ break;
+ }
+ else {
+ // Check the type of dependency
+ for (unsigned k : SubGraph) {
+ // If in the path to join the two instructions,
+ // there is another high latency instruction,
+ // or instructions colored for another block
+ // abort the merge.
+ if (DAG->IsHighLatencySU[k] ||
+ (CurrentColoring[k] != ProposedColor &&
+ CurrentColoring[k] != 0)) {
+ CompatibleGroup = false;
+ break;
+ }
+ // If one of the SU in the subgraph depends on the result of SU j,
+ // there'll be a data dependency.
+ if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
+ CompatibleGroup = false;
+ break;
+ }
+ }
+ if (!CompatibleGroup)
+ break;
+ // Same check for the SU
+ if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
+ CompatibleGroup = false;
+ break;
+ }
+ // Add all the required instructions to the block
+ // These cannot live in another block (because they
+ // depend (order dependency) on one of the
+ // instruction in the block, and are required for the
+ // high latency instruction we add.
+ AdditionalElements.insert(AdditionalElements.end(),
+ SubGraph.begin(), SubGraph.end());
+ }
+ }
+ if (CompatibleGroup) {
+ FormingGroup.insert(SU.NodeNum);
+ for (unsigned j : AdditionalElements)
+ CurrentColoring[j] = ProposedColor;
+ CurrentColoring[SU.NodeNum] = ProposedColor;
+ ++Count;
+ }
+ // Found one incompatible instruction,
+ // or has filled a big enough group.
+ // -> start a new one.
+ if (!CompatibleGroup) {
+ FormingGroup.clear();
+ Color = ++NextReservedID;
+ ProposedColor = Color;
+ FormingGroup.insert(SU.NodeNum);
+ CurrentColoring[SU.NodeNum] = ProposedColor;
+ Count = 0;
+ } else if (Count == GroupSize) {
+ FormingGroup.clear();
+ Color = ++NextReservedID;
+ ProposedColor = Color;
+ Count = 0;
+ }
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+ CurrentTopDownReservedDependencyColoring.clear();
+ CurrentBottomUpReservedDependencyColoring.clear();
+
+ CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+ CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+ // Traverse TopDown, and give different colors to SUs depending
+ // on which combination of High Latencies they depend on.
+
+ for (unsigned SUNum : DAG->TopDownIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+ SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+ }
+ // Color 0 by default.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+
+ ColorCombinations.clear();
+
+ // Same as before, but BottomUp.
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+ }
+ // Keep color 0.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+ // Every combination of colors given by the top down
+ // and bottom up Reserved node dependency
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ std::pair<unsigned, unsigned> SUColors;
+
+ // High latency instructions: already given.
+ if (CurrentColoring[SU->NodeNum])
+ continue;
+
+ SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+ SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+ std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentColoring[SU->NodeNum] = NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::vector<int> PendingColoring = CurrentColoring;
+
+ assert(DAGSize >= 1 &&
+ CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
+ CurrentTopDownReservedDependencyColoring.size() == DAGSize);
+ // If there is no reserved block at all, do nothing. We don't want
+ // everything in one block.
+ if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
+ CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
+ *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
+ CurrentTopDownReservedDependencyColoring.end()) == 0)
+ return;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+ std::set<unsigned> SUColorsPending;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+ }
+ // If there is only one child/parent block, and that block
+ // is not among the ones we are removing in this path, then
+ // merge the instruction to that block
+ if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+ PendingColoring[SU->NodeNum] = *SUColors.begin();
+ else // TODO: Attribute new colors depending on color
+ // combination of children.
+ PendingColoring[SU->NodeNum] = NextNonReservedID++;
+ }
+ CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned PreviousColor;
+ std::set<unsigned> SeenColors;
+
+ if (DAGSize <= 1)
+ return;
+
+ PreviousColor = CurrentColoring[0];
+
+ for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned CurrentColor = CurrentColoring[i];
+ unsigned PreviousColorSave = PreviousColor;
+ assert(i == SU->NodeNum);
+
+ if (CurrentColor != PreviousColor)
+ SeenColors.insert(PreviousColor);
+ PreviousColor = CurrentColor;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (SeenColors.find(CurrentColor) == SeenColors.end())
+ continue;
+
+ if (PreviousColorSave != CurrentColor)
+ CurrentColoring[i] = NextNonReservedID++;
+ else
+ CurrentColoring[i] = CurrentColoring[i-1];
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ // No predecessor: Vgpr constant loading.
+ // Low latency instructions usually have a predecessor (the address)
+ if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned, unsigned> ColorCount;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ ++ColorCount[color];
+ }
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (ColorCount[color] > 1)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() != color) {
+ --ColorCount[color];
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ ++ColorCount[*SUColors.begin()];
+ }
+ }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+ // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+ unsigned DAGSize = DAG->SUnits.size();
+ int GroupID = NextNonReservedID++;
+
+ for (unsigned SUNum : DAG->BottomUpIndex2SU) {
+ SUnit *SU = &DAG->SUnits[SUNum];
+ bool hasSuccessor = false;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ hasSuccessor = true;
+ }
+ if (!hasSuccessor)
+ CurrentColoring[SU->NodeNum] = GroupID;
+ }
+}
+
+void SIScheduleBlockCreator::colorExports() {
+ unsigned ExportColor = NextNonReservedID++;
+ SmallVector<unsigned, 8> ExpGroup;
+
+ // Put all exports together in a block.
+ // The block will naturally end up being scheduled last,
+ // thus putting exports at the end of the schedule, which
+ // is better for performance.
+ // However we must ensure, for safety, the exports can be put
+ // together in the same block without any other instruction.
+ // This could happen, for example, when scheduling after regalloc
+ // if reloading a spilled register from memory using the same
+ // register than used in a previous export.
+ // If that happens, do not regroup the exports.
+ for (unsigned SUNum : DAG->TopDownIndex2SU) {
+ const SUnit &SU = DAG->SUnits[SUNum];
+ if (SIInstrInfo::isEXP(*SU.getInstr())) {
+ // Check the EXP can be added to the group safely,
+ // ie without needing any other instruction.
+ // The EXP is allowed to depend on other EXP
+ // (they will be in the same group).
+ for (unsigned j : ExpGroup) {
+ bool HasSubGraph;
+ std::vector<int> SubGraph;
+ // By construction (topological order), if SU and
+ // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+ // in the parent graph of SU.
+#ifndef NDEBUG
+ SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+ HasSubGraph);
+ assert(!HasSubGraph);
+#endif
+ SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+ HasSubGraph);
+ if (!HasSubGraph)
+ continue; // No dependencies between each other
+
+ // SubGraph contains all the instructions required
+ // between EXP SUnits[j] and EXP SU.
+ for (unsigned k : SubGraph) {
+ if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
+ // Other instructions than EXP would be required in the group.
+ // Abort the groupping.
+ return;
+ }
+ }
+
+ ExpGroup.push_back(SUNum);
+ }
+ }
+
+ // The group can be formed. Give the color.
+ for (unsigned j : ExpGroup)
+ CurrentColoring[j] = ExportColor;
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned,unsigned> RealID;
+
+ CurrentBlocks.clear();
+ CurrentColoring.clear();
+ CurrentColoring.resize(DAGSize, 0);
+ Node2CurrentBlock.clear();
+
+ // Restore links previous scheduling variant has overridden.
+ DAG->restoreSULinksLeft();
+
+ NextReservedID = 1;
+ NextNonReservedID = DAGSize + 1;
+
+ DEBUG(dbgs() << "Coloring the graph\n");
+
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+ colorHighLatenciesGroups();
+ else
+ colorHighLatenciesAlone();
+ colorComputeReservedDependencies();
+ colorAccordingToReservedDependencies();
+ colorEndsAccordingToDependencies();
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+ colorForceConsecutiveOrderInGroup();
+ regroupNoUserInstructions();
+ colorMergeConstantLoadsNextGroup();
+ colorMergeIfPossibleNextGroupOnlyForReserved();
+ colorExports();
+
+ // Put SUs of same color into same block
+ Node2CurrentBlock.resize(DAGSize, -1);
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned Color = CurrentColoring[SU->NodeNum];
+ if (RealID.find(Color) == RealID.end()) {
+ int ID = CurrentBlocks.size();
+ BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+ CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+ RealID[Color] = ID;
+ }
+ CurrentBlocks[RealID[Color]]->addUnit(SU);
+ Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+ }
+
+ // Build dependencies between blocks.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ int SUID = Node2CurrentBlock[i];
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
+ SuccDep.isCtrl() ? NoData : Data);
+ }
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+ }
+ }
+
+ // Free root and leafs of all blocks to enable scheduling inside them.
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->finalizeUnits();
+ }
+ DEBUG(
+ dbgs() << "Blocks created:\n\n";
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+ MachineBasicBlock::const_iterator End) {
+ for (; I != End; ++I) {
+ if (!I->isDebugValue())
+ break;
+ }
+ return I;
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+ unsigned DAGSize = CurrentBlocks.size();
+ std::vector<int> WorkList;
+
+ DEBUG(dbgs() << "Topological Sort\n");
+
+ WorkList.reserve(DAGSize);
+ TopDownIndex2Block.resize(DAGSize);
+ TopDownBlock2Index.resize(DAGSize);
+ BottomUpIndex2Block.resize(DAGSize);
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ unsigned Degree = Block->getSuccs().size();
+ TopDownBlock2Index[i] = Degree;
+ if (Degree == 0) {
+ WorkList.push_back(i);
+ }
+ }
+
+ int Id = DAGSize;
+ while (!WorkList.empty()) {
+ int i = WorkList.back();
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ WorkList.pop_back();
+ TopDownBlock2Index[i] = --Id;
+ TopDownIndex2Block[Id] = i;
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ if (!--TopDownBlock2Index[Pred->getID()])
+ WorkList.push_back(Pred->getID());
+ }
+ }
+
+#ifndef NDEBUG
+ // Check correctness of the ordering.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+ "Wrong Top Down topological sorting");
+ }
+ }
+#endif
+
+ BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+ TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+ // We do schedule a valid scheduling such that a Block corresponds
+ // to a range of instructions.
+ DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->fastSchedule();
+ }
+
+ // Note: the following code, and the part restoring previous position
+ // is by far the most expensive operation of the Scheduler.
+
+ // Do not update CurrentTop.
+ MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+ std::vector<MachineBasicBlock::iterator> PosOld;
+ std::vector<MachineBasicBlock::iterator> PosNew;
+ PosOld.reserve(DAG->SUnits.size());
+ PosNew.reserve(DAG->SUnits.size());
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs) {
+ MachineInstr *MI = SU->getInstr();
+ MachineBasicBlock::iterator Pos = MI;
+ PosOld.push_back(Pos);
+ if (&*CurrentTopFastSched == MI) {
+ PosNew.push_back(Pos);
+ CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+ DAG->getCurrentBottom());
+ } else {
+ // Update the instruction stream.
+ DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+ // Update LiveIntervals.
+ // Note: Moving all instructions and calling handleMove every time
+ // is the most cpu intensive operation of the scheduler.
+ // It would gain a lot if there was a way to recompute the
+ // LiveIntervals for the entire scheduling region.
+ DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
+ PosNew.push_back(CurrentTopFastSched);
+ }
+ }
+ }
+
+ // Now we have Block of SUs == Block of MI.
+ // We do the final schedule for the instructions inside the block.
+ // The property that all the SUs of the Block are grouped together as MI
+ // is used for correct reg usage tracking.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+ Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+ }
+
+ DEBUG(dbgs() << "Restoring MI Pos\n");
+ // Restore old ordering (which prevents a LIS->handleMove bug).
+ for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+ MachineBasicBlock::iterator POld = PosOld[i-1];
+ MachineBasicBlock::iterator PNew = PosNew[i-1];
+ if (PNew != POld) {
+ // Update the instruction stream.
+ DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+ // Update LiveIntervals.
+ DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
+ }
+ }
+
+ DEBUG(
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getPreds().empty())
+ Block->Depth = 0;
+ else {
+ unsigned Depth = 0;
+ for (SIScheduleBlock *Pred : Block->getPreds()) {
+ if (Depth < Pred->Depth + Pred->getCost())
+ Depth = Pred->Depth + Pred->getCost();
+ }
+ Block->Depth = Depth;
+ }
+ }
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = BottomUpIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getSuccs().empty())
+ Block->Height = 0;
+ else {
+ unsigned Height = 0;
+ for (const auto &Succ : Block->getSuccs())
+ Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
+ Block->Height = Height;
+ }
+ }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+ SISchedulerBlockSchedulerVariant Variant,
+ SIScheduleBlocks BlocksStruct) :
+ DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+ LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+ SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+ // Fill the usage of every output
+ // Warning: while by construction we always have a link between two blocks
+ // when one needs a result from the other, the number of users of an output
+ // is not the sum of child blocks having as input the same virtual register.
+ // Here is an example. A produces x and y. B eats x and produces x'.
+ // C eats x' and y. The register coalescer may have attributed the same
+ // virtual register to x and x'.
+ // To count accurately, we do a topological sort. In case the register is
+ // found for several parents, we increment the usage of the one with the
+ // highest topological index.
+ LiveOutRegsNumUsages.resize(Blocks.size());
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ int topoInd = -1;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+ topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+ }
+ }
+ }
+
+ if (!Found)
+ continue;
+
+ int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+ ++LiveOutRegsNumUsages[PredID][Reg];
+ }
+ }
+
+ LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+ BlockNumPredsLeft.resize(Blocks.size());
+ BlockNumSuccsLeft.resize(Blocks.size());
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ BlockNumPredsLeft[i] = Block->getPreds().size();
+ BlockNumSuccsLeft[i] = Block->getSuccs().size();
+ }
+
+#ifndef NDEBUG
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ assert(Block->getID() == i);
+ }
+#endif
+
+ std::set<unsigned> InRegs = DAG->getInRegs();
+ addLiveRegs(InRegs);
+
+ // Increase LiveOutRegsNumUsages for blocks
+ // producing registers consumed in another
+ // scheduling region.
+ for (unsigned Reg : DAG->getOutRegs()) {
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ // Do reverse traversal
+ int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
+ SIScheduleBlock *Block = Blocks[ID];
+ const std::set<unsigned> &OutRegs = Block->getOutRegs();
+
+ if (OutRegs.find(Reg) == OutRegs.end())
+ continue;
+
+ ++LiveOutRegsNumUsages[ID][Reg];
+ break;
+ }
+ }
+
+ // Fill LiveRegsConsumers for regs that were already
+ // defined before scheduling.
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ break;
+ }
+ }
+
+ if (!Found)
+ ++LiveRegsConsumers[Reg];
+ }
+ }
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ if (BlockNumPredsLeft[i] == 0) {
+ ReadyBlocks.push_back(Block);
+ }
+ }
+
+ while (SIScheduleBlock *Block = pickBlock()) {
+ BlocksScheduled.push_back(Block);
+ blockScheduled(Block);
+ }
+
+ DEBUG(
+ dbgs() << "Block Order:";
+ for (SIScheduleBlock* Block : BlocksScheduled) {
+ dbgs() << ' ' << Block->getID();
+ }
+ dbgs() << '\n';
+ );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Try to hide high latencies.
+ if (tryLess(TryCand.LastPosHighLatParentScheduled,
+ Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+ return true;
+ // Schedule high latencies early so you can hide them better.
+ if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+ TryCand, Cand, Latency))
+ return true;
+ if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+ TryCand, Cand, Depth))
+ return true;
+ if (tryGreater(TryCand.NumHighLatencySuccessors,
+ Cand.NumHighLatencySuccessors,
+ TryCand, Cand, Successor))
+ return true;
+ return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+ TryCand, Cand, RegUsage))
+ return true;
+ if (tryGreater(TryCand.NumSuccessors > 0,
+ Cand.NumSuccessors > 0,
+ TryCand, Cand, Successor))
+ return true;
+ if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+ return true;
+ if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+ TryCand, Cand, RegUsage))
+ return true;
+ return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+ SIBlockSchedCandidate Cand;
+ std::vector<SIScheduleBlock*>::iterator Best;
+ SIScheduleBlock *Block;
+ if (ReadyBlocks.empty())
+ return nullptr;
+
+ DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+ VregCurrentUsage, SregCurrentUsage);
+ if (VregCurrentUsage > maxVregUsage)
+ maxVregUsage = VregCurrentUsage;
+ if (SregCurrentUsage > maxSregUsage)
+ maxSregUsage = SregCurrentUsage;
+ DEBUG(
+ dbgs() << "Picking New Blocks\n";
+ dbgs() << "Available: ";
+ for (SIScheduleBlock* Block : ReadyBlocks)
+ dbgs() << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (unsigned Reg : LiveRegs)
+ dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+ );
+
+ Cand.Block = nullptr;
+ for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+ E = ReadyBlocks.end(); I != E; ++I) {
+ SIBlockSchedCandidate TryCand;
+ TryCand.Block = *I;
+ TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+ TryCand.VGPRUsageDiff =
+ checkRegUsageImpact(TryCand.Block->getInRegs(),
+ TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+ TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+ TryCand.NumHighLatencySuccessors =
+ TryCand.Block->getNumHighLatencySuccessors();
+ TryCand.LastPosHighLatParentScheduled =
+ (unsigned int) std::max<int> (0,
+ LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+ LastPosWaitedHighLatency);
+ TryCand.Height = TryCand.Block->Height;
+ // Try not to increase VGPR usage too much, else we may spill.
+ if (VregCurrentUsage > 120 ||
+ Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+ if (!tryCandidateRegUsage(Cand, TryCand) &&
+ Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+ tryCandidateLatency(Cand, TryCand);
+ } else {
+ if (!tryCandidateLatency(Cand, TryCand))
+ tryCandidateRegUsage(Cand, TryCand);
+ }
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
+ Best = I;
+ DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+ << getReasonStr(Cand.Reason) << '\n');
+ }
+ }
+
+ DEBUG(
+ dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+ dbgs() << "Is a block with high latency instruction: "
+ << (Cand.IsHighLatency ? "yes\n" : "no\n");
+ dbgs() << "Position of last high latency dependency: "
+ << Cand.LastPosHighLatParentScheduled << '\n';
+ dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+ dbgs() << '\n';
+ );
+
+ Block = Cand.Block;
+ ReadyBlocks.erase(Best);
+ return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ // If not already in the live set, then add it.
+ (void) LiveRegs.insert(Reg);
+ }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+ std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+ assert (Pos != LiveRegs.end() && // Reg must be live.
+ LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+ LiveRegsConsumers[Reg] >= 1);
+ --LiveRegsConsumers[Reg];
+ if (LiveRegsConsumers[Reg] == 0)
+ LiveRegs.erase(Pos);
+ }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+ for (const auto &Block : Parent->getSuccs()) {
+ if (--BlockNumPredsLeft[Block.first->getID()] == 0)
+ ReadyBlocks.push_back(Block.first);
+
+ if (Parent->isHighLatencyBlock() &&
+ Block.second == SIScheduleBlockLinkKind::Data)
+ LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
+ }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+ decreaseLiveRegs(Block, Block->getInRegs());
+ addLiveRegs(Block->getOutRegs());
+ releaseBlockSuccs(Block);
+ for (std::map<unsigned, unsigned>::iterator RegI =
+ LiveOutRegsNumUsages[Block->getID()].begin(),
+ E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+ std::pair<unsigned, unsigned> RegP = *RegI;
+ // We produce this register, thus it must not be previously alive.
+ assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
+ LiveRegsConsumers[RegP.first] == 0);
+ LiveRegsConsumers[RegP.first] += RegP.second;
+ }
+ if (LastPosHighLatencyParentScheduled[Block->getID()] >
+ (unsigned)LastPosWaitedHighLatency)
+ LastPosWaitedHighLatency =
+ LastPosHighLatencyParentScheduled[Block->getID()];
+ ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+ std::set<unsigned> &OutRegs) {
+ std::vector<int> DiffSetPressure;
+ DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+ for (unsigned Reg : InRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ if (LiveRegsConsumers[Reg] > 1)
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] -= PSetI.getWeight();
+ }
+ }
+
+ for (unsigned Reg : OutRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] += PSetI.getWeight();
+ }
+ }
+
+ return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+ SISchedulerBlockSchedulerVariant ScheduleVariant) {
+ SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+ SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+ std::vector<SIScheduleBlock*> ScheduledBlocks;
+ struct SIScheduleBlockResult Res;
+
+ ScheduledBlocks = Scheduler.getBlocks();
+
+ for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+ SIScheduleBlock *Block = ScheduledBlocks[b];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs)
+ Res.SUs.push_back(SU->NodeNum);
+ }
+
+ Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+ Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+ return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+ ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+ SITII = static_cast<const SIInstrInfo*>(TII);
+ SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+ VGPRSetID = SITRI->getVGPRPressureSet();
+ SGPRSetID = SITRI->getSGPRPressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() = default;
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+ Topo.InitDAGTopologicalSorting();
+
+ TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
+ BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+ unsigned DAGSize = SUnits.size();
+ int LastLowLatencyUser = -1;
+ int LastLowLatencyPos = -1;
+
+ for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[ScheduledSUnits[i]];
+ bool IsLowLatencyUser = false;
+ unsigned MinPos = 0;
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
+ IsLowLatencyUser = true;
+ }
+ if (Pred->NodeNum >= DAGSize)
+ continue;
+ unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+ if (PredPos >= MinPos)
+ MinPos = PredPos + 1;
+ }
+
+ if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+ unsigned BestPos = LastLowLatencyUser + 1;
+ if ((int)BestPos <= LastLowLatencyPos)
+ BestPos = LastLowLatencyPos + 1;
+ if (BestPos < MinPos)
+ BestPos = MinPos;
+ if (BestPos < i) {
+ for (unsigned u = i; u > BestPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[BestPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+ }
+ LastLowLatencyPos = BestPos;
+ if (IsLowLatencyUser)
+ LastLowLatencyUser = BestPos;
+ } else if (IsLowLatencyUser) {
+ LastLowLatencyUser = i;
+ // Moves COPY instructions on which depends
+ // the low latency instructions too.
+ } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+ bool CopyForLowLat = false;
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
+ CopyForLowLat = true;
+ }
+ }
+ if (!CopyForLowLat)
+ continue;
+ if (MinPos < i) {
+ for (unsigned u = i; u > MinPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[MinPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+ }
+ }
+ }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+ for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+ SUnits[i].isScheduled = false;
+ SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+ SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+ SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+ SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+ }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+ unsigned &VgprUsage, unsigned &SgprUsage) {
+ VgprUsage = 0;
+ SgprUsage = 0;
+ for (_Iterator RegI = First; RegI != End; ++RegI) {
+ unsigned Reg = *RegI;
+ // For now only track virtual registers
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = MRI.getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ if (*PSetI == VGPRSetID)
+ VgprUsage += PSetI.getWeight();
+ else if (*PSetI == SGPRSetID)
+ SgprUsage += PSetI.getWeight();
+ }
+ }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+ SmallVector<SUnit*, 8> TopRoots, BotRoots;
+ SIScheduleBlockResult Best, Temp;
+ DEBUG(dbgs() << "Preparing Scheduling\n");
+
+ buildDAGWithRegPressure();
+ DEBUG(
+ for(SUnit& SU : SUnits)
+ SU.dumpAll(this)
+ );
+
+ topologicalSort();
+ findRootsAndBiasEdges(TopRoots, BotRoots);
+ // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+ // functions, but to make them happy we must initialize
+ // the default Scheduler implementation (even if we do not
+ // run it)
+ SchedImpl->initialize(this);
+ initQueues(TopRoots, BotRoots);
+
+ // Fill some stats to help scheduling.
+
+ SUnitsLinksBackup = SUnits;
+ IsLowLatencySU.clear();
+ LowLatencyOffset.clear();
+ IsHighLatencySU.clear();
+
+ IsLowLatencySU.resize(SUnits.size(), 0);
+ LowLatencyOffset.resize(SUnits.size(), 0);
+ IsHighLatencySU.resize(SUnits.size(), 0);
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ unsigned BaseLatReg;
+ int64_t OffLatReg;
+ if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
+ IsLowLatencySU[i] = 1;
+ if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
+ TRI))
+ LowLatencyOffset[i] = OffLatReg;
+ } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
+ IsHighLatencySU[i] = 1;
+ }
+
+ SIScheduler Scheduler(this);
+ Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+ SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+
+ // if VGPR usage is extremely high, try other good performing variants
+ // which could lead to lower VGPR usage
+ if (Best.MaxVGPRUsage > 180) {
+ static const std::pair<SISchedulerBlockCreatorVariant,
+ SISchedulerBlockSchedulerVariant>
+ Variants[] = {
+ { LatenciesAlone, BlockRegUsageLatency },
+// { LatenciesAlone, BlockRegUsage },
+ { LatenciesGrouped, BlockLatencyRegUsage },
+// { LatenciesGrouped, BlockRegUsageLatency },
+// { LatenciesGrouped, BlockRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+// { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+ // if VGPR usage is still extremely high, we may spill. Try other variants
+ // which are less performing, but that could lead to lower VGPR usage.
+ if (Best.MaxVGPRUsage > 200) {
+ static const std::pair<SISchedulerBlockCreatorVariant,
+ SISchedulerBlockSchedulerVariant>
+ Variants[] = {
+// { LatenciesAlone, BlockRegUsageLatency },
+ { LatenciesAlone, BlockRegUsage },
+// { LatenciesGrouped, BlockLatencyRegUsage },
+ { LatenciesGrouped, BlockRegUsageLatency },
+ { LatenciesGrouped, BlockRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+ { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+
+ ScheduledSUnits = Best.SUs;
+ ScheduledSUnitsInv.resize(SUnits.size());
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+ }
+
+ moveLowLatencies();
+
+ // Tell the outside world about the result of the scheduling.
+
+ assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+ TopRPTracker.setPos(CurrentTop);
+
+ for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+ E = ScheduledSUnits.end(); I != E; ++I) {
+ SUnit *SU = &SUnits[*I];
+
+ scheduleMI(SU, true);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
+ }
+
+ assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+ placeDebugValues();
+
+ DEBUG({
+ dbgs() << "*** Final schedule for "
+ << printMBBReference(*begin()->getParent()) << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
+}