aboutsummaryrefslogblamecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
blob: 5c507ef70a8c1ef255cfbd202946d0c645b6dadd (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439






















































































































































































































































































































































































































































                                                                                
//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file This file defines a set of schedule DAG mutations that can be used to
// override default scheduler behavior to enforce specific scheduling patterns.
// They should be used in cases where runtime performance considerations such as
// inter-wavefront interactions, mean that compile-time heuristics cannot
// predict the optimal instruction ordering, or in kernels where optimum
// instruction scheduling is important enough to warrant manual intervention.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUIGroupLP.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"

using namespace llvm;

#define DEBUG_TYPE "machine-scheduler"

namespace {

static cl::opt<bool>
    EnableIGroupLP("amdgpu-igrouplp",
                   cl::desc("Enable construction of Instruction Groups and "
                            "their ordering for scheduling"),
                   cl::init(false));

static cl::opt<Optional<unsigned>>
    VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
                     cl::Hidden,
                     cl::desc("The maximum number of instructions to include "
                              "in VMEM group."));

static cl::opt<Optional<unsigned>>
    MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
                     cl::Hidden,
                     cl::desc("The maximum number of instructions to include "
                              "in MFMA group."));

static cl::opt<Optional<unsigned>>
    LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
                    cl::Hidden,
                    cl::desc("The maximum number of instructions to include "
                             "in lds/gds read group."));

static cl::opt<Optional<unsigned>>
    LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
                    cl::Hidden,
                    cl::desc("The maximum number of instructions to include "
                             "in lds/gds write group."));

typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
    CanAddMIFn;

// Classify instructions into groups to enable fine tuned control over the
// scheduler. These groups may be more specific than current SchedModel
// instruction classes.
class SchedGroup {
private:
  // Function that returns true if a non-bundle MI may be inserted into this
  // group.
  const CanAddMIFn canAddMI;

  // Maximum number of SUnits that can be added to this group.
  Optional<unsigned> MaxSize;

  // Collection of SUnits that are classified as members of this group.
  SmallVector<SUnit *, 32> Collection;

  ScheduleDAGInstrs *DAG;

  void tryAddEdge(SUnit *A, SUnit *B) {
    if (A != B && DAG->canAddEdge(B, A)) {
      DAG->addEdge(B, SDep(A, SDep::Artificial));
      LLVM_DEBUG(dbgs() << "Adding edge...\n"
                        << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
                        << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
    }
  }

public:
  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
  // MakePred is true, SU will be a predecessor of the SUnits in this
  // SchedGroup, otherwise SU will be a successor.
  void link(SUnit &SU, bool MakePred = false) {
    for (auto A : Collection) {
      SUnit *B = &SU;
      if (MakePred)
        std::swap(A, B);

      tryAddEdge(A, B);
    }
  }

  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
  // the predicate to determine whether SU should be a predecessor (P = true)
  // or a successor (P = false) of this SchedGroup.
  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
    for (auto A : Collection) {
      SUnit *B = &SU;
      if (P(A, B))
        std::swap(A, B);

      tryAddEdge(A, B);
    }
  }

  // Add DAG dependencies such that SUnits in this group shall be ordered
  // before SUnits in OtherGroup.
  void link(SchedGroup &OtherGroup) {
    for (auto B : OtherGroup.Collection)
      link(*B);
  }

  // Returns true if no more instructions may be added to this group.
  bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }

  // Returns true if SU can be added to this SchedGroup.
  bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
    if (isFull())
      return false;

    MachineInstr &MI = *SU.getInstr();
    if (MI.getOpcode() != TargetOpcode::BUNDLE)
      return canAddMI(MI, TII);

    // Special case for bundled MIs.
    const MachineBasicBlock *MBB = MI.getParent();
    MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
    while (E != MBB->end() && E->isBundledWithPred())
      ++E;

    // Return true if all of the bundled MIs can be added to this group.
    return std::all_of(
        B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
  }

  void add(SUnit &SU) { Collection.push_back(&SU); }

  SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
             ScheduleDAGInstrs *DAG)
      : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
};

bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return TII->isMFMA(MI);
}

bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return TII->isVALU(MI) && !TII->isMFMA(MI);
}

bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return TII->isSALU(MI);
}

bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
}

bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return MI.mayLoad() &&
         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
}

bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return MI.mayStore() &&
         (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
}

bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return MI.mayStore() && TII->isDS(MI);
}

bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
  return MI.mayLoad() && TII->isDS(MI);
}

class IGroupLPDAGMutation : public ScheduleDAGMutation {
public:
  const SIInstrInfo *TII;
  ScheduleDAGMI *DAG;

  IGroupLPDAGMutation() = default;
  void apply(ScheduleDAGInstrs *DAGInstrs) override;
};

// DAG mutation that coordinates with the SCHED_BARRIER instruction and
// corresponding builtin. The mutation adds edges from specific instruction
// classes determined by the SCHED_BARRIER mask so that they cannot be
// scheduled around the SCHED_BARRIER.
class SchedBarrierDAGMutation : public ScheduleDAGMutation {
private:
  const SIInstrInfo *TII;

  ScheduleDAGMI *DAG;

  // Components of the mask that determines which instructions may not be
  // scheduled across the SCHED_BARRIER.
  enum class SchedBarrierMasks {
    NONE = 0u,
    ALU = 1u << 0,
    VALU = 1u << 1,
    SALU = 1u << 2,
    MFMA = 1u << 3,
    VMEM = 1u << 4,
    VMEM_READ = 1u << 5,
    VMEM_WRITE = 1u << 6,
    DS = 1u << 7,
    DS_READ = 1u << 8,
    DS_WRITE = 1u << 9,
    LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
  };

  // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
  // region.
  //
  std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
  std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
  std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
  std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
  std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
  std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
  std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;

  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
  // not be reordered accross the SCHED_BARRIER.
  void getSchedGroupsFromMask(int32_t Mask,
                              SmallVectorImpl<SchedGroup *> &SchedGroups);

  // Add DAG edges that enforce SCHED_BARRIER ordering.
  void addSchedBarrierEdges(SUnit &SU);

  // Classify instructions and add them to the SchedGroup.
  void initSchedGroup(SchedGroup *SG);

  // Remove all existing edges from a SCHED_BARRIER.
  void resetSchedBarrierEdges(SUnit &SU);

public:
  void apply(ScheduleDAGInstrs *DAGInstrs) override;

  SchedBarrierDAGMutation() = default;
};

void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
  if (!TSchedModel || DAG->SUnits.empty())
    return;

  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");

  // The order of InstructionGroups in this vector defines the
  // order in which edges will be added. In other words, given the
  // present ordering, we will try to make each VMEMRead instruction
  // a predecessor of each DSRead instruction, and so on.
  SmallVector<SchedGroup, 4> PipelineOrderGroups = {
      SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
      SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
      SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
      SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};

  for (SUnit &SU : DAG->SUnits) {
    LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
    for (auto &SG : PipelineOrderGroups)
      if (SG.canAddSU(SU, TII))
        SG.add(SU);
  }

  for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
    auto &GroupA = PipelineOrderGroups[i];
    for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
      auto &GroupB = PipelineOrderGroups[j];
      GroupA.link(GroupB);
    }
  }
}

void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
  if (!TSchedModel || DAGInstrs->SUnits.empty())
    return;

  LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");

  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
  for (auto &SU : DAG->SUnits)
    if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
      addSchedBarrierEdges(SU);
}

void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
  MachineInstr &MI = *SchedBarrier.getInstr();
  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
  // Remove all existing edges from the SCHED_BARRIER that were added due to the
  // instruction having side effects.
  resetSchedBarrierEdges(SchedBarrier);
  SmallVector<SchedGroup *, 4> SchedGroups;
  int32_t Mask = MI.getOperand(0).getImm();
  getSchedGroupsFromMask(Mask, SchedGroups);
  for (auto SG : SchedGroups)
    SG->link(
        SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
                          const SUnit *A, const SUnit *B) {
          return A->NodeNum > B->NodeNum;
        });
}

void SchedBarrierDAGMutation::getSchedGroupsFromMask(
    int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
  SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
  // See IntrinsicsAMDGPU.td for an explanation of these masks and their
  // mappings.
  //
  if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
    if (!VALUSchedGroup) {
      VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
      initSchedGroup(VALUSchedGroup.get());
    }

    SchedGroups.push_back(VALUSchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
    if (!SALUSchedGroup) {
      SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
      initSchedGroup(SALUSchedGroup.get());
    }

    SchedGroups.push_back(SALUSchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
    if (!MFMASchedGroup) {
      MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
      initSchedGroup(MFMASchedGroup.get());
    }

    SchedGroups.push_back(MFMASchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
    if (!VMEMReadSchedGroup) {
      VMEMReadSchedGroup =
          std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
      initSchedGroup(VMEMReadSchedGroup.get());
    }

    SchedGroups.push_back(VMEMReadSchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
    if (!VMEMWriteSchedGroup) {
      VMEMWriteSchedGroup =
          std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
      initSchedGroup(VMEMWriteSchedGroup.get());
    }

    SchedGroups.push_back(VMEMWriteSchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
    if (!DSReadSchedGroup) {
      DSReadSchedGroup =
          std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
      initSchedGroup(DSReadSchedGroup.get());
    }

    SchedGroups.push_back(DSReadSchedGroup.get());
  }

  if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
      (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
    if (!DSWriteSchedGroup) {
      DSWriteSchedGroup =
          std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
      initSchedGroup(DSWriteSchedGroup.get());
    }

    SchedGroups.push_back(DSWriteSchedGroup.get());
  }
}

void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
  assert(SG);
  for (auto &SU : DAG->SUnits)
    if (SG->canAddSU(SU, TII))
      SG->add(SU);
}

void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
  for (auto &P : SU.Preds)
    SU.removePred(P);

  for (auto &S : SU.Succs) {
    for (auto &SP : S.getSUnit()->Preds) {
      if (SP.getSUnit() == &SU) {
        S.getSUnit()->removePred(SP);
      }
    }
  }
}

} // namespace

namespace llvm {

std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
  return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
}

std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
  return std::make_unique<SchedBarrierDAGMutation>();
}

} // end namespace llvm