llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This file implements methods from the AMDGPUCustomBehaviour class.
///
//===----------------------------------------------------------------------===//

#include "AMDGPUCustomBehaviour.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/WithColor.h"

namespace llvm {
namespace mca {

void AMDGPUInstrPostProcess::postProcessInstruction(
    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
  switch (MCI.getOpcode()) {
  case AMDGPU::S_WAITCNT:
  case AMDGPU::S_WAITCNT_EXPCNT:
  case AMDGPU::S_WAITCNT_LGKMCNT:
  case AMDGPU::S_WAITCNT_VMCNT:
  case AMDGPU::S_WAITCNT_VSCNT:
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
  case AMDGPU::S_WAITCNT_gfx10:
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
  case AMDGPU::S_WAITCNT_vi:
    return processWaitCnt(Inst, MCI);
  }
}

// s_waitcnt instructions encode important information as immediate operands
// which are lost during the MCInst -> mca::Instruction lowering.
void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
                                            const MCInst &MCI) {
  for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
    MCAOperand Op;
    const MCOperand &MCOp = MCI.getOperand(Idx);
    if (MCOp.isReg()) {
      Op = MCAOperand::createReg(MCOp.getReg());
    } else if (MCOp.isImm()) {
      Op = MCAOperand::createImm(MCOp.getImm());
    }
    Op.setIndex(Idx);
    Inst->addOperand(Op);
  }
}

AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                                             const mca::SourceMgr &SrcMgr,
                                             const MCInstrInfo &MCII)
    : CustomBehaviour(STI, SrcMgr, MCII) {
  generateWaitCntInfo();
}

unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
                                                  const InstRef &IR) {
  const Instruction &Inst = *IR.getInstruction();
  unsigned Opcode = Inst.getOpcode();

  // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
  // pseudo instructions here. However, there are plans for the future to make
  // it possible to use mca within backend passes. As such, I have left the
  // pseudo version of s_waitcnt within this switch statement.
  switch (Opcode) {
  default:
    return 0;
  case AMDGPU::S_WAITCNT: // This instruction
  case AMDGPU::S_WAITCNT_EXPCNT:
  case AMDGPU::S_WAITCNT_LGKMCNT:
  case AMDGPU::S_WAITCNT_VMCNT:
  case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VSCNT_gfx10:
  case AMDGPU::S_WAITCNT_gfx10:
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
  case AMDGPU::S_WAITCNT_vi:
    // s_endpgm also behaves as if there is an implicit
    // s_waitcnt 0, but I'm not sure if it would be appropriate
    // to model this in llvm-mca based on how the iterations work
    // while simulating the pipeline over and over.
    return handleWaitCnt(IssuedInst, IR);
  }

  return 0;
}

unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
                                              const InstRef &IR) {
  // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
  // I do not know how that instruction works so I did not attempt to model it.
  // set the max values to begin
  unsigned Vmcnt = 63;
  unsigned Expcnt = 7;
  unsigned Lgkmcnt = 31;
  unsigned Vscnt = 63;
  unsigned CurrVmcnt = 0;
  unsigned CurrExpcnt = 0;
  unsigned CurrLgkmcnt = 0;
  unsigned CurrVscnt = 0;
  unsigned CyclesToWaitVm = ~0U;
  unsigned CyclesToWaitExp = ~0U;
  unsigned CyclesToWaitLgkm = ~0U;
  unsigned CyclesToWaitVs = ~0U;

  computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);

  // We will now look at each of the currently executing instructions
  // to find out if this wait instruction still needs to wait.
  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
    const InstRef &PrevIR = *I;
    const Instruction &PrevInst = *PrevIR.getInstruction();
    const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
    const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
    const int CyclesLeft = PrevInst.getCyclesLeft();
    assert(CyclesLeft != UNKNOWN_CYCLES &&
           "We should know how many cycles are left for this instruction");
    if (PrevInstWaitInfo.VmCnt) {
      CurrVmcnt++;
      if ((unsigned)CyclesLeft < CyclesToWaitVm)
        CyclesToWaitVm = CyclesLeft;
    }
    if (PrevInstWaitInfo.ExpCnt) {
      CurrExpcnt++;
      if ((unsigned)CyclesLeft < CyclesToWaitExp)
        CyclesToWaitExp = CyclesLeft;
    }
    if (PrevInstWaitInfo.LgkmCnt) {
      CurrLgkmcnt++;
      if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
        CyclesToWaitLgkm = CyclesLeft;
    }
    if (PrevInstWaitInfo.VsCnt) {
      CurrVscnt++;
      if ((unsigned)CyclesLeft < CyclesToWaitVs)
        CyclesToWaitVs = CyclesLeft;
    }
  }

  unsigned CyclesToWait = ~0U;
  if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
    CyclesToWait = CyclesToWaitVm;
  if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
    CyclesToWait = CyclesToWaitExp;
  if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
    CyclesToWait = CyclesToWaitLgkm;
  if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
    CyclesToWait = CyclesToWaitVs;

  // We may underestimate how many cycles we need to wait, but this
  // isn't a big deal. Our return value is just how many cycles until
  // this function gets run again. So as long as we don't overestimate
  // the wait time, we'll still end up stalling at this instruction
  // for the correct number of cycles.

  if (CyclesToWait == ~0U)
    return 0;
  return CyclesToWait;
}

void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
                                           unsigned &Expcnt, unsigned &Lgkmcnt,
                                           unsigned &Vscnt) {
  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
  const Instruction &Inst = *IR.getInstruction();
  unsigned Opcode = Inst.getOpcode();

  switch (Opcode) {
  case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
  case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VMCNT_gfx10:
  case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
    // Should probably be checking for nullptr
    // here, but I'm not sure how I should handle the case
    // where we see a nullptr.
    const MCAOperand *OpReg = Inst.getOperand(0);
    const MCAOperand *OpImm = Inst.getOperand(1);
    assert(OpReg && OpReg->isReg() && "First operand should be a register.");
    assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
    if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
      // Instruction is using a real register.
      // Since we can't know what value this register will have,
      // we can't compute what the value of this wait should be.
      WithColor::warning() << "The register component of "
                           << MCII.getName(Opcode) << " will be completely "
                           << "ignored. So the wait may not be accurate.\n";
    }
    switch (Opcode) {
    // Redundant switch so I don't have to repeat the code above
    // for each case. There are more clever ways to avoid this
    // extra switch and anyone can feel free to implement one of them.
    case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
      Expcnt = OpImm->getImm();
      break;
    case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
      Lgkmcnt = OpImm->getImm();
      break;
    case AMDGPU::S_WAITCNT_VMCNT_gfx10:
      Vmcnt = OpImm->getImm();
      break;
    case AMDGPU::S_WAITCNT_VSCNT_gfx10:
      Vscnt = OpImm->getImm();
      break;
    }
    return;
  }
  case AMDGPU::S_WAITCNT_gfx10:
  case AMDGPU::S_WAITCNT_gfx6_gfx7:
  case AMDGPU::S_WAITCNT_vi:
    unsigned WaitCnt = Inst.getOperand(0)->getImm();
    AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
    return;
  }
}

void AMDGPUCustomBehaviour::generateWaitCntInfo() {
  // The core logic from this function is taken from
  // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
  // that are being looked at are in the MachineInstr format, whereas we have
  // access to the MCInst format. The side effects of this are that we can't use
  // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
  // functions. Therefore, we conservatively assume that these functions will
  // return true. This may cause a few instructions to be incorrectly tagged
  // with an extra CNT. However, these are instructions that do interact with at
  // least one CNT so giving them an extra CNT shouldn't cause issues in most
  // scenarios.
  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
  InstrWaitCntInfo.resize(SrcMgr.size());

  int Index = 0;
  for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
    const std::unique_ptr<Instruction> &Inst = *I;
    unsigned Opcode = Inst->getOpcode();
    const MCInstrDesc &MCID = MCII.get(Opcode);
    if ((MCID.TSFlags & SIInstrFlags::DS) &&
        (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
      InstrWaitCntInfo[Index].LgkmCnt = true;
      if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
        InstrWaitCntInfo[Index].ExpCnt = true;
    } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
      // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
      // and mayAccessLDSThroughFlat(Inst) would both return true for this
      // instruction. We have to do this because those functions use
      // information about the memory operands that we don't have access to.
      InstrWaitCntInfo[Index].LgkmCnt = true;
      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
        InstrWaitCntInfo[Index].VmCnt = true;
      else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
        InstrWaitCntInfo[Index].VmCnt = true;
      else
        InstrWaitCntInfo[Index].VsCnt = true;
    } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
      if (!STI.hasFeature(AMDGPU::FeatureVscnt))
        InstrWaitCntInfo[Index].VmCnt = true;
      else if ((MCID.mayLoad() &&
                !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
               ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
                !MCID.mayStore()))
        InstrWaitCntInfo[Index].VmCnt = true;
      else if (MCID.mayStore())
        InstrWaitCntInfo[Index].VsCnt = true;

      // (IV.Major < 7) is meant to represent
      // GCNTarget.vmemWriteNeedsExpWaitcnt()
      // which is defined as
      // { return getGeneration() < SEA_ISLANDS; }
      if (IV.Major < 7 &&
          (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
        InstrWaitCntInfo[Index].ExpCnt = true;
    } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
      InstrWaitCntInfo[Index].LgkmCnt = true;
    } else if (MCID.TSFlags & SIInstrFlags::EXP) {
      InstrWaitCntInfo[Index].ExpCnt = true;
    } else {
      switch (Opcode) {
      case AMDGPU::S_SENDMSG:
      case AMDGPU::S_SENDMSGHALT:
      case AMDGPU::S_MEMTIME:
      case AMDGPU::S_MEMREALTIME:
        InstrWaitCntInfo[Index].LgkmCnt = true;
        break;
      }
    }
  }
}

// taken from SIInstrInfo::isVMEM()
bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
  return MCID.TSFlags & SIInstrFlags::MUBUF ||
         MCID.TSFlags & SIInstrFlags::MTBUF ||
         MCID.TSFlags & SIInstrFlags::MIMG;
}

// taken from SIInstrInfo::hasModifiersSet()
bool AMDGPUCustomBehaviour::hasModifiersSet(
    const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
  int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
  if (Idx == -1)
    return false;

  const MCAOperand *Op = Inst->getOperand(Idx);
  if (Op == nullptr || !Op->isImm() || !Op->getImm())
    return false;

  return true;
}

// taken from SIInstrInfo::isAlwaysGDS()
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
  return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
         Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
         Opcode == AMDGPU::DS_GWS_SEMA_P ||
         Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
         Opcode == AMDGPU::DS_GWS_BARRIER;
}

} // namespace mca
} // namespace llvm

using namespace llvm;
using namespace mca;

static CustomBehaviour *
createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                            const mca::SourceMgr &SrcMgr,
                            const MCInstrInfo &MCII) {
  return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
}

static InstrPostProcess *
createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
                             const MCInstrInfo &MCII) {
  return new AMDGPUInstrPostProcess(STI, MCII);
}

/// Extern function to initialize the targets for the AMDGPU backend

extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
  TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
                                          createAMDGPUCustomBehaviour);
  TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
                                           createAMDGPUInstrPostProcess);

  TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
                                          createAMDGPUCustomBehaviour);
  TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
                                           createAMDGPUInstrPostProcess);
}