lib/Target/AArch64/AArch64SchedM1.td


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359

//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the machine model for Samsung Exynos-M1 to support
// instruction scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// The Exynos-M1 is a traditional superscalar microprocessor with a
// 4-wide in-order stage for decode and dispatch and a wider issue stage.
// The execution units and loads and stores are out-of-order.

def ExynosM1Model : SchedMachineModel {
  let IssueWidth            =  4; // Up to 4 uops per cycle.
  let MinLatency            =  0; // OoO.
  let MicroOpBufferSize     = 96; // ROB size.
  let LoopMicroOpBufferSize = 32; // Instruction queue size.
  let LoadLatency           =  4; // Optimistic load cases.
  let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
  let CompleteModel         =  0; // Use the default model otherwise.
}

//===----------------------------------------------------------------------===//
// Define each kind of processor resource and number available on the Exynos-M1,
// which has 9 pipelines, each with its own queue with out-of-order dispatch.

def M1UnitA  : ProcResource<2>; // Simple integer
def M1UnitC  : ProcResource<1>; // Simple and complex integer
def M1UnitB  : ProcResource<2>; // Branch
def M1UnitL  : ProcResource<1>; // Load
def M1UnitS  : ProcResource<1>; // Store
def M1PipeF0 : ProcResource<1>; // FP #0
def M1PipeF1 : ProcResource<1>; // FP #1

let Super = M1PipeF0 in {
  def M1UnitFMAC   : ProcResource<1>; // FP multiplication
  def M1UnitFCVT   : ProcResource<1>; // FP conversion
  def M1UnitNAL0   : ProcResource<1>; // Simple vector.
  def M1UnitNMISC  : ProcResource<1>; // Miscellanea
  def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
}

let Super = M1PipeF1 in {
  def M1UnitFADD : ProcResource<1>; // Simple FP
  let BufferSize = 1 in
  def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
  def M1UnitNAL1 : ProcResource<1>; // Simple vector.
  def M1UnitFST  : ProcResource<1>; // FP store
}

let SchedModel = ExynosM1Model in {
  def M1UnitALU  : ProcResGroup<[M1UnitA,
                                 M1UnitC]>;    // All simple integer.
  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
                                 M1UnitNAL1]>; // All simple vector.
}

let SchedModel = ExynosM1Model in {

//===----------------------------------------------------------------------===//
// Coarse scheduling model for the Exynos-M1.

// Branch instructions.
// TODO: Non-conditional direct branches take zero cycles and units.
def : WriteRes<WriteBr,    [M1UnitB]> { let Latency = 1; }
def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
// TODO: Branch and link is much different.

// Arithmetic and logical integer instructions.
def : WriteRes<WriteI,     [M1UnitALU]> { let Latency = 1; }
// TODO: Shift over 3 and some extensions take 2 cycles.
def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
def : WriteRes<WriteIS,    [M1UnitALU]> { let Latency = 1; }

// Move instructions.
def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }

// Divide and multiply instructions.
// TODO: Division blocks the divider inside C.
def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
// TODO: Long multiplication take 5 cycles and also the ALU.
// TODO: Multiplication with accumulation can be advanced.
def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
// TODO: 64-bit multiplication has a throughput of 1/2.
def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }

// Miscellaneous instructions.
def : WriteRes<WriteExtr, [M1UnitALU,
                           M1UnitALU]> { let Latency = 2; }

// TODO: The latency for the post or pre register is 1 cycle.
def : WriteRes<WriteAdr, []> { let Latency = 0; }

// Load instructions.
def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
// TODO: Extended address requires also the ALU.
def : WriteRes<WriteLDIdx, [M1UnitL]>   { let Latency = 5; }
def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }

// Store instructions.
def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
// TODO: Extended address requires also the ALU.
def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }

// FP data instructions.
def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
// TODO: FCCMP is much different.
def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
// TODO: DP takes longer.
def : WriteRes<WriteFDiv, [M1UnitFVAR]>  { let Latency = 15; }
// TODO: MACC takes longer.
def : WriteRes<WriteFMul, [M1UnitFMAC]>  { let Latency = 4; }

// FP miscellaneous instructions.
// TODO: Conversion between register files is much different.
def : WriteRes<WriteFCvt,  [M1UnitFCVT]> { let Latency = 3; }
def : WriteRes<WriteFImm,  [M1UnitNALU]> { let Latency = 1; }
// TODO: Copy from FPR to GPR is much different.
def : WriteRes<WriteFCopy, [M1UnitS]>    { let Latency = 4; }

// FP load instructions.
// TODO: ASIMD loads are much different.
def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }

// FP store instructions.
// TODO: ASIMD stores are much different.
def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }

// ASIMD FP instructions.
// TODO: Other operations are much different.
def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }

// Other miscellaneous instructions.
def : WriteRes<WriteSys,     []> { let Latency = 1; }
def : WriteRes<WriteBarrier, []> { let Latency = 1; }
def : WriteRes<WriteHint,    []> { let Latency = 1; }

//===----------------------------------------------------------------------===//
// Fast forwarding.

// TODO: Add FP register forwarding rules.

def : ReadAdvance<ReadI,       0>;
def : ReadAdvance<ReadISReg,   0>;
def : ReadAdvance<ReadIEReg,   0>;
def : ReadAdvance<ReadIM,      0>;
// Integer multiply-accumulate.
// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
def : ReadAdvance<ReadID,      0>;
def : ReadAdvance<ReadExtrHi,  0>;
def : ReadAdvance<ReadAdrBase, 0>;
def : ReadAdvance<ReadVLD,     0>;

//===----------------------------------------------------------------------===//
// Finer scheduling model for the Exynos-M1.

def M1WriteNEONA   : SchedWriteRes<[M1UnitNALU,
                                    M1UnitNALU,
                                    M1UnitFADD]>   { let Latency = 9; }
def M1WriteNEONB   : SchedWriteRes<[M1UnitNALU,
                                    M1UnitFST]>    { let Latency = 5; }
def M1WriteNEONC   : SchedWriteRes<[M1UnitNALU,
                                    M1UnitFST]>    { let Latency = 6; }
def M1WriteNEOND   : SchedWriteRes<[M1UnitNALU,
                                    M1UnitFST,
                                    M1UnitL]>      { let Latency = 10; }
def M1WriteNEONE   : SchedWriteRes<[M1UnitFCVT,
                                    M1UnitFST]>    { let Latency = 8; }
def M1WriteNEONF   : SchedWriteRes<[M1UnitFCVT,
                                    M1UnitFST,
                                    M1UnitL]>      { let Latency = 13; }
def M1WriteNEONG   : SchedWriteRes<[M1UnitNMISC,
                                    M1UnitFST]>    { let Latency = 6; }
def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
                                    M1UnitFST]>    { let Latency = 3; }
def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
                                    M1UnitL]>      { let Latency = 9; }
def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
// FIXME: This is the worst case, conditional branch and link.
def M1WriteBL      : SchedWriteRes<[M1UnitB,
                                    M1UnitALU]>    { let Latency = 1; }
// FIXME: This is the worst case, when using LR.
def M1WriteBLR     : SchedWriteRes<[M1UnitB,
                                    M1UnitALU,
                                    M1UnitALU]>    { let Latency = 2; }
def M1WriteC1      : SchedWriteRes<[M1UnitC]>      { let Latency = 1; }
def M1WriteC2      : SchedWriteRes<[M1UnitC]>      { let Latency = 2; }
def M1WriteFADD3   : SchedWriteRes<[M1UnitFADD]>   { let Latency = 3; }
def M1WriteFCVT3   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 3; }
def M1WriteFCVT4   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 4; }
def M1WriteFMAC4   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 4; }
def M1WriteFMAC5   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 5; }
def M1WriteFVAR15  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 15; }
def M1WriteFVAR23  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 23; }
def M1WriteNALU1   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 1; }
def M1WriteNALU2   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 2; }
def M1WriteNAL11   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 1; }
def M1WriteNAL12   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 2; }
def M1WriteNAL13   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 3; }
def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
def M1WriteNMISC1  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 1; }
def M1WriteNMISC2  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 2; }
def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
def M1WriteS4      : SchedWriteRes<[M1UnitS]>      { let Latency = 4; }
def M1WriteTB      : SchedWriteRes<[M1UnitC,
                                    M1UnitALU]>    { let Latency = 2; }

// Branch instructions
def : InstRW<[M1WriteB ],  (instrs Bcc)>;
def : InstRW<[M1WriteBL],  (instrs BL)>;
def : InstRW<[M1WriteBLR], (instrs BLR)>;
def : InstRW<[M1WriteC1],  (instregex "^CBN?Z[WX]")>;
def : InstRW<[M1WriteTB],  (instregex "^TBN?Z[WX]")>;

// Arithmetic and logical integer instructions.
def : InstRW<[M1WriteALU1], (instrs COPY)>;

// Divide and multiply instructions.

// Miscellaneous instructions.

// Load instructions.

// Store instructions.

// FP data instructions.
def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
def : InstRW<[M1WriteFADD3],  (instregex "^F(ADD|SUB)[DS]rr")>;
def : InstRW<[M1WriteNEONG],  (instregex "^FCCMPE?[DS]rr")>;
def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
def : InstRW<[M1WriteFMAC4],  (instregex "^FN?MUL[DS]rr")>;
def : InstRW<[M1WriteFMAC5],  (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT.+r")>;
def : InstRW<[M1WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;

// FP miscellaneous instructions.
def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
def : InstRW<[M1WriteS4],    (instregex "^FMOV[WX][DS](High)?r")>;
def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;

// FP load instructions.

// FP store instructions.

// ASIMD instructions.
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
def : InstRW<[M1WriteNALU1],  (instregex "^(ADD|NEG|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
def : InstRW<[M1WriteNALU1],  (instregex "^CMTSTv")>;
def : InstRW<[M1WriteNALU1],  (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
def : InstRW<[M1WriteNAL13],  (instregex "^(S|SR|U|UR)SRAv")>;
def : InstRW<[M1WriteNALU1],  (instregex "^[SU]?SH(L|LL|R)2?v")>;
def : InstRW<[M1WriteNALU1],  (instregex "^S[LR]Iv")>;
def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
def : InstRW<[M1WriteNAL13],  (instregex "^[SU](Q|QR|R)SHLU?v")>;

// ASIMD FP instructions.
def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)v")>;
def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
def : InstRW<[M1WriteNEONA],  (instregex "^FADDP")>;
def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
def : InstRW<[M1WriteFCVT3],  (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v")>;
def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v")>;
def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;

// ASIMD miscellaneous instructions.
def : InstRW<[M1WriteNALU1],  (instregex "^RBITv")>;
def : InstRW<[M1WriteNAL11],  (instregex "^(BIF|BIT|BSL)v")>;
def : InstRW<[M1WriteNALU1],  (instregex "^CPY")>;
def : InstRW<[M1WriteNEONB],  (instregex "^DUPv.+gpr")>;
def : InstRW<[M1WriteNALU1],  (instregex "^DUPv.+lane")>;
def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?Q?XTU?Nv")>;
def : InstRW<[M1WriteNEONC],  (instregex "^INSv.+gpr")>;
def : InstRW<[M1WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev")>;
def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
def : InstRW<[M1WriteFMAC5],  (instregex "^F(RECP|RSQRT)Sv")>;
def : InstRW<[M1WriteNALU1],  (instregex "^REV(16|32|64)v")>;
def : InstRW<[M1WriteNAL11],  (instregex "^TB[LX]v8i8One")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
                              (instregex "^TB[LX]v8i8Two")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
                              (instregex "^TB[LX]v8i8Three")>;
def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
                              (instregex "^TB[LX]v8i8Four")>;
def : InstRW<[M1WriteNAL12],  (instregex "^TB[LX]v16i8One")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
                              (instregex "^TB[LX]v16i8Two")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
                              (instregex "^TB[LX]v16i8Three")>;
def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
                              (instregex "^TB[LX]v16i8Four")>;
def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
def : InstRW<[M1WriteNALU1],  (instregex "^ZIP(1|2)v")>;

// ASIMD load instructions.

// ASIMD store instructions.

// Cryptography instructions.
def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;

// CRC instructions.
def : InstRW<[M1WriteC2], (instregex "^CRC32")>;

} // SchedModel = ExynosM1Model