LLVM 23.0.0git
ARMLatencyMutations.cpp
Go to the documentation of this file.
1//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file contains the ARM definition DAG scheduling mutations which
10/// change inter-instruction latencies
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMLatencyMutations.h"
15#include "ARMSubtarget.h"
16#include "Thumb2InstrInfo.h"
21#include <algorithm>
22#include <array>
23#include <initializer_list>
24#include <memory>
25
26namespace llvm {
27
28namespace {
29
30// Precompute information about opcodes to speed up pass
31
32class InstructionInformation {
33protected:
34 struct IInfo {
35 bool HasBRegAddr : 1; // B-side of addr gen is a register
36 bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
37 bool IsDivide : 1; // Some form of integer divide
38 bool IsInlineShiftALU : 1; // Inline shift+ALU
39 bool IsMultiply : 1; // Some form of integer multiply
40 bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation
41 bool IsNonSubwordLoad : 1; // Load which is a word or larger
42 bool IsShift : 1; // Shift operation
43 bool IsRev : 1; // REV operation
44 bool ProducesQP : 1; // Produces a vector register result
45 bool ProducesDP : 1; // Produces a double-precision register result
46 bool ProducesSP : 1; // Produces a single-precision register result
47 bool ConsumesQP : 1; // Consumes a vector register result
48 bool ConsumesDP : 1; // Consumes a double-precision register result
49 bool ConsumesSP : 1; // Consumes a single-precision register result
50 unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51 unsigned AddressOpMask; // Mask indicating which operands go into AGU
52 IInfo()
53 : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54 IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55 IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56 ProducesQP(false), ProducesDP(false), ProducesSP(false),
57 ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58 MVEIntMACMatched(0), AddressOpMask(0) {}
59 };
60 typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61 IInfoArray Info;
62
63public:
64 // Always available information
65 unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
66 bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
67 bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
68 bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
69 bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
70 bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
71 bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
72 bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
73 bool isRev(unsigned Op) { return Info[Op].IsRev; }
74 bool isShift(unsigned Op) { return Info[Op].IsShift; }
75
76 // information available if markDPConsumers is called.
77 bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
78 bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
79 bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
80 bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
81 bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
82 bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
83
84 bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85 return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
86 }
87
88 InstructionInformation(const ARMBaseInstrInfo *TII);
89
90protected:
91 void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
92};
93
94InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95 using namespace ARM;
96
97 std::initializer_list<unsigned> hasBRegAddrList = {
98 t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99 tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
100 };
101 for (auto op : hasBRegAddrList) {
102 Info[op].HasBRegAddr = true;
103 }
104
105 std::initializer_list<unsigned> hasBRegAddrShiftList = {
106 t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
107 };
108 for (auto op : hasBRegAddrShiftList) {
109 Info[op].HasBRegAddrShift = true;
110 }
111
112 Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
113
114 std::initializer_list<unsigned> isInlineShiftALUList = {
115 t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs, t2ORNrs, t2RSBSrs, t2RSBrs,
116 t2SBCrs, t2SUBrs, t2SUBSrs, t2CMPrs, t2CMNrs, t2TEQrs, t2TSTrs,
117 };
118 for (auto op : isInlineShiftALUList) {
119 Info[op].IsInlineShiftALU = true;
120 }
121
122 Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
123
124 std::initializer_list<unsigned> isMultiplyList = {
125 t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
126 t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
127 t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
128 t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
129 t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
130 t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
131 };
132 for (auto op : isMultiplyList) {
133 Info[op].IsMultiply = true;
134 }
135
136 std::initializer_list<unsigned> isMVEIntMACList = {
137 MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
138 MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
139 MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
140 MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
141 MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
142 MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
143 MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
144 MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
145 MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
146 MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
147 MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
148 MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
149 MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
150 MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
151 };
152 for (auto op : isMVEIntMACList) {
153 Info[op].IsMVEIntMAC = true;
154 }
155
156 std::initializer_list<unsigned> isNonSubwordLoadList = {
157 t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
158 t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
159 tLDRpci, tLDRr, tLDRspi,
160 };
161 for (auto op : isNonSubwordLoadList) {
162 Info[op].IsNonSubwordLoad = true;
163 }
164
165 std::initializer_list<unsigned> isRevList = {
166 t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
167 };
168 for (auto op : isRevList) {
169 Info[op].IsRev = true;
170 }
171
172 std::initializer_list<unsigned> isShiftList = {
173 t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
174 tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
175 };
176 for (auto op : isShiftList) {
177 Info[op].IsShift = true;
178 }
179
180 std::initializer_list<unsigned> Address1List = {
181 t2LDRBi12,
182 t2LDRBi8,
183 t2LDRBpci,
184 t2LDRBs,
185 t2LDRHi12,
186 t2LDRHi8,
187 t2LDRHpci,
188 t2LDRHs,
189 t2LDRSBi12,
190 t2LDRSBi8,
191 t2LDRSBpci,
192 t2LDRSBs,
193 t2LDRSHi12,
194 t2LDRSHi8,
195 t2LDRSHpci,
196 t2LDRSHs,
197 t2LDRi12,
198 t2LDRi8,
199 t2LDRpci,
200 t2LDRs,
201 tLDRBi,
202 tLDRBr,
203 tLDRHi,
204 tLDRHr,
205 tLDRSB,
206 tLDRSH,
207 tLDRi,
208 tLDRpci,
209 tLDRr,
210 tLDRspi,
211 t2STRBi12,
212 t2STRBi8,
213 t2STRBs,
214 t2STRHi12,
215 t2STRHi8,
216 t2STRHs,
217 t2STRi12,
218 t2STRi8,
219 t2STRs,
220 tSTRBi,
221 tSTRBr,
222 tSTRHi,
223 tSTRHr,
224 tSTRi,
225 tSTRr,
226 tSTRspi,
227 VLDRD,
228 VLDRH,
229 VLDRS,
230 VSTRD,
231 VSTRH,
232 VSTRS,
233 MVE_VLD20_16,
234 MVE_VLD20_32,
235 MVE_VLD20_8,
236 MVE_VLD21_16,
237 MVE_VLD21_32,
238 MVE_VLD21_8,
239 MVE_VLD40_16,
240 MVE_VLD40_32,
241 MVE_VLD40_8,
242 MVE_VLD41_16,
243 MVE_VLD41_32,
244 MVE_VLD41_8,
245 MVE_VLD42_16,
246 MVE_VLD42_32,
247 MVE_VLD42_8,
248 MVE_VLD43_16,
249 MVE_VLD43_32,
250 MVE_VLD43_8,
251 MVE_VLDRBS16,
252 MVE_VLDRBS16_rq,
253 MVE_VLDRBS32,
254 MVE_VLDRBS32_rq,
255 MVE_VLDRBU16,
256 MVE_VLDRBU16_rq,
257 MVE_VLDRBU32,
258 MVE_VLDRBU32_rq,
259 MVE_VLDRBU8,
260 MVE_VLDRBU8_rq,
261 MVE_VLDRDU64_qi,
262 MVE_VLDRDU64_rq,
263 MVE_VLDRDU64_rq_u,
264 MVE_VLDRHS32,
265 MVE_VLDRHS32_rq,
266 MVE_VLDRHS32_rq_u,
267 MVE_VLDRHU16,
268 MVE_VLDRHU16_rq,
269 MVE_VLDRHU16_rq_u,
270 MVE_VLDRHU32,
271 MVE_VLDRHU32_rq,
272 MVE_VLDRHU32_rq_u,
273 MVE_VLDRWU32,
274 MVE_VLDRWU32_qi,
275 MVE_VLDRWU32_rq,
276 MVE_VLDRWU32_rq_u,
277 MVE_VST20_16,
278 MVE_VST20_32,
279 MVE_VST20_8,
280 MVE_VST21_16,
281 MVE_VST21_32,
282 MVE_VST21_8,
283 MVE_VST40_16,
284 MVE_VST40_32,
285 MVE_VST40_8,
286 MVE_VST41_16,
287 MVE_VST41_32,
288 MVE_VST41_8,
289 MVE_VST42_16,
290 MVE_VST42_32,
291 MVE_VST42_8,
292 MVE_VST43_16,
293 MVE_VST43_32,
294 MVE_VST43_8,
295 MVE_VSTRB16,
296 MVE_VSTRB16_rq,
297 MVE_VSTRB32,
298 MVE_VSTRB32_rq,
299 MVE_VSTRBU8,
300 MVE_VSTRB8_rq,
301 MVE_VSTRD64_qi,
302 MVE_VSTRD64_rq,
303 MVE_VSTRD64_rq_u,
304 MVE_VSTRH32,
305 MVE_VSTRH32_rq,
306 MVE_VSTRH32_rq_u,
307 MVE_VSTRHU16,
308 MVE_VSTRH16_rq,
309 MVE_VSTRH16_rq_u,
310 MVE_VSTRWU32,
311 MVE_VSTRW32_qi,
312 MVE_VSTRW32_rq,
313 MVE_VSTRW32_rq_u,
314 };
315 std::initializer_list<unsigned> Address2List = {
316 t2LDRB_POST,
317 t2LDRB_PRE,
318 t2LDRDi8,
319 t2LDRH_POST,
320 t2LDRH_PRE,
321 t2LDRSB_POST,
322 t2LDRSB_PRE,
323 t2LDRSH_POST,
324 t2LDRSH_PRE,
325 t2LDR_POST,
326 t2LDR_PRE,
327 t2STRB_POST,
328 t2STRB_PRE,
329 t2STRDi8,
330 t2STRH_POST,
331 t2STRH_PRE,
332 t2STR_POST,
333 t2STR_PRE,
334 MVE_VLD20_16_wb,
335 MVE_VLD20_32_wb,
336 MVE_VLD20_8_wb,
337 MVE_VLD21_16_wb,
338 MVE_VLD21_32_wb,
339 MVE_VLD21_8_wb,
340 MVE_VLD40_16_wb,
341 MVE_VLD40_32_wb,
342 MVE_VLD40_8_wb,
343 MVE_VLD41_16_wb,
344 MVE_VLD41_32_wb,
345 MVE_VLD41_8_wb,
346 MVE_VLD42_16_wb,
347 MVE_VLD42_32_wb,
348 MVE_VLD42_8_wb,
349 MVE_VLD43_16_wb,
350 MVE_VLD43_32_wb,
351 MVE_VLD43_8_wb,
352 MVE_VLDRBS16_post,
353 MVE_VLDRBS16_pre,
354 MVE_VLDRBS32_post,
355 MVE_VLDRBS32_pre,
356 MVE_VLDRBU16_post,
357 MVE_VLDRBU16_pre,
358 MVE_VLDRBU32_post,
359 MVE_VLDRBU32_pre,
360 MVE_VLDRBU8_post,
361 MVE_VLDRBU8_pre,
362 MVE_VLDRDU64_qi_pre,
363 MVE_VLDRHS32_post,
364 MVE_VLDRHS32_pre,
365 MVE_VLDRHU16_post,
366 MVE_VLDRHU16_pre,
367 MVE_VLDRHU32_post,
368 MVE_VLDRHU32_pre,
369 MVE_VLDRWU32_post,
370 MVE_VLDRWU32_pre,
371 MVE_VLDRWU32_qi_pre,
372 MVE_VST20_16_wb,
373 MVE_VST20_32_wb,
374 MVE_VST20_8_wb,
375 MVE_VST21_16_wb,
376 MVE_VST21_32_wb,
377 MVE_VST21_8_wb,
378 MVE_VST40_16_wb,
379 MVE_VST40_32_wb,
380 MVE_VST40_8_wb,
381 MVE_VST41_16_wb,
382 MVE_VST41_32_wb,
383 MVE_VST41_8_wb,
384 MVE_VST42_16_wb,
385 MVE_VST42_32_wb,
386 MVE_VST42_8_wb,
387 MVE_VST43_16_wb,
388 MVE_VST43_32_wb,
389 MVE_VST43_8_wb,
390 MVE_VSTRB16_post,
391 MVE_VSTRB16_pre,
392 MVE_VSTRB32_post,
393 MVE_VSTRB32_pre,
394 MVE_VSTRBU8_post,
395 MVE_VSTRBU8_pre,
396 MVE_VSTRD64_qi_pre,
397 MVE_VSTRH32_post,
398 MVE_VSTRH32_pre,
399 MVE_VSTRHU16_post,
400 MVE_VSTRHU16_pre,
401 MVE_VSTRWU32_post,
402 MVE_VSTRWU32_pre,
403 MVE_VSTRW32_qi_pre,
404 };
405 std::initializer_list<unsigned> Address3List = {
406 t2LDRD_POST,
407 t2LDRD_PRE,
408 t2STRD_POST,
409 t2STRD_PRE,
410 };
411 // Compute a mask of which operands are involved in address computation
412 for (auto &op : Address1List) {
413 Info[op].AddressOpMask = 0x6;
414 }
415 for (auto &op : Address2List) {
416 Info[op].AddressOpMask = 0xc;
417 }
418 for (auto &op : Address3List) {
419 Info[op].AddressOpMask = 0x18;
420 }
421 for (auto &op : hasBRegAddrShiftList) {
422 Info[op].AddressOpMask |= 0x8;
423 }
424}
425
426void InstructionInformation::markDPProducersConsumers(
427 const ARMBaseInstrInfo *TII) {
428 // Learn about all instructions which have FP source/dest registers
429 for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
430 const MCInstrDesc &MID = TII->get(MI);
431 auto Operands = MID.operands();
432 for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
433 bool MarkQP = false, MarkDP = false, MarkSP = false;
434 switch (Operands[OI].RegClass) {
435 case ARM::MQPRRegClassID:
436 case ARM::DPRRegClassID:
437 case ARM::DPR_8RegClassID:
438 case ARM::DPR_VFP2RegClassID:
439 case ARM::DPairRegClassID:
440 case ARM::DPairSpcRegClassID:
441 case ARM::DQuadRegClassID:
442 case ARM::DQuadSpcRegClassID:
443 case ARM::DTripleRegClassID:
444 case ARM::DTripleSpcRegClassID:
445 MarkDP = true;
446 break;
447 case ARM::QPRRegClassID:
448 case ARM::QPR_8RegClassID:
449 case ARM::QPR_VFP2RegClassID:
450 case ARM::QQPRRegClassID:
451 case ARM::QQQQPRRegClassID:
452 MarkQP = true;
453 break;
454 case ARM::SPRRegClassID:
455 case ARM::SPR_8RegClassID:
456 case ARM::FPWithVPRRegClassID:
457 MarkSP = true;
458 break;
459 default:
460 break;
461 }
462 if (MarkQP) {
463 if (OI < MID.getNumDefs())
464 Info[MI].ProducesQP = true;
465 else
466 Info[MI].ConsumesQP = true;
467 }
468 if (MarkDP) {
469 if (OI < MID.getNumDefs())
470 Info[MI].ProducesDP = true;
471 else
472 Info[MI].ConsumesDP = true;
473 }
474 if (MarkSP) {
475 if (OI < MID.getNumDefs())
476 Info[MI].ProducesSP = true;
477 else
478 Info[MI].ConsumesSP = true;
479 }
480 }
481 }
482}
483
484} // anonymous namespace
485
486static bool hasImplicitCPSRUse(const MachineInstr *MI) {
487 return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
488}
489
491 unsigned latency) {
492 SDep Reverse = SrcDep;
493 Reverse.setSUnit(&SrcSU);
494 for (SDep &PDep : SrcDep.getSUnit()->Preds) {
495 if (PDep == Reverse) {
496 PDep.setLatency(latency);
497 SrcDep.getSUnit()->setDepthDirty();
498 break;
499 }
500 }
501 SrcDep.setLatency(latency);
502 SrcSU.setHeightDirty();
503}
504
506 return (a & 0xe) != (b & 0xe);
507}
508
509// Set output dependences to zero latency for processors which can
510// simultaneously issue to the same register. Returns true if a change
511// was made.
513 if (Dep.getKind() == SDep::Output) {
514 setBidirLatencies(ISU, Dep, 0);
515 return true;
516 }
517 return false;
518}
519
520// The graph doesn't look inside of bundles to determine their
521// scheduling boundaries and reports zero latency into and out of them
522// (except for CPSR into the bundle, which has latency 1).
523// Make some better scheduling assumptions:
524// 1) CPSR uses have zero latency; other uses have incoming latency 1
525// 2) CPSR defs retain a latency of zero; others have a latency of 1.
526//
527// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529
530 SUnit &DepSU = *Dep.getSUnit();
531 const MachineInstr *SrcMI = ISU.getInstr();
532 unsigned SrcOpcode = SrcMI->getOpcode();
533 const MachineInstr *DstMI = DepSU.getInstr();
534 unsigned DstOpcode = DstMI->getOpcode();
535
536 if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
538 ISU, Dep,
539 (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
540 return 1;
541 }
542 if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
543 Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
544 setBidirLatencies(ISU, Dep, 1);
545 return 2;
546 }
547 return 0;
548}
549
550// Determine whether there is a memory RAW hazard here and set up latency
551// accordingly
553 unsigned latency) {
554 if (!Dep.isNormalMemory())
555 return false;
556 auto &SrcInst = *ISU.getInstr();
557 auto &DstInst = *Dep.getSUnit()->getInstr();
558 if (!SrcInst.mayStore() || !DstInst.mayLoad())
559 return false;
560
561 auto SrcMO = *SrcInst.memoperands().begin();
562 auto DstMO = *DstInst.memoperands().begin();
563 auto SrcVal = SrcMO->getValue();
564 auto DstVal = DstMO->getValue();
565 auto SrcPseudoVal = SrcMO->getPseudoValue();
566 auto DstPseudoVal = DstMO->getPseudoValue();
567 if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
568 SrcMO->getOffset() == DstMO->getOffset()) {
569 setBidirLatencies(ISU, Dep, latency);
570 return true;
571 } else if (SrcPseudoVal && DstPseudoVal &&
572 SrcPseudoVal->kind() == DstPseudoVal->kind() &&
573 SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
574 // Spills/fills
575 auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
576 auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
577 if (FS0 == FS1) {
578 setBidirLatencies(ISU, Dep, latency);
579 return true;
580 }
581 }
582 return false;
583}
584
585namespace {
586
587std::unique_ptr<InstructionInformation> II;
588
589class CortexM7InstructionInformation : public InstructionInformation {
590public:
591 CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
592 : InstructionInformation(TII) {}
593};
594
595class CortexM7Overrides : public ARMOverrideBypasses {
596public:
597 CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
598 : ARMOverrideBypasses(TII, AA) {
599 if (!II)
600 II.reset(new CortexM7InstructionInformation(TII));
601 }
602
603 void modifyBypasses(SUnit &) override;
604};
605
606void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
607 const MachineInstr *SrcMI = ISU.getInstr();
608 unsigned SrcOpcode = SrcMI->getOpcode();
609 bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
610
611 // Walk the successors looking for latency overrides that are needed
612 for (SDep &Dep : ISU.Succs) {
613
614 // Output dependences should have 0 latency, as M7 is able to
615 // schedule writers to the same register for simultaneous issue.
616 if (zeroOutputDependences(ISU, Dep))
617 continue;
618
619 if (memoryRAWHazard(ISU, Dep, 4))
620 continue;
621
622 // Ignore dependencies other than data
623 if (Dep.getKind() != SDep::Data)
624 continue;
625
626 SUnit &DepSU = *Dep.getSUnit();
627 if (DepSU.isBoundaryNode())
628 continue;
629
630 if (makeBundleAssumptions(ISU, Dep) == 1)
631 continue;
632
633 const MachineInstr *DstMI = DepSU.getInstr();
634 unsigned DstOpcode = DstMI->getOpcode();
635
636 // Word loads into any multiply or divide instruction are considered
637 // cannot bypass their scheduling stage. Didn't do this in the .td file
638 // because we cannot easily create a read advance that is 0 from certain
639 // writer classes and 1 from all the rest.
640 // (The other way around would have been easy.)
641 if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
642 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
643
644 // Word loads into B operand of a load/store are considered cannot bypass
645 // their scheduling stage. Cannot do in the .td file because
646 // need to decide between -1 and -2 for ReadAdvance
647 if (isNSWload && II->hasBRegAddr(DstOpcode) &&
648 DstMI->getOperand(2).getReg() == Dep.getReg())
649 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
650
651 // Multiplies into any address generation cannot bypass from EX3. Cannot do
652 // in the .td file because need to decide between -1 and -2 for ReadAdvance
653 if (II->isMultiply(SrcOpcode)) {
654 unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
655 for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
656 if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
657 DstMI->getOperand(i).getReg() == Dep.getReg()) {
658 setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
659 break;
660 }
661 }
662 }
663
664 // Mismatched conditional producers take longer on M7; they end up looking
665 // like they were produced at EX3 and read at IS.
666 if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
667 (SrcOpcode == ARM::BUNDLE ||
668 mismatchedPred(TII->getPredicate(*SrcMI),
669 TII->getPredicate(*DstMI)))) {
670 unsigned Lat = 1;
671 // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
672 if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
673 DstMI->getOperand(1).getReg() == Dep.getReg())
674 Lat = 2;
675 Lat = std::min(3u, Dep.getLatency() + Lat);
676 setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
677 }
678
679 // CC setter into conditional producer shouldn't have a latency of more
680 // than 1 unless it's due to an implicit read. (All the "true" readers
681 // of the condition code use an implicit read, and predicates use an
682 // explicit.)
683 if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
684 TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
685 setBidirLatencies(ISU, Dep, 1);
686
687 // REV instructions cannot bypass directly into the EX1 shifter. The
688 // code is slightly inexact as it doesn't attempt to ensure that the bypass
689 // is to the shifter operands.
690 if (II->isRev(SrcOpcode)) {
691 if (II->isInlineShiftALU(DstOpcode))
692 setBidirLatencies(ISU, Dep, 2);
693 else if (II->isShift(DstOpcode))
694 setBidirLatencies(ISU, Dep, 1);
695 }
696 }
697}
698
699class M85InstructionInformation : public InstructionInformation {
700public:
701 M85InstructionInformation(const ARMBaseInstrInfo *t)
702 : InstructionInformation(t) {
703 markDPProducersConsumers(t);
704 }
705};
706
707class M85Overrides : public ARMOverrideBypasses {
708public:
709 M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
710 : ARMOverrideBypasses(t, a) {
711 if (!II)
712 II.reset(new M85InstructionInformation(t));
713 }
714
715 void modifyBypasses(SUnit &) override;
716
717private:
718 unsigned computeBypassStage(const MCSchedClassDesc *SCD);
719 signed modifyMixedWidthFP(const MachineInstr *SrcMI,
720 const MachineInstr *DstMI, unsigned RegID,
721 const MCSchedClassDesc *SCD);
722};
723
724unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
725 auto SM = DAG->getSchedModel();
726 unsigned DefIdx = 0; // just look for the first output's timing
727 if (DefIdx < SCDesc->NumWriteLatencyEntries) {
728 // Lookup the definition's write latency in SubtargetInfo.
729 const MCWriteLatencyEntry *WLEntry =
730 SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
731 unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
732 if (Latency == 4)
733 return 2;
734 else if (Latency == 5)
735 return 3;
736 else if (Latency > 3)
737 return 3;
738 else
739 return Latency;
740 }
741 return 2;
742}
743
744// Latency changes for bypassing between FP registers of different sizes:
745//
746// Note that mixed DP/SP are unlikely because of the semantics
747// of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
748signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
749 const MachineInstr *DstMI,
750 unsigned RegID,
751 const MCSchedClassDesc *SCD) {
752
753 if (!II->producesSP(SrcMI->getOpcode()) &&
754 !II->producesDP(SrcMI->getOpcode()) &&
755 !II->producesQP(SrcMI->getOpcode()))
756 return 0;
757
758 if (Register::isVirtualRegister(RegID)) {
759 if (II->producesSP(SrcMI->getOpcode()) &&
760 II->consumesDP(DstMI->getOpcode())) {
761 for (auto &OP : SrcMI->operands())
762 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
763 OP.getSubReg() == ARM::ssub_1)
764 return 5 - computeBypassStage(SCD);
765 } else if (II->producesSP(SrcMI->getOpcode()) &&
766 II->consumesQP(DstMI->getOpcode())) {
767 for (auto &OP : SrcMI->operands())
768 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
769 (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
770 return 5 - computeBypassStage(SCD) -
771 ((OP.getSubReg() == ARM::ssub_2 ||
772 OP.getSubReg() == ARM::ssub_3)
773 ? 1
774 : 0);
775 } else if (II->producesDP(SrcMI->getOpcode()) &&
776 II->consumesQP(DstMI->getOpcode())) {
777 for (auto &OP : SrcMI->operands())
778 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
779 OP.getSubReg() == ARM::ssub_1)
780 return -1;
781 } else if (II->producesDP(SrcMI->getOpcode()) &&
782 II->consumesSP(DstMI->getOpcode())) {
783 for (auto &OP : DstMI->operands())
784 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
785 OP.getSubReg() == ARM::ssub_1)
786 return 5 - computeBypassStage(SCD);
787 } else if (II->producesQP(SrcMI->getOpcode()) &&
788 II->consumesSP(DstMI->getOpcode())) {
789 for (auto &OP : DstMI->operands())
790 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
791 (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
792 return 5 - computeBypassStage(SCD) +
793 ((OP.getSubReg() == ARM::ssub_2 ||
794 OP.getSubReg() == ARM::ssub_3)
795 ? 1
796 : 0);
797 } else if (II->producesQP(SrcMI->getOpcode()) &&
798 II->consumesDP(DstMI->getOpcode())) {
799 for (auto &OP : DstMI->operands())
800 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
801 OP.getSubReg() == ARM::ssub_1)
802 return 1;
803 }
804 } else if (Register::isPhysicalRegister(RegID)) {
805 // Note that when the producer is narrower, not all of the producers
806 // may be present in the scheduling graph; somewhere earlier in the
807 // compiler, an implicit def/use of the aliased full register gets
808 // added to the producer, and so only that producer is seen as *the*
809 // single producer. This behavior also has the unfortunate effect of
810 // serializing the producers in the compiler's view of things.
811 if (II->producesSP(SrcMI->getOpcode()) &&
812 II->consumesDP(DstMI->getOpcode())) {
813 for (auto &OP : SrcMI->operands())
814 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
815 OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
816 (OP.getReg() == RegID ||
817 (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
818 (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
819 return 5 - computeBypassStage(SCD);
820 } else if (II->producesSP(SrcMI->getOpcode()) &&
821 II->consumesQP(DstMI->getOpcode())) {
822 for (auto &OP : SrcMI->operands())
823 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
824 OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
825 (OP.getReg() == RegID ||
826 (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
827 (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
828 return 5 - computeBypassStage(SCD) -
829 (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
830 } else if (II->producesDP(SrcMI->getOpcode()) &&
831 II->consumesQP(DstMI->getOpcode())) {
832 for (auto &OP : SrcMI->operands())
833 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
834 OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
835 (OP.getReg() == RegID ||
836 (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
837 return -1;
838 } else if (II->producesDP(SrcMI->getOpcode()) &&
839 II->consumesSP(DstMI->getOpcode())) {
840 if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
841 return 5 - computeBypassStage(SCD);
842 } else if (II->producesQP(SrcMI->getOpcode()) &&
843 II->consumesSP(DstMI->getOpcode())) {
844 if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
845 return 5 - computeBypassStage(SCD) +
846 (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
847 } else if (II->producesQP(SrcMI->getOpcode()) &&
848 II->consumesDP(DstMI->getOpcode())) {
849 if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
850 return 1;
851 }
852 }
853 return 0;
854}
855
856void M85Overrides::modifyBypasses(SUnit &ISU) {
857 const MachineInstr *SrcMI = ISU.getInstr();
858 unsigned SrcOpcode = SrcMI->getOpcode();
859 bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
860
861 // Walk the successors looking for latency overrides that are needed
862 for (SDep &Dep : ISU.Succs) {
863
864 // Output dependences should have 0 latency, as CortexM85 is able to
865 // schedule writers to the same register for simultaneous issue.
866 if (zeroOutputDependences(ISU, Dep))
867 continue;
868
869 if (memoryRAWHazard(ISU, Dep, 3))
870 continue;
871
872 // Ignore dependencies other than data or strong ordering.
873 if (Dep.getKind() != SDep::Data)
874 continue;
875
876 SUnit &DepSU = *Dep.getSUnit();
877 if (DepSU.isBoundaryNode())
878 continue;
879
880 if (makeBundleAssumptions(ISU, Dep) == 1)
881 continue;
882
883 const MachineInstr *DstMI = DepSU.getInstr();
884 unsigned DstOpcode = DstMI->getOpcode();
885
886 // Word loads into B operand of a load/store with cannot bypass their
887 // scheduling stage. Cannot do in the .td file because need to decide
888 // between -1 and -2 for ReadAdvance
889
890 if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
891 DstMI->getOperand(3).getImm() != 0 && // shift operand
892 DstMI->getOperand(2).getReg() == Dep.getReg())
893 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
894
895 if (isNSWload && isMVEVectorInstruction(DstMI)) {
896 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
897 }
898
899 if (II->isMVEIntMAC(DstOpcode) &&
900 II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
901 DstMI->getOperand(0).isReg() &&
902 DstMI->getOperand(0).getReg() == Dep.getReg())
903 setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
904
905 // CC setter into conditional producer shouldn't have a latency of more
906 // than 0 unless it's due to an implicit read.
907 if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
908 TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
909 setBidirLatencies(ISU, Dep, 0);
910
911 if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
912 DAG->getSchedClass(&ISU)))
913 setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
914
915 if (II->isRev(SrcOpcode)) {
916 if (II->isInlineShiftALU(DstOpcode))
917 setBidirLatencies(ISU, Dep, 1);
918 else if (II->isShift(DstOpcode))
919 setBidirLatencies(ISU, Dep, 1);
920 }
921 }
922}
923
924// Add M55 specific overrides for latencies between instructions. Currently it:
925// - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
926class CortexM55Overrides : public ARMOverrideBypasses {
927public:
928 CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
929 : ARMOverrideBypasses(TII, AA) {}
930
931 void modifyBypasses(SUnit &SU) override {
932 MachineInstr *SrcMI = SU.getInstr();
933 if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
934 return;
935
936 for (SDep &Dep : SU.Succs) {
937 if (Dep.getKind() != SDep::Data)
938 continue;
939 SUnit &DepSU = *Dep.getSUnit();
940 if (DepSU.isBoundaryNode())
941 continue;
942 MachineInstr *DstMI = DepSU.getInstr();
943
944 if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
945 setBidirLatencies(SU, Dep, 3);
946 }
947 }
948};
949
950} // end anonymous namespace
951
953 DAG = DAGInstrs;
954 for (SUnit &ISU : DAGInstrs->SUnits) {
955 if (ISU.isBoundaryNode())
956 continue;
957 modifyBypasses(ISU);
958 }
959 if (DAGInstrs->ExitSU.getInstr())
960 modifyBypasses(DAGInstrs->ExitSU);
961}
962
963std::unique_ptr<ScheduleDAGMutation>
965 if (ST.isCortexM85())
966 return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
967 else if (ST.isCortexM7())
968 return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
969 else if (ST.isCortexM55())
970 return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
971
972 return nullptr;
973}
974
975} // end namespace llvm
Function Alias Analysis false
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static constexpr unsigned SM(unsigned Version)
uint64_t IntrinsicInst * II
#define OP(OPC)
Definition Instruction.h:46
bool memoryRAWHazard(SUnit &ISU, SDep &Dep, unsigned latency)
static void setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, unsigned latency)
static bool zeroOutputDependences(SUnit &ISU, SDep &Dep)
void apply(ScheduleDAGInstrs *DAGInstrs) override
unsigned makeBundleAssumptions(SUnit &ISU, SDep &Dep)
const ARMBaseInstrInfo * TII
@ MustAlias
The two locations precisely alias each other.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Output
A register output-dependence (aka WAW).
Definition ScheduleDAG.h:57
void setLatency(unsigned Lat)
Sets the latency for this edge.
bool isAssignedRegDep() const
Tests if this is a Data dependence that is associated with a register.
bool isNormalMemory() const
Tests if this is an Order dependence between two memory accesses where both sides of the dependence a...
Register getReg() const
Returns the register associated with this edge.
Scheduling unit. This is a node in the scheduling DAG.
LLVM_ABI void setHeightDirty()
Sets a flag in this node to indicate that its stored Height value will require recomputation the next...
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
std::vector< SUnit > SUnits
The scheduling units.
SUnit ExitSU
Special node for the region exit.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ ARM
Windows AXP64.
Definition MCAsmInfo.h:47
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA)
bool isMVEVectorInstruction(const MachineInstr *MI)
static bool hasImplicitCPSRUse(const MachineInstr *MI)
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b)