LLVM 17.0.0git
AArch64FalkorHWPFFix.cpp
Go to the documentation of this file.
1//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
9/// that may inhibit the HW prefetching. This is done in two steps. Before
10/// ISel, we mark strided loads (i.e. those that will likely benefit from
11/// prefetching) with metadata. Then, after opcodes have been finalized, we
12/// insert MOVs and re-write loads to prevent unintentional tag collisions.
13// ===---------------------------------------------------------------------===//
14
15#include "AArch64.h"
16#include "AArch64InstrInfo.h"
17#include "AArch64Subtarget.h"
19#include "llvm/ADT/DenseMap.h"
22#include "llvm/ADT/Statistic.h"
37#include "llvm/IR/DebugLoc.h"
38#include "llvm/IR/Dominators.h"
39#include "llvm/IR/Function.h"
40#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Metadata.h"
44#include "llvm/Pass.h"
46#include "llvm/Support/Debug.h"
49#include <cassert>
50#include <iterator>
51#include <utility>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
56
57STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
58STATISTIC(NumCollisionsAvoided,
59 "Number of HW prefetch tag collisions avoided");
60STATISTIC(NumCollisionsNotAvoided,
61 "Number of HW prefetch tag collisions not avoided due to lack of registers");
62DEBUG_COUNTER(FixCounter, "falkor-hwpf",
63 "Controls which tag collisions are avoided");
64
65namespace {
66
67class FalkorMarkStridedAccesses {
68public:
69 FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
70 : LI(LI), SE(SE) {}
71
72 bool run();
73
74private:
75 bool runOnLoop(Loop &L);
76
77 LoopInfo &LI;
79};
80
81class FalkorMarkStridedAccessesLegacy : public FunctionPass {
82public:
83 static char ID; // Pass ID, replacement for typeid
84
85 FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
88 }
89
90 void getAnalysisUsage(AnalysisUsage &AU) const override {
97 }
98
99 bool runOnFunction(Function &F) override;
100};
101
102} // end anonymous namespace
103
104char FalkorMarkStridedAccessesLegacy::ID = 0;
105
106INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
107 "Falkor HW Prefetch Fix", false, false)
111INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
112 "Falkor HW Prefetch Fix", false, false)
113
115 return new FalkorMarkStridedAccessesLegacy();
116}
117
118bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
119 TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
120 const AArch64Subtarget *ST =
121 TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
122 if (ST->getProcFamily() != AArch64Subtarget::Falkor)
123 return false;
124
125 if (skipFunction(F))
126 return false;
127
128 LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
129 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
130
131 FalkorMarkStridedAccesses LDP(LI, SE);
132 return LDP.run();
133}
134
135bool FalkorMarkStridedAccesses::run() {
136 bool MadeChange = false;
137
138 for (Loop *L : LI)
139 for (Loop *LIt : depth_first(L))
140 MadeChange |= runOnLoop(*LIt);
141
142 return MadeChange;
143}
144
145bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
146 // Only mark strided loads in the inner-most loop
147 if (!L.isInnermost())
148 return false;
149
150 bool MadeChange = false;
151
152 for (BasicBlock *BB : L.blocks()) {
153 for (Instruction &I : *BB) {
154 LoadInst *LoadI = dyn_cast<LoadInst>(&I);
155 if (!LoadI)
156 continue;
157
158 Value *PtrValue = LoadI->getPointerOperand();
159 if (L.isLoopInvariant(PtrValue))
160 continue;
161
162 const SCEV *LSCEV = SE.getSCEV(PtrValue);
163 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
164 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
165 continue;
166
168 MDNode::get(LoadI->getContext(), {}));
169 ++NumStridedLoadsMarked;
170 LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
171 MadeChange = true;
172 }
173 }
174
175 return MadeChange;
176}
177
178namespace {
179
180class FalkorHWPFFix : public MachineFunctionPass {
181public:
182 static char ID;
183
184 FalkorHWPFFix() : MachineFunctionPass(ID) {
186 }
187
188 bool runOnMachineFunction(MachineFunction &Fn) override;
189
190 void getAnalysisUsage(AnalysisUsage &AU) const override {
191 AU.setPreservesCFG();
194 }
195
198 MachineFunctionProperties::Property::NoVRegs);
199 }
200
201private:
202 void runOnLoop(MachineLoop &L, MachineFunction &Fn);
203
204 const AArch64InstrInfo *TII;
205 const TargetRegisterInfo *TRI;
207 bool Modified;
208};
209
210/// Bits from load opcodes used to compute HW prefetcher instruction tags.
211struct LoadInfo {
212 LoadInfo() = default;
213
214 Register DestReg;
215 Register BaseReg;
216 int BaseRegIdx = -1;
217 const MachineOperand *OffsetOpnd = nullptr;
218 bool IsPrePost = false;
219};
220
221} // end anonymous namespace
222
223char FalkorHWPFFix::ID = 0;
224
225INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
226 "Falkor HW Prefetch Fix Late Phase", false, false)
228INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
229 "Falkor HW Prefetch Fix Late Phase", false, false)
230
231static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
232 return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
233}
234
235static std::optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
236 int DestRegIdx;
237 int BaseRegIdx;
238 int OffsetIdx;
239 bool IsPrePost;
240
241 switch (MI.getOpcode()) {
242 default:
243 return std::nullopt;
244
245 case AArch64::LD1i64:
246 case AArch64::LD2i64:
247 DestRegIdx = 0;
248 BaseRegIdx = 3;
249 OffsetIdx = -1;
250 IsPrePost = false;
251 break;
252
253 case AArch64::LD1i8:
254 case AArch64::LD1i16:
255 case AArch64::LD1i32:
256 case AArch64::LD2i8:
257 case AArch64::LD2i16:
258 case AArch64::LD2i32:
259 case AArch64::LD3i8:
260 case AArch64::LD3i16:
261 case AArch64::LD3i32:
262 case AArch64::LD3i64:
263 case AArch64::LD4i8:
264 case AArch64::LD4i16:
265 case AArch64::LD4i32:
266 case AArch64::LD4i64:
267 DestRegIdx = -1;
268 BaseRegIdx = 3;
269 OffsetIdx = -1;
270 IsPrePost = false;
271 break;
272
273 case AArch64::LD1Onev1d:
274 case AArch64::LD1Onev2s:
275 case AArch64::LD1Onev4h:
276 case AArch64::LD1Onev8b:
277 case AArch64::LD1Onev2d:
278 case AArch64::LD1Onev4s:
279 case AArch64::LD1Onev8h:
280 case AArch64::LD1Onev16b:
281 case AArch64::LD1Rv1d:
282 case AArch64::LD1Rv2s:
283 case AArch64::LD1Rv4h:
284 case AArch64::LD1Rv8b:
285 case AArch64::LD1Rv2d:
286 case AArch64::LD1Rv4s:
287 case AArch64::LD1Rv8h:
288 case AArch64::LD1Rv16b:
289 DestRegIdx = 0;
290 BaseRegIdx = 1;
291 OffsetIdx = -1;
292 IsPrePost = false;
293 break;
294
295 case AArch64::LD1Twov1d:
296 case AArch64::LD1Twov2s:
297 case AArch64::LD1Twov4h:
298 case AArch64::LD1Twov8b:
299 case AArch64::LD1Twov2d:
300 case AArch64::LD1Twov4s:
301 case AArch64::LD1Twov8h:
302 case AArch64::LD1Twov16b:
303 case AArch64::LD1Threev1d:
304 case AArch64::LD1Threev2s:
305 case AArch64::LD1Threev4h:
306 case AArch64::LD1Threev8b:
307 case AArch64::LD1Threev2d:
308 case AArch64::LD1Threev4s:
309 case AArch64::LD1Threev8h:
310 case AArch64::LD1Threev16b:
311 case AArch64::LD1Fourv1d:
312 case AArch64::LD1Fourv2s:
313 case AArch64::LD1Fourv4h:
314 case AArch64::LD1Fourv8b:
315 case AArch64::LD1Fourv2d:
316 case AArch64::LD1Fourv4s:
317 case AArch64::LD1Fourv8h:
318 case AArch64::LD1Fourv16b:
319 case AArch64::LD2Twov2s:
320 case AArch64::LD2Twov4s:
321 case AArch64::LD2Twov8b:
322 case AArch64::LD2Twov2d:
323 case AArch64::LD2Twov4h:
324 case AArch64::LD2Twov8h:
325 case AArch64::LD2Twov16b:
326 case AArch64::LD2Rv1d:
327 case AArch64::LD2Rv2s:
328 case AArch64::LD2Rv4s:
329 case AArch64::LD2Rv8b:
330 case AArch64::LD2Rv2d:
331 case AArch64::LD2Rv4h:
332 case AArch64::LD2Rv8h:
333 case AArch64::LD2Rv16b:
334 case AArch64::LD3Threev2s:
335 case AArch64::LD3Threev4h:
336 case AArch64::LD3Threev8b:
337 case AArch64::LD3Threev2d:
338 case AArch64::LD3Threev4s:
339 case AArch64::LD3Threev8h:
340 case AArch64::LD3Threev16b:
341 case AArch64::LD3Rv1d:
342 case AArch64::LD3Rv2s:
343 case AArch64::LD3Rv4h:
344 case AArch64::LD3Rv8b:
345 case AArch64::LD3Rv2d:
346 case AArch64::LD3Rv4s:
347 case AArch64::LD3Rv8h:
348 case AArch64::LD3Rv16b:
349 case AArch64::LD4Fourv2s:
350 case AArch64::LD4Fourv4h:
351 case AArch64::LD4Fourv8b:
352 case AArch64::LD4Fourv2d:
353 case AArch64::LD4Fourv4s:
354 case AArch64::LD4Fourv8h:
355 case AArch64::LD4Fourv16b:
356 case AArch64::LD4Rv1d:
357 case AArch64::LD4Rv2s:
358 case AArch64::LD4Rv4h:
359 case AArch64::LD4Rv8b:
360 case AArch64::LD4Rv2d:
361 case AArch64::LD4Rv4s:
362 case AArch64::LD4Rv8h:
363 case AArch64::LD4Rv16b:
364 DestRegIdx = -1;
365 BaseRegIdx = 1;
366 OffsetIdx = -1;
367 IsPrePost = false;
368 break;
369
370 case AArch64::LD1i64_POST:
371 case AArch64::LD2i64_POST:
372 DestRegIdx = 1;
373 BaseRegIdx = 4;
374 OffsetIdx = 5;
375 IsPrePost = true;
376 break;
377
378 case AArch64::LD1i8_POST:
379 case AArch64::LD1i16_POST:
380 case AArch64::LD1i32_POST:
381 case AArch64::LD2i8_POST:
382 case AArch64::LD2i16_POST:
383 case AArch64::LD2i32_POST:
384 case AArch64::LD3i8_POST:
385 case AArch64::LD3i16_POST:
386 case AArch64::LD3i32_POST:
387 case AArch64::LD3i64_POST:
388 case AArch64::LD4i8_POST:
389 case AArch64::LD4i16_POST:
390 case AArch64::LD4i32_POST:
391 case AArch64::LD4i64_POST:
392 DestRegIdx = -1;
393 BaseRegIdx = 4;
394 OffsetIdx = 5;
395 IsPrePost = true;
396 break;
397
398 case AArch64::LD1Onev1d_POST:
399 case AArch64::LD1Onev2s_POST:
400 case AArch64::LD1Onev4h_POST:
401 case AArch64::LD1Onev8b_POST:
402 case AArch64::LD1Onev2d_POST:
403 case AArch64::LD1Onev4s_POST:
404 case AArch64::LD1Onev8h_POST:
405 case AArch64::LD1Onev16b_POST:
406 case AArch64::LD1Rv1d_POST:
407 case AArch64::LD1Rv2s_POST:
408 case AArch64::LD1Rv4h_POST:
409 case AArch64::LD1Rv8b_POST:
410 case AArch64::LD1Rv2d_POST:
411 case AArch64::LD1Rv4s_POST:
412 case AArch64::LD1Rv8h_POST:
413 case AArch64::LD1Rv16b_POST:
414 DestRegIdx = 1;
415 BaseRegIdx = 2;
416 OffsetIdx = 3;
417 IsPrePost = true;
418 break;
419
420 case AArch64::LD1Twov1d_POST:
421 case AArch64::LD1Twov2s_POST:
422 case AArch64::LD1Twov4h_POST:
423 case AArch64::LD1Twov8b_POST:
424 case AArch64::LD1Twov2d_POST:
425 case AArch64::LD1Twov4s_POST:
426 case AArch64::LD1Twov8h_POST:
427 case AArch64::LD1Twov16b_POST:
428 case AArch64::LD1Threev1d_POST:
429 case AArch64::LD1Threev2s_POST:
430 case AArch64::LD1Threev4h_POST:
431 case AArch64::LD1Threev8b_POST:
432 case AArch64::LD1Threev2d_POST:
433 case AArch64::LD1Threev4s_POST:
434 case AArch64::LD1Threev8h_POST:
435 case AArch64::LD1Threev16b_POST:
436 case AArch64::LD1Fourv1d_POST:
437 case AArch64::LD1Fourv2s_POST:
438 case AArch64::LD1Fourv4h_POST:
439 case AArch64::LD1Fourv8b_POST:
440 case AArch64::LD1Fourv2d_POST:
441 case AArch64::LD1Fourv4s_POST:
442 case AArch64::LD1Fourv8h_POST:
443 case AArch64::LD1Fourv16b_POST:
444 case AArch64::LD2Twov2s_POST:
445 case AArch64::LD2Twov4s_POST:
446 case AArch64::LD2Twov8b_POST:
447 case AArch64::LD2Twov2d_POST:
448 case AArch64::LD2Twov4h_POST:
449 case AArch64::LD2Twov8h_POST:
450 case AArch64::LD2Twov16b_POST:
451 case AArch64::LD2Rv1d_POST:
452 case AArch64::LD2Rv2s_POST:
453 case AArch64::LD2Rv4s_POST:
454 case AArch64::LD2Rv8b_POST:
455 case AArch64::LD2Rv2d_POST:
456 case AArch64::LD2Rv4h_POST:
457 case AArch64::LD2Rv8h_POST:
458 case AArch64::LD2Rv16b_POST:
459 case AArch64::LD3Threev2s_POST:
460 case AArch64::LD3Threev4h_POST:
461 case AArch64::LD3Threev8b_POST:
462 case AArch64::LD3Threev2d_POST:
463 case AArch64::LD3Threev4s_POST:
464 case AArch64::LD3Threev8h_POST:
465 case AArch64::LD3Threev16b_POST:
466 case AArch64::LD3Rv1d_POST:
467 case AArch64::LD3Rv2s_POST:
468 case AArch64::LD3Rv4h_POST:
469 case AArch64::LD3Rv8b_POST:
470 case AArch64::LD3Rv2d_POST:
471 case AArch64::LD3Rv4s_POST:
472 case AArch64::LD3Rv8h_POST:
473 case AArch64::LD3Rv16b_POST:
474 case AArch64::LD4Fourv2s_POST:
475 case AArch64::LD4Fourv4h_POST:
476 case AArch64::LD4Fourv8b_POST:
477 case AArch64::LD4Fourv2d_POST:
478 case AArch64::LD4Fourv4s_POST:
479 case AArch64::LD4Fourv8h_POST:
480 case AArch64::LD4Fourv16b_POST:
481 case AArch64::LD4Rv1d_POST:
482 case AArch64::LD4Rv2s_POST:
483 case AArch64::LD4Rv4h_POST:
484 case AArch64::LD4Rv8b_POST:
485 case AArch64::LD4Rv2d_POST:
486 case AArch64::LD4Rv4s_POST:
487 case AArch64::LD4Rv8h_POST:
488 case AArch64::LD4Rv16b_POST:
489 DestRegIdx = -1;
490 BaseRegIdx = 2;
491 OffsetIdx = 3;
492 IsPrePost = true;
493 break;
494
495 case AArch64::LDRBBroW:
496 case AArch64::LDRBBroX:
497 case AArch64::LDRBBui:
498 case AArch64::LDRBroW:
499 case AArch64::LDRBroX:
500 case AArch64::LDRBui:
501 case AArch64::LDRDl:
502 case AArch64::LDRDroW:
503 case AArch64::LDRDroX:
504 case AArch64::LDRDui:
505 case AArch64::LDRHHroW:
506 case AArch64::LDRHHroX:
507 case AArch64::LDRHHui:
508 case AArch64::LDRHroW:
509 case AArch64::LDRHroX:
510 case AArch64::LDRHui:
511 case AArch64::LDRQl:
512 case AArch64::LDRQroW:
513 case AArch64::LDRQroX:
514 case AArch64::LDRQui:
515 case AArch64::LDRSBWroW:
516 case AArch64::LDRSBWroX:
517 case AArch64::LDRSBWui:
518 case AArch64::LDRSBXroW:
519 case AArch64::LDRSBXroX:
520 case AArch64::LDRSBXui:
521 case AArch64::LDRSHWroW:
522 case AArch64::LDRSHWroX:
523 case AArch64::LDRSHWui:
524 case AArch64::LDRSHXroW:
525 case AArch64::LDRSHXroX:
526 case AArch64::LDRSHXui:
527 case AArch64::LDRSWl:
528 case AArch64::LDRSWroW:
529 case AArch64::LDRSWroX:
530 case AArch64::LDRSWui:
531 case AArch64::LDRSl:
532 case AArch64::LDRSroW:
533 case AArch64::LDRSroX:
534 case AArch64::LDRSui:
535 case AArch64::LDRWl:
536 case AArch64::LDRWroW:
537 case AArch64::LDRWroX:
538 case AArch64::LDRWui:
539 case AArch64::LDRXl:
540 case AArch64::LDRXroW:
541 case AArch64::LDRXroX:
542 case AArch64::LDRXui:
543 case AArch64::LDURBBi:
544 case AArch64::LDURBi:
545 case AArch64::LDURDi:
546 case AArch64::LDURHHi:
547 case AArch64::LDURHi:
548 case AArch64::LDURQi:
549 case AArch64::LDURSBWi:
550 case AArch64::LDURSBXi:
551 case AArch64::LDURSHWi:
552 case AArch64::LDURSHXi:
553 case AArch64::LDURSWi:
554 case AArch64::LDURSi:
555 case AArch64::LDURWi:
556 case AArch64::LDURXi:
557 DestRegIdx = 0;
558 BaseRegIdx = 1;
559 OffsetIdx = 2;
560 IsPrePost = false;
561 break;
562
563 case AArch64::LDRBBpost:
564 case AArch64::LDRBBpre:
565 case AArch64::LDRBpost:
566 case AArch64::LDRBpre:
567 case AArch64::LDRDpost:
568 case AArch64::LDRDpre:
569 case AArch64::LDRHHpost:
570 case AArch64::LDRHHpre:
571 case AArch64::LDRHpost:
572 case AArch64::LDRHpre:
573 case AArch64::LDRQpost:
574 case AArch64::LDRQpre:
575 case AArch64::LDRSBWpost:
576 case AArch64::LDRSBWpre:
577 case AArch64::LDRSBXpost:
578 case AArch64::LDRSBXpre:
579 case AArch64::LDRSHWpost:
580 case AArch64::LDRSHWpre:
581 case AArch64::LDRSHXpost:
582 case AArch64::LDRSHXpre:
583 case AArch64::LDRSWpost:
584 case AArch64::LDRSWpre:
585 case AArch64::LDRSpost:
586 case AArch64::LDRSpre:
587 case AArch64::LDRWpost:
588 case AArch64::LDRWpre:
589 case AArch64::LDRXpost:
590 case AArch64::LDRXpre:
591 DestRegIdx = 1;
592 BaseRegIdx = 2;
593 OffsetIdx = 3;
594 IsPrePost = true;
595 break;
596
597 case AArch64::LDNPDi:
598 case AArch64::LDNPQi:
599 case AArch64::LDNPSi:
600 case AArch64::LDPQi:
601 case AArch64::LDPDi:
602 case AArch64::LDPSi:
603 DestRegIdx = -1;
604 BaseRegIdx = 2;
605 OffsetIdx = 3;
606 IsPrePost = false;
607 break;
608
609 case AArch64::LDPSWi:
610 case AArch64::LDPWi:
611 case AArch64::LDPXi:
612 DestRegIdx = 0;
613 BaseRegIdx = 2;
614 OffsetIdx = 3;
615 IsPrePost = false;
616 break;
617
618 case AArch64::LDPQpost:
619 case AArch64::LDPQpre:
620 case AArch64::LDPDpost:
621 case AArch64::LDPDpre:
622 case AArch64::LDPSpost:
623 case AArch64::LDPSpre:
624 DestRegIdx = -1;
625 BaseRegIdx = 3;
626 OffsetIdx = 4;
627 IsPrePost = true;
628 break;
629
630 case AArch64::LDPSWpost:
631 case AArch64::LDPSWpre:
632 case AArch64::LDPWpost:
633 case AArch64::LDPWpre:
634 case AArch64::LDPXpost:
635 case AArch64::LDPXpre:
636 DestRegIdx = 1;
637 BaseRegIdx = 3;
638 OffsetIdx = 4;
639 IsPrePost = true;
640 break;
641 }
642
643 // Loads from the stack pointer don't get prefetched.
644 Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
645 if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
646 return std::nullopt;
647
648 LoadInfo LI;
649 LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg();
650 LI.BaseReg = BaseReg;
651 LI.BaseRegIdx = BaseRegIdx;
652 LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
653 LI.IsPrePost = IsPrePost;
654 return LI;
655}
656
657static std::optional<unsigned> getTag(const TargetRegisterInfo *TRI,
658 const MachineInstr &MI,
659 const LoadInfo &LI) {
660 unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
661 unsigned Base = TRI->getEncodingValue(LI.BaseReg);
662 unsigned Off;
663 if (LI.OffsetOpnd == nullptr)
664 Off = 0;
665 else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
666 LI.OffsetOpnd->isCPI())
667 return std::nullopt;
668 else if (LI.OffsetOpnd->isReg())
669 Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
670 else
671 Off = LI.OffsetOpnd->getImm() >> 2;
672
673 return makeTag(Dest, Base, Off);
674}
675
676void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
677 // Build the initial tag map for the whole loop.
678 TagMap.clear();
679 for (MachineBasicBlock *MBB : L.getBlocks())
680 for (MachineInstr &MI : *MBB) {
681 std::optional<LoadInfo> LInfo = getLoadInfo(MI);
682 if (!LInfo)
683 continue;
684 std::optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
685 if (!Tag)
686 continue;
687 TagMap[*Tag].push_back(&MI);
688 }
689
690 bool AnyCollisions = false;
691 for (auto &P : TagMap) {
692 auto Size = P.second.size();
693 if (Size > 1) {
694 for (auto *MI : P.second) {
695 if (TII->isStridedAccess(*MI)) {
696 AnyCollisions = true;
697 break;
698 }
699 }
700 }
701 if (AnyCollisions)
702 break;
703 }
704 // Nothing to fix.
705 if (!AnyCollisions)
706 return;
707
709
710 // Go through all the basic blocks in the current loop and fix any streaming
711 // loads to avoid collisions with any other loads.
712 LiveRegUnits LR(*TRI);
713 for (MachineBasicBlock *MBB : L.getBlocks()) {
714 LR.clear();
715 LR.addLiveOuts(*MBB);
716 for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
717 MachineInstr &MI = *I;
718 if (!TII->isStridedAccess(MI))
719 continue;
720
721 std::optional<LoadInfo> OptLdI = getLoadInfo(MI);
722 if (!OptLdI)
723 continue;
724 LoadInfo LdI = *OptLdI;
725 std::optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
726 if (!OptOldTag)
727 continue;
728 auto &OldCollisions = TagMap[*OptOldTag];
729 if (OldCollisions.size() <= 1)
730 continue;
731
732 bool Fixed = false;
733 LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
734
735 if (!DebugCounter::shouldExecute(FixCounter)) {
736 LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
737 continue;
738 }
739
740 // Add the non-base registers of MI as live so we don't use them as
741 // scratch registers.
742 for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
743 if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
744 continue;
745 MachineOperand &MO = MI.getOperand(OpI);
746 if (MO.isReg() && MO.readsReg())
747 LR.addReg(MO.getReg());
748 }
749
750 for (unsigned ScratchReg : AArch64::GPR64RegClass) {
751 if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
752 continue;
753
754 LoadInfo NewLdI(LdI);
755 NewLdI.BaseReg = ScratchReg;
756 unsigned NewTag = *getTag(TRI, MI, NewLdI);
757 // Scratch reg tag would collide too, so don't use it.
758 if (TagMap.count(NewTag))
759 continue;
760
761 LLVM_DEBUG(dbgs() << "Changing base reg to: "
762 << printReg(ScratchReg, TRI) << '\n');
763
764 // Rewrite:
765 // Xd = LOAD Xb, off
766 // to:
767 // Xc = MOV Xb
768 // Xd = LOAD Xc, off
769 DebugLoc DL = MI.getDebugLoc();
770 BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
771 .addReg(AArch64::XZR)
772 .addReg(LdI.BaseReg)
773 .addImm(0);
774 MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
775 BaseOpnd.setReg(ScratchReg);
776
777 // If the load does a pre/post increment, then insert a MOV after as
778 // well to update the real base register.
779 if (LdI.IsPrePost) {
780 LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
781 << printReg(ScratchReg, TRI) << '\n');
782 MI.getOperand(0).setReg(
783 ScratchReg); // Change tied operand pre/post update dest.
785 TII->get(AArch64::ORRXrs), LdI.BaseReg)
786 .addReg(AArch64::XZR)
787 .addReg(ScratchReg)
788 .addImm(0);
789 }
790
791 for (int I = 0, E = OldCollisions.size(); I != E; ++I)
792 if (OldCollisions[I] == &MI) {
793 std::swap(OldCollisions[I], OldCollisions[E - 1]);
794 OldCollisions.pop_back();
795 break;
796 }
797
798 // Update TagMap to reflect instruction changes to reduce the number
799 // of later MOVs to be inserted. This needs to be done after
800 // OldCollisions is updated since it may be relocated by this
801 // insertion.
802 TagMap[NewTag].push_back(&MI);
803 ++NumCollisionsAvoided;
804 Fixed = true;
805 Modified = true;
806 break;
807 }
808 if (!Fixed)
809 ++NumCollisionsNotAvoided;
810 }
811 }
812}
813
814bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
815 auto &ST = Fn.getSubtarget<AArch64Subtarget>();
816 if (ST.getProcFamily() != AArch64Subtarget::Falkor)
817 return false;
818
819 if (skipFunction(Fn.getFunction()))
820 return false;
821
822 TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
823 TRI = ST.getRegisterInfo();
824
825 MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
826
827 Modified = false;
828
829 for (MachineLoop *I : LI)
830 for (MachineLoop *L : depth_first(I))
831 // Only process inner-loops
832 if (L->isInnermost())
833 runOnLoop(*L, Fn);
834
835 return Modified;
836}
837
838FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late static false unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset)
Falkor HW Prefetch Fix
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
static std::optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
#define DEBUG_TYPE
aarch64 falkor hwpf fix late
#define FALKOR_STRIDED_ACCESS_MD
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
arm execution domain fix
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file contains the declarations for metadata subclasses.
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Target-Independent Code Generator Pass Configuration Options pass.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1455
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1293
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1399
reverse_iterator rend()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
MachineOperand class - Representation of each machine instruction operand.
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:406
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createFalkorMarkStridedAccessesPass()
FunctionPass * createFalkorHWPFFixPass()
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void initializeFalkorHWPFFixPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
A record for a potential prefetch made during the initial scan of the loop.