Line data Source code
1 : //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
10 : /// that may inhibit the HW prefetching. This is done in two steps. Before
11 : /// ISel, we mark strided loads (i.e. those that will likely benefit from
12 : /// prefetching) with metadata. Then, after opcodes have been finalized, we
13 : /// insert MOVs and re-write loads to prevent unintentional tag collisions.
14 : // ===---------------------------------------------------------------------===//
15 :
16 : #include "AArch64.h"
17 : #include "AArch64InstrInfo.h"
18 : #include "AArch64Subtarget.h"
19 : #include "AArch64TargetMachine.h"
20 : #include "llvm/ADT/DenseMap.h"
21 : #include "llvm/ADT/DepthFirstIterator.h"
22 : #include "llvm/ADT/None.h"
23 : #include "llvm/ADT/Optional.h"
24 : #include "llvm/ADT/SmallVector.h"
25 : #include "llvm/ADT/Statistic.h"
26 : #include "llvm/Analysis/LoopInfo.h"
27 : #include "llvm/Analysis/ScalarEvolution.h"
28 : #include "llvm/Analysis/ScalarEvolutionExpressions.h"
29 : #include "llvm/CodeGen/LiveRegUnits.h"
30 : #include "llvm/CodeGen/MachineBasicBlock.h"
31 : #include "llvm/CodeGen/MachineFunction.h"
32 : #include "llvm/CodeGen/MachineFunctionPass.h"
33 : #include "llvm/CodeGen/MachineInstr.h"
34 : #include "llvm/CodeGen/MachineInstrBuilder.h"
35 : #include "llvm/CodeGen/MachineLoopInfo.h"
36 : #include "llvm/CodeGen/MachineOperand.h"
37 : #include "llvm/CodeGen/MachineRegisterInfo.h"
38 : #include "llvm/CodeGen/TargetPassConfig.h"
39 : #include "llvm/CodeGen/TargetRegisterInfo.h"
40 : #include "llvm/IR/DebugLoc.h"
41 : #include "llvm/IR/Dominators.h"
42 : #include "llvm/IR/Function.h"
43 : #include "llvm/IR/Instruction.h"
44 : #include "llvm/IR/Instructions.h"
45 : #include "llvm/IR/Metadata.h"
46 : #include "llvm/Pass.h"
47 : #include "llvm/Support/Casting.h"
48 : #include "llvm/Support/Debug.h"
49 : #include "llvm/Support/DebugCounter.h"
50 : #include "llvm/Support/raw_ostream.h"
51 : #include <cassert>
52 : #include <iterator>
53 : #include <utility>
54 :
55 : using namespace llvm;
56 :
57 : #define DEBUG_TYPE "falkor-hwpf-fix"
58 :
59 : STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
60 : STATISTIC(NumCollisionsAvoided,
61 : "Number of HW prefetch tag collisions avoided");
62 : STATISTIC(NumCollisionsNotAvoided,
63 : "Number of HW prefetch tag collisions not avoided due to lack of registers");
64 : DEBUG_COUNTER(FixCounter, "falkor-hwpf",
65 : "Controls which tag collisions are avoided");
66 :
67 : namespace {
68 :
69 : class FalkorMarkStridedAccesses {
70 : public:
71 : FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
72 69 : : LI(LI), SE(SE) {}
73 :
74 : bool run();
75 :
76 : private:
77 : bool runOnLoop(Loop &L);
78 :
79 : LoopInfo &LI;
80 : ScalarEvolution &SE;
81 : };
82 :
83 : class FalkorMarkStridedAccessesLegacy : public FunctionPass {
84 : public:
85 : static char ID; // Pass ID, replacement for typeid
86 :
87 : FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
88 1122 : initializeFalkorMarkStridedAccessesLegacyPass(
89 1122 : *PassRegistry::getPassRegistry());
90 : }
91 :
92 1116 : void getAnalysisUsage(AnalysisUsage &AU) const override {
93 : AU.addRequired<TargetPassConfig>();
94 : AU.addPreserved<DominatorTreeWrapperPass>();
95 : AU.addRequired<LoopInfoWrapperPass>();
96 : AU.addPreserved<LoopInfoWrapperPass>();
97 : AU.addRequired<ScalarEvolutionWrapperPass>();
98 : AU.addPreserved<ScalarEvolutionWrapperPass>();
99 1116 : }
100 :
101 : bool runOnFunction(Function &F) override;
102 : };
103 :
104 : } // end anonymous namespace
105 :
106 : char FalkorMarkStridedAccessesLegacy::ID = 0;
107 :
108 85109 : INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
109 : "Falkor HW Prefetch Fix", false, false)
110 85109 : INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
111 85109 : INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
112 85109 : INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
113 200154 : INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
114 : "Falkor HW Prefetch Fix", false, false)
115 :
116 1120 : FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
117 1120 : return new FalkorMarkStridedAccessesLegacy();
118 : }
119 :
120 14140 : bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
121 14140 : TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
122 : const AArch64Subtarget *ST =
123 14140 : TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
124 14140 : if (ST->getProcFamily() != AArch64Subtarget::Falkor)
125 : return false;
126 :
127 69 : if (skipFunction(F))
128 : return false;
129 :
130 69 : LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
131 69 : ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
132 :
133 : FalkorMarkStridedAccesses LDP(LI, SE);
134 69 : return LDP.run();
135 : }
136 :
137 69 : bool FalkorMarkStridedAccesses::run() {
138 : bool MadeChange = false;
139 :
140 74 : for (Loop *L : LI)
141 11 : for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
142 6 : MadeChange |= runOnLoop(**LIt);
143 :
144 69 : return MadeChange;
145 : }
146 :
147 0 : bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
148 : // Only mark strided loads in the inner-most loop
149 0 : if (!L.empty())
150 0 : return false;
151 :
152 : bool MadeChange = false;
153 :
154 0 : for (BasicBlock *BB : L.blocks()) {
155 0 : for (Instruction &I : *BB) {
156 : LoadInst *LoadI = dyn_cast<LoadInst>(&I);
157 0 : if (!LoadI)
158 0 : continue;
159 :
160 : Value *PtrValue = LoadI->getPointerOperand();
161 0 : if (L.isLoopInvariant(PtrValue))
162 0 : continue;
163 :
164 0 : const SCEV *LSCEV = SE.getSCEV(PtrValue);
165 : const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
166 0 : if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
167 0 : continue;
168 :
169 0 : LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
170 0 : MDNode::get(LoadI->getContext(), {}));
171 : ++NumStridedLoadsMarked;
172 : LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
173 : MadeChange = true;
174 : }
175 : }
176 :
177 : return MadeChange;
178 : }
179 :
180 : namespace {
181 :
182 : class FalkorHWPFFix : public MachineFunctionPass {
183 : public:
184 : static char ID;
185 :
186 1121 : FalkorHWPFFix() : MachineFunctionPass(ID) {
187 1121 : initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
188 1121 : }
189 :
190 : bool runOnMachineFunction(MachineFunction &Fn) override;
191 :
192 1106 : void getAnalysisUsage(AnalysisUsage &AU) const override {
193 1106 : AU.setPreservesCFG();
194 : AU.addRequired<MachineLoopInfo>();
195 1106 : MachineFunctionPass::getAnalysisUsage(AU);
196 1106 : }
197 :
198 1106 : MachineFunctionProperties getRequiredProperties() const override {
199 1106 : return MachineFunctionProperties().set(
200 1106 : MachineFunctionProperties::Property::NoVRegs);
201 : }
202 :
203 : private:
204 : void runOnLoop(MachineLoop &L, MachineFunction &Fn);
205 :
206 : const AArch64InstrInfo *TII;
207 : const TargetRegisterInfo *TRI;
208 : DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
209 : bool Modified;
210 : };
211 :
212 : /// Bits from load opcodes used to compute HW prefetcher instruction tags.
213 : struct LoadInfo {
214 : LoadInfo() = default;
215 :
216 : unsigned DestReg = 0;
217 : unsigned BaseReg = 0;
218 : int BaseRegIdx = -1;
219 : const MachineOperand *OffsetOpnd = nullptr;
220 : bool IsPrePost = false;
221 : };
222 :
223 : } // end anonymous namespace
224 :
225 : char FalkorHWPFFix::ID = 0;
226 :
227 85109 : INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
228 : "Falkor HW Prefetch Fix Late Phase", false, false)
229 85109 : INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
230 200153 : INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
231 : "Falkor HW Prefetch Fix Late Phase", false, false)
232 :
233 : static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
234 0 : return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
235 : }
236 :
237 132 : static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
238 : int DestRegIdx;
239 : int BaseRegIdx;
240 : int OffsetIdx;
241 : bool IsPrePost;
242 :
243 264 : switch (MI.getOpcode()) {
244 : default:
245 : return None;
246 :
247 : case AArch64::LD1i64:
248 : case AArch64::LD2i64:
249 : DestRegIdx = 0;
250 : BaseRegIdx = 3;
251 : OffsetIdx = -1;
252 : IsPrePost = false;
253 : break;
254 :
255 2 : case AArch64::LD1i8:
256 : case AArch64::LD1i16:
257 : case AArch64::LD1i32:
258 : case AArch64::LD2i8:
259 : case AArch64::LD2i16:
260 : case AArch64::LD2i32:
261 : case AArch64::LD3i8:
262 : case AArch64::LD3i16:
263 : case AArch64::LD3i32:
264 : case AArch64::LD3i64:
265 : case AArch64::LD4i8:
266 : case AArch64::LD4i16:
267 : case AArch64::LD4i32:
268 : case AArch64::LD4i64:
269 : DestRegIdx = -1;
270 : BaseRegIdx = 3;
271 : OffsetIdx = -1;
272 : IsPrePost = false;
273 2 : break;
274 :
275 2 : case AArch64::LD1Onev1d:
276 : case AArch64::LD1Onev2s:
277 : case AArch64::LD1Onev4h:
278 : case AArch64::LD1Onev8b:
279 : case AArch64::LD1Onev2d:
280 : case AArch64::LD1Onev4s:
281 : case AArch64::LD1Onev8h:
282 : case AArch64::LD1Onev16b:
283 : case AArch64::LD1Rv1d:
284 : case AArch64::LD1Rv2s:
285 : case AArch64::LD1Rv4h:
286 : case AArch64::LD1Rv8b:
287 : case AArch64::LD1Rv2d:
288 : case AArch64::LD1Rv4s:
289 : case AArch64::LD1Rv8h:
290 : case AArch64::LD1Rv16b:
291 : DestRegIdx = 0;
292 : BaseRegIdx = 1;
293 : OffsetIdx = -1;
294 : IsPrePost = false;
295 2 : break;
296 :
297 2 : case AArch64::LD1Twov1d:
298 : case AArch64::LD1Twov2s:
299 : case AArch64::LD1Twov4h:
300 : case AArch64::LD1Twov8b:
301 : case AArch64::LD1Twov2d:
302 : case AArch64::LD1Twov4s:
303 : case AArch64::LD1Twov8h:
304 : case AArch64::LD1Twov16b:
305 : case AArch64::LD1Threev1d:
306 : case AArch64::LD1Threev2s:
307 : case AArch64::LD1Threev4h:
308 : case AArch64::LD1Threev8b:
309 : case AArch64::LD1Threev2d:
310 : case AArch64::LD1Threev4s:
311 : case AArch64::LD1Threev8h:
312 : case AArch64::LD1Threev16b:
313 : case AArch64::LD1Fourv1d:
314 : case AArch64::LD1Fourv2s:
315 : case AArch64::LD1Fourv4h:
316 : case AArch64::LD1Fourv8b:
317 : case AArch64::LD1Fourv2d:
318 : case AArch64::LD1Fourv4s:
319 : case AArch64::LD1Fourv8h:
320 : case AArch64::LD1Fourv16b:
321 : case AArch64::LD2Twov2s:
322 : case AArch64::LD2Twov4s:
323 : case AArch64::LD2Twov8b:
324 : case AArch64::LD2Twov2d:
325 : case AArch64::LD2Twov4h:
326 : case AArch64::LD2Twov8h:
327 : case AArch64::LD2Twov16b:
328 : case AArch64::LD2Rv1d:
329 : case AArch64::LD2Rv2s:
330 : case AArch64::LD2Rv4s:
331 : case AArch64::LD2Rv8b:
332 : case AArch64::LD2Rv2d:
333 : case AArch64::LD2Rv4h:
334 : case AArch64::LD2Rv8h:
335 : case AArch64::LD2Rv16b:
336 : case AArch64::LD3Threev2s:
337 : case AArch64::LD3Threev4h:
338 : case AArch64::LD3Threev8b:
339 : case AArch64::LD3Threev2d:
340 : case AArch64::LD3Threev4s:
341 : case AArch64::LD3Threev8h:
342 : case AArch64::LD3Threev16b:
343 : case AArch64::LD3Rv1d:
344 : case AArch64::LD3Rv2s:
345 : case AArch64::LD3Rv4h:
346 : case AArch64::LD3Rv8b:
347 : case AArch64::LD3Rv2d:
348 : case AArch64::LD3Rv4s:
349 : case AArch64::LD3Rv8h:
350 : case AArch64::LD3Rv16b:
351 : case AArch64::LD4Fourv2s:
352 : case AArch64::LD4Fourv4h:
353 : case AArch64::LD4Fourv8b:
354 : case AArch64::LD4Fourv2d:
355 : case AArch64::LD4Fourv4s:
356 : case AArch64::LD4Fourv8h:
357 : case AArch64::LD4Fourv16b:
358 : case AArch64::LD4Rv1d:
359 : case AArch64::LD4Rv2s:
360 : case AArch64::LD4Rv4h:
361 : case AArch64::LD4Rv8b:
362 : case AArch64::LD4Rv2d:
363 : case AArch64::LD4Rv4s:
364 : case AArch64::LD4Rv8h:
365 : case AArch64::LD4Rv16b:
366 : DestRegIdx = -1;
367 : BaseRegIdx = 1;
368 : OffsetIdx = -1;
369 : IsPrePost = false;
370 2 : break;
371 :
372 2 : case AArch64::LD1i64_POST:
373 : case AArch64::LD2i64_POST:
374 : DestRegIdx = 1;
375 : BaseRegIdx = 4;
376 : OffsetIdx = 5;
377 : IsPrePost = true;
378 2 : break;
379 :
380 2 : case AArch64::LD1i8_POST:
381 : case AArch64::LD1i16_POST:
382 : case AArch64::LD1i32_POST:
383 : case AArch64::LD2i8_POST:
384 : case AArch64::LD2i16_POST:
385 : case AArch64::LD2i32_POST:
386 : case AArch64::LD3i8_POST:
387 : case AArch64::LD3i16_POST:
388 : case AArch64::LD3i32_POST:
389 : case AArch64::LD3i64_POST:
390 : case AArch64::LD4i8_POST:
391 : case AArch64::LD4i16_POST:
392 : case AArch64::LD4i32_POST:
393 : case AArch64::LD4i64_POST:
394 : DestRegIdx = -1;
395 : BaseRegIdx = 4;
396 : OffsetIdx = 5;
397 : IsPrePost = true;
398 2 : break;
399 :
400 2 : case AArch64::LD1Onev1d_POST:
401 : case AArch64::LD1Onev2s_POST:
402 : case AArch64::LD1Onev4h_POST:
403 : case AArch64::LD1Onev8b_POST:
404 : case AArch64::LD1Onev2d_POST:
405 : case AArch64::LD1Onev4s_POST:
406 : case AArch64::LD1Onev8h_POST:
407 : case AArch64::LD1Onev16b_POST:
408 : case AArch64::LD1Rv1d_POST:
409 : case AArch64::LD1Rv2s_POST:
410 : case AArch64::LD1Rv4h_POST:
411 : case AArch64::LD1Rv8b_POST:
412 : case AArch64::LD1Rv2d_POST:
413 : case AArch64::LD1Rv4s_POST:
414 : case AArch64::LD1Rv8h_POST:
415 : case AArch64::LD1Rv16b_POST:
416 : DestRegIdx = 1;
417 : BaseRegIdx = 2;
418 : OffsetIdx = 3;
419 : IsPrePost = true;
420 2 : break;
421 :
422 2 : case AArch64::LD1Twov1d_POST:
423 : case AArch64::LD1Twov2s_POST:
424 : case AArch64::LD1Twov4h_POST:
425 : case AArch64::LD1Twov8b_POST:
426 : case AArch64::LD1Twov2d_POST:
427 : case AArch64::LD1Twov4s_POST:
428 : case AArch64::LD1Twov8h_POST:
429 : case AArch64::LD1Twov16b_POST:
430 : case AArch64::LD1Threev1d_POST:
431 : case AArch64::LD1Threev2s_POST:
432 : case AArch64::LD1Threev4h_POST:
433 : case AArch64::LD1Threev8b_POST:
434 : case AArch64::LD1Threev2d_POST:
435 : case AArch64::LD1Threev4s_POST:
436 : case AArch64::LD1Threev8h_POST:
437 : case AArch64::LD1Threev16b_POST:
438 : case AArch64::LD1Fourv1d_POST:
439 : case AArch64::LD1Fourv2s_POST:
440 : case AArch64::LD1Fourv4h_POST:
441 : case AArch64::LD1Fourv8b_POST:
442 : case AArch64::LD1Fourv2d_POST:
443 : case AArch64::LD1Fourv4s_POST:
444 : case AArch64::LD1Fourv8h_POST:
445 : case AArch64::LD1Fourv16b_POST:
446 : case AArch64::LD2Twov2s_POST:
447 : case AArch64::LD2Twov4s_POST:
448 : case AArch64::LD2Twov8b_POST:
449 : case AArch64::LD2Twov2d_POST:
450 : case AArch64::LD2Twov4h_POST:
451 : case AArch64::LD2Twov8h_POST:
452 : case AArch64::LD2Twov16b_POST:
453 : case AArch64::LD2Rv1d_POST:
454 : case AArch64::LD2Rv2s_POST:
455 : case AArch64::LD2Rv4s_POST:
456 : case AArch64::LD2Rv8b_POST:
457 : case AArch64::LD2Rv2d_POST:
458 : case AArch64::LD2Rv4h_POST:
459 : case AArch64::LD2Rv8h_POST:
460 : case AArch64::LD2Rv16b_POST:
461 : case AArch64::LD3Threev2s_POST:
462 : case AArch64::LD3Threev4h_POST:
463 : case AArch64::LD3Threev8b_POST:
464 : case AArch64::LD3Threev2d_POST:
465 : case AArch64::LD3Threev4s_POST:
466 : case AArch64::LD3Threev8h_POST:
467 : case AArch64::LD3Threev16b_POST:
468 : case AArch64::LD3Rv1d_POST:
469 : case AArch64::LD3Rv2s_POST:
470 : case AArch64::LD3Rv4h_POST:
471 : case AArch64::LD3Rv8b_POST:
472 : case AArch64::LD3Rv2d_POST:
473 : case AArch64::LD3Rv4s_POST:
474 : case AArch64::LD3Rv8h_POST:
475 : case AArch64::LD3Rv16b_POST:
476 : case AArch64::LD4Fourv2s_POST:
477 : case AArch64::LD4Fourv4h_POST:
478 : case AArch64::LD4Fourv8b_POST:
479 : case AArch64::LD4Fourv2d_POST:
480 : case AArch64::LD4Fourv4s_POST:
481 : case AArch64::LD4Fourv8h_POST:
482 : case AArch64::LD4Fourv16b_POST:
483 : case AArch64::LD4Rv1d_POST:
484 : case AArch64::LD4Rv2s_POST:
485 : case AArch64::LD4Rv4h_POST:
486 : case AArch64::LD4Rv8b_POST:
487 : case AArch64::LD4Rv2d_POST:
488 : case AArch64::LD4Rv4s_POST:
489 : case AArch64::LD4Rv8h_POST:
490 : case AArch64::LD4Rv16b_POST:
491 : DestRegIdx = -1;
492 : BaseRegIdx = 2;
493 : OffsetIdx = 3;
494 : IsPrePost = true;
495 2 : break;
496 :
497 25 : case AArch64::LDRBBroW:
498 : case AArch64::LDRBBroX:
499 : case AArch64::LDRBBui:
500 : case AArch64::LDRBroW:
501 : case AArch64::LDRBroX:
502 : case AArch64::LDRBui:
503 : case AArch64::LDRDl:
504 : case AArch64::LDRDroW:
505 : case AArch64::LDRDroX:
506 : case AArch64::LDRDui:
507 : case AArch64::LDRHHroW:
508 : case AArch64::LDRHHroX:
509 : case AArch64::LDRHHui:
510 : case AArch64::LDRHroW:
511 : case AArch64::LDRHroX:
512 : case AArch64::LDRHui:
513 : case AArch64::LDRQl:
514 : case AArch64::LDRQroW:
515 : case AArch64::LDRQroX:
516 : case AArch64::LDRQui:
517 : case AArch64::LDRSBWroW:
518 : case AArch64::LDRSBWroX:
519 : case AArch64::LDRSBWui:
520 : case AArch64::LDRSBXroW:
521 : case AArch64::LDRSBXroX:
522 : case AArch64::LDRSBXui:
523 : case AArch64::LDRSHWroW:
524 : case AArch64::LDRSHWroX:
525 : case AArch64::LDRSHWui:
526 : case AArch64::LDRSHXroW:
527 : case AArch64::LDRSHXroX:
528 : case AArch64::LDRSHXui:
529 : case AArch64::LDRSWl:
530 : case AArch64::LDRSWroW:
531 : case AArch64::LDRSWroX:
532 : case AArch64::LDRSWui:
533 : case AArch64::LDRSl:
534 : case AArch64::LDRSroW:
535 : case AArch64::LDRSroX:
536 : case AArch64::LDRSui:
537 : case AArch64::LDRWl:
538 : case AArch64::LDRWroW:
539 : case AArch64::LDRWroX:
540 : case AArch64::LDRWui:
541 : case AArch64::LDRXl:
542 : case AArch64::LDRXroW:
543 : case AArch64::LDRXroX:
544 : case AArch64::LDRXui:
545 : case AArch64::LDURBBi:
546 : case AArch64::LDURBi:
547 : case AArch64::LDURDi:
548 : case AArch64::LDURHHi:
549 : case AArch64::LDURHi:
550 : case AArch64::LDURQi:
551 : case AArch64::LDURSBWi:
552 : case AArch64::LDURSBXi:
553 : case AArch64::LDURSHWi:
554 : case AArch64::LDURSHXi:
555 : case AArch64::LDURSWi:
556 : case AArch64::LDURSi:
557 : case AArch64::LDURWi:
558 : case AArch64::LDURXi:
559 : DestRegIdx = 0;
560 : BaseRegIdx = 1;
561 : OffsetIdx = 2;
562 : IsPrePost = false;
563 25 : break;
564 :
565 2 : case AArch64::LDRBBpost:
566 : case AArch64::LDRBBpre:
567 : case AArch64::LDRBpost:
568 : case AArch64::LDRBpre:
569 : case AArch64::LDRDpost:
570 : case AArch64::LDRDpre:
571 : case AArch64::LDRHHpost:
572 : case AArch64::LDRHHpre:
573 : case AArch64::LDRHpost:
574 : case AArch64::LDRHpre:
575 : case AArch64::LDRQpost:
576 : case AArch64::LDRQpre:
577 : case AArch64::LDRSBWpost:
578 : case AArch64::LDRSBWpre:
579 : case AArch64::LDRSBXpost:
580 : case AArch64::LDRSBXpre:
581 : case AArch64::LDRSHWpost:
582 : case AArch64::LDRSHWpre:
583 : case AArch64::LDRSHXpost:
584 : case AArch64::LDRSHXpre:
585 : case AArch64::LDRSWpost:
586 : case AArch64::LDRSWpre:
587 : case AArch64::LDRSpost:
588 : case AArch64::LDRSpre:
589 : case AArch64::LDRWpost:
590 : case AArch64::LDRWpre:
591 : case AArch64::LDRXpost:
592 : case AArch64::LDRXpre:
593 : DestRegIdx = 1;
594 : BaseRegIdx = 2;
595 : OffsetIdx = 3;
596 : IsPrePost = true;
597 2 : break;
598 :
599 2 : case AArch64::LDNPDi:
600 : case AArch64::LDNPQi:
601 : case AArch64::LDNPSi:
602 : case AArch64::LDPQi:
603 : case AArch64::LDPDi:
604 : case AArch64::LDPSi:
605 : DestRegIdx = -1;
606 : BaseRegIdx = 2;
607 : OffsetIdx = 3;
608 : IsPrePost = false;
609 2 : break;
610 :
611 10 : case AArch64::LDPSWi:
612 : case AArch64::LDPWi:
613 : case AArch64::LDPXi:
614 : DestRegIdx = 0;
615 : BaseRegIdx = 2;
616 : OffsetIdx = 3;
617 : IsPrePost = false;
618 10 : break;
619 :
620 2 : case AArch64::LDPQpost:
621 : case AArch64::LDPQpre:
622 : case AArch64::LDPDpost:
623 : case AArch64::LDPDpre:
624 : case AArch64::LDPSpost:
625 : case AArch64::LDPSpre:
626 : DestRegIdx = -1;
627 : BaseRegIdx = 3;
628 : OffsetIdx = 4;
629 : IsPrePost = true;
630 2 : break;
631 :
632 2 : case AArch64::LDPSWpost:
633 : case AArch64::LDPSWpre:
634 : case AArch64::LDPWpost:
635 : case AArch64::LDPWpre:
636 : case AArch64::LDPXpost:
637 : case AArch64::LDPXpre:
638 : DestRegIdx = 1;
639 : BaseRegIdx = 3;
640 : OffsetIdx = 4;
641 : IsPrePost = true;
642 2 : break;
643 : }
644 :
645 : // Loads from the stack pointer don't get prefetched.
646 59 : unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
647 59 : if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
648 : return None;
649 :
650 : LoadInfo LI;
651 58 : LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
652 : LI.BaseReg = BaseReg;
653 : LI.BaseRegIdx = BaseRegIdx;
654 58 : LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
655 58 : LI.IsPrePost = IsPrePost;
656 : return LI;
657 : }
658 :
659 0 : static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
660 : const MachineInstr &MI, const LoadInfo &LI) {
661 0 : unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
662 0 : unsigned Base = TRI->getEncodingValue(LI.BaseReg);
663 : unsigned Off;
664 0 : if (LI.OffsetOpnd == nullptr)
665 : Off = 0;
666 0 : else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
667 : LI.OffsetOpnd->isCPI())
668 : return None;
669 0 : else if (LI.OffsetOpnd->isReg())
670 0 : Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
671 : else
672 0 : Off = LI.OffsetOpnd->getImm() >> 2;
673 :
674 : return makeTag(Dest, Base, Off);
675 : }
676 :
677 19 : void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
678 : // Build the initial tag map for the whole loop.
679 19 : TagMap.clear();
680 38 : for (MachineBasicBlock *MBB : L.getBlocks())
681 129 : for (MachineInstr &MI : *MBB) {
682 110 : Optional<LoadInfo> LInfo = getLoadInfo(MI);
683 110 : if (!LInfo)
684 : continue;
685 37 : Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
686 37 : if (!Tag)
687 : continue;
688 37 : TagMap[*Tag].push_back(&MI);
689 : }
690 :
691 : bool AnyCollisions = false;
692 20 : for (auto &P : TagMap) {
693 18 : auto Size = P.second.size();
694 18 : if (Size > 1) {
695 17 : for (auto *MI : P.second) {
696 17 : if (TII->isStridedAccess(*MI)) {
697 : AnyCollisions = true;
698 : break;
699 : }
700 : }
701 : }
702 18 : if (AnyCollisions)
703 : break;
704 : }
705 : // Nothing to fix.
706 19 : if (!AnyCollisions)
707 2 : return;
708 :
709 17 : MachineRegisterInfo &MRI = Fn.getRegInfo();
710 :
711 : // Go through all the basic blocks in the current loop and fix any streaming
712 : // loads to avoid collisions with any other loads.
713 17 : LiveRegUnits LR(*TRI);
714 34 : for (MachineBasicBlock *MBB : L.getBlocks()) {
715 : LR.clear();
716 17 : LR.addLiveOuts(*MBB);
717 249 : for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
718 : MachineInstr &MI = *I;
719 116 : if (!TII->isStridedAccess(MI))
720 98 : continue;
721 :
722 22 : Optional<LoadInfo> OptLdI = getLoadInfo(MI);
723 22 : if (!OptLdI)
724 : continue;
725 21 : LoadInfo LdI = *OptLdI;
726 21 : Optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
727 21 : if (!OptOldTag)
728 : continue;
729 : auto &OldCollisions = TagMap[*OptOldTag];
730 21 : if (OldCollisions.size() <= 1)
731 : continue;
732 :
733 : bool Fixed = false;
734 : LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
735 :
736 : if (!DebugCounter::shouldExecute(FixCounter)) {
737 : LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
738 : continue;
739 : }
740 :
741 : // Add the non-base registers of MI as live so we don't use them as
742 : // scratch registers.
743 91 : for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
744 73 : if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
745 : continue;
746 55 : MachineOperand &MO = MI.getOperand(OpI);
747 55 : if (MO.isReg() && MO.readsReg())
748 9 : LR.addReg(MO.getReg());
749 : }
750 :
751 43 : for (unsigned ScratchReg : AArch64::GPR64RegClass) {
752 43 : if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
753 25 : continue;
754 :
755 18 : LoadInfo NewLdI(LdI);
756 18 : NewLdI.BaseReg = ScratchReg;
757 18 : unsigned NewTag = *getTag(TRI, MI, NewLdI);
758 : // Scratch reg tag would collide too, so don't use it.
759 : if (TagMap.count(NewTag))
760 0 : continue;
761 :
762 : LLVM_DEBUG(dbgs() << "Changing base reg to: "
763 : << printReg(ScratchReg, TRI) << '\n');
764 :
765 : // Rewrite:
766 : // Xd = LOAD Xb, off
767 : // to:
768 : // Xc = MOV Xb
769 : // Xd = LOAD Xc, off
770 : DebugLoc DL = MI.getDebugLoc();
771 18 : BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
772 18 : .addReg(AArch64::XZR)
773 18 : .addReg(LdI.BaseReg)
774 : .addImm(0);
775 18 : MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
776 18 : BaseOpnd.setReg(ScratchReg);
777 :
778 : // If the load does a pre/post increment, then insert a MOV after as
779 : // well to update the real base register.
780 18 : if (LdI.IsPrePost) {
781 : LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
782 : << printReg(ScratchReg, TRI) << '\n');
783 7 : MI.getOperand(0).setReg(
784 : ScratchReg); // Change tied operand pre/post update dest.
785 7 : BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
786 7 : TII->get(AArch64::ORRXrs), LdI.BaseReg)
787 7 : .addReg(AArch64::XZR)
788 7 : .addReg(ScratchReg)
789 : .addImm(0);
790 : }
791 :
792 21 : for (int I = 0, E = OldCollisions.size(); I != E; ++I)
793 42 : if (OldCollisions[I] == &MI) {
794 18 : std::swap(OldCollisions[I], OldCollisions[E - 1]);
795 : OldCollisions.pop_back();
796 : break;
797 : }
798 :
799 : // Update TagMap to reflect instruction changes to reduce the number
800 : // of later MOVs to be inserted. This needs to be done after
801 : // OldCollisions is updated since it may be relocated by this
802 : // insertion.
803 18 : TagMap[NewTag].push_back(&MI);
804 : ++NumCollisionsAvoided;
805 : Fixed = true;
806 18 : Modified = true;
807 : break;
808 : }
809 : if (!Fixed)
810 : ++NumCollisionsNotAvoided;
811 : }
812 : }
813 : }
814 :
815 14103 : bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
816 14103 : auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
817 14103 : if (ST.getProcFamily() != AArch64Subtarget::Falkor)
818 : return false;
819 :
820 83 : if (skipFunction(Fn.getFunction()))
821 : return false;
822 :
823 83 : TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
824 83 : TRI = ST.getRegisterInfo();
825 :
826 : assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
827 : "Register liveness not available!");
828 :
829 83 : MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
830 :
831 83 : Modified = false;
832 :
833 102 : for (MachineLoop *I : LI)
834 38 : for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
835 : // Only process inner-loops
836 19 : if (L->empty())
837 19 : runOnLoop(**L, Fn);
838 :
839 83 : return Modified;
840 : }
841 :
842 1120 : FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
|