Go to the documentation of this file.
57 #define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
59 STATISTIC(NumStridedLoadsMarked,
"Number of strided loads marked");
61 "Number of HW prefetch tag collisions avoided");
63 "Number of HW prefetch tag collisions not avoided due to lack of registers");
65 "Controls which tag collisions are avoided");
69 class FalkorMarkStridedAccesses {
77 bool runOnLoop(
Loop &L);
83 class FalkorMarkStridedAccessesLegacy :
public FunctionPass {
109 "Falkor HW Prefetch Fix",
false,
false)
117 return new FalkorMarkStridedAccessesLegacy();
130 LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
131 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
133 FalkorMarkStridedAccesses
LDP(LI, SE);
138 bool MadeChange =
false;
142 MadeChange |= runOnLoop(*LIt);
147 bool FalkorMarkStridedAccesses::runOnLoop(
Loop &L) {
152 bool MadeChange =
false;
156 LoadInst *LoadI = dyn_cast<LoadInst>(&
I);
165 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
166 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
171 ++NumStridedLoadsMarked;
220 bool IsPrePost =
false;
228 "Falkor HW Prefetch Fix Late Phase",
false,
false)
233 static
unsigned makeTag(
unsigned Dest,
unsigned Base,
unsigned Offset) {
234 return (Dest & 0xf) | ((
Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
243 switch (
MI.getOpcode()) {
247 case AArch64::LD1i64:
248 case AArch64::LD2i64:
256 case AArch64::LD1i16:
257 case AArch64::LD1i32:
259 case AArch64::LD2i16:
260 case AArch64::LD2i32:
262 case AArch64::LD3i16:
263 case AArch64::LD3i32:
264 case AArch64::LD3i64:
266 case AArch64::LD4i16:
267 case AArch64::LD4i32:
268 case AArch64::LD4i64:
275 case AArch64::LD1Onev1d:
276 case AArch64::LD1Onev2s:
277 case AArch64::LD1Onev4h:
278 case AArch64::LD1Onev8b:
279 case AArch64::LD1Onev2d:
280 case AArch64::LD1Onev4s:
281 case AArch64::LD1Onev8h:
282 case AArch64::LD1Onev16b:
283 case AArch64::LD1Rv1d:
284 case AArch64::LD1Rv2s:
285 case AArch64::LD1Rv4h:
286 case AArch64::LD1Rv8b:
287 case AArch64::LD1Rv2d:
288 case AArch64::LD1Rv4s:
289 case AArch64::LD1Rv8h:
290 case AArch64::LD1Rv16b:
297 case AArch64::LD1Twov1d:
298 case AArch64::LD1Twov2s:
299 case AArch64::LD1Twov4h:
300 case AArch64::LD1Twov8b:
301 case AArch64::LD1Twov2d:
302 case AArch64::LD1Twov4s:
303 case AArch64::LD1Twov8h:
304 case AArch64::LD1Twov16b:
305 case AArch64::LD1Threev1d:
306 case AArch64::LD1Threev2s:
307 case AArch64::LD1Threev4h:
308 case AArch64::LD1Threev8b:
309 case AArch64::LD1Threev2d:
310 case AArch64::LD1Threev4s:
311 case AArch64::LD1Threev8h:
312 case AArch64::LD1Threev16b:
313 case AArch64::LD1Fourv1d:
314 case AArch64::LD1Fourv2s:
315 case AArch64::LD1Fourv4h:
316 case AArch64::LD1Fourv8b:
317 case AArch64::LD1Fourv2d:
318 case AArch64::LD1Fourv4s:
319 case AArch64::LD1Fourv8h:
320 case AArch64::LD1Fourv16b:
321 case AArch64::LD2Twov2s:
322 case AArch64::LD2Twov4s:
323 case AArch64::LD2Twov8b:
324 case AArch64::LD2Twov2d:
325 case AArch64::LD2Twov4h:
326 case AArch64::LD2Twov8h:
327 case AArch64::LD2Twov16b:
328 case AArch64::LD2Rv1d:
329 case AArch64::LD2Rv2s:
330 case AArch64::LD2Rv4s:
331 case AArch64::LD2Rv8b:
332 case AArch64::LD2Rv2d:
333 case AArch64::LD2Rv4h:
334 case AArch64::LD2Rv8h:
335 case AArch64::LD2Rv16b:
336 case AArch64::LD3Threev2s:
337 case AArch64::LD3Threev4h:
338 case AArch64::LD3Threev8b:
339 case AArch64::LD3Threev2d:
340 case AArch64::LD3Threev4s:
341 case AArch64::LD3Threev8h:
342 case AArch64::LD3Threev16b:
343 case AArch64::LD3Rv1d:
344 case AArch64::LD3Rv2s:
345 case AArch64::LD3Rv4h:
346 case AArch64::LD3Rv8b:
347 case AArch64::LD3Rv2d:
348 case AArch64::LD3Rv4s:
349 case AArch64::LD3Rv8h:
350 case AArch64::LD3Rv16b:
351 case AArch64::LD4Fourv2s:
352 case AArch64::LD4Fourv4h:
353 case AArch64::LD4Fourv8b:
354 case AArch64::LD4Fourv2d:
355 case AArch64::LD4Fourv4s:
356 case AArch64::LD4Fourv8h:
357 case AArch64::LD4Fourv16b:
358 case AArch64::LD4Rv1d:
359 case AArch64::LD4Rv2s:
360 case AArch64::LD4Rv4h:
361 case AArch64::LD4Rv8b:
362 case AArch64::LD4Rv2d:
363 case AArch64::LD4Rv4s:
364 case AArch64::LD4Rv8h:
365 case AArch64::LD4Rv16b:
372 case AArch64::LD1i64_POST:
373 case AArch64::LD2i64_POST:
380 case AArch64::LD1i8_POST:
381 case AArch64::LD1i16_POST:
382 case AArch64::LD1i32_POST:
383 case AArch64::LD2i8_POST:
384 case AArch64::LD2i16_POST:
385 case AArch64::LD2i32_POST:
386 case AArch64::LD3i8_POST:
387 case AArch64::LD3i16_POST:
388 case AArch64::LD3i32_POST:
389 case AArch64::LD3i64_POST:
390 case AArch64::LD4i8_POST:
391 case AArch64::LD4i16_POST:
392 case AArch64::LD4i32_POST:
393 case AArch64::LD4i64_POST:
400 case AArch64::LD1Onev1d_POST:
401 case AArch64::LD1Onev2s_POST:
402 case AArch64::LD1Onev4h_POST:
403 case AArch64::LD1Onev8b_POST:
404 case AArch64::LD1Onev2d_POST:
405 case AArch64::LD1Onev4s_POST:
406 case AArch64::LD1Onev8h_POST:
407 case AArch64::LD1Onev16b_POST:
408 case AArch64::LD1Rv1d_POST:
409 case AArch64::LD1Rv2s_POST:
410 case AArch64::LD1Rv4h_POST:
411 case AArch64::LD1Rv8b_POST:
412 case AArch64::LD1Rv2d_POST:
413 case AArch64::LD1Rv4s_POST:
414 case AArch64::LD1Rv8h_POST:
415 case AArch64::LD1Rv16b_POST:
422 case AArch64::LD1Twov1d_POST:
423 case AArch64::LD1Twov2s_POST:
424 case AArch64::LD1Twov4h_POST:
425 case AArch64::LD1Twov8b_POST:
426 case AArch64::LD1Twov2d_POST:
427 case AArch64::LD1Twov4s_POST:
428 case AArch64::LD1Twov8h_POST:
429 case AArch64::LD1Twov16b_POST:
430 case AArch64::LD1Threev1d_POST:
431 case AArch64::LD1Threev2s_POST:
432 case AArch64::LD1Threev4h_POST:
433 case AArch64::LD1Threev8b_POST:
434 case AArch64::LD1Threev2d_POST:
435 case AArch64::LD1Threev4s_POST:
436 case AArch64::LD1Threev8h_POST:
437 case AArch64::LD1Threev16b_POST:
438 case AArch64::LD1Fourv1d_POST:
439 case AArch64::LD1Fourv2s_POST:
440 case AArch64::LD1Fourv4h_POST:
441 case AArch64::LD1Fourv8b_POST:
442 case AArch64::LD1Fourv2d_POST:
443 case AArch64::LD1Fourv4s_POST:
444 case AArch64::LD1Fourv8h_POST:
445 case AArch64::LD1Fourv16b_POST:
446 case AArch64::LD2Twov2s_POST:
447 case AArch64::LD2Twov4s_POST:
448 case AArch64::LD2Twov8b_POST:
449 case AArch64::LD2Twov2d_POST:
450 case AArch64::LD2Twov4h_POST:
451 case AArch64::LD2Twov8h_POST:
452 case AArch64::LD2Twov16b_POST:
453 case AArch64::LD2Rv1d_POST:
454 case AArch64::LD2Rv2s_POST:
455 case AArch64::LD2Rv4s_POST:
456 case AArch64::LD2Rv8b_POST:
457 case AArch64::LD2Rv2d_POST:
458 case AArch64::LD2Rv4h_POST:
459 case AArch64::LD2Rv8h_POST:
460 case AArch64::LD2Rv16b_POST:
461 case AArch64::LD3Threev2s_POST:
462 case AArch64::LD3Threev4h_POST:
463 case AArch64::LD3Threev8b_POST:
464 case AArch64::LD3Threev2d_POST:
465 case AArch64::LD3Threev4s_POST:
466 case AArch64::LD3Threev8h_POST:
467 case AArch64::LD3Threev16b_POST:
468 case AArch64::LD3Rv1d_POST:
469 case AArch64::LD3Rv2s_POST:
470 case AArch64::LD3Rv4h_POST:
471 case AArch64::LD3Rv8b_POST:
472 case AArch64::LD3Rv2d_POST:
473 case AArch64::LD3Rv4s_POST:
474 case AArch64::LD3Rv8h_POST:
475 case AArch64::LD3Rv16b_POST:
476 case AArch64::LD4Fourv2s_POST:
477 case AArch64::LD4Fourv4h_POST:
478 case AArch64::LD4Fourv8b_POST:
479 case AArch64::LD4Fourv2d_POST:
480 case AArch64::LD4Fourv4s_POST:
481 case AArch64::LD4Fourv8h_POST:
482 case AArch64::LD4Fourv16b_POST:
483 case AArch64::LD4Rv1d_POST:
484 case AArch64::LD4Rv2s_POST:
485 case AArch64::LD4Rv4h_POST:
486 case AArch64::LD4Rv8b_POST:
487 case AArch64::LD4Rv2d_POST:
488 case AArch64::LD4Rv4s_POST:
489 case AArch64::LD4Rv8h_POST:
490 case AArch64::LD4Rv16b_POST:
497 case AArch64::LDRBBroW:
498 case AArch64::LDRBBroX:
499 case AArch64::LDRBBui:
500 case AArch64::LDRBroW:
501 case AArch64::LDRBroX:
502 case AArch64::LDRBui:
504 case AArch64::LDRDroW:
505 case AArch64::LDRDroX:
506 case AArch64::LDRDui:
507 case AArch64::LDRHHroW:
508 case AArch64::LDRHHroX:
509 case AArch64::LDRHHui:
510 case AArch64::LDRHroW:
511 case AArch64::LDRHroX:
512 case AArch64::LDRHui:
514 case AArch64::LDRQroW:
515 case AArch64::LDRQroX:
516 case AArch64::LDRQui:
517 case AArch64::LDRSBWroW:
518 case AArch64::LDRSBWroX:
519 case AArch64::LDRSBWui:
520 case AArch64::LDRSBXroW:
521 case AArch64::LDRSBXroX:
522 case AArch64::LDRSBXui:
523 case AArch64::LDRSHWroW:
524 case AArch64::LDRSHWroX:
525 case AArch64::LDRSHWui:
526 case AArch64::LDRSHXroW:
527 case AArch64::LDRSHXroX:
528 case AArch64::LDRSHXui:
529 case AArch64::LDRSWl:
530 case AArch64::LDRSWroW:
531 case AArch64::LDRSWroX:
532 case AArch64::LDRSWui:
534 case AArch64::LDRSroW:
535 case AArch64::LDRSroX:
536 case AArch64::LDRSui:
538 case AArch64::LDRWroW:
539 case AArch64::LDRWroX:
540 case AArch64::LDRWui:
542 case AArch64::LDRXroW:
543 case AArch64::LDRXroX:
544 case AArch64::LDRXui:
545 case AArch64::LDURBBi:
546 case AArch64::LDURBi:
547 case AArch64::LDURDi:
548 case AArch64::LDURHHi:
549 case AArch64::LDURHi:
550 case AArch64::LDURQi:
551 case AArch64::LDURSBWi:
552 case AArch64::LDURSBXi:
553 case AArch64::LDURSHWi:
554 case AArch64::LDURSHXi:
555 case AArch64::LDURSWi:
556 case AArch64::LDURSi:
557 case AArch64::LDURWi:
558 case AArch64::LDURXi:
565 case AArch64::LDRBBpost:
566 case AArch64::LDRBBpre:
567 case AArch64::LDRBpost:
568 case AArch64::LDRBpre:
569 case AArch64::LDRDpost:
570 case AArch64::LDRDpre:
571 case AArch64::LDRHHpost:
572 case AArch64::LDRHHpre:
573 case AArch64::LDRHpost:
574 case AArch64::LDRHpre:
575 case AArch64::LDRQpost:
576 case AArch64::LDRQpre:
577 case AArch64::LDRSBWpost:
578 case AArch64::LDRSBWpre:
579 case AArch64::LDRSBXpost:
580 case AArch64::LDRSBXpre:
581 case AArch64::LDRSHWpost:
582 case AArch64::LDRSHWpre:
583 case AArch64::LDRSHXpost:
584 case AArch64::LDRSHXpre:
585 case AArch64::LDRSWpost:
586 case AArch64::LDRSWpre:
587 case AArch64::LDRSpost:
588 case AArch64::LDRSpre:
589 case AArch64::LDRWpost:
590 case AArch64::LDRWpre:
591 case AArch64::LDRXpost:
592 case AArch64::LDRXpre:
599 case AArch64::LDNPDi:
600 case AArch64::LDNPQi:
601 case AArch64::LDNPSi:
611 case AArch64::LDPSWi:
620 case AArch64::LDPQpost:
621 case AArch64::LDPQpre:
622 case AArch64::LDPDpost:
623 case AArch64::LDPDpre:
624 case AArch64::LDPSpost:
625 case AArch64::LDPSpre:
632 case AArch64::LDPSWpost:
633 case AArch64::LDPSWpre:
634 case AArch64::LDPWpost:
635 case AArch64::LDPWpre:
636 case AArch64::LDPXpost:
637 case AArch64::LDPXpre:
646 Register BaseReg =
MI.getOperand(BaseRegIdx).getReg();
647 if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
651 LI.DestReg = DestRegIdx == -1 ?
Register() :
MI.getOperand(DestRegIdx).getReg();
652 LI.BaseReg = BaseReg;
653 LI.BaseRegIdx = BaseRegIdx;
654 LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &
MI.getOperand(OffsetIdx);
655 LI.IsPrePost = IsPrePost;
664 if (LI.OffsetOpnd ==
nullptr)
666 else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
667 LI.OffsetOpnd->isCPI())
669 else if (LI.OffsetOpnd->isReg())
672 Off = LI.OffsetOpnd->getImm() >> 2;
688 TagMap[*
Tag].push_back(&
MI);
691 bool AnyCollisions =
false;
692 for (
auto &
P : TagMap) {
693 auto Size =
P.second.size();
695 for (
auto *
MI :
P.second) {
696 if (
TII->isStridedAccess(*
MI)) {
697 AnyCollisions =
true;
716 LR.addLiveOuts(*
MBB);
719 if (!
TII->isStridedAccess(
MI))
729 auto &OldCollisions = TagMap[*OptOldTag];
730 if (OldCollisions.size() <= 1)
743 for (
unsigned OpI = 0, OpE =
MI.getNumOperands(); OpI < OpE; ++OpI) {
744 if (OpI ==
static_cast<unsigned>(LdI.BaseRegIdx))
751 for (
unsigned ScratchReg : AArch64::GPR64RegClass) {
756 NewLdI.BaseReg = ScratchReg;
759 if (TagMap.count(NewTag))
776 BaseOpnd.
setReg(ScratchReg);
783 MI.getOperand(0).setReg(
786 TII->get(AArch64::ORRXrs), LdI.BaseReg)
792 for (
int I = 0,
E = OldCollisions.size();
I !=
E; ++
I)
793 if (OldCollisions[
I] == &
MI) {
795 OldCollisions.pop_back();
803 TagMap[NewTag].push_back(&
MI);
804 ++NumCollisionsAvoided;
810 ++NumCollisionsNotAvoided;
824 TRI =
ST.getRegisterInfo();
#define FALKOR_STRIDED_ACCESS_MD
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
This is an optimization pass for GlobalISel generic memory operations.
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Represents a single loop in the control flow graph.
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
The main scalar evolution driver.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
aarch64 falkor hwpf fix late
The legacy pass manager's analysis pass to compute loop information.
Properties which a MachineFunction may have at a given point in time.
static Optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Value * getPointerOperand()
static Optional< unsigned > getTag(const TargetRegisterInfo *TRI, const MachineInstr &MI, const LoadInfo &LI)
unsigned const TargetRegisterInfo * TRI
void initializeFalkorHWPFFixPass(PassRegistry &)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM Basic Block Representation.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Represent the analysis usage information of a pass.
iterator_range< block_iterator > blocks() const
A record for a potential prefetch made during the initial scan of the loop.
const HexagonInstrInfo * TII
MachineOperand class - Representation of each machine instruction operand.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
MachineFunctionProperties & set(Property P)
static bool shouldExecute(unsigned CounterName)
Legacy analysis pass which computes a DominatorTree.
STATISTIC(NumFunctions, "Total number of functions")
DEBUG_COUNTER(FixCounter, "falkor-hwpf", "Controls which tag collisions are avoided")
A set of register units used to track register liveness.
INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, "Falkor HW Prefetch Fix", false, false) INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late static false unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
Target-Independent Code Generator Pass Configuration Options.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
FunctionPass * createFalkorHWPFFixPass()
This class represents an analyzed expression in the program.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Representation of each machine instruction.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Register getReg() const
getReg - Returns the register number.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
reverse_iterator rbegin()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVMContext & getContext() const
All values hold a context through their type.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
TMC & getTM() const
Get the right type of TargetMachine for this target.
An instruction for reading from memory.
unsigned const MachineRegisterInfo * MRI
Wrapper class representing virtual and physical registers.
iterator_range< df_iterator< T > > depth_first(const T &G)
static bool runOnFunction(Function &F, bool PostInlining)
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Function & getFunction()
Return the LLVM function that this machine code represents.
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry &)
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
This node represents a polynomial recurrence on the trip count of the specified loop.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void setReg(Register Reg)
Change the register this operand corresponds to.
FunctionPass class - This class is used to implement most global optimizations.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
AnalysisUsage & addRequired()
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
FunctionPass * createFalkorMarkStridedAccessesPass()
LLVM Value Representation.