LLVM  15.0.0git
ARMLowOverheadLoops.cpp
Go to the documentation of this file.
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 /// form should be in the preheader, whereas the while form should be in the
14 /// preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 /// In addition to this, we also look for the presence of the VCTP instruction,
19 /// which determines whether we can generated the tail-predicated low-overhead
20 /// loop form.
21 ///
22 /// Assumptions and Dependencies:
23 /// Low-overhead loops are constructed and executed using a setup instruction:
24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26 /// but fixed polarity: WLS can only branch forwards and LE can only branch
27 /// backwards. These restrictions mean that this pass is dependent upon block
28 /// layout and block sizes, which is why it's the last pass to run. The same is
29 /// true for ConstantIslands, but this pass does not increase the size of the
30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31 /// during the transform and pseudo instructions are replaced by real ones. In
32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33 /// multiple instructions for a single pseudo (see RevertWhile and
34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35 /// are defined to be as large as this maximum sequence of replacement
36 /// instructions.
37 ///
38 /// A note on VPR.P0 (the lane mask):
39 /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40 /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41 /// They will simply "and" the result of their calculation with the current
42 /// value of VPR.P0. You can think of it like this:
43 /// \verbatim
44 /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45 /// VPR.P0 &= Value
46 /// else
47 /// VPR.P0 = Value
48 /// \endverbatim
49 /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50 /// fall in the "VPT active" case, so we can consider that all VPR writes by
51 /// one of those instruction is actually a "and".
52 //===----------------------------------------------------------------------===//
53 
54 #include "ARM.h"
55 #include "ARMBaseInstrInfo.h"
56 #include "ARMBaseRegisterInfo.h"
57 #include "ARMBasicBlockInfo.h"
58 #include "ARMSubtarget.h"
59 #include "MVETailPredUtils.h"
60 #include "Thumb2InstrInfo.h"
61 #include "llvm/ADT/SetOperations.h"
62 #include "llvm/ADT/SetVector.h"
63 #include "llvm/ADT/SmallSet.h"
70 #include "llvm/CodeGen/Passes.h"
72 #include "llvm/MC/MCInstrDesc.h"
73 
74 using namespace llvm;
75 
76 #define DEBUG_TYPE "arm-low-overhead-loops"
77 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
78 
79 static cl::opt<bool>
80 DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
81  cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
82  cl::init(false));
83 
86  return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
87 }
88 
90  return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
91 }
92 
93 static bool hasVPRUse(MachineInstr &MI) {
94  return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
95 }
96 
97 static bool isDomainMVE(MachineInstr *MI) {
98  uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
99  return Domain == ARMII::DomainMVE;
100 }
101 
102 static int getVecSize(const MachineInstr &MI) {
103  const MCInstrDesc &MCID = MI.getDesc();
104  uint64_t Flags = MCID.TSFlags;
105  return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
106 }
107 
109  if (MI.isDebugInstr())
110  return false;
111  return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
112 }
113 
114 namespace {
115 
116  using InstSet = SmallPtrSetImpl<MachineInstr *>;
117 
118  class PostOrderLoopTraversal {
119  MachineLoop &ML;
120  MachineLoopInfo &MLI;
123 
124  public:
125  PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
126  : ML(ML), MLI(MLI) { }
127 
128  const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
129  return Order;
130  }
131 
132  // Visit all the blocks within the loop, as well as exit blocks and any
133  // blocks properly dominating the header.
134  void ProcessLoop() {
135  std::function<void(MachineBasicBlock*)> Search = [this, &Search]
136  (MachineBasicBlock *MBB) -> void {
137  if (Visited.count(MBB))
138  return;
139 
140  Visited.insert(MBB);
141  for (auto *Succ : MBB->successors()) {
142  if (!ML.contains(Succ))
143  continue;
144  Search(Succ);
145  }
146  Order.push_back(MBB);
147  };
148 
149  // Insert exit blocks.
151  ML.getExitBlocks(ExitBlocks);
152  append_range(Order, ExitBlocks);
153 
154  // Then add the loop body.
155  Search(ML.getHeader());
156 
157  // Then try the preheader and its predecessors.
158  std::function<void(MachineBasicBlock*)> GetPredecessor =
159  [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
160  Order.push_back(MBB);
161  if (MBB->pred_size() == 1)
162  GetPredecessor(*MBB->pred_begin());
163  };
164 
165  if (auto *Preheader = ML.getLoopPreheader())
166  GetPredecessor(Preheader);
167  else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
168  GetPredecessor(Preheader);
169  }
170  };
171 
172  struct PredicatedMI {
173  MachineInstr *MI = nullptr;
174  SetVector<MachineInstr*> Predicates;
175 
176  public:
177  PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
178  assert(I && "Instruction must not be null!");
179  Predicates.insert(Preds.begin(), Preds.end());
180  }
181  };
182 
183  // Represent the current state of the VPR and hold all instances which
184  // represent a VPT block, which is a list of instructions that begins with a
185  // VPT/VPST and has a maximum of four proceeding instructions. All
186  // instructions within the block are predicated upon the vpr and we allow
187  // instructions to define the vpr within in the block too.
188  class VPTState {
189  friend struct LowOverheadLoop;
190 
192 
193  static SmallVector<VPTState, 4> Blocks;
194  static SetVector<MachineInstr *> CurrentPredicates;
195  static std::map<MachineInstr *,
196  std::unique_ptr<PredicatedMI>> PredicatedInsts;
197 
198  static void CreateVPTBlock(MachineInstr *MI) {
199  assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
200  && "Can't begin VPT without predicate");
201  Blocks.emplace_back(MI);
202  // The execution of MI is predicated upon the current set of instructions
203  // that are AND'ed together to form the VPR predicate value. In the case
204  // that MI is a VPT, CurrentPredicates will also just be MI.
205  PredicatedInsts.emplace(
206  MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
207  }
208 
209  static void reset() {
210  Blocks.clear();
211  PredicatedInsts.clear();
212  CurrentPredicates.clear();
213  }
214 
215  static void addInst(MachineInstr *MI) {
216  Blocks.back().insert(MI);
217  PredicatedInsts.emplace(
218  MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
219  }
220 
221  static void addPredicate(MachineInstr *MI) {
222  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
223  CurrentPredicates.insert(MI);
224  }
225 
226  static void resetPredicate(MachineInstr *MI) {
227  LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
228  CurrentPredicates.clear();
229  CurrentPredicates.insert(MI);
230  }
231 
232  public:
233  // Have we found an instruction within the block which defines the vpr? If
234  // so, not all the instructions in the block will have the same predicate.
235  static bool hasUniformPredicate(VPTState &Block) {
236  return getDivergent(Block) == nullptr;
237  }
238 
239  // If it exists, return the first internal instruction which modifies the
240  // VPR.
241  static MachineInstr *getDivergent(VPTState &Block) {
242  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
243  for (unsigned i = 1; i < Insts.size(); ++i) {
244  MachineInstr *Next = Insts[i];
245  if (isVectorPredicate(Next))
246  return Next; // Found an instruction altering the vpr.
247  }
248  return nullptr;
249  }
250 
251  // Return whether the given instruction is predicated upon a VCTP.
252  static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
253  SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
254  if (Exclusive && Predicates.size() != 1)
255  return false;
256  return llvm::any_of(Predicates, isVCTP);
257  }
258 
259  // Is the VPST, controlling the block entry, predicated upon a VCTP.
260  static bool isEntryPredicatedOnVCTP(VPTState &Block,
261  bool Exclusive = false) {
262  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
263  return isPredicatedOnVCTP(Insts.front(), Exclusive);
264  }
265 
266  // If this block begins with a VPT, we can check whether it's using
267  // at least one predicated input(s), as well as possible loop invariant
268  // which would result in it being implicitly predicated.
269  static bool hasImplicitlyValidVPT(VPTState &Block,
271  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
272  MachineInstr *VPT = Insts.front();
273  assert(isVPTOpcode(VPT->getOpcode()) &&
274  "Expected VPT block to begin with VPT/VPST");
275 
276  if (VPT->getOpcode() == ARM::MVE_VPST)
277  return false;
278 
279  auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
280  MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
281  return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
282  };
283 
284  auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
285  MachineOperand &MO = MI->getOperand(Idx);
286  if (!MO.isReg() || !MO.getReg())
287  return true;
288 
290  RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
291  if (Defs.empty())
292  return true;
293 
294  for (auto *Def : Defs)
295  if (Def->getParent() == VPT->getParent())
296  return false;
297  return true;
298  };
299 
300  // Check that at least one of the operands is directly predicated on a
301  // vctp and allow an invariant value too.
302  return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
303  (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
304  (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
305  }
306 
307  static bool isValid(ReachingDefAnalysis &RDA) {
308  // All predication within the loop should be based on vctp. If the block
309  // isn't predicated on entry, check whether the vctp is within the block
310  // and that all other instructions are then predicated on it.
311  for (auto &Block : Blocks) {
312  if (isEntryPredicatedOnVCTP(Block, false) ||
313  hasImplicitlyValidVPT(Block, RDA))
314  continue;
315 
316  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
317  // We don't know how to convert a block with just a VPT;VCTP into
318  // anything valid once we remove the VCTP. For now just bail out.
319  assert(isVPTOpcode(Insts.front()->getOpcode()) &&
320  "Expected VPT block to start with a VPST or VPT!");
321  if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
322  isVCTP(Insts.back()))
323  return false;
324 
325  for (auto *MI : Insts) {
326  // Check that any internal VCTPs are 'Then' predicated.
328  return false;
329  // Skip other instructions that build up the predicate.
330  if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
331  continue;
332  // Check that any other instructions are predicated upon a vctp.
333  // TODO: We could infer when VPTs are implicitly predicated on the
334  // vctp (when the operands are predicated).
335  if (!isPredicatedOnVCTP(MI)) {
336  LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
337  return false;
338  }
339  }
340  }
341  return true;
342  }
343 
344  VPTState(MachineInstr *MI) { Insts.push_back(MI); }
345 
346  void insert(MachineInstr *MI) {
347  Insts.push_back(MI);
348  // VPT/VPST + 4 predicated instructions.
349  assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
350  }
351 
352  bool containsVCTP() const {
353  return llvm::any_of(Insts, isVCTP);
354  }
355 
356  unsigned size() const { return Insts.size(); }
357  SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
358  };
359 
360  struct LowOverheadLoop {
361 
362  MachineLoop &ML;
363  MachineBasicBlock *Preheader = nullptr;
364  MachineLoopInfo &MLI;
366  const TargetRegisterInfo &TRI;
367  const ARMBaseInstrInfo &TII;
368  MachineFunction *MF = nullptr;
369  MachineBasicBlock::iterator StartInsertPt;
370  MachineBasicBlock *StartInsertBB = nullptr;
371  MachineInstr *Start = nullptr;
372  MachineInstr *Dec = nullptr;
373  MachineInstr *End = nullptr;
374  MachineOperand TPNumElements;
377  SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
378  SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
380  bool Revert = false;
381  bool CannotTailPredicate = false;
382 
383  LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
385  const ARMBaseInstrInfo &TII)
386  : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
387  TPNumElements(MachineOperand::CreateImm(0)) {
388  MF = ML.getHeader()->getParent();
389  if (auto *MBB = ML.getLoopPreheader())
390  Preheader = MBB;
391  else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
392  Preheader = MBB;
393  VPTState::reset();
394  }
395 
396  // If this is an MVE instruction, check that we know how to use tail
397  // predication with it. Record VPT blocks and return whether the
398  // instruction is valid for tail predication.
399  bool ValidateMVEInst(MachineInstr *MI);
400 
401  void AnalyseMVEInst(MachineInstr *MI) {
402  CannotTailPredicate = !ValidateMVEInst(MI);
403  }
404 
405  bool IsTailPredicationLegal() const {
406  // For now, let's keep things really simple and only support a single
407  // block for tail predication.
408  return !Revert && FoundAllComponents() && !VCTPs.empty() &&
409  !CannotTailPredicate && ML.getNumBlocks() == 1;
410  }
411 
412  // Given that MI is a VCTP, check that is equivalent to any other VCTPs
413  // found.
414  bool AddVCTP(MachineInstr *MI);
415 
416  // Check that the predication in the loop will be equivalent once we
417  // perform the conversion. Also ensure that we can provide the number
418  // of elements to the loop start instruction.
419  bool ValidateTailPredicate();
420 
421  // Check that any values available outside of the loop will be the same
422  // after tail predication conversion.
423  bool ValidateLiveOuts();
424 
425  // Is it safe to define LR with DLS/WLS?
426  // LR can be defined if it is the operand to start, because it's the same
427  // value, or if it's going to be equivalent to the operand to Start.
428  MachineInstr *isSafeToDefineLR();
429 
430  // Check the branch targets are within range and we satisfy our
431  // restrictions.
432  void Validate(ARMBasicBlockUtils *BBUtils);
433 
434  bool FoundAllComponents() const {
435  return Start && Dec && End;
436  }
437 
438  SmallVectorImpl<VPTState> &getVPTBlocks() {
439  return VPTState::Blocks;
440  }
441 
442  // Return the operand for the loop start instruction. This will be the loop
443  // iteration count, or the number of elements if we're tail predicating.
444  MachineOperand &getLoopStartOperand() {
445  if (IsTailPredicationLegal())
446  return TPNumElements;
447  return Start->getOperand(1);
448  }
449 
450  unsigned getStartOpcode() const {
451  bool IsDo = isDoLoopStart(*Start);
452  if (!IsTailPredicationLegal())
453  return IsDo ? ARM::t2DLS : ARM::t2WLS;
454 
455  return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
456  }
457 
458  void dump() const {
459  if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
460  if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
461  if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
462  if (!VCTPs.empty()) {
463  dbgs() << "ARM Loops: Found VCTP(s):\n";
464  for (auto *MI : VCTPs)
465  dbgs() << " - " << *MI;
466  }
467  if (!FoundAllComponents())
468  dbgs() << "ARM Loops: Not a low-overhead loop.\n";
469  else if (!(Start && Dec && End))
470  dbgs() << "ARM Loops: Failed to find all loop components.\n";
471  }
472  };
473 
474  class ARMLowOverheadLoops : public MachineFunctionPass {
475  MachineFunction *MF = nullptr;
476  MachineLoopInfo *MLI = nullptr;
477  ReachingDefAnalysis *RDA = nullptr;
478  const ARMBaseInstrInfo *TII = nullptr;
479  MachineRegisterInfo *MRI = nullptr;
480  const TargetRegisterInfo *TRI = nullptr;
481  std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
482 
483  public:
484  static char ID;
485 
486  ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
487 
488  void getAnalysisUsage(AnalysisUsage &AU) const override {
489  AU.setPreservesCFG();
493  }
494 
495  bool runOnMachineFunction(MachineFunction &MF) override;
496 
497  MachineFunctionProperties getRequiredProperties() const override {
501  }
502 
503  StringRef getPassName() const override {
505  }
506 
507  private:
508  bool ProcessLoop(MachineLoop *ML);
509 
510  bool RevertNonLoops();
511 
512  void RevertWhile(MachineInstr *MI) const;
513  void RevertDo(MachineInstr *MI) const;
514 
515  bool RevertLoopDec(MachineInstr *MI) const;
516 
517  void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
518 
519  void RevertLoopEndDec(MachineInstr *MI) const;
520 
521  void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
522 
523  MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
524 
525  void Expand(LowOverheadLoop &LoLoop);
526 
527  void IterationCountDCE(LowOverheadLoop &LoLoop);
528  };
529 }
530 
531 char ARMLowOverheadLoops::ID = 0;
532 
533 SmallVector<VPTState, 4> VPTState::Blocks;
534 SetVector<MachineInstr *> VPTState::CurrentPredicates;
535 std::map<MachineInstr *,
536  std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
537 
539  false, false)
540 
541 static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
542  InstSet &ToRemove, InstSet &Ignore) {
543 
544  // Check that we can remove all of Killed without having to modify any IT
545  // blocks.
546  auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
547  // Collect the dead code and the MBBs in which they reside.
549  for (auto *Dead : Killed)
550  BasicBlocks.insert(Dead->getParent());
551 
552  // Collect IT blocks in all affected basic blocks.
553  std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
554  for (auto *MBB : BasicBlocks) {
555  for (auto &IT : *MBB) {
556  if (IT.getOpcode() != ARM::t2IT)
557  continue;
559  ITBlocks[&IT]);
560  }
561  }
562 
563  // If we're removing all of the instructions within an IT block, then
564  // also remove the IT instruction.
565  SmallPtrSet<MachineInstr *, 2> ModifiedITs;
567  for (auto *Dead : Killed) {
568  if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
570  RemoveITs.insert(IT);
571  auto &CurrentBlock = ITBlocks[IT];
572  CurrentBlock.erase(Dead);
573  if (CurrentBlock.empty())
574  ModifiedITs.erase(IT);
575  else
576  ModifiedITs.insert(IT);
577  }
578  }
579  if (!ModifiedITs.empty())
580  return false;
581  Killed.insert(RemoveITs.begin(), RemoveITs.end());
582  return true;
583  };
584 
586  if (!RDA.isSafeToRemove(MI, Uses, Ignore))
587  return false;
588 
589  if (WontCorruptITs(Uses, RDA)) {
590  ToRemove.insert(Uses.begin(), Uses.end());
591  LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
592  << " - can also remove:\n";
593  for (auto *Use : Uses)
594  dbgs() << " - " << *Use);
595 
597  RDA.collectKilledOperands(MI, Killed);
598  if (WontCorruptITs(Killed, RDA)) {
599  ToRemove.insert(Killed.begin(), Killed.end());
600  LLVM_DEBUG(for (auto *Dead : Killed)
601  dbgs() << " - " << *Dead);
602  }
603  return true;
604  }
605  return false;
606 }
607 
608 bool LowOverheadLoop::ValidateTailPredicate() {
609  if (!IsTailPredicationLegal()) {
610  LLVM_DEBUG(if (VCTPs.empty())
611  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
612  dbgs() << "ARM Loops: Tail-predication is not valid.\n");
613  return false;
614  }
615 
616  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
617  assert(ML.getBlocks().size() == 1 &&
618  "Shouldn't be processing a loop with more than one block");
619 
621  LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
622  return false;
623  }
624 
625  if (!VPTState::isValid(RDA)) {
626  LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
627  return false;
628  }
629 
630  if (!ValidateLiveOuts()) {
631  LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
632  return false;
633  }
634 
635  // For tail predication, we need to provide the number of elements, instead
636  // of the iteration count, to the loop start instruction. The number of
637  // elements is provided to the vctp instruction, so we need to check that
638  // we can use this register at InsertPt.
639  MachineInstr *VCTP = VCTPs.back();
640  if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
641  Start->getOpcode() == ARM::t2WhileLoopStartTP) {
642  TPNumElements = Start->getOperand(2);
643  StartInsertPt = Start;
644  StartInsertBB = Start->getParent();
645  } else {
646  TPNumElements = VCTP->getOperand(1);
647  MCRegister NumElements = TPNumElements.getReg().asMCReg();
648 
649  // If the register is defined within loop, then we can't perform TP.
650  // TODO: Check whether this is just a mov of a register that would be
651  // available.
652  if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
653  LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
654  return false;
655  }
656 
657  // The element count register maybe defined after InsertPt, in which case we
658  // need to try to move either InsertPt or the def so that the [w|d]lstp can
659  // use the value.
660 
661  if (StartInsertPt != StartInsertBB->end() &&
662  !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
663  if (auto *ElemDef =
664  RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
665  if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
666  ElemDef->removeFromParent();
667  StartInsertBB->insert(StartInsertPt, ElemDef);
668  LLVM_DEBUG(dbgs()
669  << "ARM Loops: Moved element count def: " << *ElemDef);
670  } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
671  StartInsertPt->removeFromParent();
672  StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
673  &*StartInsertPt);
674  LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
675  } else {
676  // If we fail to move an instruction and the element count is provided
677  // by a mov, use the mov operand if it will have the same value at the
678  // insertion point
679  MachineOperand Operand = ElemDef->getOperand(1);
680  if (isMovRegOpcode(ElemDef->getOpcode()) &&
681  RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
682  RDA.getUniqueReachingMIDef(&*StartInsertPt,
683  Operand.getReg().asMCReg())) {
684  TPNumElements = Operand;
685  NumElements = TPNumElements.getReg();
686  } else {
687  LLVM_DEBUG(dbgs()
688  << "ARM Loops: Unable to move element count to loop "
689  << "start instruction.\n");
690  return false;
691  }
692  }
693  }
694  }
695 
696  // Especially in the case of while loops, InsertBB may not be the
697  // preheader, so we need to check that the register isn't redefined
698  // before entering the loop.
699  auto CannotProvideElements = [this](MachineBasicBlock *MBB,
700  MCRegister NumElements) {
701  if (MBB->empty())
702  return false;
703  // NumElements is redefined in this block.
704  if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
705  return true;
706 
707  // Don't continue searching up through multiple predecessors.
708  if (MBB->pred_size() > 1)
709  return true;
710 
711  return false;
712  };
713 
714  // Search backwards for a def, until we get to InsertBB.
715  MachineBasicBlock *MBB = Preheader;
716  while (MBB && MBB != StartInsertBB) {
717  if (CannotProvideElements(MBB, NumElements)) {
718  LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
719  return false;
720  }
721  MBB = *MBB->pred_begin();
722  }
723  }
724 
725  // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
726  // world the [w|d]lstp instruction would be last instruction in the preheader
727  // and so it would only affect instructions within the loop body. But due to
728  // scheduling, and/or the logic in this pass (above), the insertion point can
729  // be moved earlier. So if the Loop Start isn't the last instruction in the
730  // preheader, and if the initial element count is smaller than the vector
731  // width, the Loop Start instruction will immediately generate one or more
732  // false lane mask which can, incorrectly, affect the proceeding MVE
733  // instructions in the preheader.
734  if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
735  LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
736  return false;
737  }
738 
739  // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
740  // need to compute an output size that is smaller than the VCTP mask operates
741  // on. The VecSize of the DoubleWidthResult is the larger vector size - the
742  // size it extends into, so any VCTP VecSize <= is valid.
743  unsigned VCTPVecSize = getVecSize(*VCTP);
744  for (MachineInstr *MI : DoubleWidthResultInstrs) {
745  unsigned InstrVecSize = getVecSize(*MI);
746  if (InstrVecSize > VCTPVecSize) {
747  LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
748  << "VecSize:\n" << *MI);
749  return false;
750  }
751  }
752 
753  // Check that the value change of the element count is what we expect and
754  // that the predication will be equivalent. For this we need:
755  // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
756  // and we can also allow register copies within the chain too.
757  auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
758  return -getAddSubImmediate(*MI) == ExpectedVecWidth;
759  };
760 
761  MachineBasicBlock *MBB = VCTP->getParent();
762  // Remove modifications to the element count since they have no purpose in a
763  // tail predicated loop. Explicitly refer to the vctp operand no matter which
764  // register NumElements has been assigned to, since that is what the
765  // modifications will be using
766  if (auto *Def = RDA.getUniqueReachingMIDef(
767  &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
768  SmallPtrSet<MachineInstr*, 2> ElementChain;
770  unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
771 
772  Ignore.insert(VCTPs.begin(), VCTPs.end());
773 
774  if (TryRemove(Def, RDA, ElementChain, Ignore)) {
775  bool FoundSub = false;
776 
777  for (auto *MI : ElementChain) {
778  if (isMovRegOpcode(MI->getOpcode()))
779  continue;
780 
781  if (isSubImmOpcode(MI->getOpcode())) {
782  if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
783  LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
784  " count: " << *MI);
785  return false;
786  }
787  FoundSub = true;
788  } else {
789  LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
790  " count: " << *MI);
791  return false;
792  }
793  }
794  ToRemove.insert(ElementChain.begin(), ElementChain.end());
795  }
796  }
797 
798  // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
799  // can also remove any extra instructions in the preheader, which often
800  // includes a now unused MOV.
801  if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
802  Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
803  Preheader && !Preheader->empty() &&
804  !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
805  if (auto *Def = RDA.getUniqueReachingMIDef(
806  &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
808  Ignore.insert(VCTPs.begin(), VCTPs.end());
809  TryRemove(Def, RDA, ToRemove, Ignore);
810  }
811  }
812 
813  return true;
814 }
815 
816 static bool isRegInClass(const MachineOperand &MO,
817  const TargetRegisterClass *Class) {
818  return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
819 }
820 
821 // MVE 'narrowing' operate on half a lane, reading from half and writing
822 // to half, which are referred to has the top and bottom half. The other
823 // half retains its previous value.
825  const MCInstrDesc &MCID = MI.getDesc();
826  uint64_t Flags = MCID.TSFlags;
827  return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
828 }
829 
830 // Some MVE instructions read from the top/bottom halves of their operand(s)
831 // and generate a vector result with result elements that are double the
832 // width of the input.
834  const MCInstrDesc &MCID = MI.getDesc();
835  uint64_t Flags = MCID.TSFlags;
836  return (Flags & ARMII::DoubleWidthResult) != 0;
837 }
838 
839 static bool isHorizontalReduction(const MachineInstr &MI) {
840  const MCInstrDesc &MCID = MI.getDesc();
841  uint64_t Flags = MCID.TSFlags;
842  return (Flags & ARMII::HorizontalReduction) != 0;
843 }
844 
845 // Can this instruction generate a non-zero result when given only zeroed
846 // operands? This allows us to know that, given operands with false bytes
847 // zeroed by masked loads, that the result will also contain zeros in those
848 // bytes.
849 static bool canGenerateNonZeros(const MachineInstr &MI) {
850 
851  // Check for instructions which can write into a larger element size,
852  // possibly writing into a previous zero'd lane.
854  return true;
855 
856  switch (MI.getOpcode()) {
857  default:
858  break;
859  // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
860  // fp16 -> fp32 vector conversions.
861  // Instructions that perform a NOT will generate 1s from 0s.
862  case ARM::MVE_VMVN:
863  case ARM::MVE_VORN:
864  // Count leading zeros will do just that!
865  case ARM::MVE_VCLZs8:
866  case ARM::MVE_VCLZs16:
867  case ARM::MVE_VCLZs32:
868  return true;
869  }
870  return false;
871 }
872 
873 // Look at its register uses to see if it only can only receive zeros
874 // into its false lanes which would then produce zeros. Also check that
875 // the output register is also defined by an FalseLanesZero instruction
876 // so that if tail-predication happens, the lanes that aren't updated will
877 // still be zeros.
879  const TargetRegisterClass *QPRs,
880  const ReachingDefAnalysis &RDA,
881  InstSet &FalseLanesZero) {
882  if (canGenerateNonZeros(MI))
883  return false;
884 
886  // Predicated loads will write zeros to the falsely predicated bytes of the
887  // destination register.
888  if (MI.mayLoad())
889  return isPredicated;
890 
891  auto IsZeroInit = [](MachineInstr *Def) {
892  return !isVectorPredicated(Def) &&
893  Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
894  Def->getOperand(1).getImm() == 0;
895  };
896 
897  bool AllowScalars = isHorizontalReduction(MI);
898  for (auto &MO : MI.operands()) {
899  if (!MO.isReg() || !MO.getReg())
900  continue;
901  if (!isRegInClass(MO, QPRs) && AllowScalars)
902  continue;
903  // Skip the lr predicate reg
905  if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
906  continue;
907 
908  // Check that this instruction will produce zeros in its false lanes:
909  // - If it only consumes false lanes zero or constant 0 (vmov #0)
910  // - If it's predicated, it only matters that it's def register already has
911  // false lane zeros, so we can ignore the uses.
913  RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
914  for (auto *Def : Defs) {
915  if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
916  continue;
917  if (MO.isUse() && isPredicated)
918  continue;
919  return false;
920  }
921  }
922  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
923  return true;
924 }
925 
926 bool LowOverheadLoop::ValidateLiveOuts() {
927  // We want to find out if the tail-predicated version of this loop will
928  // produce the same values as the loop in its original form. For this to
929  // be true, the newly inserted implicit predication must not change the
930  // the (observable) results.
931  // We're doing this because many instructions in the loop will not be
932  // predicated and so the conversion from VPT predication to tail-predication
933  // can result in different values being produced; due to the tail-predication
934  // preventing many instructions from updating their falsely predicated
935  // lanes. This analysis assumes that all the instructions perform lane-wise
936  // operations and don't perform any exchanges.
937  // A masked load, whether through VPT or tail predication, will write zeros
938  // to any of the falsely predicated bytes. So, from the loads, we know that
939  // the false lanes are zeroed and here we're trying to track that those false
940  // lanes remain zero, or where they change, the differences are masked away
941  // by their user(s).
942  // All MVE stores have to be predicated, so we know that any predicate load
943  // operands, or stored results are equivalent already. Other explicitly
944  // predicated instructions will perform the same operation in the original
945  // loop and the tail-predicated form too. Because of this, we can insert
946  // loads, stores and other predicated instructions into our Predicated
947  // set and build from there.
948  const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
949  SetVector<MachineInstr *> FalseLanesUnknown;
952  MachineBasicBlock *Header = ML.getHeader();
953 
954  LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
955 
956  for (auto &MI : *Header) {
957  if (!shouldInspect(MI))
958  continue;
959 
960  if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
961  continue;
962 
964  bool retainsOrReduces =
966 
967  if (isPredicated)
968  Predicated.insert(&MI);
970  FalseLanesZero.insert(&MI);
971  else if (MI.getNumDefs() == 0)
972  continue;
973  else if (!isPredicated && retainsOrReduces) {
974  LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
975  return false;
976  } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
977  FalseLanesUnknown.insert(&MI);
978  }
979 
980  LLVM_DEBUG({
981  dbgs() << " Predicated:\n";
982  for (auto *I : Predicated)
983  dbgs() << " " << *I;
984  dbgs() << " FalseLanesZero:\n";
985  for (auto *I : FalseLanesZero)
986  dbgs() << " " << *I;
987  dbgs() << " FalseLanesUnknown:\n";
988  for (auto *I : FalseLanesUnknown)
989  dbgs() << " " << *I;
990  });
991 
992  auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
993  SmallPtrSetImpl<MachineInstr *> &Predicated) {
995  RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
996  for (auto *Use : Uses) {
997  if (Use != MI && !Predicated.count(Use))
998  return false;
999  }
1000  return true;
1001  };
1002 
1003  // Visit the unknowns in reverse so that we can start at the values being
1004  // stored and then we can work towards the leaves, hopefully adding more
1005  // instructions to Predicated. Successfully terminating the loop means that
1006  // all the unknown values have to found to be masked by predicated user(s).
1007  // For any unpredicated values, we store them in NonPredicated so that we
1008  // can later check whether these form a reduction.
1009  SmallPtrSet<MachineInstr*, 2> NonPredicated;
1010  for (auto *MI : reverse(FalseLanesUnknown)) {
1011  for (auto &MO : MI->operands()) {
1012  if (!isRegInClass(MO, QPRs) || !MO.isDef())
1013  continue;
1014  if (!HasPredicatedUsers(MI, MO, Predicated)) {
1015  LLVM_DEBUG(dbgs() << " Found an unknown def of : "
1016  << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
1017  NonPredicated.insert(MI);
1018  break;
1019  }
1020  }
1021  // Any unknown false lanes have been masked away by the user(s).
1022  if (!NonPredicated.contains(MI))
1023  Predicated.insert(MI);
1024  }
1025 
1026  SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
1028  ML.getExitBlocks(ExitBlocks);
1029  assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
1030  assert(ExitBlocks.size() == 1 && "Expected a single exit block");
1031  MachineBasicBlock *ExitBB = ExitBlocks.front();
1032  for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1033  // TODO: Instead of blocking predication, we could move the vctp to the exit
1034  // block and calculate it's operand there in or the preheader.
1035  if (RegMask.PhysReg == ARM::VPR) {
1036  LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1037  return false;
1038  }
1039  // Check Q-regs that are live in the exit blocks. We don't collect scalars
1040  // because they won't be affected by lane predication.
1041  if (QPRs->contains(RegMask.PhysReg))
1042  if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
1043  LiveOutMIs.insert(MI);
1044  }
1045 
1046  // We've already validated that any VPT predication within the loop will be
1047  // equivalent when we perform the predication transformation; so we know that
1048  // any VPT predicated instruction is predicated upon VCTP. Any live-out
1049  // instruction needs to be predicated, so check this here. The instructions
1050  // in NonPredicated have been found to be a reduction that we can ensure its
1051  // legality. Any MQPRCopy found will need to validate its input as if it was
1052  // live out.
1053  SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1054  while (!Worklist.empty()) {
1055  MachineInstr *MI = Worklist.pop_back_val();
1056  if (MI->getOpcode() == ARM::MQPRCopy) {
1057  VMOVCopies.insert(MI);
1058  MachineInstr *CopySrc =
1059  RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
1060  if (CopySrc)
1061  Worklist.push_back(CopySrc);
1062  } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
1063  LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1064  VMOVCopies.clear();
1065  return false;
1066  }
1067  }
1068 
1069  return true;
1070 }
1071 
1072 void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1073  if (Revert)
1074  return;
1075 
1076  // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1077  // can only jump back.
1078  auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
1079  ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1080  MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1081  ? End->getOperand(1).getMBB()
1082  : End->getOperand(2).getMBB();
1083  // TODO Maybe there's cases where the target doesn't have to be the header,
1084  // but for now be safe and revert.
1085  if (TgtBB != ML.getHeader()) {
1086  LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1087  return false;
1088  }
1089 
1090  // The WLS and LE instructions have 12-bits for the label offset. WLS
1091  // requires a positive offset, while LE uses negative.
1092  if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
1093  !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
1094  LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1095  return false;
1096  }
1097 
1098  if (isWhileLoopStart(*Start)) {
1099  MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1100  if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1101  !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
1102  LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1103  return false;
1104  }
1105  }
1106  return true;
1107  };
1108 
1109  StartInsertPt = MachineBasicBlock::iterator(Start);
1110  StartInsertBB = Start->getParent();
1111  LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1112  << *StartInsertPt);
1113 
1114  Revert = !ValidateRanges(Start, End, BBUtils, ML);
1115  CannotTailPredicate = !ValidateTailPredicate();
1116 }
1117 
1118 bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1119  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1120  if (VCTPs.empty()) {
1121  VCTPs.push_back(MI);
1122  return true;
1123  }
1124 
1125  // If we find another VCTP, check whether it uses the same value as the main VCTP.
1126  // If it does, store it in the VCTPs set, else refuse it.
1127  MachineInstr *Prev = VCTPs.back();
1128  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1129  !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1130  LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1131  "definition from the main VCTP");
1132  return false;
1133  }
1134  VCTPs.push_back(MI);
1135  return true;
1136 }
1137 
1139 
1140  auto GetFrameIndex = [](MachineMemOperand *Operand) {
1141  const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1142  if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1143  if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1144  return FS->getFrameIndex();
1145  }
1146  }
1147  return -1;
1148  };
1149 
1150  auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1151  switch (I->getOpcode()) {
1152  case ARM::MVE_VSTRWU32:
1153  case ARM::MVE_VLDRWU32: {
1154  return I->getOperand(1).getReg() == ARM::SP &&
1155  I->memoperands().size() == 1 &&
1156  GetFrameIndex(I->memoperands().front()) >= 0;
1157  }
1158  default:
1159  return false;
1160  }
1161  };
1162 
1163  // An unpredicated vector register spill is allowed if all of the uses of the
1164  // stack slot are within the loop
1165  if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1166  return false;
1167 
1168  // Search all blocks after the loop for accesses to the same stack slot.
1169  // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1170  // live-out (which sp never is) to know what blocks to look in
1171  if (MI->memoperands().size() == 0)
1172  return false;
1173  int FI = GetFrameIndex(MI->memoperands().front());
1174 
1175  auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1176  if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1177  return false;
1178 
1180  ML->getExitBlocks(Frontier);
1181  SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1182  unsigned Idx = 0;
1183  while (Idx < Frontier.size()) {
1184  MachineBasicBlock *BB = Frontier[Idx];
1185  bool LookAtSuccessors = true;
1186  for (auto &I : *BB) {
1187  if (!IsStackOp(&I) || I.memoperands().size() == 0)
1188  continue;
1189  if (GetFrameIndex(I.memoperands().front()) != FI)
1190  continue;
1191  // If this block has a store to the stack slot before any loads then we
1192  // can ignore the block
1193  if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1194  LookAtSuccessors = false;
1195  break;
1196  }
1197  // If the store and the load are using the same stack slot then the
1198  // store isn't valid for tail predication
1199  if (I.getOpcode() == ARM::MVE_VLDRWU32)
1200  return false;
1201  }
1202 
1203  if (LookAtSuccessors) {
1204  for (auto Succ : BB->successors()) {
1205  if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1206  Frontier.push_back(Succ);
1207  }
1208  }
1209  Visited.insert(BB);
1210  Idx++;
1211  }
1212 
1213  return true;
1214 }
1215 
1216 bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1217  if (CannotTailPredicate)
1218  return false;
1219 
1220  if (!shouldInspect(*MI))
1221  return true;
1222 
1223  if (MI->getOpcode() == ARM::MVE_VPSEL ||
1224  MI->getOpcode() == ARM::MVE_VPNOT) {
1225  // TODO: Allow VPSEL and VPNOT, we currently cannot because:
1226  // 1) It will use the VPR as a predicate operand, but doesn't have to be
1227  // instead a VPT block, which means we can assert while building up
1228  // the VPT block because we don't find another VPT or VPST to being a new
1229  // one.
1230  // 2) VPSEL still requires a VPR operand even after tail predicating,
1231  // which means we can't remove it unless there is another
1232  // instruction, such as vcmp, that can provide the VPR def.
1233  return false;
1234  }
1235 
1236  // Record all VCTPs and check that they're equivalent to one another.
1237  if (isVCTP(MI) && !AddVCTP(MI))
1238  return false;
1239 
1240  // Inspect uses first so that any instructions that alter the VPR don't
1241  // alter the predicate upon themselves.
1242  const MCInstrDesc &MCID = MI->getDesc();
1243  bool IsUse = false;
1244  unsigned LastOpIdx = MI->getNumOperands() - 1;
1245  for (auto &Op : enumerate(reverse(MCID.operands()))) {
1246  const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1247  if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
1248  continue;
1249 
1250  if (ARM::isVpred(Op.value().OperandType)) {
1251  VPTState::addInst(MI);
1252  IsUse = true;
1253  } else if (MI->getOpcode() != ARM::MVE_VPST) {
1254  LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1255  return false;
1256  }
1257  }
1258 
1259  // If we find an instruction that has been marked as not valid for tail
1260  // predication, only allow the instruction if it's contained within a valid
1261  // VPT block.
1262  bool RequiresExplicitPredication =
1264  if (isDomainMVE(MI) && RequiresExplicitPredication) {
1265  if (MI->getOpcode() == ARM::MQPRCopy)
1266  return true;
1267  if (!IsUse && producesDoubleWidthResult(*MI)) {
1268  DoubleWidthResultInstrs.insert(MI);
1269  return true;
1270  }
1271 
1272  LLVM_DEBUG(if (!IsUse) dbgs()
1273  << "ARM Loops: Can't tail predicate: " << *MI);
1274  return IsUse;
1275  }
1276 
1277  // If the instruction is already explicitly predicated, then the conversion
1278  // will be fine, but ensure that all store operations are predicated.
1279  if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1280  return IsUse;
1281 
1282  // If this instruction defines the VPR, update the predicate for the
1283  // proceeding instructions.
1284  if (isVectorPredicate(MI)) {
1285  // Clear the existing predicate when we're not in VPT Active state,
1286  // otherwise we add to it.
1287  if (!isVectorPredicated(MI))
1288  VPTState::resetPredicate(MI);
1289  else
1290  VPTState::addPredicate(MI);
1291  }
1292 
1293  // Finally once the predicate has been modified, we can start a new VPT
1294  // block if necessary.
1295  if (isVPTOpcode(MI->getOpcode()))
1297 
1298  return true;
1299 }
1300 
1301 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1302  const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
1303  if (!ST.hasLOB())
1304  return false;
1305 
1306  MF = &mf;
1307  LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
1308 
1309  MLI = &getAnalysis<MachineLoopInfo>();
1310  RDA = &getAnalysis<ReachingDefAnalysis>();
1312  MRI = &MF->getRegInfo();
1313  TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1314  TRI = ST.getRegisterInfo();
1315  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
1316  BBUtils->computeAllBlockSizes();
1317  BBUtils->adjustBBOffsetsAfter(&MF->front());
1318 
1319  bool Changed = false;
1320  for (auto ML : *MLI) {
1321  if (ML->isOutermost())
1322  Changed |= ProcessLoop(ML);
1323  }
1324  Changed |= RevertNonLoops();
1325  return Changed;
1326 }
1327 
1328 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1329 
1330  bool Changed = false;
1331 
1332  // Process inner loops first.
1333  for (MachineLoop *L : *ML)
1334  Changed |= ProcessLoop(L);
1335 
1336  LLVM_DEBUG({
1337  dbgs() << "ARM Loops: Processing loop containing:\n";
1338  if (auto *Preheader = ML->getLoopPreheader())
1339  dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1340  else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1341  dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1342  for (auto *MBB : ML->getBlocks())
1343  dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1344  });
1345 
1346  // Search the given block for a loop start instruction. If one isn't found,
1347  // and there's only one predecessor block, search that one too.
1348  std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
1349  [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
1350  for (auto &MI : *MBB) {
1351  if (isLoopStart(MI))
1352  return &MI;
1353  }
1354  if (MBB->pred_size() == 1)
1355  return SearchForStart(*MBB->pred_begin());
1356  return nullptr;
1357  };
1358 
1359  LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
1360  // Search the preheader for the start intrinsic.
1361  // FIXME: I don't see why we shouldn't be supporting multiple predecessors
1362  // with potentially multiple set.loop.iterations, so we need to enable this.
1363  if (LoLoop.Preheader)
1364  LoLoop.Start = SearchForStart(LoLoop.Preheader);
1365  else
1366  return Changed;
1367 
1368  // Find the low-overhead loop components and decide whether or not to fall
1369  // back to a normal loop. Also look for a vctp instructions and decide
1370  // whether we can convert that predicate using tail predication.
1371  for (auto *MBB : reverse(ML->getBlocks())) {
1372  for (auto &MI : *MBB) {
1373  if (MI.isDebugValue())
1374  continue;
1375  else if (MI.getOpcode() == ARM::t2LoopDec)
1376  LoLoop.Dec = &MI;
1377  else if (MI.getOpcode() == ARM::t2LoopEnd)
1378  LoLoop.End = &MI;
1379  else if (MI.getOpcode() == ARM::t2LoopEndDec)
1380  LoLoop.End = LoLoop.Dec = &MI;
1381  else if (isLoopStart(MI))
1382  LoLoop.Start = &MI;
1383  else if (MI.getDesc().isCall()) {
1384  // TODO: Though the call will require LE to execute again, does this
1385  // mean we should revert? Always executing LE hopefully should be
1386  // faster than performing a sub,cmp,br or even subs,br.
1387  LoLoop.Revert = true;
1388  LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1389  } else {
1390  // Record VPR defs and build up their corresponding vpt blocks.
1391  // Check we know how to tail predicate any mve instructions.
1392  LoLoop.AnalyseMVEInst(&MI);
1393  }
1394  }
1395  }
1396 
1397  LLVM_DEBUG(LoLoop.dump());
1398  if (!LoLoop.FoundAllComponents()) {
1399  LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1400  return Changed;
1401  }
1402 
1403  assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1404  "Expected t2WhileLoopStart to be removed before regalloc!");
1405 
1406  // Check that the only instruction using LoopDec is LoopEnd. This can only
1407  // happen when the Dec and End are separate, not a single t2LoopEndDec.
1408  // TODO: Check for copy chains that really have no effect.
1409  if (LoLoop.Dec != LoLoop.End) {
1411  RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
1412  if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
1413  LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
1414  LoLoop.Revert = true;
1415  }
1416  }
1417  LoLoop.Validate(BBUtils.get());
1418  Expand(LoLoop);
1419  return true;
1420 }
1421 
1422 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1423 // beq that branches to the exit branch.
1424 // TODO: We could also try to generate a cbz if the value in LR is also in
1425 // another low register.
1426 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
1427  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1429  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1430  ARM::tBcc : ARM::t2Bcc;
1431 
1432  RevertWhileLoopStartLR(MI, TII, BrOpc);
1433 }
1434 
1435 void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1436  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1438 }
1439 
1441  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1442  MachineBasicBlock *MBB = MI->getParent();
1444  for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
1445  if (I->getOpcode() == ARM::t2LoopEnd) {
1446  Ignore.insert(&*I);
1447  break;
1448  }
1449  }
1450 
1451  // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1452  bool SetFlags =
1454 
1455  llvm::RevertLoopDec(MI, TII, SetFlags);
1456  return SetFlags;
1457 }
1458 
1459 // Generate a subs, or sub and cmp, and a branch instead of an LE.
1460 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
1461  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1462 
1463  MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
1464  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1465  ARM::tBcc : ARM::t2Bcc;
1466 
1467  llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1468 }
1469 
1470 // Generate a subs, or sub and cmp, and a branch instead of an LE.
1471 void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
1472  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1473  assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1474  MachineBasicBlock *MBB = MI->getParent();
1475 
1476  MachineInstrBuilder MIB =
1477  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1478  MIB.addDef(ARM::LR);
1479  MIB.add(MI->getOperand(1));
1480  MIB.addImm(1);
1481  MIB.addImm(ARMCC::AL);
1482  MIB.addReg(ARM::NoRegister);
1483  MIB.addReg(ARM::CPSR);
1484  MIB->getOperand(5).setIsDef(true);
1485 
1486  MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
1487  unsigned BrOpc =
1488  BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
1489 
1490  // Create bne
1491  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1492  MIB.add(MI->getOperand(2)); // branch target
1493  MIB.addImm(ARMCC::NE); // condition code
1494  MIB.addReg(ARM::CPSR);
1495 
1496  MI->eraseFromParent();
1497 }
1498 
1499 // Perform dead code elimation on the loop iteration count setup expression.
1500 // If we are tail-predicating, the number of elements to be processed is the
1501 // operand of the VCTP instruction in the vector body, see getCount(), which is
1502 // register $r3 in this example:
1503 //
1504 // $lr = big-itercount-expression
1505 // ..
1506 // $lr = t2DoLoopStart renamable $lr
1507 // vector.body:
1508 // ..
1509 // $vpr = MVE_VCTP32 renamable $r3
1510 // renamable $lr = t2LoopDec killed renamable $lr, 1
1511 // t2LoopEnd renamable $lr, %vector.body
1512 // tB %end
1513 //
1514 // What we would like achieve here is to replace the do-loop start pseudo
1515 // instruction t2DoLoopStart with:
1516 //
1517 // $lr = MVE_DLSTP_32 killed renamable $r3
1518 //
1519 // Thus, $r3 which defines the number of elements, is written to $lr,
1520 // and then we want to delete the whole chain that used to define $lr,
1521 // see the comment below how this chain could look like.
1522 //
1523 void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
1524  if (!LoLoop.IsTailPredicationLegal())
1525  return;
1526 
1527  LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1528 
1529  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
1530  if (!Def) {
1531  LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
1532  return;
1533  }
1534 
1535  // Collect and remove the users of iteration count.
1536  SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
1537  LoLoop.End };
1538  if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
1539  LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
1540 }
1541 
1542 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
1543  LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
1544  // When using tail-predication, try to delete the dead code that was used to
1545  // calculate the number of loop iterations.
1546  IterationCountDCE(LoLoop);
1547 
1548  MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1549  MachineInstr *Start = LoLoop.Start;
1550  MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1551  unsigned Opc = LoLoop.getStartOpcode();
1552  MachineOperand &Count = LoLoop.getLoopStartOperand();
1553 
1554  // A DLS lr, lr we needn't emit
1555  MachineInstr* NewStart;
1556  if (Opc == ARM::t2DLS && Count.isReg() && Count.getReg() == ARM::LR) {
1557  LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1558  NewStart = nullptr;
1559  } else {
1560  MachineInstrBuilder MIB =
1561  BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
1562 
1563  MIB.addDef(ARM::LR);
1564  MIB.add(Count);
1565  if (isWhileLoopStart(*Start))
1566  MIB.addMBB(getWhileLoopStartTargetBB(*Start));
1567 
1568  LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1569  NewStart = &*MIB;
1570  }
1571 
1572  LoLoop.ToRemove.insert(Start);
1573  return NewStart;
1574 }
1575 
1576 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1577  auto RemovePredicate = [](MachineInstr *MI) {
1578  if (MI->isDebugInstr())
1579  return;
1580  LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1581  int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
1582  assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1583  assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1584  "Expected Then predicate!");
1585  MI->getOperand(PIdx).setImm(ARMVCC::None);
1586  MI->getOperand(PIdx + 1).setReg(0);
1587  };
1588 
1589  for (auto &Block : LoLoop.getVPTBlocks()) {
1590  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1591 
1592  auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1593  assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1594  // Replace the VCMP with a VPT
1595  MachineInstrBuilder MIB =
1596  BuildMI(*At->getParent(), At, At->getDebugLoc(),
1597  TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1598  MIB.addImm(ARMVCC::Then);
1599  // Register one
1600  MIB.add(TheVCMP->getOperand(1));
1601  // Register two
1602  MIB.add(TheVCMP->getOperand(2));
1603  // The comparison code, e.g. ge, eq, lt
1604  MIB.add(TheVCMP->getOperand(3));
1605  LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1606  LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1607  LoLoop.ToRemove.insert(TheVCMP);
1608  TheVCMP = nullptr;
1609  };
1610 
1611  if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1612  MachineInstr *VPST = Insts.front();
1613  if (VPTState::hasUniformPredicate(Block)) {
1614  // A vpt block starting with VPST, is only predicated upon vctp and has no
1615  // internal vpr defs:
1616  // - Remove vpst.
1617  // - Unpredicate the remaining instructions.
1618  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1619  for (unsigned i = 1; i < Insts.size(); ++i)
1620  RemovePredicate(Insts[i]);
1621  } else {
1622  // The VPT block has a non-uniform predicate but it uses a vpst and its
1623  // entry is guarded only by a vctp, which means we:
1624  // - Need to remove the original vpst.
1625  // - Then need to unpredicate any following instructions, until
1626  // we come across the divergent vpr def.
1627  // - Insert a new vpst to predicate the instruction(s) that following
1628  // the divergent vpr def.
1629  MachineInstr *Divergent = VPTState::getDivergent(Block);
1630  MachineBasicBlock *MBB = Divergent->getParent();
1631  auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
1632  while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
1633  ++DivergentNext;
1634 
1635  bool DivergentNextIsPredicated =
1636  DivergentNext != MBB->end() &&
1637  getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
1638 
1639  for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
1640  I != E; ++I)
1641  RemovePredicate(&*I);
1642 
1643  // Check if the instruction defining vpr is a vcmp so it can be combined
1644  // with the VPST This should be the divergent instruction
1645  MachineInstr *VCMP =
1646  VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1647 
1648  if (DivergentNextIsPredicated) {
1649  // Insert a VPST at the divergent only if the next instruction
1650  // would actually use it. A VCMP following a VPST can be
1651  // merged into a VPT so do that instead if the VCMP exists.
1652  if (!VCMP) {
1653  // Create a VPST (with a null mask for now, we'll recompute it
1654  // later)
1655  MachineInstrBuilder MIB =
1656  BuildMI(*Divergent->getParent(), Divergent,
1657  Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
1658  MIB.addImm(0);
1659  LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
1660  LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1661  } else {
1662  // No RDA checks are necessary here since the VPST would have been
1663  // directly after the VCMP
1664  ReplaceVCMPWithVPT(VCMP, VCMP);
1665  }
1666  }
1667  }
1668  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1669  LoLoop.ToRemove.insert(VPST);
1670  } else if (Block.containsVCTP()) {
1671  // The vctp will be removed, so either the entire block will be dead or
1672  // the block mask of the vp(s)t will need to be recomputed.
1673  MachineInstr *VPST = Insts.front();
1674  if (Block.size() == 2) {
1675  assert(VPST->getOpcode() == ARM::MVE_VPST &&
1676  "Found a VPST in an otherwise empty vpt block");
1677  LoLoop.ToRemove.insert(VPST);
1678  } else
1679  LoLoop.BlockMasksToRecompute.insert(VPST);
1680  } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1681  // If this block starts with a VPST then attempt to merge it with the
1682  // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1683  // block that no longer exists
1684  MachineInstr *VPST = Insts.front();
1685  auto Next = ++MachineBasicBlock::iterator(VPST);
1687  "The instruction after a VPST must be predicated");
1688  (void)Next;
1689  MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1690  if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1691  !LoLoop.ToRemove.contains(VprDef)) {
1692  MachineInstr *VCMP = VprDef;
1693  // The VCMP and VPST can only be merged if the VCMP's operands will have
1694  // the same values at the VPST.
1695  // If any of the instructions between the VCMP and VPST are predicated
1696  // then a different code path is expected to have merged the VCMP and
1697  // VPST already.
1700  RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
1701  RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1702  ReplaceVCMPWithVPT(VCMP, VPST);
1703  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1704  LoLoop.ToRemove.insert(VPST);
1705  }
1706  }
1707  }
1708  }
1709 
1710  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
1711 }
1712 
1713 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1714 
1715  // Combine the LoopDec and LoopEnd instructions into LE(TP).
1716  auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1717  MachineInstr *End = LoLoop.End;
1718  MachineBasicBlock *MBB = End->getParent();
1719  unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1720  ARM::MVE_LETP : ARM::t2LEUpdate;
1721  MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1722  TII->get(Opc));
1723  MIB.addDef(ARM::LR);
1724  unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
1725  MIB.add(End->getOperand(Off + 0));
1726  MIB.add(End->getOperand(Off + 1));
1727  LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1728  LoLoop.ToRemove.insert(LoLoop.Dec);
1729  LoLoop.ToRemove.insert(End);
1730  return &*MIB;
1731  };
1732 
1733  // TODO: We should be able to automatically remove these branches before we
1734  // get here - probably by teaching analyzeBranch about the pseudo
1735  // instructions.
1736  // If there is an unconditional branch, after I, that just branches to the
1737  // next block, remove it.
1738  auto RemoveDeadBranch = [](MachineInstr *I) {
1739  MachineBasicBlock *BB = I->getParent();
1740  MachineInstr *Terminator = &BB->instr_back();
1741  if (Terminator->isUnconditionalBranch() && I != Terminator) {
1742  MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
1743  if (BB->isLayoutSuccessor(Succ)) {
1744  LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1745  Terminator->eraseFromParent();
1746  }
1747  }
1748  };
1749 
1750  // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1751  // Anything other MQPRCopy can be converted to MVE_VORR later on.
1752  auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
1753  for (auto *MI : VMOVCopies) {
1754  LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1755  assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1756  MachineBasicBlock *MBB = MI->getParent();
1757  Register Dst = MI->getOperand(0).getReg();
1758  Register Src = MI->getOperand(1).getReg();
1759  auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1760  ARM::D0 + (Dst - ARM::Q0) * 2)
1761  .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
1762  .add(predOps(ARMCC::AL));
1763  (void)MIB1;
1764  LLVM_DEBUG(dbgs() << " into " << *MIB1);
1765  auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1766  ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
1767  .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
1768  .add(predOps(ARMCC::AL));
1769  LLVM_DEBUG(dbgs() << " and " << *MIB2);
1770  (void)MIB2;
1771  MI->eraseFromParent();
1772  }
1773  };
1774 
1775  if (LoLoop.Revert) {
1776  if (isWhileLoopStart(*LoLoop.Start))
1777  RevertWhile(LoLoop.Start);
1778  else
1779  RevertDo(LoLoop.Start);
1780  if (LoLoop.Dec == LoLoop.End)
1781  RevertLoopEndDec(LoLoop.End);
1782  else
1783  RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
1784  } else {
1785  ExpandVMOVCopies(LoLoop.VMOVCopies);
1786  LoLoop.Start = ExpandLoopStart(LoLoop);
1787  if (LoLoop.Start)
1788  RemoveDeadBranch(LoLoop.Start);
1789  LoLoop.End = ExpandLoopEnd(LoLoop);
1790  RemoveDeadBranch(LoLoop.End);
1791  if (LoLoop.IsTailPredicationLegal())
1792  ConvertVPTBlocks(LoLoop);
1793  for (auto *I : LoLoop.ToRemove) {
1794  LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
1795  I->eraseFromParent();
1796  }
1797  for (auto *I : LoLoop.BlockMasksToRecompute) {
1798  LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
1800  LLVM_DEBUG(dbgs() << " ... done: " << *I);
1801  }
1802  }
1803 
1804  PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1805  DFS.ProcessLoop();
1806  const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1807  for (auto *MBB : PostOrder) {
1809  // FIXME: For some reason, the live-in print order is non-deterministic for
1810  // our tests and I can't out why... So just sort them.
1812  }
1813 
1814  for (auto *MBB : reverse(PostOrder))
1816 
1817  // We've moved, removed and inserted new instructions, so update RDA.
1818  RDA->reset();
1819 }
1820 
1821 bool ARMLowOverheadLoops::RevertNonLoops() {
1822  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1823  bool Changed = false;
1824 
1825  for (auto &MBB : *MF) {
1830  SmallVector<MachineInstr *, 4> MQPRCopies;
1831 
1832  for (auto &I : MBB) {
1833  if (isLoopStart(I))
1834  Starts.push_back(&I);
1835  else if (I.getOpcode() == ARM::t2LoopDec)
1836  Decs.push_back(&I);
1837  else if (I.getOpcode() == ARM::t2LoopEnd)
1838  Ends.push_back(&I);
1839  else if (I.getOpcode() == ARM::t2LoopEndDec)
1840  EndDecs.push_back(&I);
1841  else if (I.getOpcode() == ARM::MQPRCopy)
1842  MQPRCopies.push_back(&I);
1843  }
1844 
1845  if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1846  MQPRCopies.empty())
1847  continue;
1848 
1849  Changed = true;
1850 
1851  for (auto *Start : Starts) {
1852  if (isWhileLoopStart(*Start))
1853  RevertWhile(Start);
1854  else
1855  RevertDo(Start);
1856  }
1857  for (auto *Dec : Decs)
1858  RevertLoopDec(Dec);
1859 
1860  for (auto *End : Ends)
1861  RevertLoopEnd(End);
1862  for (auto *End : EndDecs)
1863  RevertLoopEndDec(End);
1864  for (auto *MI : MQPRCopies) {
1865  LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1866  assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1867  MachineBasicBlock *MBB = MI->getParent();
1868  auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
1869  MI->getOperand(0).getReg())
1870  .add(MI->getOperand(1))
1871  .add(MI->getOperand(1));
1872  addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
1873  MI->eraseFromParent();
1874  }
1875  }
1876  return Changed;
1877 }
1878 
1880  return new ARMLowOverheadLoops();
1881 }
isVectorPredicated
static bool isVectorPredicated(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:84
llvm::TargetRegisterInfo::getRegAsmName
virtual StringRef getRegAsmName(MCRegister Reg) const
Return the assembly name for Reg.
Definition: TargetRegisterInfo.h:1022
i
i
Definition: README.txt:29
ARMSubtarget.h
INITIALIZE_PASS
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) static bool TryRemove(MachineInstr *MI
producesFalseLanesZero
static bool producesFalseLanesZero(MachineInstr &MI, const TargetRegisterClass *QPRs, const ReachingDefAnalysis &RDA, InstSet &FalseLanesZero)
Definition: ARMLowOverheadLoops.cpp:878
llvm::ReachingDefAnalysis::reset
void reset()
Re-run the analysis.
Definition: ReachingDefAnalysis.cpp:230
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:326
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm::ReachingDefAnalysis::hasSameReachingDef
bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, MCRegister PhysReg) const
Return whether A and B use the same def of PhysReg.
Definition: ReachingDefAnalysis.cpp:291
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::ReachingDefAnalysis::isSafeToRemove
bool isSafeToRemove(MachineInstr *MI, InstSet &ToRemove) const
Return whether removing this instruction will have no effect on the program, returning the redundant ...
Definition: ReachingDefAnalysis.cpp:605
llvm::ARM::isVpred
bool isVpred(OperandType op)
Definition: ARMMCTargetDesc.h:120
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1631
llvm::LoopBase::getExitBlocks
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
Definition: LoopInfoImpl.h:61
llvm::ReachingDefAnalysis::getUniqueReachingMIDef
MachineInstr * getUniqueReachingMIDef(MachineInstr *MI, MCRegister PhysReg) const
If a single MachineInstr creates the reaching definition, then return it.
Definition: ReachingDefAnalysis.cpp:438
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
llvm::ARMII::VecSize
@ VecSize
Definition: ARMBaseInfo.h:421
llvm::ReachingDefAnalysis
This class provides the reaching def analysis.
Definition: ReachingDefAnalysis.h:69
MCInstrDesc.h
llvm::ARMSubtarget
Definition: ARMSubtarget.h:47
SetOperations.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:67
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::createARMLowOverheadLoopsPass
FunctionPass * createARMLowOverheadLoopsPass()
Definition: ARMLowOverheadLoops.cpp:1879
llvm::AArch64::FalseLanesZero
@ FalseLanesZero
Definition: AArch64InstrInfo.h:539
llvm::recomputeLivenessFlags
void recomputeLivenessFlags(MachineBasicBlock &MBB)
Recomputes dead and kill flags in MBB.
Definition: LivePhysRegs.cpp:281
llvm::ARMII::DoubleWidthResult
@ DoubleWidthResult
Definition: ARMBaseInfo.h:413
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:138
llvm::MachineLoopInfo::findLoopPreheader
MachineBasicBlock * findLoopPreheader(MachineLoop *L, bool SpeculativePreheader=false, bool FindMultiLoopPreheader=false) const
Find the block that either is the loop preheader, or could speculatively be used as the preheader.
Definition: MachineLoopInfo.cpp:118
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::isDoLoopStart
static bool isDoLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:71
Ignore
ReachingDefAnalysis InstSet InstSet & Ignore
Definition: ARMLowOverheadLoops.cpp:542
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:116
llvm::enumerate
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition: STLExtras.h:2057
ToRemove
ReachingDefAnalysis InstSet & ToRemove
Definition: ARMLowOverheadLoops.cpp:542
llvm::recomputeLiveIns
static void recomputeLiveIns(MachineBasicBlock &MBB)
Convenience function for recomputing live-in's for MBB.
Definition: LivePhysRegs.h:198
isDomainMVE
static bool isDomainMVE(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:97
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
IT
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
hasVPRUse
static bool hasVPRUse(MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:93
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:234
MachineLoopUtils.h
llvm::MachineBasicBlock::liveins
iterator_range< livein_iterator > liveins() const
Definition: MachineBasicBlock.h:422
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:380
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
shouldInspect
static bool shouldInspect(MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:108
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:127
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::ARMVCC::Then
@ Then
Definition: ARMBaseInfo.h:91
CreateVPTBlock
static ARM::PredBlockMask CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, MachineBasicBlock::instr_iterator EndIter, SmallVectorImpl< MachineInstr * > &DeadInstructions)
Definition: MVEVPTBlockPass.cpp:163
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
llvm::MachineBasicBlock::back
MachineInstr & back()
Definition: MachineBasicBlock.h:258
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::VCTPOpcodeToLSTP
static unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop)
Definition: MVETailPredUtils.h:25
llvm::ARMII::ValidForTailPredication
@ ValidForTailPredication
Definition: ARMBaseInfo.h:402
llvm::LoopBase::getNumBlocks
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:201
llvm::ReachingDefAnalysis::hasLocalDefBefore
bool hasLocalDefBefore(MachineInstr *MI, MCRegister PhysReg) const
Provide whether the register has been defined in the same basic block as, and before,...
Definition: ReachingDefAnalysis.cpp:326
llvm::ReachingDefAnalysis::getLocalLiveOutMIDef
MachineInstr * getLocalLiveOutMIDef(MachineBasicBlock *MBB, MCRegister PhysReg) const
Return the local MI that produces the live out value for PhysReg, or nullptr for a non-live out or no...
Definition: ReachingDefAnalysis.cpp:527
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
MachineRegisterInfo.h
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:585
llvm::ReachingDefAnalysis::isSafeToMoveBackwards
bool isSafeToMoveBackwards(MachineInstr *From, MachineInstr *To) const
Return whether From can be moved backwards to just after To.
Definition: ReachingDefAnalysis.cpp:595
llvm::getAddSubImmediate
int getAddSubImmediate(MachineInstr &MI)
Definition: ARMBaseInstrInfo.h:871
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
llvm::MachineBasicBlock::pred_size
unsigned pred_size() const
Definition: MachineBasicBlock.h:338
llvm::TargetRegisterClass::contains
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
Definition: TargetRegisterInfo.h:94
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:865
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:666
llvm::MCInstrDesc::TSFlags
uint64_t TSFlags
Definition: MCInstrDesc.h:205
llvm::PseudoSourceValue::kind
unsigned kind() const
Definition: PseudoSourceValue.h:66
llvm::MachineBasicBlock::insertAfter
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
Definition: MachineBasicBlock.h:903
MachineLoopInfo.h
llvm::SetVector::begin
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::ARM_MC::isPredicated
bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII)
Definition: ARMMCTargetDesc.cpp:169
llvm::ReachingDefAnalysis::getGlobalUses
void getGlobalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const
Collect the users of the value stored in PhysReg, which is defined by MI.
Definition: ReachingDefAnalysis.cpp:375
producesDoubleWidthResult
static bool producesDoubleWidthResult(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:833
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::isUse
bool isUse() const
Definition: MachineOperand.h:369
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:501
llvm::RevertLoopDec
void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII, bool SetFlags=false)
Definition: MVETailPredUtils.h:145
llvm::findFirstVPTPredOperandIdx
int findFirstVPTPredOperandIdx(const MachineInstr &MI)
Definition: Thumb2InstrInfo.cpp:774
llvm::RevertLoopEnd
void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII, unsigned BrOpc=ARM::t2Bcc, bool SkipCmp=false)
Definition: MVETailPredUtils.h:167
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
Domain
Domain
Definition: CorrelatedValuePropagation.cpp:710
getVecSize
static int getVecSize(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:102
llvm::MachineFunction::getProperties
const MachineFunctionProperties & getProperties() const
Get the function properties.
Definition: MachineFunction.h:747
MVETailPredUtils.h
llvm::getVPTInstrPredicate
ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg)
Definition: Thumb2InstrInfo.cpp:787
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:197
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:187
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:196
llvm::MachineBasicBlock::RegisterMaskPair
Pair of physical register and lane mask.
Definition: MachineBasicBlock.h:100
llvm::ARMISD::VCMP
@ VCMP
Definition: ARMISelLowering.h:147
llvm::PseudoSourceValue::FixedStack
@ FixedStack
Definition: PseudoSourceValue.h:42
DisableTailPredication
static cl::opt< bool > DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden, cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"), cl::init(false))
llvm::PseudoSourceValue
Special value supplied for machine level alias analysis.
Definition: PseudoSourceValue.h:35
llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition: ARMBaseInfo.h:409
isVectorPredicate
static bool isVectorPredicate(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:89
llvm::VCMPOpcodeToVPT
static unsigned VCMPOpcodeToVPT(unsigned Opcode)
Definition: ARMBaseInstrInfo.h:587
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::MachineFunctionProperties::Property::NoVRegs
@ NoVRegs
llvm::ARMCC::NE
@ NE
Definition: ARMBaseInfo.h:32
Passes.h
llvm::ARMCC::AL
@ AL
Definition: ARMBaseInfo.h:45
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::cl::opt< bool >
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:420
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:209
ARMBaseRegisterInfo.h
llvm::isVPTOpcode
static bool isVPTOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:571
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:750
llvm::MachineFunctionProperties::Property::TracksLiveness
@ TracksLiveness
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
uint64_t
llvm::RevertDoLoopStart
void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII)
Definition: MVETailPredUtils.h:135
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:408
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::ARMBaseInstrInfo
Definition: ARMBaseInstrInfo.h:37
llvm::ARMII::RetainsPreviousHalfElement
@ RetainsPreviousHalfElement
Definition: ARMBaseInfo.h:406
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:166
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:403
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1682
MachineFunctionPass.h
llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:565
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::X86AS::FS
@ FS
Definition: X86.h:192
llvm::isVCTP
static bool isVCTP(const MachineInstr *MI)
Definition: MVETailPredUtils.h:58
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:234
DEBUG_TYPE
#define DEBUG_TYPE
Definition: ARMLowOverheadLoops.cpp:76
ARMBaseInstrInfo.h
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::ARMVCC::None
@ None
Definition: ARMBaseInfo.h:90
llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:120
ARM.h
llvm::ARMII::VecSizeShift
@ VecSizeShift
Definition: ARMBaseInfo.h:420
llvm::AMDGPU::IsaInfo::TargetIDSetting::Off
@ Off
isRegInClass
static bool isRegInClass(const MachineOperand &MO, const TargetRegisterClass *Class)
Definition: ARMLowOverheadLoops.cpp:816
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1598
llvm::MachineOperand::setIsDef
void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
Definition: MachineOperand.cpp:99
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:95
llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition: MachineBasicBlock.h:242
isHorizontalReduction
static bool isHorizontalReduction(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:839
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1624
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:365
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:491
llvm::ReachingDefAnalysis::getReachingLocalUses
void getReachingLocalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const
Provides the uses, in the same block as MI, of register that MI defines.
Definition: ReachingDefAnalysis.cpp:331
llvm::LoopBase::isOutermost
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:184
llvm::RevertWhileLoopStartLR
void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII, unsigned BrOpc=ARM::t2Bcc, bool UseCmp=false)
Definition: MVETailPredUtils.h:98
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1823
llvm::isWhileLoopStart
static bool isWhileLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:76
ARMBasicBlockInfo.h
llvm::getWhileLoopStartTargetBB
MachineBasicBlock * getWhileLoopStartTargetBB(const MachineInstr &MI)
Definition: MVETailPredUtils.h:87
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:288
llvm::MachineInstrBuilder::getInstr
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Definition: MachineInstrBuilder.h:89
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ReachingDefAnalysis::collectKilledOperands
void collectKilledOperands(MachineInstr *MI, InstSet &Dead) const
Assuming MI is dead, recursively search the incoming operands which are killed by MI and collect thos...
Definition: ReachingDefAnalysis.cpp:649
llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition: SetVector.h:220
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
ValidateMVEStore
static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML)
Definition: ARMLowOverheadLoops.cpp:1138
llvm::ReachingDefAnalysis::isSafeToDefRegAt
bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg) const
Return whether a MachineInstr could be inserted at MI and safely define the given register without af...
Definition: ReachingDefAnalysis.cpp:681
Thumb2InstrInfo.h
llvm::MachineBasicBlock::sortUniqueLiveIns
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
Definition: MachineBasicBlock.cpp:582
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::MachineBasicBlock::insert
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition: MachineBasicBlock.cpp:1312
MachineFrameInfo.h
ReachingDefAnalysis.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:591
llvm::recomputeVPTBlockMask
void recomputeVPTBlockMask(MachineInstr &Instr)
Definition: Thumb2InstrInfo.cpp:799
llvm::ReachingDefAnalysis::getGlobalReachingDefs
void getGlobalReachingDefs(MachineInstr *MI, MCRegister PhysReg, InstSet &Defs) const
Collect all possible definitions of the value stored in PhysReg, which is used by MI.
Definition: ReachingDefAnalysis.cpp:400
canGenerateNonZeros
static bool canGenerateNonZeros(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:849
llvm::SetVector::end
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
ARM_LOW_OVERHEAD_LOOPS_NAME
#define ARM_LOW_OVERHEAD_LOOPS_NAME
Definition: ARMLowOverheadLoops.cpp:77
isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:187
llvm::ARMII::DomainMVE
@ DomainMVE
Definition: ARMBaseInfo.h:431
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
RDA
ReachingDefAnalysis & RDA
Definition: ARMLowOverheadLoops.cpp:541
llvm::SmallPtrSetImplBase::empty
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:92
llvm::ReachingDefAnalysis::isReachingDefLiveOut
bool isReachingDefLiveOut(MachineInstr *MI, MCRegister PhysReg) const
Return whether the reaching def for MI also is live out of its parent block.
Definition: ReachingDefAnalysis.cpp:505
llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:250
llvm::addUnpredicatedMveVpredROp
void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, Register DestReg)
Definition: ARMBaseInstrInfo.cpp:870
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::predOps
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
Definition: ARMBaseInstrInfo.h:542
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::ReachingDefAnalysis::getMIOperand
MachineInstr * getMIOperand(MachineInstr *MI, unsigned Idx) const
If a single MachineInstr creates the reaching definition, for MIs operand at Idx, then return it.
Definition: ReachingDefAnalysis.cpp:458
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::cl::desc
Definition: CommandLine.h:405
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
retainsPreviousHalfElement
static bool retainsPreviousHalfElement(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:824
llvm::ARMII::DomainMask
@ DomainMask
Definition: ARMBaseInfo.h:426
llvm::isMovRegOpcode
static bool isMovRegOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:734
llvm::MCID::Terminator
@ Terminator
Definition: MCInstrDesc.h:157
llvm::MCInstrDesc::operands
iterator_range< const_opInfo_iterator > operands() const
Definition: MCInstrDesc.h:237
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:280
llvm::isSubImmOpcode
static bool isSubImmOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:727
llvm::ARMBasicBlockUtils
Definition: ARMBasicBlockInfo.h:109
llvm::getTailPredVectorWidth
static unsigned getTailPredVectorWidth(unsigned Opcode)
Definition: MVETailPredUtils.h:42
SetVector.h
llvm::MachineOperand::isIdenticalTo
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Definition: MachineOperand.cpp:285
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:927
llvm::ReachingDefAnalysis::isSafeToMoveForwards
bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const
Return whether From can be moved forwards to just before To.
Definition: ReachingDefAnalysis.cpp:585
llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:24
llvm::isLoopStart
static bool isLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:82
SmallSet.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:792
LivePhysRegs.h