LLVM  16.0.0git
ARMLowOverheadLoops.cpp
Go to the documentation of this file.
1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 /// form should be in the preheader, whereas the while form should be in the
14 /// preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 /// In addition to this, we also look for the presence of the VCTP instruction,
19 /// which determines whether we can generated the tail-predicated low-overhead
20 /// loop form.
21 ///
22 /// Assumptions and Dependencies:
23 /// Low-overhead loops are constructed and executed using a setup instruction:
24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26 /// but fixed polarity: WLS can only branch forwards and LE can only branch
27 /// backwards. These restrictions mean that this pass is dependent upon block
28 /// layout and block sizes, which is why it's the last pass to run. The same is
29 /// true for ConstantIslands, but this pass does not increase the size of the
30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31 /// during the transform and pseudo instructions are replaced by real ones. In
32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33 /// multiple instructions for a single pseudo (see RevertWhile and
34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35 /// are defined to be as large as this maximum sequence of replacement
36 /// instructions.
37 ///
38 /// A note on VPR.P0 (the lane mask):
39 /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40 /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41 /// They will simply "and" the result of their calculation with the current
42 /// value of VPR.P0. You can think of it like this:
43 /// \verbatim
44 /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45 /// VPR.P0 &= Value
46 /// else
47 /// VPR.P0 = Value
48 /// \endverbatim
49 /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50 /// fall in the "VPT active" case, so we can consider that all VPR writes by
51 /// one of those instruction is actually a "and".
52 //===----------------------------------------------------------------------===//
53 
54 #include "ARM.h"
55 #include "ARMBaseInstrInfo.h"
56 #include "ARMBaseRegisterInfo.h"
57 #include "ARMBasicBlockInfo.h"
58 #include "ARMSubtarget.h"
59 #include "MVETailPredUtils.h"
60 #include "Thumb2InstrInfo.h"
61 #include "llvm/ADT/SetOperations.h"
62 #include "llvm/ADT/SetVector.h"
63 #include "llvm/ADT/SmallSet.h"
70 #include "llvm/CodeGen/Passes.h"
72 #include "llvm/MC/MCInstrDesc.h"
73 
74 using namespace llvm;
75 
76 #define DEBUG_TYPE "arm-low-overhead-loops"
77 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
78 
79 static cl::opt<bool>
80 DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
81  cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
82  cl::init(false));
83 
84 static cl::opt<bool>
85  DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
86  cl::desc("Disable omitting 'dls lr, lr' instructions"),
87  cl::init(false));
88 
91  return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
92 }
93 
95  return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
96 }
97 
98 static bool hasVPRUse(MachineInstr &MI) {
99  return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
100 }
101 
102 static bool isDomainMVE(MachineInstr *MI) {
103  uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
104  return Domain == ARMII::DomainMVE;
105 }
106 
107 static int getVecSize(const MachineInstr &MI) {
108  const MCInstrDesc &MCID = MI.getDesc();
109  uint64_t Flags = MCID.TSFlags;
110  return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
111 }
112 
114  if (MI.isDebugInstr())
115  return false;
116  return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
117 }
118 
119 namespace {
120 
121  using InstSet = SmallPtrSetImpl<MachineInstr *>;
122 
123  class PostOrderLoopTraversal {
124  MachineLoop &ML;
125  MachineLoopInfo &MLI;
128 
129  public:
130  PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
131  : ML(ML), MLI(MLI) { }
132 
133  const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
134  return Order;
135  }
136 
137  // Visit all the blocks within the loop, as well as exit blocks and any
138  // blocks properly dominating the header.
139  void ProcessLoop() {
140  std::function<void(MachineBasicBlock*)> Search = [this, &Search]
141  (MachineBasicBlock *MBB) -> void {
142  if (Visited.count(MBB))
143  return;
144 
145  Visited.insert(MBB);
146  for (auto *Succ : MBB->successors()) {
147  if (!ML.contains(Succ))
148  continue;
149  Search(Succ);
150  }
151  Order.push_back(MBB);
152  };
153 
154  // Insert exit blocks.
156  ML.getExitBlocks(ExitBlocks);
157  append_range(Order, ExitBlocks);
158 
159  // Then add the loop body.
160  Search(ML.getHeader());
161 
162  // Then try the preheader and its predecessors.
163  std::function<void(MachineBasicBlock*)> GetPredecessor =
164  [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
165  Order.push_back(MBB);
166  if (MBB->pred_size() == 1)
167  GetPredecessor(*MBB->pred_begin());
168  };
169 
170  if (auto *Preheader = ML.getLoopPreheader())
171  GetPredecessor(Preheader);
172  else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
173  GetPredecessor(Preheader);
174  }
175  };
176 
177  struct PredicatedMI {
178  MachineInstr *MI = nullptr;
179  SetVector<MachineInstr*> Predicates;
180 
181  public:
182  PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
183  assert(I && "Instruction must not be null!");
184  Predicates.insert(Preds.begin(), Preds.end());
185  }
186  };
187 
188  // Represent the current state of the VPR and hold all instances which
189  // represent a VPT block, which is a list of instructions that begins with a
190  // VPT/VPST and has a maximum of four proceeding instructions. All
191  // instructions within the block are predicated upon the vpr and we allow
192  // instructions to define the vpr within in the block too.
193  class VPTState {
194  friend struct LowOverheadLoop;
195 
197 
198  static SmallVector<VPTState, 4> Blocks;
199  static SetVector<MachineInstr *> CurrentPredicates;
200  static std::map<MachineInstr *,
201  std::unique_ptr<PredicatedMI>> PredicatedInsts;
202 
203  static void CreateVPTBlock(MachineInstr *MI) {
204  assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
205  && "Can't begin VPT without predicate");
206  Blocks.emplace_back(MI);
207  // The execution of MI is predicated upon the current set of instructions
208  // that are AND'ed together to form the VPR predicate value. In the case
209  // that MI is a VPT, CurrentPredicates will also just be MI.
210  PredicatedInsts.emplace(
211  MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
212  }
213 
214  static void reset() {
215  Blocks.clear();
216  PredicatedInsts.clear();
217  CurrentPredicates.clear();
218  }
219 
220  static void addInst(MachineInstr *MI) {
221  Blocks.back().insert(MI);
222  PredicatedInsts.emplace(
223  MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
224  }
225 
226  static void addPredicate(MachineInstr *MI) {
227  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
228  CurrentPredicates.insert(MI);
229  }
230 
231  static void resetPredicate(MachineInstr *MI) {
232  LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
233  CurrentPredicates.clear();
234  CurrentPredicates.insert(MI);
235  }
236 
237  public:
238  // Have we found an instruction within the block which defines the vpr? If
239  // so, not all the instructions in the block will have the same predicate.
240  static bool hasUniformPredicate(VPTState &Block) {
241  return getDivergent(Block) == nullptr;
242  }
243 
244  // If it exists, return the first internal instruction which modifies the
245  // VPR.
246  static MachineInstr *getDivergent(VPTState &Block) {
247  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
248  for (unsigned i = 1; i < Insts.size(); ++i) {
249  MachineInstr *Next = Insts[i];
250  if (isVectorPredicate(Next))
251  return Next; // Found an instruction altering the vpr.
252  }
253  return nullptr;
254  }
255 
256  // Return whether the given instruction is predicated upon a VCTP.
257  static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
258  SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
259  if (Exclusive && Predicates.size() != 1)
260  return false;
261  return llvm::any_of(Predicates, isVCTP);
262  }
263 
264  // Is the VPST, controlling the block entry, predicated upon a VCTP.
265  static bool isEntryPredicatedOnVCTP(VPTState &Block,
266  bool Exclusive = false) {
267  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
268  return isPredicatedOnVCTP(Insts.front(), Exclusive);
269  }
270 
271  // If this block begins with a VPT, we can check whether it's using
272  // at least one predicated input(s), as well as possible loop invariant
273  // which would result in it being implicitly predicated.
274  static bool hasImplicitlyValidVPT(VPTState &Block,
276  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
277  MachineInstr *VPT = Insts.front();
278  assert(isVPTOpcode(VPT->getOpcode()) &&
279  "Expected VPT block to begin with VPT/VPST");
280 
281  if (VPT->getOpcode() == ARM::MVE_VPST)
282  return false;
283 
284  auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
285  MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
286  return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
287  };
288 
289  auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
290  MachineOperand &MO = MI->getOperand(Idx);
291  if (!MO.isReg() || !MO.getReg())
292  return true;
293 
295  RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
296  if (Defs.empty())
297  return true;
298 
299  for (auto *Def : Defs)
300  if (Def->getParent() == VPT->getParent())
301  return false;
302  return true;
303  };
304 
305  // Check that at least one of the operands is directly predicated on a
306  // vctp and allow an invariant value too.
307  return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
308  (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
309  (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
310  }
311 
312  static bool isValid(ReachingDefAnalysis &RDA) {
313  // All predication within the loop should be based on vctp. If the block
314  // isn't predicated on entry, check whether the vctp is within the block
315  // and that all other instructions are then predicated on it.
316  for (auto &Block : Blocks) {
317  if (isEntryPredicatedOnVCTP(Block, false) ||
318  hasImplicitlyValidVPT(Block, RDA))
319  continue;
320 
321  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
322  // We don't know how to convert a block with just a VPT;VCTP into
323  // anything valid once we remove the VCTP. For now just bail out.
324  assert(isVPTOpcode(Insts.front()->getOpcode()) &&
325  "Expected VPT block to start with a VPST or VPT!");
326  if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
327  isVCTP(Insts.back()))
328  return false;
329 
330  for (auto *MI : Insts) {
331  // Check that any internal VCTPs are 'Then' predicated.
333  return false;
334  // Skip other instructions that build up the predicate.
335  if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
336  continue;
337  // Check that any other instructions are predicated upon a vctp.
338  // TODO: We could infer when VPTs are implicitly predicated on the
339  // vctp (when the operands are predicated).
340  if (!isPredicatedOnVCTP(MI)) {
341  LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
342  return false;
343  }
344  }
345  }
346  return true;
347  }
348 
349  VPTState(MachineInstr *MI) { Insts.push_back(MI); }
350 
351  void insert(MachineInstr *MI) {
352  Insts.push_back(MI);
353  // VPT/VPST + 4 predicated instructions.
354  assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
355  }
356 
357  bool containsVCTP() const {
358  return llvm::any_of(Insts, isVCTP);
359  }
360 
361  unsigned size() const { return Insts.size(); }
362  SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
363  };
364 
365  struct LowOverheadLoop {
366 
367  MachineLoop &ML;
368  MachineBasicBlock *Preheader = nullptr;
369  MachineLoopInfo &MLI;
371  const TargetRegisterInfo &TRI;
372  const ARMBaseInstrInfo &TII;
373  MachineFunction *MF = nullptr;
374  MachineBasicBlock::iterator StartInsertPt;
375  MachineBasicBlock *StartInsertBB = nullptr;
376  MachineInstr *Start = nullptr;
377  MachineInstr *Dec = nullptr;
378  MachineInstr *End = nullptr;
379  MachineOperand TPNumElements;
382  SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
383  SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
385  bool Revert = false;
386  bool CannotTailPredicate = false;
387 
388  LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
390  const ARMBaseInstrInfo &TII)
391  : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
392  TPNumElements(MachineOperand::CreateImm(0)) {
393  MF = ML.getHeader()->getParent();
394  if (auto *MBB = ML.getLoopPreheader())
395  Preheader = MBB;
396  else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
397  Preheader = MBB;
398  VPTState::reset();
399  }
400 
401  // If this is an MVE instruction, check that we know how to use tail
402  // predication with it. Record VPT blocks and return whether the
403  // instruction is valid for tail predication.
404  bool ValidateMVEInst(MachineInstr *MI);
405 
406  void AnalyseMVEInst(MachineInstr *MI) {
407  CannotTailPredicate = !ValidateMVEInst(MI);
408  }
409 
410  bool IsTailPredicationLegal() const {
411  // For now, let's keep things really simple and only support a single
412  // block for tail predication.
413  return !Revert && FoundAllComponents() && !VCTPs.empty() &&
414  !CannotTailPredicate && ML.getNumBlocks() == 1;
415  }
416 
417  // Given that MI is a VCTP, check that is equivalent to any other VCTPs
418  // found.
419  bool AddVCTP(MachineInstr *MI);
420 
421  // Check that the predication in the loop will be equivalent once we
422  // perform the conversion. Also ensure that we can provide the number
423  // of elements to the loop start instruction.
424  bool ValidateTailPredicate();
425 
426  // Check that any values available outside of the loop will be the same
427  // after tail predication conversion.
428  bool ValidateLiveOuts();
429 
430  // Is it safe to define LR with DLS/WLS?
431  // LR can be defined if it is the operand to start, because it's the same
432  // value, or if it's going to be equivalent to the operand to Start.
433  MachineInstr *isSafeToDefineLR();
434 
435  // Check the branch targets are within range and we satisfy our
436  // restrictions.
437  void Validate(ARMBasicBlockUtils *BBUtils);
438 
439  bool FoundAllComponents() const {
440  return Start && Dec && End;
441  }
442 
443  SmallVectorImpl<VPTState> &getVPTBlocks() {
444  return VPTState::Blocks;
445  }
446 
447  // Return the operand for the loop start instruction. This will be the loop
448  // iteration count, or the number of elements if we're tail predicating.
449  MachineOperand &getLoopStartOperand() {
450  if (IsTailPredicationLegal())
451  return TPNumElements;
452  return Start->getOperand(1);
453  }
454 
455  unsigned getStartOpcode() const {
456  bool IsDo = isDoLoopStart(*Start);
457  if (!IsTailPredicationLegal())
458  return IsDo ? ARM::t2DLS : ARM::t2WLS;
459 
460  return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
461  }
462 
463  void dump() const {
464  if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
465  if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
466  if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
467  if (!VCTPs.empty()) {
468  dbgs() << "ARM Loops: Found VCTP(s):\n";
469  for (auto *MI : VCTPs)
470  dbgs() << " - " << *MI;
471  }
472  if (!FoundAllComponents())
473  dbgs() << "ARM Loops: Not a low-overhead loop.\n";
474  else if (!(Start && Dec && End))
475  dbgs() << "ARM Loops: Failed to find all loop components.\n";
476  }
477  };
478 
479  class ARMLowOverheadLoops : public MachineFunctionPass {
480  MachineFunction *MF = nullptr;
481  MachineLoopInfo *MLI = nullptr;
482  ReachingDefAnalysis *RDA = nullptr;
483  const ARMBaseInstrInfo *TII = nullptr;
484  MachineRegisterInfo *MRI = nullptr;
485  const TargetRegisterInfo *TRI = nullptr;
486  std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
487 
488  public:
489  static char ID;
490 
491  ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
492 
493  void getAnalysisUsage(AnalysisUsage &AU) const override {
494  AU.setPreservesCFG();
498  }
499 
500  bool runOnMachineFunction(MachineFunction &MF) override;
501 
502  MachineFunctionProperties getRequiredProperties() const override {
506  }
507 
508  StringRef getPassName() const override {
510  }
511 
512  private:
513  bool ProcessLoop(MachineLoop *ML);
514 
515  bool RevertNonLoops();
516 
517  void RevertWhile(MachineInstr *MI) const;
518  void RevertDo(MachineInstr *MI) const;
519 
520  bool RevertLoopDec(MachineInstr *MI) const;
521 
522  void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
523 
524  void RevertLoopEndDec(MachineInstr *MI) const;
525 
526  void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
527 
528  MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
529 
530  void Expand(LowOverheadLoop &LoLoop);
531 
532  void IterationCountDCE(LowOverheadLoop &LoLoop);
533  };
534 }
535 
536 char ARMLowOverheadLoops::ID = 0;
537 
538 SmallVector<VPTState, 4> VPTState::Blocks;
539 SetVector<MachineInstr *> VPTState::CurrentPredicates;
540 std::map<MachineInstr *,
541  std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
542 
544  false, false)
545 
546 static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
547  InstSet &ToRemove, InstSet &Ignore) {
548 
549  // Check that we can remove all of Killed without having to modify any IT
550  // blocks.
551  auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
552  // Collect the dead code and the MBBs in which they reside.
554  for (auto *Dead : Killed)
555  BasicBlocks.insert(Dead->getParent());
556 
557  // Collect IT blocks in all affected basic blocks.
558  std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
559  for (auto *MBB : BasicBlocks) {
560  for (auto &IT : *MBB) {
561  if (IT.getOpcode() != ARM::t2IT)
562  continue;
564  ITBlocks[&IT]);
565  }
566  }
567 
568  // If we're removing all of the instructions within an IT block, then
569  // also remove the IT instruction.
570  SmallPtrSet<MachineInstr *, 2> ModifiedITs;
572  for (auto *Dead : Killed) {
573  if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
575  RemoveITs.insert(IT);
576  auto &CurrentBlock = ITBlocks[IT];
577  CurrentBlock.erase(Dead);
578  if (CurrentBlock.empty())
579  ModifiedITs.erase(IT);
580  else
581  ModifiedITs.insert(IT);
582  }
583  }
584  if (!ModifiedITs.empty())
585  return false;
586  Killed.insert(RemoveITs.begin(), RemoveITs.end());
587  return true;
588  };
589 
591  if (!RDA.isSafeToRemove(MI, Uses, Ignore))
592  return false;
593 
594  if (WontCorruptITs(Uses, RDA)) {
595  ToRemove.insert(Uses.begin(), Uses.end());
596  LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
597  << " - can also remove:\n";
598  for (auto *Use : Uses)
599  dbgs() << " - " << *Use);
600 
602  RDA.collectKilledOperands(MI, Killed);
603  if (WontCorruptITs(Killed, RDA)) {
604  ToRemove.insert(Killed.begin(), Killed.end());
605  LLVM_DEBUG(for (auto *Dead : Killed)
606  dbgs() << " - " << *Dead);
607  }
608  return true;
609  }
610  return false;
611 }
612 
613 bool LowOverheadLoop::ValidateTailPredicate() {
614  if (!IsTailPredicationLegal()) {
615  LLVM_DEBUG(if (VCTPs.empty())
616  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
617  dbgs() << "ARM Loops: Tail-predication is not valid.\n");
618  return false;
619  }
620 
621  assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
622  assert(ML.getBlocks().size() == 1 &&
623  "Shouldn't be processing a loop with more than one block");
624 
626  LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
627  return false;
628  }
629 
630  if (!VPTState::isValid(RDA)) {
631  LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
632  return false;
633  }
634 
635  if (!ValidateLiveOuts()) {
636  LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
637  return false;
638  }
639 
640  // For tail predication, we need to provide the number of elements, instead
641  // of the iteration count, to the loop start instruction. The number of
642  // elements is provided to the vctp instruction, so we need to check that
643  // we can use this register at InsertPt.
644  MachineInstr *VCTP = VCTPs.back();
645  if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
646  Start->getOpcode() == ARM::t2WhileLoopStartTP) {
647  TPNumElements = Start->getOperand(2);
648  StartInsertPt = Start;
649  StartInsertBB = Start->getParent();
650  } else {
651  TPNumElements = VCTP->getOperand(1);
652  MCRegister NumElements = TPNumElements.getReg().asMCReg();
653 
654  // If the register is defined within loop, then we can't perform TP.
655  // TODO: Check whether this is just a mov of a register that would be
656  // available.
657  if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
658  LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
659  return false;
660  }
661 
662  // The element count register maybe defined after InsertPt, in which case we
663  // need to try to move either InsertPt or the def so that the [w|d]lstp can
664  // use the value.
665 
666  if (StartInsertPt != StartInsertBB->end() &&
667  !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
668  if (auto *ElemDef =
669  RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
670  if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
671  ElemDef->removeFromParent();
672  StartInsertBB->insert(StartInsertPt, ElemDef);
673  LLVM_DEBUG(dbgs()
674  << "ARM Loops: Moved element count def: " << *ElemDef);
675  } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
676  StartInsertPt->removeFromParent();
677  StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
678  &*StartInsertPt);
679  LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
680  } else {
681  // If we fail to move an instruction and the element count is provided
682  // by a mov, use the mov operand if it will have the same value at the
683  // insertion point
684  MachineOperand Operand = ElemDef->getOperand(1);
685  if (isMovRegOpcode(ElemDef->getOpcode()) &&
686  RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
687  RDA.getUniqueReachingMIDef(&*StartInsertPt,
688  Operand.getReg().asMCReg())) {
689  TPNumElements = Operand;
690  NumElements = TPNumElements.getReg();
691  } else {
692  LLVM_DEBUG(dbgs()
693  << "ARM Loops: Unable to move element count to loop "
694  << "start instruction.\n");
695  return false;
696  }
697  }
698  }
699  }
700 
701  // Especially in the case of while loops, InsertBB may not be the
702  // preheader, so we need to check that the register isn't redefined
703  // before entering the loop.
704  auto CannotProvideElements = [this](MachineBasicBlock *MBB,
705  MCRegister NumElements) {
706  if (MBB->empty())
707  return false;
708  // NumElements is redefined in this block.
709  if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
710  return true;
711 
712  // Don't continue searching up through multiple predecessors.
713  if (MBB->pred_size() > 1)
714  return true;
715 
716  return false;
717  };
718 
719  // Search backwards for a def, until we get to InsertBB.
720  MachineBasicBlock *MBB = Preheader;
721  while (MBB && MBB != StartInsertBB) {
722  if (CannotProvideElements(MBB, NumElements)) {
723  LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
724  return false;
725  }
726  MBB = *MBB->pred_begin();
727  }
728  }
729 
730  // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
731  // world the [w|d]lstp instruction would be last instruction in the preheader
732  // and so it would only affect instructions within the loop body. But due to
733  // scheduling, and/or the logic in this pass (above), the insertion point can
734  // be moved earlier. So if the Loop Start isn't the last instruction in the
735  // preheader, and if the initial element count is smaller than the vector
736  // width, the Loop Start instruction will immediately generate one or more
737  // false lane mask which can, incorrectly, affect the proceeding MVE
738  // instructions in the preheader.
739  if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
740  LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
741  return false;
742  }
743 
744  // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
745  // need to compute an output size that is smaller than the VCTP mask operates
746  // on. The VecSize of the DoubleWidthResult is the larger vector size - the
747  // size it extends into, so any VCTP VecSize <= is valid.
748  unsigned VCTPVecSize = getVecSize(*VCTP);
749  for (MachineInstr *MI : DoubleWidthResultInstrs) {
750  unsigned InstrVecSize = getVecSize(*MI);
751  if (InstrVecSize > VCTPVecSize) {
752  LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
753  << "VecSize:\n" << *MI);
754  return false;
755  }
756  }
757 
758  // Check that the value change of the element count is what we expect and
759  // that the predication will be equivalent. For this we need:
760  // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
761  // and we can also allow register copies within the chain too.
762  auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
763  return -getAddSubImmediate(*MI) == ExpectedVecWidth;
764  };
765 
766  MachineBasicBlock *MBB = VCTP->getParent();
767  // Remove modifications to the element count since they have no purpose in a
768  // tail predicated loop. Explicitly refer to the vctp operand no matter which
769  // register NumElements has been assigned to, since that is what the
770  // modifications will be using
771  if (auto *Def = RDA.getUniqueReachingMIDef(
772  &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
773  SmallPtrSet<MachineInstr*, 2> ElementChain;
775  unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
776 
777  Ignore.insert(VCTPs.begin(), VCTPs.end());
778 
779  if (TryRemove(Def, RDA, ElementChain, Ignore)) {
780  bool FoundSub = false;
781 
782  for (auto *MI : ElementChain) {
783  if (isMovRegOpcode(MI->getOpcode()))
784  continue;
785 
786  if (isSubImmOpcode(MI->getOpcode())) {
787  if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
788  LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
789  " count: " << *MI);
790  return false;
791  }
792  FoundSub = true;
793  } else {
794  LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
795  " count: " << *MI);
796  return false;
797  }
798  }
799  ToRemove.insert(ElementChain.begin(), ElementChain.end());
800  }
801  }
802 
803  // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
804  // can also remove any extra instructions in the preheader, which often
805  // includes a now unused MOV.
806  if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
807  Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
808  Preheader && !Preheader->empty() &&
809  !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
810  if (auto *Def = RDA.getUniqueReachingMIDef(
811  &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
813  Ignore.insert(VCTPs.begin(), VCTPs.end());
814  TryRemove(Def, RDA, ToRemove, Ignore);
815  }
816  }
817 
818  return true;
819 }
820 
821 static bool isRegInClass(const MachineOperand &MO,
822  const TargetRegisterClass *Class) {
823  return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
824 }
825 
826 // MVE 'narrowing' operate on half a lane, reading from half and writing
827 // to half, which are referred to has the top and bottom half. The other
828 // half retains its previous value.
830  const MCInstrDesc &MCID = MI.getDesc();
831  uint64_t Flags = MCID.TSFlags;
832  return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
833 }
834 
835 // Some MVE instructions read from the top/bottom halves of their operand(s)
836 // and generate a vector result with result elements that are double the
837 // width of the input.
839  const MCInstrDesc &MCID = MI.getDesc();
840  uint64_t Flags = MCID.TSFlags;
841  return (Flags & ARMII::DoubleWidthResult) != 0;
842 }
843 
844 static bool isHorizontalReduction(const MachineInstr &MI) {
845  const MCInstrDesc &MCID = MI.getDesc();
846  uint64_t Flags = MCID.TSFlags;
847  return (Flags & ARMII::HorizontalReduction) != 0;
848 }
849 
850 // Can this instruction generate a non-zero result when given only zeroed
851 // operands? This allows us to know that, given operands with false bytes
852 // zeroed by masked loads, that the result will also contain zeros in those
853 // bytes.
854 static bool canGenerateNonZeros(const MachineInstr &MI) {
855 
856  // Check for instructions which can write into a larger element size,
857  // possibly writing into a previous zero'd lane.
859  return true;
860 
861  switch (MI.getOpcode()) {
862  default:
863  break;
864  // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
865  // fp16 -> fp32 vector conversions.
866  // Instructions that perform a NOT will generate 1s from 0s.
867  case ARM::MVE_VMVN:
868  case ARM::MVE_VORN:
869  // Count leading zeros will do just that!
870  case ARM::MVE_VCLZs8:
871  case ARM::MVE_VCLZs16:
872  case ARM::MVE_VCLZs32:
873  return true;
874  }
875  return false;
876 }
877 
878 // Look at its register uses to see if it only can only receive zeros
879 // into its false lanes which would then produce zeros. Also check that
880 // the output register is also defined by an FalseLanesZero instruction
881 // so that if tail-predication happens, the lanes that aren't updated will
882 // still be zeros.
884  const TargetRegisterClass *QPRs,
885  const ReachingDefAnalysis &RDA,
886  InstSet &FalseLanesZero) {
887  if (canGenerateNonZeros(MI))
888  return false;
889 
891  // Predicated loads will write zeros to the falsely predicated bytes of the
892  // destination register.
893  if (MI.mayLoad())
894  return isPredicated;
895 
896  auto IsZeroInit = [](MachineInstr *Def) {
897  return !isVectorPredicated(Def) &&
898  Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
899  Def->getOperand(1).getImm() == 0;
900  };
901 
902  bool AllowScalars = isHorizontalReduction(MI);
903  for (auto &MO : MI.operands()) {
904  if (!MO.isReg() || !MO.getReg())
905  continue;
906  if (!isRegInClass(MO, QPRs) && AllowScalars)
907  continue;
908  // Skip the lr predicate reg
910  if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
911  continue;
912 
913  // Check that this instruction will produce zeros in its false lanes:
914  // - If it only consumes false lanes zero or constant 0 (vmov #0)
915  // - If it's predicated, it only matters that it's def register already has
916  // false lane zeros, so we can ignore the uses.
918  RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
919  for (auto *Def : Defs) {
920  if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
921  continue;
922  if (MO.isUse() && isPredicated)
923  continue;
924  return false;
925  }
926  }
927  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
928  return true;
929 }
930 
931 bool LowOverheadLoop::ValidateLiveOuts() {
932  // We want to find out if the tail-predicated version of this loop will
933  // produce the same values as the loop in its original form. For this to
934  // be true, the newly inserted implicit predication must not change the
935  // the (observable) results.
936  // We're doing this because many instructions in the loop will not be
937  // predicated and so the conversion from VPT predication to tail-predication
938  // can result in different values being produced; due to the tail-predication
939  // preventing many instructions from updating their falsely predicated
940  // lanes. This analysis assumes that all the instructions perform lane-wise
941  // operations and don't perform any exchanges.
942  // A masked load, whether through VPT or tail predication, will write zeros
943  // to any of the falsely predicated bytes. So, from the loads, we know that
944  // the false lanes are zeroed and here we're trying to track that those false
945  // lanes remain zero, or where they change, the differences are masked away
946  // by their user(s).
947  // All MVE stores have to be predicated, so we know that any predicate load
948  // operands, or stored results are equivalent already. Other explicitly
949  // predicated instructions will perform the same operation in the original
950  // loop and the tail-predicated form too. Because of this, we can insert
951  // loads, stores and other predicated instructions into our Predicated
952  // set and build from there.
953  const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
954  SetVector<MachineInstr *> FalseLanesUnknown;
957  MachineBasicBlock *Header = ML.getHeader();
958 
959  LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
960 
961  for (auto &MI : *Header) {
962  if (!shouldInspect(MI))
963  continue;
964 
965  if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
966  continue;
967 
969  bool retainsOrReduces =
971 
972  if (isPredicated)
973  Predicated.insert(&MI);
975  FalseLanesZero.insert(&MI);
976  else if (MI.getNumDefs() == 0)
977  continue;
978  else if (!isPredicated && retainsOrReduces) {
979  LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
980  return false;
981  } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
982  FalseLanesUnknown.insert(&MI);
983  }
984 
985  LLVM_DEBUG({
986  dbgs() << " Predicated:\n";
987  for (auto *I : Predicated)
988  dbgs() << " " << *I;
989  dbgs() << " FalseLanesZero:\n";
990  for (auto *I : FalseLanesZero)
991  dbgs() << " " << *I;
992  dbgs() << " FalseLanesUnknown:\n";
993  for (auto *I : FalseLanesUnknown)
994  dbgs() << " " << *I;
995  });
996 
997  auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
998  SmallPtrSetImpl<MachineInstr *> &Predicated) {
1000  RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
1001  for (auto *Use : Uses) {
1002  if (Use != MI && !Predicated.count(Use))
1003  return false;
1004  }
1005  return true;
1006  };
1007 
1008  // Visit the unknowns in reverse so that we can start at the values being
1009  // stored and then we can work towards the leaves, hopefully adding more
1010  // instructions to Predicated. Successfully terminating the loop means that
1011  // all the unknown values have to found to be masked by predicated user(s).
1012  // For any unpredicated values, we store them in NonPredicated so that we
1013  // can later check whether these form a reduction.
1014  SmallPtrSet<MachineInstr*, 2> NonPredicated;
1015  for (auto *MI : reverse(FalseLanesUnknown)) {
1016  for (auto &MO : MI->operands()) {
1017  if (!isRegInClass(MO, QPRs) || !MO.isDef())
1018  continue;
1019  if (!HasPredicatedUsers(MI, MO, Predicated)) {
1020  LLVM_DEBUG(dbgs() << " Found an unknown def of : "
1021  << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
1022  NonPredicated.insert(MI);
1023  break;
1024  }
1025  }
1026  // Any unknown false lanes have been masked away by the user(s).
1027  if (!NonPredicated.contains(MI))
1028  Predicated.insert(MI);
1029  }
1030 
1031  SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
1033  ML.getExitBlocks(ExitBlocks);
1034  assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
1035  assert(ExitBlocks.size() == 1 && "Expected a single exit block");
1036  MachineBasicBlock *ExitBB = ExitBlocks.front();
1037  for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1038  // TODO: Instead of blocking predication, we could move the vctp to the exit
1039  // block and calculate it's operand there in or the preheader.
1040  if (RegMask.PhysReg == ARM::VPR) {
1041  LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1042  return false;
1043  }
1044  // Check Q-regs that are live in the exit blocks. We don't collect scalars
1045  // because they won't be affected by lane predication.
1046  if (QPRs->contains(RegMask.PhysReg))
1047  if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
1048  LiveOutMIs.insert(MI);
1049  }
1050 
1051  // We've already validated that any VPT predication within the loop will be
1052  // equivalent when we perform the predication transformation; so we know that
1053  // any VPT predicated instruction is predicated upon VCTP. Any live-out
1054  // instruction needs to be predicated, so check this here. The instructions
1055  // in NonPredicated have been found to be a reduction that we can ensure its
1056  // legality. Any MQPRCopy found will need to validate its input as if it was
1057  // live out.
1058  SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1059  while (!Worklist.empty()) {
1060  MachineInstr *MI = Worklist.pop_back_val();
1061  if (MI->getOpcode() == ARM::MQPRCopy) {
1062  VMOVCopies.insert(MI);
1063  MachineInstr *CopySrc =
1064  RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
1065  if (CopySrc)
1066  Worklist.push_back(CopySrc);
1067  } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
1068  LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1069  VMOVCopies.clear();
1070  return false;
1071  }
1072  }
1073 
1074  return true;
1075 }
1076 
1077 void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1078  if (Revert)
1079  return;
1080 
1081  // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1082  // can only jump back.
1083  auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
1084  ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1085  MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1086  ? End->getOperand(1).getMBB()
1087  : End->getOperand(2).getMBB();
1088  // TODO Maybe there's cases where the target doesn't have to be the header,
1089  // but for now be safe and revert.
1090  if (TgtBB != ML.getHeader()) {
1091  LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1092  return false;
1093  }
1094 
1095  // The WLS and LE instructions have 12-bits for the label offset. WLS
1096  // requires a positive offset, while LE uses negative.
1097  if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
1098  !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
1099  LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1100  return false;
1101  }
1102 
1103  if (isWhileLoopStart(*Start)) {
1104  MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1105  if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1106  !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
1107  LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1108  return false;
1109  }
1110  }
1111  return true;
1112  };
1113 
1114  StartInsertPt = MachineBasicBlock::iterator(Start);
1115  StartInsertBB = Start->getParent();
1116  LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1117  << *StartInsertPt);
1118 
1119  Revert = !ValidateRanges(Start, End, BBUtils, ML);
1120  CannotTailPredicate = !ValidateTailPredicate();
1121 }
1122 
1123 bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1124  LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1125  if (VCTPs.empty()) {
1126  VCTPs.push_back(MI);
1127  return true;
1128  }
1129 
1130  // If we find another VCTP, check whether it uses the same value as the main VCTP.
1131  // If it does, store it in the VCTPs set, else refuse it.
1132  MachineInstr *Prev = VCTPs.back();
1133  if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1134  !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1135  LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1136  "definition from the main VCTP");
1137  return false;
1138  }
1139  VCTPs.push_back(MI);
1140  return true;
1141 }
1142 
1144 
1145  auto GetFrameIndex = [](MachineMemOperand *Operand) {
1146  const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1147  if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1148  if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1149  return FS->getFrameIndex();
1150  }
1151  }
1152  return -1;
1153  };
1154 
1155  auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1156  switch (I->getOpcode()) {
1157  case ARM::MVE_VSTRWU32:
1158  case ARM::MVE_VLDRWU32: {
1159  return I->getOperand(1).getReg() == ARM::SP &&
1160  I->memoperands().size() == 1 &&
1161  GetFrameIndex(I->memoperands().front()) >= 0;
1162  }
1163  default:
1164  return false;
1165  }
1166  };
1167 
1168  // An unpredicated vector register spill is allowed if all of the uses of the
1169  // stack slot are within the loop
1170  if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1171  return false;
1172 
1173  // Search all blocks after the loop for accesses to the same stack slot.
1174  // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1175  // live-out (which sp never is) to know what blocks to look in
1176  if (MI->memoperands().size() == 0)
1177  return false;
1178  int FI = GetFrameIndex(MI->memoperands().front());
1179 
1180  auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1181  if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1182  return false;
1183 
1185  ML->getExitBlocks(Frontier);
1186  SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1187  unsigned Idx = 0;
1188  while (Idx < Frontier.size()) {
1189  MachineBasicBlock *BB = Frontier[Idx];
1190  bool LookAtSuccessors = true;
1191  for (auto &I : *BB) {
1192  if (!IsStackOp(&I) || I.memoperands().size() == 0)
1193  continue;
1194  if (GetFrameIndex(I.memoperands().front()) != FI)
1195  continue;
1196  // If this block has a store to the stack slot before any loads then we
1197  // can ignore the block
1198  if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1199  LookAtSuccessors = false;
1200  break;
1201  }
1202  // If the store and the load are using the same stack slot then the
1203  // store isn't valid for tail predication
1204  if (I.getOpcode() == ARM::MVE_VLDRWU32)
1205  return false;
1206  }
1207 
1208  if (LookAtSuccessors) {
1209  for (auto *Succ : BB->successors()) {
1210  if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1211  Frontier.push_back(Succ);
1212  }
1213  }
1214  Visited.insert(BB);
1215  Idx++;
1216  }
1217 
1218  return true;
1219 }
1220 
1221 bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1222  if (CannotTailPredicate)
1223  return false;
1224 
1225  if (!shouldInspect(*MI))
1226  return true;
1227 
1228  if (MI->getOpcode() == ARM::MVE_VPSEL ||
1229  MI->getOpcode() == ARM::MVE_VPNOT) {
1230  // TODO: Allow VPSEL and VPNOT, we currently cannot because:
1231  // 1) It will use the VPR as a predicate operand, but doesn't have to be
1232  // instead a VPT block, which means we can assert while building up
1233  // the VPT block because we don't find another VPT or VPST to being a new
1234  // one.
1235  // 2) VPSEL still requires a VPR operand even after tail predicating,
1236  // which means we can't remove it unless there is another
1237  // instruction, such as vcmp, that can provide the VPR def.
1238  return false;
1239  }
1240 
1241  // Record all VCTPs and check that they're equivalent to one another.
1242  if (isVCTP(MI) && !AddVCTP(MI))
1243  return false;
1244 
1245  // Inspect uses first so that any instructions that alter the VPR don't
1246  // alter the predicate upon themselves.
1247  const MCInstrDesc &MCID = MI->getDesc();
1248  bool IsUse = false;
1249  unsigned LastOpIdx = MI->getNumOperands() - 1;
1250  for (auto &Op : enumerate(reverse(MCID.operands()))) {
1251  const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1252  if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
1253  continue;
1254 
1255  if (ARM::isVpred(Op.value().OperandType)) {
1256  VPTState::addInst(MI);
1257  IsUse = true;
1258  } else if (MI->getOpcode() != ARM::MVE_VPST) {
1259  LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1260  return false;
1261  }
1262  }
1263 
1264  // If we find an instruction that has been marked as not valid for tail
1265  // predication, only allow the instruction if it's contained within a valid
1266  // VPT block.
1267  bool RequiresExplicitPredication =
1269  if (isDomainMVE(MI) && RequiresExplicitPredication) {
1270  if (MI->getOpcode() == ARM::MQPRCopy)
1271  return true;
1272  if (!IsUse && producesDoubleWidthResult(*MI)) {
1273  DoubleWidthResultInstrs.insert(MI);
1274  return true;
1275  }
1276 
1277  LLVM_DEBUG(if (!IsUse) dbgs()
1278  << "ARM Loops: Can't tail predicate: " << *MI);
1279  return IsUse;
1280  }
1281 
1282  // If the instruction is already explicitly predicated, then the conversion
1283  // will be fine, but ensure that all store operations are predicated.
1284  if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1285  return IsUse;
1286 
1287  // If this instruction defines the VPR, update the predicate for the
1288  // proceeding instructions.
1289  if (isVectorPredicate(MI)) {
1290  // Clear the existing predicate when we're not in VPT Active state,
1291  // otherwise we add to it.
1292  if (!isVectorPredicated(MI))
1293  VPTState::resetPredicate(MI);
1294  else
1295  VPTState::addPredicate(MI);
1296  }
1297 
1298  // Finally once the predicate has been modified, we can start a new VPT
1299  // block if necessary.
1300  if (isVPTOpcode(MI->getOpcode()))
1302 
1303  return true;
1304 }
1305 
1306 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1307  const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
1308  if (!ST.hasLOB())
1309  return false;
1310 
1311  MF = &mf;
1312  LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
1313 
1314  MLI = &getAnalysis<MachineLoopInfo>();
1315  RDA = &getAnalysis<ReachingDefAnalysis>();
1317  MRI = &MF->getRegInfo();
1318  TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1319  TRI = ST.getRegisterInfo();
1320  BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
1321  BBUtils->computeAllBlockSizes();
1322  BBUtils->adjustBBOffsetsAfter(&MF->front());
1323 
1324  bool Changed = false;
1325  for (auto *ML : *MLI) {
1326  if (ML->isOutermost())
1327  Changed |= ProcessLoop(ML);
1328  }
1329  Changed |= RevertNonLoops();
1330  return Changed;
1331 }
1332 
1333 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1334 
1335  bool Changed = false;
1336 
1337  // Process inner loops first.
1338  for (MachineLoop *L : *ML)
1339  Changed |= ProcessLoop(L);
1340 
1341  LLVM_DEBUG({
1342  dbgs() << "ARM Loops: Processing loop containing:\n";
1343  if (auto *Preheader = ML->getLoopPreheader())
1344  dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1345  else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1346  dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1347  for (auto *MBB : ML->getBlocks())
1348  dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1349  });
1350 
1351  // Search the given block for a loop start instruction. If one isn't found,
1352  // and there's only one predecessor block, search that one too.
1353  std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
1354  [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
1355  for (auto &MI : *MBB) {
1356  if (isLoopStart(MI))
1357  return &MI;
1358  }
1359  if (MBB->pred_size() == 1)
1360  return SearchForStart(*MBB->pred_begin());
1361  return nullptr;
1362  };
1363 
1364  LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
1365  // Search the preheader for the start intrinsic.
1366  // FIXME: I don't see why we shouldn't be supporting multiple predecessors
1367  // with potentially multiple set.loop.iterations, so we need to enable this.
1368  if (LoLoop.Preheader)
1369  LoLoop.Start = SearchForStart(LoLoop.Preheader);
1370  else
1371  return Changed;
1372 
1373  // Find the low-overhead loop components and decide whether or not to fall
1374  // back to a normal loop. Also look for a vctp instructions and decide
1375  // whether we can convert that predicate using tail predication.
1376  for (auto *MBB : reverse(ML->getBlocks())) {
1377  for (auto &MI : *MBB) {
1378  if (MI.isDebugValue())
1379  continue;
1380  else if (MI.getOpcode() == ARM::t2LoopDec)
1381  LoLoop.Dec = &MI;
1382  else if (MI.getOpcode() == ARM::t2LoopEnd)
1383  LoLoop.End = &MI;
1384  else if (MI.getOpcode() == ARM::t2LoopEndDec)
1385  LoLoop.End = LoLoop.Dec = &MI;
1386  else if (isLoopStart(MI))
1387  LoLoop.Start = &MI;
1388  else if (MI.getDesc().isCall()) {
1389  // TODO: Though the call will require LE to execute again, does this
1390  // mean we should revert? Always executing LE hopefully should be
1391  // faster than performing a sub,cmp,br or even subs,br.
1392  LoLoop.Revert = true;
1393  LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1394  } else {
1395  // Record VPR defs and build up their corresponding vpt blocks.
1396  // Check we know how to tail predicate any mve instructions.
1397  LoLoop.AnalyseMVEInst(&MI);
1398  }
1399  }
1400  }
1401 
1402  LLVM_DEBUG(LoLoop.dump());
1403  if (!LoLoop.FoundAllComponents()) {
1404  LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1405  return Changed;
1406  }
1407 
1408  assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1409  "Expected t2WhileLoopStart to be removed before regalloc!");
1410 
1411  // Check that the only instruction using LoopDec is LoopEnd. This can only
1412  // happen when the Dec and End are separate, not a single t2LoopEndDec.
1413  // TODO: Check for copy chains that really have no effect.
1414  if (LoLoop.Dec != LoLoop.End) {
1416  RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
1417  if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
1418  LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
1419  LoLoop.Revert = true;
1420  }
1421  }
1422  LoLoop.Validate(BBUtils.get());
1423  Expand(LoLoop);
1424  return true;
1425 }
1426 
1427 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1428 // beq that branches to the exit branch.
1429 // TODO: We could also try to generate a cbz if the value in LR is also in
1430 // another low register.
1431 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
1432  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1434  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1435  ARM::tBcc : ARM::t2Bcc;
1436 
1437  RevertWhileLoopStartLR(MI, TII, BrOpc);
1438 }
1439 
1440 void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1441  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1443 }
1444 
1446  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1447  MachineBasicBlock *MBB = MI->getParent();
1449  for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
1450  if (I->getOpcode() == ARM::t2LoopEnd) {
1451  Ignore.insert(&*I);
1452  break;
1453  }
1454  }
1455 
1456  // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1457  bool SetFlags =
1459 
1460  llvm::RevertLoopDec(MI, TII, SetFlags);
1461  return SetFlags;
1462 }
1463 
1464 // Generate a subs, or sub and cmp, and a branch instead of an LE.
1465 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
1466  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1467 
1468  MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
1469  unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1470  ARM::tBcc : ARM::t2Bcc;
1471 
1472  llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1473 }
1474 
1475 // Generate a subs, or sub and cmp, and a branch instead of an LE.
1476 void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
1477  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1478  assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1479  MachineBasicBlock *MBB = MI->getParent();
1480 
1481  MachineInstrBuilder MIB =
1482  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1483  MIB.addDef(ARM::LR);
1484  MIB.add(MI->getOperand(1));
1485  MIB.addImm(1);
1486  MIB.addImm(ARMCC::AL);
1487  MIB.addReg(ARM::NoRegister);
1488  MIB.addReg(ARM::CPSR);
1489  MIB->getOperand(5).setIsDef(true);
1490 
1491  MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
1492  unsigned BrOpc =
1493  BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
1494 
1495  // Create bne
1496  MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1497  MIB.add(MI->getOperand(2)); // branch target
1498  MIB.addImm(ARMCC::NE); // condition code
1499  MIB.addReg(ARM::CPSR);
1500 
1501  MI->eraseFromParent();
1502 }
1503 
1504 // Perform dead code elimation on the loop iteration count setup expression.
1505 // If we are tail-predicating, the number of elements to be processed is the
1506 // operand of the VCTP instruction in the vector body, see getCount(), which is
1507 // register $r3 in this example:
1508 //
1509 // $lr = big-itercount-expression
1510 // ..
1511 // $lr = t2DoLoopStart renamable $lr
1512 // vector.body:
1513 // ..
1514 // $vpr = MVE_VCTP32 renamable $r3
1515 // renamable $lr = t2LoopDec killed renamable $lr, 1
1516 // t2LoopEnd renamable $lr, %vector.body
1517 // tB %end
1518 //
1519 // What we would like achieve here is to replace the do-loop start pseudo
1520 // instruction t2DoLoopStart with:
1521 //
1522 // $lr = MVE_DLSTP_32 killed renamable $r3
1523 //
1524 // Thus, $r3 which defines the number of elements, is written to $lr,
1525 // and then we want to delete the whole chain that used to define $lr,
1526 // see the comment below how this chain could look like.
1527 //
1528 void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
1529  if (!LoLoop.IsTailPredicationLegal())
1530  return;
1531 
1532  LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1533 
1534  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
1535  if (!Def) {
1536  LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
1537  return;
1538  }
1539 
1540  // Collect and remove the users of iteration count.
1541  SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
1542  LoLoop.End };
1543  if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
1544  LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
1545 }
1546 
1547 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
1548  LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
1549  // When using tail-predication, try to delete the dead code that was used to
1550  // calculate the number of loop iterations.
1551  IterationCountDCE(LoLoop);
1552 
1553  MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1554  MachineInstr *Start = LoLoop.Start;
1555  MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1556  unsigned Opc = LoLoop.getStartOpcode();
1557  MachineOperand &Count = LoLoop.getLoopStartOperand();
1558 
1559  // A DLS lr, lr we needn't emit
1560  MachineInstr* NewStart;
1561  if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1562  Count.getReg() == ARM::LR) {
1563  LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1564  NewStart = nullptr;
1565  } else {
1566  MachineInstrBuilder MIB =
1567  BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
1568 
1569  MIB.addDef(ARM::LR);
1570  MIB.add(Count);
1571  if (isWhileLoopStart(*Start))
1572  MIB.addMBB(getWhileLoopStartTargetBB(*Start));
1573 
1574  LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1575  NewStart = &*MIB;
1576  }
1577 
1578  LoLoop.ToRemove.insert(Start);
1579  return NewStart;
1580 }
1581 
1582 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1583  auto RemovePredicate = [](MachineInstr *MI) {
1584  if (MI->isDebugInstr())
1585  return;
1586  LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1587  int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
1588  assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1589  assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1590  "Expected Then predicate!");
1591  MI->getOperand(PIdx).setImm(ARMVCC::None);
1592  MI->getOperand(PIdx + 1).setReg(0);
1593  };
1594 
1595  for (auto &Block : LoLoop.getVPTBlocks()) {
1596  SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1597 
1598  auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1599  assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1600  // Replace the VCMP with a VPT
1601  MachineInstrBuilder MIB =
1602  BuildMI(*At->getParent(), At, At->getDebugLoc(),
1603  TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1604  MIB.addImm(ARMVCC::Then);
1605  // Register one
1606  MIB.add(TheVCMP->getOperand(1));
1607  // Register two
1608  MIB.add(TheVCMP->getOperand(2));
1609  // The comparison code, e.g. ge, eq, lt
1610  MIB.add(TheVCMP->getOperand(3));
1611  LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1612  LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1613  LoLoop.ToRemove.insert(TheVCMP);
1614  TheVCMP = nullptr;
1615  };
1616 
1617  if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1618  MachineInstr *VPST = Insts.front();
1619  if (VPTState::hasUniformPredicate(Block)) {
1620  // A vpt block starting with VPST, is only predicated upon vctp and has no
1621  // internal vpr defs:
1622  // - Remove vpst.
1623  // - Unpredicate the remaining instructions.
1624  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1625  for (unsigned i = 1; i < Insts.size(); ++i)
1626  RemovePredicate(Insts[i]);
1627  } else {
1628  // The VPT block has a non-uniform predicate but it uses a vpst and its
1629  // entry is guarded only by a vctp, which means we:
1630  // - Need to remove the original vpst.
1631  // - Then need to unpredicate any following instructions, until
1632  // we come across the divergent vpr def.
1633  // - Insert a new vpst to predicate the instruction(s) that following
1634  // the divergent vpr def.
1635  MachineInstr *Divergent = VPTState::getDivergent(Block);
1636  MachineBasicBlock *MBB = Divergent->getParent();
1637  auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
1638  while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
1639  ++DivergentNext;
1640 
1641  bool DivergentNextIsPredicated =
1642  DivergentNext != MBB->end() &&
1643  getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
1644 
1645  for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
1646  I != E; ++I)
1647  RemovePredicate(&*I);
1648 
1649  // Check if the instruction defining vpr is a vcmp so it can be combined
1650  // with the VPST This should be the divergent instruction
1651  MachineInstr *VCMP =
1652  VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1653 
1654  if (DivergentNextIsPredicated) {
1655  // Insert a VPST at the divergent only if the next instruction
1656  // would actually use it. A VCMP following a VPST can be
1657  // merged into a VPT so do that instead if the VCMP exists.
1658  if (!VCMP) {
1659  // Create a VPST (with a null mask for now, we'll recompute it
1660  // later)
1661  MachineInstrBuilder MIB =
1662  BuildMI(*Divergent->getParent(), Divergent,
1663  Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
1664  MIB.addImm(0);
1665  LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
1666  LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1667  } else {
1668  // No RDA checks are necessary here since the VPST would have been
1669  // directly after the VCMP
1670  ReplaceVCMPWithVPT(VCMP, VCMP);
1671  }
1672  }
1673  }
1674  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1675  LoLoop.ToRemove.insert(VPST);
1676  } else if (Block.containsVCTP()) {
1677  // The vctp will be removed, so either the entire block will be dead or
1678  // the block mask of the vp(s)t will need to be recomputed.
1679  MachineInstr *VPST = Insts.front();
1680  if (Block.size() == 2) {
1681  assert(VPST->getOpcode() == ARM::MVE_VPST &&
1682  "Found a VPST in an otherwise empty vpt block");
1683  LoLoop.ToRemove.insert(VPST);
1684  } else
1685  LoLoop.BlockMasksToRecompute.insert(VPST);
1686  } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1687  // If this block starts with a VPST then attempt to merge it with the
1688  // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1689  // block that no longer exists
1690  MachineInstr *VPST = Insts.front();
1691  auto Next = ++MachineBasicBlock::iterator(VPST);
1693  "The instruction after a VPST must be predicated");
1694  (void)Next;
1695  MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1696  if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1697  !LoLoop.ToRemove.contains(VprDef)) {
1698  MachineInstr *VCMP = VprDef;
1699  // The VCMP and VPST can only be merged if the VCMP's operands will have
1700  // the same values at the VPST.
1701  // If any of the instructions between the VCMP and VPST are predicated
1702  // then a different code path is expected to have merged the VCMP and
1703  // VPST already.
1706  RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
1707  RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1708  ReplaceVCMPWithVPT(VCMP, VPST);
1709  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1710  LoLoop.ToRemove.insert(VPST);
1711  }
1712  }
1713  }
1714  }
1715 
1716  LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
1717 }
1718 
1719 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1720 
1721  // Combine the LoopDec and LoopEnd instructions into LE(TP).
1722  auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1723  MachineInstr *End = LoLoop.End;
1724  MachineBasicBlock *MBB = End->getParent();
1725  unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1726  ARM::MVE_LETP : ARM::t2LEUpdate;
1727  MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1728  TII->get(Opc));
1729  MIB.addDef(ARM::LR);
1730  unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
1731  MIB.add(End->getOperand(Off + 0));
1732  MIB.add(End->getOperand(Off + 1));
1733  LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1734  LoLoop.ToRemove.insert(LoLoop.Dec);
1735  LoLoop.ToRemove.insert(End);
1736  return &*MIB;
1737  };
1738 
1739  // TODO: We should be able to automatically remove these branches before we
1740  // get here - probably by teaching analyzeBranch about the pseudo
1741  // instructions.
1742  // If there is an unconditional branch, after I, that just branches to the
1743  // next block, remove it.
1744  auto RemoveDeadBranch = [](MachineInstr *I) {
1745  MachineBasicBlock *BB = I->getParent();
1746  MachineInstr *Terminator = &BB->instr_back();
1747  if (Terminator->isUnconditionalBranch() && I != Terminator) {
1748  MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
1749  if (BB->isLayoutSuccessor(Succ)) {
1750  LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1751  Terminator->eraseFromParent();
1752  }
1753  }
1754  };
1755 
1756  // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1757  // Anything other MQPRCopy can be converted to MVE_VORR later on.
1758  auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
1759  for (auto *MI : VMOVCopies) {
1760  LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1761  assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1762  MachineBasicBlock *MBB = MI->getParent();
1763  Register Dst = MI->getOperand(0).getReg();
1764  Register Src = MI->getOperand(1).getReg();
1765  auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1766  ARM::D0 + (Dst - ARM::Q0) * 2)
1767  .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
1768  .add(predOps(ARMCC::AL));
1769  (void)MIB1;
1770  LLVM_DEBUG(dbgs() << " into " << *MIB1);
1771  auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1772  ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
1773  .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
1774  .add(predOps(ARMCC::AL));
1775  LLVM_DEBUG(dbgs() << " and " << *MIB2);
1776  (void)MIB2;
1777  MI->eraseFromParent();
1778  }
1779  };
1780 
1781  if (LoLoop.Revert) {
1782  if (isWhileLoopStart(*LoLoop.Start))
1783  RevertWhile(LoLoop.Start);
1784  else
1785  RevertDo(LoLoop.Start);
1786  if (LoLoop.Dec == LoLoop.End)
1787  RevertLoopEndDec(LoLoop.End);
1788  else
1789  RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
1790  } else {
1791  ExpandVMOVCopies(LoLoop.VMOVCopies);
1792  LoLoop.Start = ExpandLoopStart(LoLoop);
1793  if (LoLoop.Start)
1794  RemoveDeadBranch(LoLoop.Start);
1795  LoLoop.End = ExpandLoopEnd(LoLoop);
1796  RemoveDeadBranch(LoLoop.End);
1797  if (LoLoop.IsTailPredicationLegal())
1798  ConvertVPTBlocks(LoLoop);
1799  for (auto *I : LoLoop.ToRemove) {
1800  LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
1801  I->eraseFromParent();
1802  }
1803  for (auto *I : LoLoop.BlockMasksToRecompute) {
1804  LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
1806  LLVM_DEBUG(dbgs() << " ... done: " << *I);
1807  }
1808  }
1809 
1810  PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1811  DFS.ProcessLoop();
1812  const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1813  for (auto *MBB : PostOrder) {
1815  // FIXME: For some reason, the live-in print order is non-deterministic for
1816  // our tests and I can't out why... So just sort them.
1818  }
1819 
1820  for (auto *MBB : reverse(PostOrder))
1822 
1823  // We've moved, removed and inserted new instructions, so update RDA.
1824  RDA->reset();
1825 }
1826 
1827 bool ARMLowOverheadLoops::RevertNonLoops() {
1828  LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1829  bool Changed = false;
1830 
1831  for (auto &MBB : *MF) {
1836  SmallVector<MachineInstr *, 4> MQPRCopies;
1837 
1838  for (auto &I : MBB) {
1839  if (isLoopStart(I))
1840  Starts.push_back(&I);
1841  else if (I.getOpcode() == ARM::t2LoopDec)
1842  Decs.push_back(&I);
1843  else if (I.getOpcode() == ARM::t2LoopEnd)
1844  Ends.push_back(&I);
1845  else if (I.getOpcode() == ARM::t2LoopEndDec)
1846  EndDecs.push_back(&I);
1847  else if (I.getOpcode() == ARM::MQPRCopy)
1848  MQPRCopies.push_back(&I);
1849  }
1850 
1851  if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1852  MQPRCopies.empty())
1853  continue;
1854 
1855  Changed = true;
1856 
1857  for (auto *Start : Starts) {
1858  if (isWhileLoopStart(*Start))
1859  RevertWhile(Start);
1860  else
1861  RevertDo(Start);
1862  }
1863  for (auto *Dec : Decs)
1864  RevertLoopDec(Dec);
1865 
1866  for (auto *End : Ends)
1867  RevertLoopEnd(End);
1868  for (auto *End : EndDecs)
1869  RevertLoopEndDec(End);
1870  for (auto *MI : MQPRCopies) {
1871  LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1872  assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1873  MachineBasicBlock *MBB = MI->getParent();
1874  auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
1875  MI->getOperand(0).getReg())
1876  .add(MI->getOperand(1))
1877  .add(MI->getOperand(1));
1878  addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
1879  MI->eraseFromParent();
1880  }
1881  }
1882  return Changed;
1883 }
1884 
1886  return new ARMLowOverheadLoops();
1887 }
isVectorPredicated
static bool isVectorPredicated(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:89
llvm::TargetRegisterInfo::getRegAsmName
virtual StringRef getRegAsmName(MCRegister Reg) const
Return the assembly name for Reg.
Definition: TargetRegisterInfo.h:1050
i
i
Definition: README.txt:29
ARMSubtarget.h
INITIALIZE_PASS
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) static bool TryRemove(MachineInstr *MI
producesFalseLanesZero
static bool producesFalseLanesZero(MachineInstr &MI, const TargetRegisterClass *QPRs, const ReachingDefAnalysis &RDA, InstSet &FalseLanesZero)
Definition: ARMLowOverheadLoops.cpp:883
llvm::ReachingDefAnalysis::reset
void reset()
Re-run the analysis.
Definition: ReachingDefAnalysis.cpp:230
llvm::MachineBasicBlock::pred_begin
pred_iterator pred_begin()
Definition: MachineBasicBlock.h:353
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm::ReachingDefAnalysis::hasSameReachingDef
bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, MCRegister PhysReg) const
Return whether A and B use the same def of PhysReg.
Definition: ReachingDefAnalysis.cpp:291
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::ReachingDefAnalysis::isSafeToRemove
bool isSafeToRemove(MachineInstr *MI, InstSet &ToRemove) const
Return whether removing this instruction will have no effect on the program, returning the redundant ...
Definition: ReachingDefAnalysis.cpp:605
llvm::ARM::isVpred
bool isVpred(OperandType op)
Definition: ARMMCTargetDesc.h:120
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1748
llvm::ReachingDefAnalysis::getUniqueReachingMIDef
MachineInstr * getUniqueReachingMIDef(MachineInstr *MI, MCRegister PhysReg) const
If a single MachineInstr creates the reaching definition, then return it.
Definition: ReachingDefAnalysis.cpp:438
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::ReachingDefAnalysis
This class provides the reaching def analysis.
Definition: ReachingDefAnalysis.h:69
MCInstrDesc.h
llvm::ARMSubtarget
Definition: ARMSubtarget.h:47
SetOperations.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MCRegister::from
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:67
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::createARMLowOverheadLoopsPass
FunctionPass * createARMLowOverheadLoopsPass()
Definition: ARMLowOverheadLoops.cpp:1885
llvm::AArch64::FalseLanesZero
@ FalseLanesZero
Definition: AArch64InstrInfo.h:571
llvm::recomputeLivenessFlags
void recomputeLivenessFlags(MachineBasicBlock &MBB)
Recomputes dead and kill flags in MBB.
Definition: LivePhysRegs.cpp:281
llvm::MachineLoopInfo::findLoopPreheader
MachineBasicBlock * findLoopPreheader(MachineLoop *L, bool SpeculativePreheader=false, bool FindMultiLoopPreheader=false) const
Find the block that either is the loop preheader, or could speculatively be used as the preheader.
Definition: MachineLoopInfo.cpp:118
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::isDoLoopStart
static bool isDoLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:71
Ignore
ReachingDefAnalysis InstSet InstSet & Ignore
Definition: ARMLowOverheadLoops.cpp:547
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::printMBBReference
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Definition: MachineBasicBlock.cpp:117
llvm::enumerate
detail::enumerator< R > enumerate(R &&TheRange)
Given an input range, returns a new range whose values are are pair (A,B) such that A is the 0-based ...
Definition: STLExtras.h:2263
ToRemove
ReachingDefAnalysis InstSet & ToRemove
Definition: ARMLowOverheadLoops.cpp:547
llvm::recomputeLiveIns
static void recomputeLiveIns(MachineBasicBlock &MBB)
Convenience function for recomputing live-in's for MBB.
Definition: LivePhysRegs.h:198
isDomainMVE
static bool isDomainMVE(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:102
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
IT
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
hasVPRUse
static bool hasVPRUse(MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:98
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:236
MachineLoopUtils.h
DisableOmitDLS
static cl::opt< bool > DisableOmitDLS("arm-disable-omit-dls", cl::Hidden, cl::desc("Disable omitting 'dls lr, lr' instructions"), cl::init(false))
llvm::MachineBasicBlock::liveins
iterator_range< livein_iterator > liveins() const
Definition: MachineBasicBlock.h:449
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
shouldInspect
static bool shouldInspect(MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:113
llvm::ARMII::VecSizeShift
@ VecSizeShift
Definition: ARMBaseInfo.h:420
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:127
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::ARMVCC::Then
@ Then
Definition: ARMBaseInfo.h:91
CreateVPTBlock
static ARM::PredBlockMask CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, MachineBasicBlock::instr_iterator EndIter, SmallVectorImpl< MachineInstr * > &DeadInstructions)
Definition: MVEVPTBlockPass.cpp:163
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
llvm::MachineBasicBlock::back
MachineInstr & back()
Definition: MachineBasicBlock.h:285
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::VCTPOpcodeToLSTP
static unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop)
Definition: MVETailPredUtils.h:25
llvm::ReachingDefAnalysis::hasLocalDefBefore
bool hasLocalDefBefore(MachineInstr *MI, MCRegister PhysReg) const
Provide whether the register has been defined in the same basic block as, and before,...
Definition: ReachingDefAnalysis.cpp:326
llvm::ReachingDefAnalysis::getLocalLiveOutMIDef
MachineInstr * getLocalLiveOutMIDef(MachineBasicBlock *MBB, MCRegister PhysReg) const
Return the local MI that produces the live out value for PhysReg, or nullptr for a non-live out or no...
Definition: ReachingDefAnalysis.cpp:527
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
MachineRegisterInfo.h
Uses
SmallPtrSet< MachineInstr *, 2 > Uses
Definition: ARMLowOverheadLoops.cpp:590
llvm::ReachingDefAnalysis::isSafeToMoveBackwards
bool isSafeToMoveBackwards(MachineInstr *From, MachineInstr *To) const
Return whether From can be moved backwards to just after To.
Definition: ReachingDefAnalysis.cpp:595
llvm::getAddSubImmediate
int getAddSubImmediate(MachineInstr &MI)
Definition: ARMBaseInstrInfo.h:870
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition: MachineInstrBuilder.h:116
llvm::MachineBasicBlock::pred_size
unsigned pred_size() const
Definition: MachineBasicBlock.h:365
llvm::TargetRegisterClass::contains
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
Definition: TargetRegisterInfo.h:96
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:866
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:667
llvm::MCInstrDesc::TSFlags
uint64_t TSFlags
Definition: MCInstrDesc.h:205
llvm::PseudoSourceValue::kind
unsigned kind() const
Definition: PseudoSourceValue.h:66
llvm::MachineBasicBlock::insertAfter
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
Definition: MachineBasicBlock.h:930
MachineLoopInfo.h
llvm::SetVector::begin
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition: MachineInstrBuilder.h:146
llvm::ARM_MC::isPredicated
bool isPredicated(const MCInst &MI, const MCInstrInfo *MCII)
Definition: ARMMCTargetDesc.cpp:170
llvm::ReachingDefAnalysis::getGlobalUses
void getGlobalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const
Collect the users of the value stored in PhysReg, which is defined by MI.
Definition: ReachingDefAnalysis.cpp:375
producesDoubleWidthResult
static bool producesDoubleWidthResult(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:838
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::isUse
bool isUse() const
Definition: MachineOperand.h:369
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
llvm::RevertLoopDec
void RevertLoopDec(MachineInstr *MI, const TargetInstrInfo *TII, bool SetFlags=false)
Definition: MVETailPredUtils.h:145
llvm::findFirstVPTPredOperandIdx
int findFirstVPTPredOperandIdx(const MachineInstr &MI)
Definition: Thumb2InstrInfo.cpp:774
llvm::RevertLoopEnd
void RevertLoopEnd(MachineInstr *MI, const TargetInstrInfo *TII, unsigned BrOpc=ARM::t2Bcc, bool SkipCmp=false)
Definition: MVETailPredUtils.h:167
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
Domain
Domain
Definition: CorrelatedValuePropagation.cpp:708
getVecSize
static int getVecSize(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:107
llvm::MachineFunction::getProperties
const MachineFunctionProperties & getProperties() const
Get the function properties.
Definition: MachineFunction.h:748
MVETailPredUtils.h
llvm::getVPTInstrPredicate
ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI, Register &PredReg)
Definition: Thumb2InstrInfo.cpp:787
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:197
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:196
llvm::MachineBasicBlock::RegisterMaskPair
Pair of physical register and lane mask.
Definition: MachineBasicBlock.h:100
llvm::ARMISD::VCMP
@ VCMP
Definition: ARMISelLowering.h:148
llvm::PseudoSourceValue::FixedStack
@ FixedStack
Definition: PseudoSourceValue.h:42
DisableTailPredication
static cl::opt< bool > DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden, cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"), cl::init(false))
llvm::PseudoSourceValue
Special value supplied for machine level alias analysis.
Definition: PseudoSourceValue.h:35
isVectorPredicate
static bool isVectorPredicate(MachineInstr *MI)
Definition: ARMLowOverheadLoops.cpp:94
llvm::VCMPOpcodeToVPT
static unsigned VCMPOpcodeToVPT(unsigned Opcode)
Definition: ARMBaseInstrInfo.h:586
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::ARMII::ValidForTailPredication
@ ValidForTailPredication
Definition: ARMBaseInfo.h:402
llvm::MachineFunctionProperties::Property::NoVRegs
@ NoVRegs
llvm::ARMCC::NE
@ NE
Definition: ARMBaseInfo.h:32
Passes.h
llvm::ARMCC::AL
@ AL
Definition: ARMBaseInfo.h:45
llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition: SmallPtrSet.h:92
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:657
llvm::ARMII::DoubleWidthResult
@ DoubleWidthResult
Definition: ARMBaseInfo.h:413
llvm::cl::opt< bool >
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:445
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
llvm::SetVector::contains
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:209
ARMBaseRegisterInfo.h
llvm::isVPTOpcode
static bool isVPTOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:570
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:770
llvm::MachineFunctionProperties::Property::TracksLiveness
@ TracksLiveness
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
uint64_t
llvm::RevertDoLoopStart
void RevertDoLoopStart(MachineInstr *MI, const TargetInstrInfo *TII)
Definition: MVETailPredUtils.h:135
llvm::SmallPtrSetImpl::end
iterator end() const
Definition: SmallPtrSet.h:408
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::ARMBaseInstrInfo
Definition: ARMBaseInstrInfo.h:37
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:446
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:403
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1868
MachineFunctionPass.h
llvm::MachineFunction::getName
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Definition: MachineFunction.cpp:567
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::isVCTP
static bool isVCTP(const MachineInstr *MI)
Definition: MVETailPredUtils.h:58
llvm::X86AS::FS
@ FS
Definition: X86.h:200
DEBUG_TYPE
#define DEBUG_TYPE
Definition: ARMLowOverheadLoops.cpp:76
ARMBaseInstrInfo.h
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
InlinePriorityMode::ML
@ ML
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::ARMVCC::None
@ None
Definition: ARMBaseInfo.h:90
llvm::Register::asMCReg
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition: Register.h:120
ARM.h
llvm::AMDGPU::IsaInfo::TargetIDSetting::Off
@ Off
isRegInClass
static bool isRegInClass(const MachineOperand &MO, const TargetRegisterClass *Class)
Definition: ARMLowOverheadLoops.cpp:821
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1715
llvm::MachineOperand::setIsDef
void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
Definition: MachineOperand.cpp:102
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:95
llvm::MachineBasicBlock::iterator
MachineInstrBundleIterator< MachineInstr > iterator
Definition: MachineBasicBlock.h:269
isHorizontalReduction
static bool isHorizontalReduction(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:844
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
llvm::MachineBasicBlock::successors
iterator_range< succ_iterator > successors()
Definition: MachineBasicBlock.h:392
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:265
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm::ReachingDefAnalysis::getReachingLocalUses
void getReachingLocalUses(MachineInstr *MI, MCRegister PhysReg, InstSet &Uses) const
Provides the uses, in the same block as MI, of register that MI defines.
Definition: ReachingDefAnalysis.cpp:331
llvm::RevertWhileLoopStartLR
void RevertWhileLoopStartLR(MachineInstr *MI, const TargetInstrInfo *TII, unsigned BrOpc=ARM::t2Bcc, bool UseCmp=false)
Definition: MVETailPredUtils.h:98
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:2013
llvm::isWhileLoopStart
static bool isWhileLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:76
ARMBasicBlockInfo.h
llvm::getWhileLoopStartTargetBB
MachineBasicBlock * getWhileLoopStartTargetBB(const MachineInstr &MI)
Definition: MVETailPredUtils.h:87
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
llvm::MachineInstrBuilder::getInstr
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Definition: MachineInstrBuilder.h:89
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ReachingDefAnalysis::collectKilledOperands
void collectKilledOperands(MachineInstr *MI, InstSet &Dead) const
Assuming MI is dead, recursively search the incoming operands which are killed by MI and collect thos...
Definition: ReachingDefAnalysis.cpp:649
llvm::SetVector::clear
void clear()
Completely clear the SetVector.
Definition: SetVector.h:220
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ARMII::RetainsPreviousHalfElement
@ RetainsPreviousHalfElement
Definition: ARMBaseInfo.h:406
ValidateMVEStore
static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML)
Definition: ARMLowOverheadLoops.cpp:1143
llvm::ReachingDefAnalysis::isSafeToDefRegAt
bool isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg) const
Return whether a MachineInstr could be inserted at MI and safely define the given register without af...
Definition: ReachingDefAnalysis.cpp:681
Thumb2InstrInfo.h
llvm::MachineBasicBlock::sortUniqueLiveIns
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
Definition: MachineBasicBlock.cpp:597
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:351
llvm::MachineBasicBlock::insert
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition: MachineBasicBlock.cpp:1327
MachineFrameInfo.h
ReachingDefAnalysis.h
llvm::ARMII::DomainMVE
@ DomainMVE
Definition: ARMBaseInfo.h:431
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:614
llvm::recomputeVPTBlockMask
void recomputeVPTBlockMask(MachineInstr &Instr)
Definition: Thumb2InstrInfo.cpp:799
llvm::ReachingDefAnalysis::getGlobalReachingDefs
void getGlobalReachingDefs(MachineInstr *MI, MCRegister PhysReg, InstSet &Defs) const
Collect all possible definitions of the value stored in PhysReg, which is used by MI.
Definition: ReachingDefAnalysis.cpp:400
llvm::ARMII::VecSize
@ VecSize
Definition: ARMBaseInfo.h:421
canGenerateNonZeros
static bool canGenerateNonZeros(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:854
llvm::SetVector::end
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
llvm::ARMII::HorizontalReduction
@ HorizontalReduction
Definition: ARMBaseInfo.h:409
ARM_LOW_OVERHEAD_LOOPS_NAME
#define ARM_LOW_OVERHEAD_LOOPS_NAME
Definition: ARMLowOverheadLoops.cpp:77
isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:184
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
RDA
ReachingDefAnalysis & RDA
Definition: ARMLowOverheadLoops.cpp:546
llvm::ReachingDefAnalysis::isReachingDefLiveOut
bool isReachingDefLiveOut(MachineInstr *MI, MCRegister PhysReg) const
Return whether the reaching def for MI also is live out of its parent block.
Definition: ReachingDefAnalysis.cpp:505
llvm::MachineBasicBlock::empty
bool empty() const
Definition: MachineBasicBlock.h:277
llvm::addUnpredicatedMveVpredROp
void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, Register DestReg)
Definition: ARMBaseInstrInfo.cpp:872
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::reverse
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:485
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::predOps
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
Definition: ARMBaseInstrInfo.h:541
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::ReachingDefAnalysis::getMIOperand
MachineInstr * getMIOperand(MachineInstr *MI, unsigned Idx) const
If a single MachineInstr creates the reaching definition, for MIs operand at Idx, then return it.
Definition: ReachingDefAnalysis.cpp:458
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::cl::desc
Definition: CommandLine.h:412
llvm::ARMII::DomainMask
@ DomainMask
Definition: ARMBaseInfo.h:426
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::SmallPtrSetImpl::contains
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
retainsPreviousHalfElement
static bool retainsPreviousHalfElement(const MachineInstr &MI)
Definition: ARMLowOverheadLoops.cpp:829
llvm::isMovRegOpcode
static bool isMovRegOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:733
llvm::MCID::Terminator
@ Terminator
Definition: MCInstrDesc.h:157
llvm::MCInstrDesc::operands
iterator_range< const_opInfo_iterator > operands() const
Definition: MCInstrDesc.h:237
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:307
llvm::isSubImmOpcode
static bool isSubImmOpcode(int Opc)
Definition: ARMBaseInstrInfo.h:726
llvm::ARMBasicBlockUtils
Definition: ARMBasicBlockInfo.h:109
llvm::getTailPredVectorWidth
static unsigned getTailPredVectorWidth(unsigned Opcode)
Definition: MVETailPredUtils.h:42
SetVector.h
llvm::MachineOperand::isIdenticalTo
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Definition: MachineOperand.cpp:288
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
llvm::ReachingDefAnalysis::isSafeToMoveForwards
bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const
Return whether From can be moved forwards to just before To.
Definition: ReachingDefAnalysis.cpp:585
llvm::MCRegister
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:24
llvm::isLoopStart
static bool isLoopStart(const MachineInstr &MI)
Definition: MVETailPredUtils.h:82
SmallSet.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::SmallVectorImpl::insert
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
LivePhysRegs.h