LLVM  16.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/CodeGen/StackMaps.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/GlobalValue.h"
37 #include "llvm/MC/MCAsmInfo.h"
38 #include "llvm/MC/MCInst.h"
39 #include "llvm/MC/MCInstBuilder.h"
40 #include "llvm/MC/MCInstrDesc.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Support/CodeGen.h"
44 #include "llvm/Support/Compiler.h"
46 #include "llvm/Support/LEB128.h"
50 #include <cassert>
51 #include <cstdint>
52 #include <iterator>
53 #include <utility>
54 
55 using namespace llvm;
56 
57 #define GET_INSTRINFO_CTOR_DTOR
58 #include "AArch64GenInstrInfo.inc"
59 
61  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
62  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
63 
65  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
66  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
67 
68 static cl::opt<unsigned>
69  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
70  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
71 
73  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
74  AArch64::CATCHRET),
75  RI(STI.getTargetTriple()), Subtarget(STI) {}
76 
77 /// GetInstSize - Return the number of bytes of code the specified
78 /// instruction may be. This returns the maximum number of bytes.
80  const MachineBasicBlock &MBB = *MI.getParent();
81  const MachineFunction *MF = MBB.getParent();
82  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
83 
84  {
85  auto Op = MI.getOpcode();
87  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
88  }
89 
90  // Meta-instructions emit no code.
91  if (MI.isMetaInstruction())
92  return 0;
93 
94  // FIXME: We currently only handle pseudoinstructions that don't get expanded
95  // before the assembly printer.
96  unsigned NumBytes = 0;
97  const MCInstrDesc &Desc = MI.getDesc();
98 
99  // Size should be preferably set in
100  // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
101  // Specific cases handle instructions of variable sizes
102  switch (Desc.getOpcode()) {
103  default:
104  if (Desc.getSize())
105  return Desc.getSize();
106 
107  // Anything not explicitly designated otherwise (i.e. pseudo-instructions
108  // with fixed constant size but not specified in .td file) is a normal
109  // 4-byte insn.
110  NumBytes = 4;
111  break;
113  // The upper bound for a stackmap intrinsic is the full length of its shadow
114  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
115  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
116  break;
118  // The size of the patchpoint intrinsic is the number of bytes requested
119  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
120  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
121  break;
122  case TargetOpcode::STATEPOINT:
123  NumBytes = StatepointOpers(&MI).getNumPatchBytes();
124  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
125  // No patch bytes means a normal call inst is emitted
126  if (NumBytes == 0)
127  NumBytes = 4;
128  break;
129  case AArch64::SPACE:
130  NumBytes = MI.getOperand(1).getImm();
131  break;
132  case TargetOpcode::BUNDLE:
133  NumBytes = getInstBundleLength(MI);
134  break;
135  }
136 
137  return NumBytes;
138 }
139 
140 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
141  unsigned Size = 0;
143  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
144  while (++I != E && I->isInsideBundle()) {
145  assert(!I->isBundle() && "No nested bundle!");
146  Size += getInstSizeInBytes(*I);
147  }
148  return Size;
149 }
150 
153  // Block ends with fall-through condbranch.
154  switch (LastInst->getOpcode()) {
155  default:
156  llvm_unreachable("Unknown branch instruction?");
157  case AArch64::Bcc:
158  Target = LastInst->getOperand(1).getMBB();
159  Cond.push_back(LastInst->getOperand(0));
160  break;
161  case AArch64::CBZW:
162  case AArch64::CBZX:
163  case AArch64::CBNZW:
164  case AArch64::CBNZX:
165  Target = LastInst->getOperand(1).getMBB();
166  Cond.push_back(MachineOperand::CreateImm(-1));
167  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
168  Cond.push_back(LastInst->getOperand(0));
169  break;
170  case AArch64::TBZW:
171  case AArch64::TBZX:
172  case AArch64::TBNZW:
173  case AArch64::TBNZX:
174  Target = LastInst->getOperand(2).getMBB();
175  Cond.push_back(MachineOperand::CreateImm(-1));
176  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
177  Cond.push_back(LastInst->getOperand(0));
178  Cond.push_back(LastInst->getOperand(1));
179  }
180 }
181 
182 static unsigned getBranchDisplacementBits(unsigned Opc) {
183  switch (Opc) {
184  default:
185  llvm_unreachable("unexpected opcode!");
186  case AArch64::B:
187  return 64;
188  case AArch64::TBNZW:
189  case AArch64::TBZW:
190  case AArch64::TBNZX:
191  case AArch64::TBZX:
192  return TBZDisplacementBits;
193  case AArch64::CBNZW:
194  case AArch64::CBZW:
195  case AArch64::CBNZX:
196  case AArch64::CBZX:
197  return CBZDisplacementBits;
198  case AArch64::Bcc:
199  return BCCDisplacementBits;
200  }
201 }
202 
204  int64_t BrOffset) const {
205  unsigned Bits = getBranchDisplacementBits(BranchOp);
206  assert(Bits >= 3 && "max branch displacement must be enough to jump"
207  "over conditional branch expansion");
208  return isIntN(Bits, BrOffset / 4);
209 }
210 
213  switch (MI.getOpcode()) {
214  default:
215  llvm_unreachable("unexpected opcode!");
216  case AArch64::B:
217  return MI.getOperand(0).getMBB();
218  case AArch64::TBZW:
219  case AArch64::TBNZW:
220  case AArch64::TBZX:
221  case AArch64::TBNZX:
222  return MI.getOperand(2).getMBB();
223  case AArch64::CBZW:
224  case AArch64::CBNZW:
225  case AArch64::CBZX:
226  case AArch64::CBNZX:
227  case AArch64::Bcc:
228  return MI.getOperand(1).getMBB();
229  }
230 }
231 
232 // Branch analysis.
235  MachineBasicBlock *&FBB,
237  bool AllowModify) const {
238  // If the block has no terminators, it just falls into the block after it.
240  if (I == MBB.end())
241  return false;
242 
243  // Skip over SpeculationBarrierEndBB terminators
244  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
245  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
246  --I;
247  }
248 
249  if (!isUnpredicatedTerminator(*I))
250  return false;
251 
252  // Get the last instruction in the block.
253  MachineInstr *LastInst = &*I;
254 
255  // If there is only one terminator instruction, process it.
256  unsigned LastOpc = LastInst->getOpcode();
257  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
258  if (isUncondBranchOpcode(LastOpc)) {
259  TBB = LastInst->getOperand(0).getMBB();
260  return false;
261  }
262  if (isCondBranchOpcode(LastOpc)) {
263  // Block ends with fall-through condbranch.
264  parseCondBranch(LastInst, TBB, Cond);
265  return false;
266  }
267  return true; // Can't handle indirect branch.
268  }
269 
270  // Get the instruction before it if it is a terminator.
271  MachineInstr *SecondLastInst = &*I;
272  unsigned SecondLastOpc = SecondLastInst->getOpcode();
273 
274  // If AllowModify is true and the block ends with two or more unconditional
275  // branches, delete all but the first unconditional branch.
276  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
277  while (isUncondBranchOpcode(SecondLastOpc)) {
278  LastInst->eraseFromParent();
279  LastInst = SecondLastInst;
280  LastOpc = LastInst->getOpcode();
281  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
282  // Return now the only terminator is an unconditional branch.
283  TBB = LastInst->getOperand(0).getMBB();
284  return false;
285  } else {
286  SecondLastInst = &*I;
287  SecondLastOpc = SecondLastInst->getOpcode();
288  }
289  }
290  }
291 
292  // If we're allowed to modify and the block ends in a unconditional branch
293  // which could simply fallthrough, remove the branch. (Note: This case only
294  // matters when we can't understand the whole sequence, otherwise it's also
295  // handled by BranchFolding.cpp.)
296  if (AllowModify && isUncondBranchOpcode(LastOpc) &&
298  LastInst->eraseFromParent();
299  LastInst = SecondLastInst;
300  LastOpc = LastInst->getOpcode();
301  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
302  assert(!isUncondBranchOpcode(LastOpc) &&
303  "unreachable unconditional branches removed above");
304 
305  if (isCondBranchOpcode(LastOpc)) {
306  // Block ends with fall-through condbranch.
307  parseCondBranch(LastInst, TBB, Cond);
308  return false;
309  }
310  return true; // Can't handle indirect branch.
311  } else {
312  SecondLastInst = &*I;
313  SecondLastOpc = SecondLastInst->getOpcode();
314  }
315  }
316 
317  // If there are three terminators, we don't know what sort of block this is.
318  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
319  return true;
320 
321  // If the block ends with a B and a Bcc, handle it.
322  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
323  parseCondBranch(SecondLastInst, TBB, Cond);
324  FBB = LastInst->getOperand(0).getMBB();
325  return false;
326  }
327 
328  // If the block ends with two unconditional branches, handle it. The second
329  // one is not executed, so remove it.
330  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
331  TBB = SecondLastInst->getOperand(0).getMBB();
332  I = LastInst;
333  if (AllowModify)
334  I->eraseFromParent();
335  return false;
336  }
337 
338  // ...likewise if it ends with an indirect branch followed by an unconditional
339  // branch.
340  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
341  I = LastInst;
342  if (AllowModify)
343  I->eraseFromParent();
344  return true;
345  }
346 
347  // Otherwise, can't handle this.
348  return true;
349 }
350 
352  MachineBranchPredicate &MBP,
353  bool AllowModify) const {
354  // For the moment, handle only a block which ends with a cb(n)zx followed by
355  // a fallthrough. Why this? Because it is a common form.
356  // TODO: Should we handle b.cc?
357 
359  if (I == MBB.end())
360  return true;
361 
362  // Skip over SpeculationBarrierEndBB terminators
363  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
364  I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
365  --I;
366  }
367 
368  if (!isUnpredicatedTerminator(*I))
369  return true;
370 
371  // Get the last instruction in the block.
372  MachineInstr *LastInst = &*I;
373  unsigned LastOpc = LastInst->getOpcode();
374  if (!isCondBranchOpcode(LastOpc))
375  return true;
376 
377  switch (LastOpc) {
378  default:
379  return true;
380  case AArch64::CBZW:
381  case AArch64::CBZX:
382  case AArch64::CBNZW:
383  case AArch64::CBNZX:
384  break;
385  };
386 
387  MBP.TrueDest = LastInst->getOperand(1).getMBB();
388  assert(MBP.TrueDest && "expected!");
389  MBP.FalseDest = MBB.getNextNode();
390 
391  MBP.ConditionDef = nullptr;
392  MBP.SingleUseCondition = false;
393 
394  MBP.LHS = LastInst->getOperand(0);
395  MBP.RHS = MachineOperand::CreateImm(0);
396  MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
398  return false;
399 }
400 
403  if (Cond[0].getImm() != -1) {
404  // Regular Bcc
405  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
407  } else {
408  // Folded compare-and-branch
409  switch (Cond[1].getImm()) {
410  default:
411  llvm_unreachable("Unknown conditional branch!");
412  case AArch64::CBZW:
413  Cond[1].setImm(AArch64::CBNZW);
414  break;
415  case AArch64::CBNZW:
416  Cond[1].setImm(AArch64::CBZW);
417  break;
418  case AArch64::CBZX:
419  Cond[1].setImm(AArch64::CBNZX);
420  break;
421  case AArch64::CBNZX:
422  Cond[1].setImm(AArch64::CBZX);
423  break;
424  case AArch64::TBZW:
425  Cond[1].setImm(AArch64::TBNZW);
426  break;
427  case AArch64::TBNZW:
428  Cond[1].setImm(AArch64::TBZW);
429  break;
430  case AArch64::TBZX:
431  Cond[1].setImm(AArch64::TBNZX);
432  break;
433  case AArch64::TBNZX:
434  Cond[1].setImm(AArch64::TBZX);
435  break;
436  }
437  }
438 
439  return false;
440 }
441 
443  int *BytesRemoved) const {
445  if (I == MBB.end())
446  return 0;
447 
448  if (!isUncondBranchOpcode(I->getOpcode()) &&
449  !isCondBranchOpcode(I->getOpcode()))
450  return 0;
451 
452  // Remove the branch.
453  I->eraseFromParent();
454 
455  I = MBB.end();
456 
457  if (I == MBB.begin()) {
458  if (BytesRemoved)
459  *BytesRemoved = 4;
460  return 1;
461  }
462  --I;
463  if (!isCondBranchOpcode(I->getOpcode())) {
464  if (BytesRemoved)
465  *BytesRemoved = 4;
466  return 1;
467  }
468 
469  // Remove the branch.
470  I->eraseFromParent();
471  if (BytesRemoved)
472  *BytesRemoved = 8;
473 
474  return 2;
475 }
476 
477 void AArch64InstrInfo::instantiateCondBranch(
480  if (Cond[0].getImm() != -1) {
481  // Regular Bcc
482  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
483  } else {
484  // Folded compare-and-branch
485  // Note that we use addOperand instead of addReg to keep the flags.
486  const MachineInstrBuilder MIB =
487  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
488  if (Cond.size() > 3)
489  MIB.addImm(Cond[3].getImm());
490  MIB.addMBB(TBB);
491  }
492 }
493 
496  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
497  // Shouldn't be a fall through.
498  assert(TBB && "insertBranch must not be told to insert a fallthrough");
499 
500  if (!FBB) {
501  if (Cond.empty()) // Unconditional branch?
503  else
504  instantiateCondBranch(MBB, DL, TBB, Cond);
505 
506  if (BytesAdded)
507  *BytesAdded = 4;
508 
509  return 1;
510  }
511 
512  // Two-way conditional branch.
513  instantiateCondBranch(MBB, DL, TBB, Cond);
514  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
515 
516  if (BytesAdded)
517  *BytesAdded = 8;
518 
519  return 2;
520 }
521 
522 // Find the original register that VReg is copied from.
523 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
524  while (Register::isVirtualRegister(VReg)) {
525  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
526  if (!DefMI->isFullCopy())
527  return VReg;
528  VReg = DefMI->getOperand(1).getReg();
529  }
530  return VReg;
531 }
532 
533 // Determine if VReg is defined by an instruction that can be folded into a
534 // csel instruction. If so, return the folded opcode, and the replacement
535 // register.
536 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
537  unsigned *NewVReg = nullptr) {
538  VReg = removeCopies(MRI, VReg);
539  if (!Register::isVirtualRegister(VReg))
540  return 0;
541 
542  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
543  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
544  unsigned Opc = 0;
545  unsigned SrcOpNum = 0;
546  switch (DefMI->getOpcode()) {
547  case AArch64::ADDSXri:
548  case AArch64::ADDSWri:
549  // if NZCV is used, do not fold.
550  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
551  return 0;
552  // fall-through to ADDXri and ADDWri.
553  [[fallthrough]];
554  case AArch64::ADDXri:
555  case AArch64::ADDWri:
556  // add x, 1 -> csinc.
557  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
558  DefMI->getOperand(3).getImm() != 0)
559  return 0;
560  SrcOpNum = 1;
561  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
562  break;
563 
564  case AArch64::ORNXrr:
565  case AArch64::ORNWrr: {
566  // not x -> csinv, represented as orn dst, xzr, src.
567  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
568  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
569  return 0;
570  SrcOpNum = 2;
571  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
572  break;
573  }
574 
575  case AArch64::SUBSXrr:
576  case AArch64::SUBSWrr:
577  // if NZCV is used, do not fold.
578  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
579  return 0;
580  // fall-through to SUBXrr and SUBWrr.
581  [[fallthrough]];
582  case AArch64::SUBXrr:
583  case AArch64::SUBWrr: {
584  // neg x -> csneg, represented as sub dst, xzr, src.
585  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
586  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
587  return 0;
588  SrcOpNum = 2;
589  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
590  break;
591  }
592  default:
593  return 0;
594  }
595  assert(Opc && SrcOpNum && "Missing parameters");
596 
597  if (NewVReg)
598  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
599  return Opc;
600 }
601 
604  Register DstReg, Register TrueReg,
605  Register FalseReg, int &CondCycles,
606  int &TrueCycles,
607  int &FalseCycles) const {
608  // Check register classes.
610  const TargetRegisterClass *RC =
611  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
612  if (!RC)
613  return false;
614 
615  // Also need to check the dest regclass, in case we're trying to optimize
616  // something like:
617  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
618  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
619  return false;
620 
621  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
622  unsigned ExtraCondLat = Cond.size() != 1;
623 
624  // GPRs are handled by csel.
625  // FIXME: Fold in x+1, -x, and ~x when applicable.
626  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
627  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
628  // Single-cycle csel, csinc, csinv, and csneg.
629  CondCycles = 1 + ExtraCondLat;
630  TrueCycles = FalseCycles = 1;
631  if (canFoldIntoCSel(MRI, TrueReg))
632  TrueCycles = 0;
633  else if (canFoldIntoCSel(MRI, FalseReg))
634  FalseCycles = 0;
635  return true;
636  }
637 
638  // Scalar floating point is handled by fcsel.
639  // FIXME: Form fabs, fmin, and fmax when applicable.
640  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
641  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
642  CondCycles = 5 + ExtraCondLat;
643  TrueCycles = FalseCycles = 2;
644  return true;
645  }
646 
647  // Can't do vectors.
648  return false;
649 }
650 
653  const DebugLoc &DL, Register DstReg,
655  Register TrueReg, Register FalseReg) const {
657 
658  // Parse the condition code, see parseCondBranch() above.
660  switch (Cond.size()) {
661  default:
662  llvm_unreachable("Unknown condition opcode in Cond");
663  case 1: // b.cc
664  CC = AArch64CC::CondCode(Cond[0].getImm());
665  break;
666  case 3: { // cbz/cbnz
667  // We must insert a compare against 0.
668  bool Is64Bit;
669  switch (Cond[1].getImm()) {
670  default:
671  llvm_unreachable("Unknown branch opcode in Cond");
672  case AArch64::CBZW:
673  Is64Bit = false;
674  CC = AArch64CC::EQ;
675  break;
676  case AArch64::CBZX:
677  Is64Bit = true;
678  CC = AArch64CC::EQ;
679  break;
680  case AArch64::CBNZW:
681  Is64Bit = false;
682  CC = AArch64CC::NE;
683  break;
684  case AArch64::CBNZX:
685  Is64Bit = true;
686  CC = AArch64CC::NE;
687  break;
688  }
689  Register SrcReg = Cond[2].getReg();
690  if (Is64Bit) {
691  // cmp reg, #0 is actually subs xzr, reg, #0.
692  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
693  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
694  .addReg(SrcReg)
695  .addImm(0)
696  .addImm(0);
697  } else {
698  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
699  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
700  .addReg(SrcReg)
701  .addImm(0)
702  .addImm(0);
703  }
704  break;
705  }
706  case 4: { // tbz/tbnz
707  // We must insert a tst instruction.
708  switch (Cond[1].getImm()) {
709  default:
710  llvm_unreachable("Unknown branch opcode in Cond");
711  case AArch64::TBZW:
712  case AArch64::TBZX:
713  CC = AArch64CC::EQ;
714  break;
715  case AArch64::TBNZW:
716  case AArch64::TBNZX:
717  CC = AArch64CC::NE;
718  break;
719  }
720  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
721  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
722  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
723  .addReg(Cond[2].getReg())
724  .addImm(
725  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
726  else
727  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
728  .addReg(Cond[2].getReg())
729  .addImm(
730  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
731  break;
732  }
733  }
734 
735  unsigned Opc = 0;
736  const TargetRegisterClass *RC = nullptr;
737  bool TryFold = false;
738  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
739  RC = &AArch64::GPR64RegClass;
740  Opc = AArch64::CSELXr;
741  TryFold = true;
742  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
743  RC = &AArch64::GPR32RegClass;
744  Opc = AArch64::CSELWr;
745  TryFold = true;
746  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
747  RC = &AArch64::FPR64RegClass;
748  Opc = AArch64::FCSELDrrr;
749  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
750  RC = &AArch64::FPR32RegClass;
751  Opc = AArch64::FCSELSrrr;
752  }
753  assert(RC && "Unsupported regclass");
754 
755  // Try folding simple instructions into the csel.
756  if (TryFold) {
757  unsigned NewVReg = 0;
758  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
759  if (FoldedOpc) {
760  // The folded opcodes csinc, csinc and csneg apply the operation to
761  // FalseReg, so we need to invert the condition.
763  TrueReg = FalseReg;
764  } else
765  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
766 
767  // Fold the operation. Leave any dead instructions for DCE to clean up.
768  if (FoldedOpc) {
769  FalseReg = NewVReg;
770  Opc = FoldedOpc;
771  // The extends the live range of NewVReg.
772  MRI.clearKillFlags(NewVReg);
773  }
774  }
775 
776  // Pull all virtual register into the appropriate class.
777  MRI.constrainRegClass(TrueReg, RC);
778  MRI.constrainRegClass(FalseReg, RC);
779 
780  // Insert the csel.
781  BuildMI(MBB, I, DL, get(Opc), DstReg)
782  .addReg(TrueReg)
783  .addReg(FalseReg)
784  .addImm(CC);
785 }
786 
787 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
788 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
789  uint64_t Imm = MI.getOperand(1).getImm();
790  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
791  uint64_t Encoding;
792  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
793 }
794 
795 // FIXME: this implementation should be micro-architecture dependent, so a
796 // micro-architecture target hook should be introduced here in future.
798  if (!Subtarget.hasCustomCheapAsMoveHandling())
799  return MI.isAsCheapAsAMove();
800 
801  const unsigned Opcode = MI.getOpcode();
802 
803  // Firstly, check cases gated by features.
804 
805  if (Subtarget.hasZeroCycleZeroingFP()) {
806  if (Opcode == AArch64::FMOVH0 ||
807  Opcode == AArch64::FMOVS0 ||
808  Opcode == AArch64::FMOVD0)
809  return true;
810  }
811 
812  if (Subtarget.hasZeroCycleZeroingGP()) {
813  if (Opcode == TargetOpcode::COPY &&
814  (MI.getOperand(1).getReg() == AArch64::WZR ||
815  MI.getOperand(1).getReg() == AArch64::XZR))
816  return true;
817  }
818 
819  // Secondly, check cases specific to sub-targets.
820 
821  if (Subtarget.hasExynosCheapAsMoveHandling()) {
822  if (isExynosCheapAsMove(MI))
823  return true;
824 
825  return MI.isAsCheapAsAMove();
826  }
827 
828  // Finally, check generic cases.
829 
830  switch (Opcode) {
831  default:
832  return false;
833 
834  // add/sub on register without shift
835  case AArch64::ADDWri:
836  case AArch64::ADDXri:
837  case AArch64::SUBWri:
838  case AArch64::SUBXri:
839  return (MI.getOperand(3).getImm() == 0);
840 
841  // logical ops on immediate
842  case AArch64::ANDWri:
843  case AArch64::ANDXri:
844  case AArch64::EORWri:
845  case AArch64::EORXri:
846  case AArch64::ORRWri:
847  case AArch64::ORRXri:
848  return true;
849 
850  // logical ops on register without shift
851  case AArch64::ANDWrr:
852  case AArch64::ANDXrr:
853  case AArch64::BICWrr:
854  case AArch64::BICXrr:
855  case AArch64::EONWrr:
856  case AArch64::EONXrr:
857  case AArch64::EORWrr:
858  case AArch64::EORXrr:
859  case AArch64::ORNWrr:
860  case AArch64::ORNXrr:
861  case AArch64::ORRWrr:
862  case AArch64::ORRXrr:
863  return true;
864 
865  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
866  // ORRXri, it is as cheap as MOV
867  case AArch64::MOVi32imm:
868  return canBeExpandedToORR(MI, 32);
869  case AArch64::MOVi64imm:
870  return canBeExpandedToORR(MI, 64);
871  }
872 
873  llvm_unreachable("Unknown opcode to check as cheap as a move!");
874 }
875 
877  switch (MI.getOpcode()) {
878  default:
879  return false;
880 
881  case AArch64::ADDWrs:
882  case AArch64::ADDXrs:
883  case AArch64::ADDSWrs:
884  case AArch64::ADDSXrs: {
885  unsigned Imm = MI.getOperand(3).getImm();
886  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
887  if (ShiftVal == 0)
888  return true;
889  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
890  }
891 
892  case AArch64::ADDWrx:
893  case AArch64::ADDXrx:
894  case AArch64::ADDXrx64:
895  case AArch64::ADDSWrx:
896  case AArch64::ADDSXrx:
897  case AArch64::ADDSXrx64: {
898  unsigned Imm = MI.getOperand(3).getImm();
900  default:
901  return false;
902  case AArch64_AM::UXTB:
903  case AArch64_AM::UXTH:
904  case AArch64_AM::UXTW:
905  case AArch64_AM::UXTX:
906  return AArch64_AM::getArithShiftValue(Imm) <= 4;
907  }
908  }
909 
910  case AArch64::SUBWrs:
911  case AArch64::SUBSWrs: {
912  unsigned Imm = MI.getOperand(3).getImm();
913  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
914  return ShiftVal == 0 ||
915  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
916  }
917 
918  case AArch64::SUBXrs:
919  case AArch64::SUBSXrs: {
920  unsigned Imm = MI.getOperand(3).getImm();
921  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
922  return ShiftVal == 0 ||
923  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
924  }
925 
926  case AArch64::SUBWrx:
927  case AArch64::SUBXrx:
928  case AArch64::SUBXrx64:
929  case AArch64::SUBSWrx:
930  case AArch64::SUBSXrx:
931  case AArch64::SUBSXrx64: {
932  unsigned Imm = MI.getOperand(3).getImm();
934  default:
935  return false;
936  case AArch64_AM::UXTB:
937  case AArch64_AM::UXTH:
938  case AArch64_AM::UXTW:
939  case AArch64_AM::UXTX:
940  return AArch64_AM::getArithShiftValue(Imm) == 0;
941  }
942  }
943 
944  case AArch64::LDRBBroW:
945  case AArch64::LDRBBroX:
946  case AArch64::LDRBroW:
947  case AArch64::LDRBroX:
948  case AArch64::LDRDroW:
949  case AArch64::LDRDroX:
950  case AArch64::LDRHHroW:
951  case AArch64::LDRHHroX:
952  case AArch64::LDRHroW:
953  case AArch64::LDRHroX:
954  case AArch64::LDRQroW:
955  case AArch64::LDRQroX:
956  case AArch64::LDRSBWroW:
957  case AArch64::LDRSBWroX:
958  case AArch64::LDRSBXroW:
959  case AArch64::LDRSBXroX:
960  case AArch64::LDRSHWroW:
961  case AArch64::LDRSHWroX:
962  case AArch64::LDRSHXroW:
963  case AArch64::LDRSHXroX:
964  case AArch64::LDRSWroW:
965  case AArch64::LDRSWroX:
966  case AArch64::LDRSroW:
967  case AArch64::LDRSroX:
968  case AArch64::LDRWroW:
969  case AArch64::LDRWroX:
970  case AArch64::LDRXroW:
971  case AArch64::LDRXroX:
972  case AArch64::PRFMroW:
973  case AArch64::PRFMroX:
974  case AArch64::STRBBroW:
975  case AArch64::STRBBroX:
976  case AArch64::STRBroW:
977  case AArch64::STRBroX:
978  case AArch64::STRDroW:
979  case AArch64::STRDroX:
980  case AArch64::STRHHroW:
981  case AArch64::STRHHroX:
982  case AArch64::STRHroW:
983  case AArch64::STRHroX:
984  case AArch64::STRQroW:
985  case AArch64::STRQroX:
986  case AArch64::STRSroW:
987  case AArch64::STRSroX:
988  case AArch64::STRWroW:
989  case AArch64::STRWroX:
990  case AArch64::STRXroW:
991  case AArch64::STRXroX: {
992  unsigned IsSigned = MI.getOperand(3).getImm();
993  return !IsSigned;
994  }
995  }
996 }
997 
999  unsigned Opc = MI.getOpcode();
1000  switch (Opc) {
1001  default:
1002  return false;
1003  case AArch64::SEH_StackAlloc:
1004  case AArch64::SEH_SaveFPLR:
1005  case AArch64::SEH_SaveFPLR_X:
1006  case AArch64::SEH_SaveReg:
1007  case AArch64::SEH_SaveReg_X:
1008  case AArch64::SEH_SaveRegP:
1009  case AArch64::SEH_SaveRegP_X:
1010  case AArch64::SEH_SaveFReg:
1011  case AArch64::SEH_SaveFReg_X:
1012  case AArch64::SEH_SaveFRegP:
1013  case AArch64::SEH_SaveFRegP_X:
1014  case AArch64::SEH_SetFP:
1015  case AArch64::SEH_AddFP:
1016  case AArch64::SEH_Nop:
1017  case AArch64::SEH_PrologEnd:
1018  case AArch64::SEH_EpilogStart:
1019  case AArch64::SEH_EpilogEnd:
1020  case AArch64::SEH_PACSignLR:
1021  return true;
1022  }
1023 }
1024 
1026  Register &SrcReg, Register &DstReg,
1027  unsigned &SubIdx) const {
1028  switch (MI.getOpcode()) {
1029  default:
1030  return false;
1031  case AArch64::SBFMXri: // aka sxtw
1032  case AArch64::UBFMXri: // aka uxtw
1033  // Check for the 32 -> 64 bit extension case, these instructions can do
1034  // much more.
1035  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1036  return false;
1037  // This is a signed or unsigned 32 -> 64 bit extension.
1038  SrcReg = MI.getOperand(1).getReg();
1039  DstReg = MI.getOperand(0).getReg();
1040  SubIdx = AArch64::sub_32;
1041  return true;
1042  }
1043 }
1044 
1046  const MachineInstr &MIa, const MachineInstr &MIb) const {
1048  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1049  int64_t OffsetA = 0, OffsetB = 0;
1050  unsigned WidthA = 0, WidthB = 0;
1051  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1052 
1053  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1054  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1055 
1056  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1058  return false;
1059 
1060  // Retrieve the base, offset from the base and width. Width
1061  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1062  // base are identical, and the offset of a lower memory access +
1063  // the width doesn't overlap the offset of a higher memory access,
1064  // then the memory accesses are different.
1065  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1066  // are assumed to have the same scale (vscale).
1067  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1068  WidthA, TRI) &&
1069  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1070  WidthB, TRI)) {
1071  if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1072  OffsetAIsScalable == OffsetBIsScalable) {
1073  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1074  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1075  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1076  if (LowOffset + LowWidth <= HighOffset)
1077  return true;
1078  }
1079  }
1080  return false;
1081 }
1082 
1084  const MachineBasicBlock *MBB,
1085  const MachineFunction &MF) const {
1087  return true;
1088  switch (MI.getOpcode()) {
1089  case AArch64::HINT:
1090  // CSDB hints are scheduling barriers.
1091  if (MI.getOperand(0).getImm() == 0x14)
1092  return true;
1093  break;
1094  case AArch64::DSB:
1095  case AArch64::ISB:
1096  // DSB and ISB also are scheduling barriers.
1097  return true;
1098  case AArch64::MSRpstatesvcrImm1:
1099  // SMSTART and SMSTOP are also scheduling barriers.
1100  return true;
1101  default:;
1102  }
1103  if (isSEHInstruction(MI))
1104  return true;
1105  auto Next = std::next(MI.getIterator());
1106  return Next != MBB->end() && Next->isCFIInstruction();
1107 }
1108 
1109 /// analyzeCompare - For a comparison instruction, return the source registers
1110 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1111 /// Return true if the comparison instruction can be analyzed.
1113  Register &SrcReg2, int64_t &CmpMask,
1114  int64_t &CmpValue) const {
1115  // The first operand can be a frame index where we'd normally expect a
1116  // register.
1117  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1118  if (!MI.getOperand(1).isReg())
1119  return false;
1120 
1121  switch (MI.getOpcode()) {
1122  default:
1123  break;
1124  case AArch64::PTEST_PP:
1125  case AArch64::PTEST_PP_ANY:
1126  SrcReg = MI.getOperand(0).getReg();
1127  SrcReg2 = MI.getOperand(1).getReg();
1128  // Not sure about the mask and value for now...
1129  CmpMask = ~0;
1130  CmpValue = 0;
1131  return true;
1132  case AArch64::SUBSWrr:
1133  case AArch64::SUBSWrs:
1134  case AArch64::SUBSWrx:
1135  case AArch64::SUBSXrr:
1136  case AArch64::SUBSXrs:
1137  case AArch64::SUBSXrx:
1138  case AArch64::ADDSWrr:
1139  case AArch64::ADDSWrs:
1140  case AArch64::ADDSWrx:
1141  case AArch64::ADDSXrr:
1142  case AArch64::ADDSXrs:
1143  case AArch64::ADDSXrx:
1144  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1145  SrcReg = MI.getOperand(1).getReg();
1146  SrcReg2 = MI.getOperand(2).getReg();
1147  CmpMask = ~0;
1148  CmpValue = 0;
1149  return true;
1150  case AArch64::SUBSWri:
1151  case AArch64::ADDSWri:
1152  case AArch64::SUBSXri:
1153  case AArch64::ADDSXri:
1154  SrcReg = MI.getOperand(1).getReg();
1155  SrcReg2 = 0;
1156  CmpMask = ~0;
1157  CmpValue = MI.getOperand(2).getImm();
1158  return true;
1159  case AArch64::ANDSWri:
1160  case AArch64::ANDSXri:
1161  // ANDS does not use the same encoding scheme as the others xxxS
1162  // instructions.
1163  SrcReg = MI.getOperand(1).getReg();
1164  SrcReg2 = 0;
1165  CmpMask = ~0;
1167  MI.getOperand(2).getImm(),
1168  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1169  return true;
1170  }
1171 
1172  return false;
1173 }
1174 
1176  MachineBasicBlock *MBB = Instr.getParent();
1177  assert(MBB && "Can't get MachineBasicBlock here");
1178  MachineFunction *MF = MBB->getParent();
1179  assert(MF && "Can't get MachineFunction here");
1180  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1183 
1184  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1185  ++OpIdx) {
1186  MachineOperand &MO = Instr.getOperand(OpIdx);
1187  const TargetRegisterClass *OpRegCstraints =
1188  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1189 
1190  // If there's no constraint, there's nothing to do.
1191  if (!OpRegCstraints)
1192  continue;
1193  // If the operand is a frame index, there's nothing to do here.
1194  // A frame index operand will resolve correctly during PEI.
1195  if (MO.isFI())
1196  continue;
1197 
1198  assert(MO.isReg() &&
1199  "Operand has register constraints without being a register!");
1200 
1201  Register Reg = MO.getReg();
1203  if (!OpRegCstraints->contains(Reg))
1204  return false;
1205  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1206  !MRI->constrainRegClass(Reg, OpRegCstraints))
1207  return false;
1208  }
1209 
1210  return true;
1211 }
1212 
1213 /// Return the opcode that does not set flags when possible - otherwise
1214 /// return the original opcode. The caller is responsible to do the actual
1215 /// substitution and legality checking.
1216 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1217  // Don't convert all compare instructions, because for some the zero register
1218  // encoding becomes the sp register.
1219  bool MIDefinesZeroReg = false;
1220  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1221  MIDefinesZeroReg = true;
1222 
1223  switch (MI.getOpcode()) {
1224  default:
1225  return MI.getOpcode();
1226  case AArch64::ADDSWrr:
1227  return AArch64::ADDWrr;
1228  case AArch64::ADDSWri:
1229  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1230  case AArch64::ADDSWrs:
1231  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1232  case AArch64::ADDSWrx:
1233  return AArch64::ADDWrx;
1234  case AArch64::ADDSXrr:
1235  return AArch64::ADDXrr;
1236  case AArch64::ADDSXri:
1237  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1238  case AArch64::ADDSXrs:
1239  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1240  case AArch64::ADDSXrx:
1241  return AArch64::ADDXrx;
1242  case AArch64::SUBSWrr:
1243  return AArch64::SUBWrr;
1244  case AArch64::SUBSWri:
1245  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1246  case AArch64::SUBSWrs:
1247  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1248  case AArch64::SUBSWrx:
1249  return AArch64::SUBWrx;
1250  case AArch64::SUBSXrr:
1251  return AArch64::SUBXrr;
1252  case AArch64::SUBSXri:
1253  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1254  case AArch64::SUBSXrs:
1255  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1256  case AArch64::SUBSXrx:
1257  return AArch64::SUBXrx;
1258  }
1259 }
1260 
1261 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1262 
1263 /// True when condition flags are accessed (either by writing or reading)
1264 /// on the instruction trace starting at From and ending at To.
1265 ///
1266 /// Note: If From and To are from different blocks it's assumed CC are accessed
1267 /// on the path.
1270  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1271  // Early exit if To is at the beginning of the BB.
1272  if (To == To->getParent()->begin())
1273  return true;
1274 
1275  // Check whether the instructions are in the same basic block
1276  // If not, assume the condition flags might get modified somewhere.
1277  if (To->getParent() != From->getParent())
1278  return true;
1279 
1280  // From must be above To.
1282  ++To.getReverse(), To->getParent()->rend(),
1283  [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1284 
1285  // We iterate backward starting at \p To until we hit \p From.
1286  for (const MachineInstr &Instr :
1287  instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1288  if (((AccessToCheck & AK_Write) &&
1289  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1290  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1291  return true;
1292  }
1293  return false;
1294 }
1295 
1296 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1297 /// operation which could set the flags in an identical manner
1298 bool AArch64InstrInfo::optimizePTestInstr(
1299  MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1300  const MachineRegisterInfo *MRI) const {
1301  auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1302  auto *Pred = MRI->getUniqueVRegDef(PredReg);
1303  auto NewOp = Pred->getOpcode();
1304  bool OpChanged = false;
1305 
1306  unsigned MaskOpcode = Mask->getOpcode();
1307  unsigned PredOpcode = Pred->getOpcode();
1308  bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1309  bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1310 
1311  if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
1312  getElementSizeForOpcode(MaskOpcode) ==
1313  getElementSizeForOpcode(PredOpcode) &&
1314  Mask->getOperand(1).getImm() == 31) {
1315  // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1316  // redundant since WHILE performs an implicit PTEST with an all active
1317  // mask. Must be an all active predicate of matching element size.
1318 
1319  // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1320  // PTEST_LIKE instruction uses the same all active mask and the element
1321  // size matches. If the PTEST has a condition of any then it is always
1322  // redundant.
1323  if (PredIsPTestLike) {
1324  auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1325  if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
1326  return false;
1327  }
1328 
1329  // Fallthough to simply remove the PTEST.
1330  } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
1331  PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
1332  // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1333  // instruction that sets the flags as PTEST would. This is only valid when
1334  // the condition is any.
1335 
1336  // Fallthough to simply remove the PTEST.
1337  } else if (PredIsPTestLike) {
1338  // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1339  // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1340  // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1341  // compare that also support 16/32/64-bit predicates, the implicit PTEST
1342  // performed by the compare could consider fewer lanes for these element
1343  // sizes.
1344  //
1345  // For example, consider
1346  //
1347  // ptrue p0.b ; P0=1111-1111-1111-1111
1348  // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1349  // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1350  // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1351  // ; ^ last active
1352  // ptest p0, p1.b ; P1=0001-0001-0001-0001
1353  // ; ^ last active
1354  //
1355  // where the compare generates a canonical all active 32-bit predicate
1356  // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1357  // active flag, whereas the PTEST instruction with the same mask doesn't.
1358  // For PTEST_ANY this doesn't apply as the flags in this case would be
1359  // identical regardless of element size.
1360  auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1361  uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1362  if ((Mask != PTestLikeMask) ||
1363  (PredElementSize != AArch64::ElementSizeB &&
1364  PTest->getOpcode() != AArch64::PTEST_PP_ANY))
1365  return false;
1366 
1367  // Fallthough to simply remove the PTEST.
1368  } else {
1369  // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1370  // opcode so the PTEST becomes redundant.
1371  switch (PredOpcode) {
1372  case AArch64::AND_PPzPP:
1373  case AArch64::BIC_PPzPP:
1374  case AArch64::EOR_PPzPP:
1375  case AArch64::NAND_PPzPP:
1376  case AArch64::NOR_PPzPP:
1377  case AArch64::ORN_PPzPP:
1378  case AArch64::ORR_PPzPP:
1379  case AArch64::BRKA_PPzP:
1380  case AArch64::BRKPA_PPzPP:
1381  case AArch64::BRKB_PPzP:
1382  case AArch64::BRKPB_PPzPP:
1383  case AArch64::RDFFR_PPz: {
1384  // Check to see if our mask is the same. If not the resulting flag bits
1385  // may be different and we can't remove the ptest.
1386  auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1387  if (Mask != PredMask)
1388  return false;
1389  break;
1390  }
1391  case AArch64::BRKN_PPzP: {
1392  // BRKN uses an all active implicit mask to set flags unlike the other
1393  // flag-setting instructions.
1394  // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1395  if ((MaskOpcode != AArch64::PTRUE_B) ||
1396  (Mask->getOperand(1).getImm() != 31))
1397  return false;
1398  break;
1399  }
1400  case AArch64::PTRUE_B:
1401  // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1402  break;
1403  default:
1404  // Bail out if we don't recognize the input
1405  return false;
1406  }
1407 
1408  NewOp = convertToFlagSettingOpc(PredOpcode);
1409  OpChanged = true;
1410  }
1411 
1413 
1414  // If another instruction between Pred and PTest accesses flags, don't remove
1415  // the ptest or update the earlier instruction to modify them.
1416  if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1417  return false;
1418 
1419  // If we pass all the checks, it's safe to remove the PTEST and use the flags
1420  // as they are prior to PTEST. Sometimes this requires the tested PTEST
1421  // operand to be replaced with an equivalent instruction that also sets the
1422  // flags.
1423  Pred->setDesc(get(NewOp));
1424  PTest->eraseFromParent();
1425  if (OpChanged) {
1426  bool succeeded = UpdateOperandRegClass(*Pred);
1427  (void)succeeded;
1428  assert(succeeded && "Operands have incompatible register classes!");
1429  Pred->addRegisterDefined(AArch64::NZCV, TRI);
1430  }
1431 
1432  // Ensure that the flags def is live.
1433  if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1434  unsigned i = 0, e = Pred->getNumOperands();
1435  for (; i != e; ++i) {
1436  MachineOperand &MO = Pred->getOperand(i);
1437  if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1438  MO.setIsDead(false);
1439  break;
1440  }
1441  }
1442  }
1443  return true;
1444 }
1445 
1446 /// Try to optimize a compare instruction. A compare instruction is an
1447 /// instruction which produces AArch64::NZCV. It can be truly compare
1448 /// instruction
1449 /// when there are no uses of its destination register.
1450 ///
1451 /// The following steps are tried in order:
1452 /// 1. Convert CmpInstr into an unconditional version.
1453 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1454 /// condition code or an instruction which can be converted into such an
1455 /// instruction.
1456 /// Only comparison with zero is supported.
1458  MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1459  int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1460  assert(CmpInstr.getParent());
1461  assert(MRI);
1462 
1463  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1464  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1465  if (DeadNZCVIdx != -1) {
1466  if (CmpInstr.definesRegister(AArch64::WZR) ||
1467  CmpInstr.definesRegister(AArch64::XZR)) {
1468  CmpInstr.eraseFromParent();
1469  return true;
1470  }
1471  unsigned Opc = CmpInstr.getOpcode();
1472  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1473  if (NewOpc == Opc)
1474  return false;
1475  const MCInstrDesc &MCID = get(NewOpc);
1476  CmpInstr.setDesc(MCID);
1477  CmpInstr.removeOperand(DeadNZCVIdx);
1478  bool succeeded = UpdateOperandRegClass(CmpInstr);
1479  (void)succeeded;
1480  assert(succeeded && "Some operands reg class are incompatible!");
1481  return true;
1482  }
1483 
1484  if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1485  CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1486  return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1487 
1488  if (SrcReg2 != 0)
1489  return false;
1490 
1491  // CmpInstr is a Compare instruction if destination register is not used.
1492  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1493  return false;
1494 
1495  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1496  return true;
1497  return (CmpValue == 0 || CmpValue == 1) &&
1498  removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1499 }
1500 
1501 /// Get opcode of S version of Instr.
1502 /// If Instr is S version its opcode is returned.
1503 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1504 /// or we are not interested in it.
1505 static unsigned sForm(MachineInstr &Instr) {
1506  switch (Instr.getOpcode()) {
1507  default:
1508  return AArch64::INSTRUCTION_LIST_END;
1509 
1510  case AArch64::ADDSWrr:
1511  case AArch64::ADDSWri:
1512  case AArch64::ADDSXrr:
1513  case AArch64::ADDSXri:
1514  case AArch64::SUBSWrr:
1515  case AArch64::SUBSWri:
1516  case AArch64::SUBSXrr:
1517  case AArch64::SUBSXri:
1518  return Instr.getOpcode();
1519 
1520  case AArch64::ADDWrr:
1521  return AArch64::ADDSWrr;
1522  case AArch64::ADDWri:
1523  return AArch64::ADDSWri;
1524  case AArch64::ADDXrr:
1525  return AArch64::ADDSXrr;
1526  case AArch64::ADDXri:
1527  return AArch64::ADDSXri;
1528  case AArch64::ADCWr:
1529  return AArch64::ADCSWr;
1530  case AArch64::ADCXr:
1531  return AArch64::ADCSXr;
1532  case AArch64::SUBWrr:
1533  return AArch64::SUBSWrr;
1534  case AArch64::SUBWri:
1535  return AArch64::SUBSWri;
1536  case AArch64::SUBXrr:
1537  return AArch64::SUBSXrr;
1538  case AArch64::SUBXri:
1539  return AArch64::SUBSXri;
1540  case AArch64::SBCWr:
1541  return AArch64::SBCSWr;
1542  case AArch64::SBCXr:
1543  return AArch64::SBCSXr;
1544  case AArch64::ANDWri:
1545  return AArch64::ANDSWri;
1546  case AArch64::ANDXri:
1547  return AArch64::ANDSXri;
1548  }
1549 }
1550 
1551 /// Check if AArch64::NZCV should be alive in successors of MBB.
1553  for (auto *BB : MBB->successors())
1554  if (BB->isLiveIn(AArch64::NZCV))
1555  return true;
1556  return false;
1557 }
1558 
1559 /// \returns The condition code operand index for \p Instr if it is a branch
1560 /// or select and -1 otherwise.
1561 static int
1563  switch (Instr.getOpcode()) {
1564  default:
1565  return -1;
1566 
1567  case AArch64::Bcc: {
1568  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1569  assert(Idx >= 2);
1570  return Idx - 2;
1571  }
1572 
1573  case AArch64::CSINVWr:
1574  case AArch64::CSINVXr:
1575  case AArch64::CSINCWr:
1576  case AArch64::CSINCXr:
1577  case AArch64::CSELWr:
1578  case AArch64::CSELXr:
1579  case AArch64::CSNEGWr:
1580  case AArch64::CSNEGXr:
1581  case AArch64::FCSELSrrr:
1582  case AArch64::FCSELDrrr: {
1583  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1584  assert(Idx >= 1);
1585  return Idx - 1;
1586  }
1587  }
1588 }
1589 
1590 /// Find a condition code used by the instruction.
1591 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1592 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1594  int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1595  return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1596  Instr.getOperand(CCIdx).getImm())
1598 }
1599 
1602  UsedNZCV UsedFlags;
1603  switch (CC) {
1604  default:
1605  break;
1606 
1607  case AArch64CC::EQ: // Z set
1608  case AArch64CC::NE: // Z clear
1609  UsedFlags.Z = true;
1610  break;
1611 
1612  case AArch64CC::HI: // Z clear and C set
1613  case AArch64CC::LS: // Z set or C clear
1614  UsedFlags.Z = true;
1615  [[fallthrough]];
1616  case AArch64CC::HS: // C set
1617  case AArch64CC::LO: // C clear
1618  UsedFlags.C = true;
1619  break;
1620 
1621  case AArch64CC::MI: // N set
1622  case AArch64CC::PL: // N clear
1623  UsedFlags.N = true;
1624  break;
1625 
1626  case AArch64CC::VS: // V set
1627  case AArch64CC::VC: // V clear
1628  UsedFlags.V = true;
1629  break;
1630 
1631  case AArch64CC::GT: // Z clear, N and V the same
1632  case AArch64CC::LE: // Z set, N and V differ
1633  UsedFlags.Z = true;
1634  [[fallthrough]];
1635  case AArch64CC::GE: // N and V the same
1636  case AArch64CC::LT: // N and V differ
1637  UsedFlags.N = true;
1638  UsedFlags.V = true;
1639  break;
1640  }
1641  return UsedFlags;
1642 }
1643 
1644 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1645 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1646 /// \returns None otherwise.
1647 ///
1648 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1651  const TargetRegisterInfo &TRI,
1652  SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1653  MachineBasicBlock *CmpParent = CmpInstr.getParent();
1654  if (MI.getParent() != CmpParent)
1655  return None;
1656 
1657  if (areCFlagsAliveInSuccessors(CmpParent))
1658  return None;
1659 
1660  UsedNZCV NZCVUsedAfterCmp;
1661  for (MachineInstr &Instr : instructionsWithoutDebug(
1662  std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1663  if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1665  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1666  return None;
1667  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1668  if (CCUseInstrs)
1669  CCUseInstrs->push_back(&Instr);
1670  }
1671  if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1672  break;
1673  }
1674  return NZCVUsedAfterCmp;
1675 }
1676 
1677 static bool isADDSRegImm(unsigned Opcode) {
1678  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1679 }
1680 
1681 static bool isSUBSRegImm(unsigned Opcode) {
1682  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1683 }
1684 
1685 /// Check if CmpInstr can be substituted by MI.
1686 ///
1687 /// CmpInstr can be substituted:
1688 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1689 /// - and, MI and CmpInstr are from the same MachineBB
1690 /// - and, condition flags are not alive in successors of the CmpInstr parent
1691 /// - and, if MI opcode is the S form there must be no defs of flags between
1692 /// MI and CmpInstr
1693 /// or if MI opcode is not the S form there must be neither defs of flags
1694 /// nor uses of flags between MI and CmpInstr.
1695 /// - and C/V flags are not used after CmpInstr
1697  const TargetRegisterInfo &TRI) {
1698  assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1699 
1700  const unsigned CmpOpcode = CmpInstr.getOpcode();
1701  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1702  return false;
1703 
1704  Optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1705  if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V)
1706  return false;
1707 
1708  AccessKind AccessToCheck = AK_Write;
1709  if (sForm(MI) != MI.getOpcode())
1710  AccessToCheck = AK_All;
1711  return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1712 }
1713 
1714 /// Substitute an instruction comparing to zero with another instruction
1715 /// which produces needed condition flags.
1716 ///
1717 /// Return true on success.
1718 bool AArch64InstrInfo::substituteCmpToZero(
1719  MachineInstr &CmpInstr, unsigned SrcReg,
1720  const MachineRegisterInfo &MRI) const {
1721  // Get the unique definition of SrcReg.
1722  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1723  if (!MI)
1724  return false;
1725 
1727 
1728  unsigned NewOpc = sForm(*MI);
1729  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1730  return false;
1731 
1732  if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1733  return false;
1734 
1735  // Update the instruction to set NZCV.
1736  MI->setDesc(get(NewOpc));
1737  CmpInstr.eraseFromParent();
1738  bool succeeded = UpdateOperandRegClass(*MI);
1739  (void)succeeded;
1740  assert(succeeded && "Some operands reg class are incompatible!");
1741  MI->addRegisterDefined(AArch64::NZCV, &TRI);
1742  return true;
1743 }
1744 
1745 /// \returns True if \p CmpInstr can be removed.
1746 ///
1747 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1748 /// codes used in \p CCUseInstrs must be inverted.
1750  int CmpValue, const TargetRegisterInfo &TRI,
1751  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1752  bool &IsInvertCC) {
1753  assert((CmpValue == 0 || CmpValue == 1) &&
1754  "Only comparisons to 0 or 1 considered for removal!");
1755 
1756  // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1757  unsigned MIOpc = MI.getOpcode();
1758  if (MIOpc == AArch64::CSINCWr) {
1759  if (MI.getOperand(1).getReg() != AArch64::WZR ||
1760  MI.getOperand(2).getReg() != AArch64::WZR)
1761  return false;
1762  } else if (MIOpc == AArch64::CSINCXr) {
1763  if (MI.getOperand(1).getReg() != AArch64::XZR ||
1764  MI.getOperand(2).getReg() != AArch64::XZR)
1765  return false;
1766  } else {
1767  return false;
1768  }
1770  if (MICC == AArch64CC::Invalid)
1771  return false;
1772 
1773  // NZCV needs to be defined
1774  if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1775  return false;
1776 
1777  // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1778  const unsigned CmpOpcode = CmpInstr.getOpcode();
1779  bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1780  if (CmpValue && !IsSubsRegImm)
1781  return false;
1782  if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1783  return false;
1784 
1785  // MI conditions allowed: eq, ne, mi, pl
1786  UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1787  if (MIUsedNZCV.C || MIUsedNZCV.V)
1788  return false;
1789 
1790  Optional<UsedNZCV> NZCVUsedAfterCmp =
1791  examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1792  // Condition flags are not used in CmpInstr basic block successors and only
1793  // Z or N flags allowed to be used after CmpInstr within its basic block
1794  if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1795  return false;
1796  // Z or N flag used after CmpInstr must correspond to the flag used in MI
1797  if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1798  (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1799  return false;
1800  // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1801  if (MIUsedNZCV.N && !CmpValue)
1802  return false;
1803 
1804  // There must be no defs of flags between MI and CmpInstr
1805  if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1806  return false;
1807 
1808  // Condition code is inverted in the following cases:
1809  // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1810  // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1811  IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1812  (!CmpValue && MICC == AArch64CC::NE);
1813  return true;
1814 }
1815 
1816 /// Remove comparison in csinc-cmp sequence
1817 ///
1818 /// Examples:
1819 /// 1. \code
1820 /// csinc w9, wzr, wzr, ne
1821 /// cmp w9, #0
1822 /// b.eq
1823 /// \endcode
1824 /// to
1825 /// \code
1826 /// csinc w9, wzr, wzr, ne
1827 /// b.ne
1828 /// \endcode
1829 ///
1830 /// 2. \code
1831 /// csinc x2, xzr, xzr, mi
1832 /// cmp x2, #1
1833 /// b.pl
1834 /// \endcode
1835 /// to
1836 /// \code
1837 /// csinc x2, xzr, xzr, mi
1838 /// b.pl
1839 /// \endcode
1840 ///
1841 /// \param CmpInstr comparison instruction
1842 /// \return True when comparison removed
1843 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1844  MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1845  const MachineRegisterInfo &MRI) const {
1846  MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1847  if (!MI)
1848  return false;
1850  SmallVector<MachineInstr *, 4> CCUseInstrs;
1851  bool IsInvertCC = false;
1852  if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1853  IsInvertCC))
1854  return false;
1855  // Make transformation
1856  CmpInstr.eraseFromParent();
1857  if (IsInvertCC) {
1858  // Invert condition codes in CmpInstr CC users
1859  for (MachineInstr *CCUseInstr : CCUseInstrs) {
1860  int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1861  assert(Idx >= 0 && "Unexpected instruction using CC.");
1862  MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1864  static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1865  CCOperand.setImm(CCUse);
1866  }
1867  }
1868  return true;
1869 }
1870 
1872  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1873  MI.getOpcode() != AArch64::CATCHRET)
1874  return false;
1875 
1876  MachineBasicBlock &MBB = *MI.getParent();
1877  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1878  auto TRI = Subtarget.getRegisterInfo();
1879  DebugLoc DL = MI.getDebugLoc();
1880 
1881  if (MI.getOpcode() == AArch64::CATCHRET) {
1882  // Skip to the first instruction before the epilog.
1883  const TargetInstrInfo *TII =
1885  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1887  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1888  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1889  FirstEpilogSEH != MBB.begin())
1890  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1891  if (FirstEpilogSEH != MBB.begin())
1892  FirstEpilogSEH = std::next(FirstEpilogSEH);
1893  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1894  .addReg(AArch64::X0, RegState::Define)
1895  .addMBB(TargetMBB);
1896  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1897  .addReg(AArch64::X0, RegState::Define)
1898  .addReg(AArch64::X0)
1899  .addMBB(TargetMBB)
1900  .addImm(0);
1901  return true;
1902  }
1903 
1904  Register Reg = MI.getOperand(0).getReg();
1906  if (M.getStackProtectorGuard() == "sysreg") {
1907  const AArch64SysReg::SysReg *SrcReg =
1908  AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1909  if (!SrcReg)
1910  report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1911 
1912  // mrs xN, sysreg
1915  .addImm(SrcReg->Encoding);
1916  int Offset = M.getStackProtectorGuardOffset();
1917  if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1918  // ldr xN, [xN, #offset]
1919  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1920  .addDef(Reg)
1922  .addImm(Offset / 8);
1923  } else if (Offset >= -256 && Offset <= 255) {
1924  // ldur xN, [xN, #offset]
1925  BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
1926  .addDef(Reg)
1928  .addImm(Offset);
1929  } else if (Offset >= -4095 && Offset <= 4095) {
1930  if (Offset > 0) {
1931  // add xN, xN, #offset
1932  BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
1933  .addDef(Reg)
1935  .addImm(Offset)
1936  .addImm(0);
1937  } else {
1938  // sub xN, xN, #offset
1939  BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
1940  .addDef(Reg)
1942  .addImm(-Offset)
1943  .addImm(0);
1944  }
1945  // ldr xN, [xN]
1946  BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1947  .addDef(Reg)
1949  .addImm(0);
1950  } else {
1951  // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
1952  // than 23760.
1953  // It might be nice to use AArch64::MOVi32imm here, which would get
1954  // expanded in PreSched2 after PostRA, but our lone scratch Reg already
1955  // contains the MRS result. findScratchNonCalleeSaveRegister() in
1956  // AArch64FrameLowering might help us find such a scratch register
1957  // though. If we failed to find a scratch register, we could emit a
1958  // stream of add instructions to build up the immediate. Or, we could try
1959  // to insert a AArch64::MOVi32imm before register allocation so that we
1960  // didn't need to scavenge for a scratch register.
1961  report_fatal_error("Unable to encode Stack Protector Guard Offset");
1962  }
1963  MBB.erase(MI);
1964  return true;
1965  }
1966 
1967  const GlobalValue *GV =
1968  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1969  const TargetMachine &TM = MBB.getParent()->getTarget();
1970  unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1971  const unsigned char MO_NC = AArch64II::MO_NC;
1972 
1973  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1975  .addGlobalAddress(GV, 0, OpFlags);
1976  if (Subtarget.isTargetILP32()) {
1977  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
1978  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
1979  .addDef(Reg32, RegState::Dead)
1981  .addImm(0)
1982  .addMemOperand(*MI.memoperands_begin())
1984  } else {
1985  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1987  .addImm(0)
1988  .addMemOperand(*MI.memoperands_begin());
1989  }
1990  } else if (TM.getCodeModel() == CodeModel::Large) {
1991  assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
1992  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1994  .addImm(0);
1995  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1998  .addImm(16);
1999  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2002  .addImm(32);
2003  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2006  .addImm(48);
2007  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2009  .addImm(0)
2010  .addMemOperand(*MI.memoperands_begin());
2011  } else if (TM.getCodeModel() == CodeModel::Tiny) {
2013  .addGlobalAddress(GV, 0, OpFlags);
2014  } else {
2016  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2017  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2018  if (Subtarget.isTargetILP32()) {
2019  unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2020  BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2021  .addDef(Reg32, RegState::Dead)
2023  .addGlobalAddress(GV, 0, LoFlags)
2024  .addMemOperand(*MI.memoperands_begin())
2026  } else {
2027  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2029  .addGlobalAddress(GV, 0, LoFlags)
2030  .addMemOperand(*MI.memoperands_begin());
2031  }
2032  }
2033 
2034  MBB.erase(MI);
2035 
2036  return true;
2037 }
2038 
2039 // Return true if this instruction simply sets its single destination register
2040 // to zero. This is equivalent to a register rename of the zero-register.
2042  switch (MI.getOpcode()) {
2043  default:
2044  break;
2045  case AArch64::MOVZWi:
2046  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2047  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2048  assert(MI.getDesc().getNumOperands() == 3 &&
2049  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2050  return true;
2051  }
2052  break;
2053  case AArch64::ANDWri: // and Rd, Rzr, #imm
2054  return MI.getOperand(1).getReg() == AArch64::WZR;
2055  case AArch64::ANDXri:
2056  return MI.getOperand(1).getReg() == AArch64::XZR;
2057  case TargetOpcode::COPY:
2058  return MI.getOperand(1).getReg() == AArch64::WZR;
2059  }
2060  return false;
2061 }
2062 
2063 // Return true if this instruction simply renames a general register without
2064 // modifying bits.
2066  switch (MI.getOpcode()) {
2067  default:
2068  break;
2069  case TargetOpcode::COPY: {
2070  // GPR32 copies will by lowered to ORRXrs
2071  Register DstReg = MI.getOperand(0).getReg();
2072  return (AArch64::GPR32RegClass.contains(DstReg) ||
2073  AArch64::GPR64RegClass.contains(DstReg));
2074  }
2075  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2076  if (MI.getOperand(1).getReg() == AArch64::XZR) {
2077  assert(MI.getDesc().getNumOperands() == 4 &&
2078  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2079  return true;
2080  }
2081  break;
2082  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2083  if (MI.getOperand(2).getImm() == 0) {
2084  assert(MI.getDesc().getNumOperands() == 4 &&
2085  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2086  return true;
2087  }
2088  break;
2089  }
2090  return false;
2091 }
2092 
2093 // Return true if this instruction simply renames a general register without
2094 // modifying bits.
2096  switch (MI.getOpcode()) {
2097  default:
2098  break;
2099  case TargetOpcode::COPY: {
2100  Register DstReg = MI.getOperand(0).getReg();
2101  return AArch64::FPR128RegClass.contains(DstReg);
2102  }
2103  case AArch64::ORRv16i8:
2104  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2105  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2106  "invalid ORRv16i8 operands");
2107  return true;
2108  }
2109  break;
2110  }
2111  return false;
2112 }
2113 
2115  int &FrameIndex) const {
2116  switch (MI.getOpcode()) {
2117  default:
2118  break;
2119  case AArch64::LDRWui:
2120  case AArch64::LDRXui:
2121  case AArch64::LDRBui:
2122  case AArch64::LDRHui:
2123  case AArch64::LDRSui:
2124  case AArch64::LDRDui:
2125  case AArch64::LDRQui:
2126  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2127  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2128  FrameIndex = MI.getOperand(1).getIndex();
2129  return MI.getOperand(0).getReg();
2130  }
2131  break;
2132  }
2133 
2134  return 0;
2135 }
2136 
2138  int &FrameIndex) const {
2139  switch (MI.getOpcode()) {
2140  default:
2141  break;
2142  case AArch64::STRWui:
2143  case AArch64::STRXui:
2144  case AArch64::STRBui:
2145  case AArch64::STRHui:
2146  case AArch64::STRSui:
2147  case AArch64::STRDui:
2148  case AArch64::STRQui:
2149  case AArch64::LDR_PXI:
2150  case AArch64::STR_PXI:
2151  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2152  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2153  FrameIndex = MI.getOperand(1).getIndex();
2154  return MI.getOperand(0).getReg();
2155  }
2156  break;
2157  }
2158  return 0;
2159 }
2160 
2161 /// Check all MachineMemOperands for a hint to suppress pairing.
2163  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2164  return MMO->getFlags() & MOSuppressPair;
2165  });
2166 }
2167 
2168 /// Set a flag on the first MachineMemOperand to suppress pairing.
2170  if (MI.memoperands_empty())
2171  return;
2172  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2173 }
2174 
2175 /// Check all MachineMemOperands for a hint that the load/store is strided.
2177  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2178  return MMO->getFlags() & MOStridedAccess;
2179  });
2180 }
2181 
2183  switch (Opc) {
2184  default:
2185  return false;
2186  case AArch64::STURSi:
2187  case AArch64::STRSpre:
2188  case AArch64::STURDi:
2189  case AArch64::STRDpre:
2190  case AArch64::STURQi:
2191  case AArch64::STRQpre:
2192  case AArch64::STURBBi:
2193  case AArch64::STURHHi:
2194  case AArch64::STURWi:
2195  case AArch64::STRWpre:
2196  case AArch64::STURXi:
2197  case AArch64::STRXpre:
2198  case AArch64::LDURSi:
2199  case AArch64::LDRSpre:
2200  case AArch64::LDURDi:
2201  case AArch64::LDRDpre:
2202  case AArch64::LDURQi:
2203  case AArch64::LDRQpre:
2204  case AArch64::LDURWi:
2205  case AArch64::LDRWpre:
2206  case AArch64::LDURXi:
2207  case AArch64::LDRXpre:
2208  case AArch64::LDURSWi:
2209  case AArch64::LDURHHi:
2210  case AArch64::LDURBBi:
2211  case AArch64::LDURSBWi:
2212  case AArch64::LDURSHWi:
2213  return true;
2214  }
2215 }
2216 
2218  switch (Opc) {
2219  default: return {};
2220  case AArch64::PRFMui: return AArch64::PRFUMi;
2221  case AArch64::LDRXui: return AArch64::LDURXi;
2222  case AArch64::LDRWui: return AArch64::LDURWi;
2223  case AArch64::LDRBui: return AArch64::LDURBi;
2224  case AArch64::LDRHui: return AArch64::LDURHi;
2225  case AArch64::LDRSui: return AArch64::LDURSi;
2226  case AArch64::LDRDui: return AArch64::LDURDi;
2227  case AArch64::LDRQui: return AArch64::LDURQi;
2228  case AArch64::LDRBBui: return AArch64::LDURBBi;
2229  case AArch64::LDRHHui: return AArch64::LDURHHi;
2230  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2231  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2232  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2233  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2234  case AArch64::LDRSWui: return AArch64::LDURSWi;
2235  case AArch64::STRXui: return AArch64::STURXi;
2236  case AArch64::STRWui: return AArch64::STURWi;
2237  case AArch64::STRBui: return AArch64::STURBi;
2238  case AArch64::STRHui: return AArch64::STURHi;
2239  case AArch64::STRSui: return AArch64::STURSi;
2240  case AArch64::STRDui: return AArch64::STURDi;
2241  case AArch64::STRQui: return AArch64::STURQi;
2242  case AArch64::STRBBui: return AArch64::STURBBi;
2243  case AArch64::STRHHui: return AArch64::STURHHi;
2244  }
2245 }
2246 
2248  switch (Opc) {
2249  default:
2250  return 2;
2251  case AArch64::LDPXi:
2252  case AArch64::LDPDi:
2253  case AArch64::STPXi:
2254  case AArch64::STPDi:
2255  case AArch64::LDNPXi:
2256  case AArch64::LDNPDi:
2257  case AArch64::STNPXi:
2258  case AArch64::STNPDi:
2259  case AArch64::LDPQi:
2260  case AArch64::STPQi:
2261  case AArch64::LDNPQi:
2262  case AArch64::STNPQi:
2263  case AArch64::LDPWi:
2264  case AArch64::LDPSi:
2265  case AArch64::STPWi:
2266  case AArch64::STPSi:
2267  case AArch64::LDNPWi:
2268  case AArch64::LDNPSi:
2269  case AArch64::STNPWi:
2270  case AArch64::STNPSi:
2271  case AArch64::LDG:
2272  case AArch64::STGPi:
2273 
2274  case AArch64::LD1B_IMM:
2275  case AArch64::LD1B_H_IMM:
2276  case AArch64::LD1B_S_IMM:
2277  case AArch64::LD1B_D_IMM:
2278  case AArch64::LD1SB_H_IMM:
2279  case AArch64::LD1SB_S_IMM:
2280  case AArch64::LD1SB_D_IMM:
2281  case AArch64::LD1H_IMM:
2282  case AArch64::LD1H_S_IMM:
2283  case AArch64::LD1H_D_IMM:
2284  case AArch64::LD1SH_S_IMM:
2285  case AArch64::LD1SH_D_IMM:
2286  case AArch64::LD1W_IMM:
2287  case AArch64::LD1W_D_IMM:
2288  case AArch64::LD1SW_D_IMM:
2289  case AArch64::LD1D_IMM:
2290 
2291  case AArch64::LD2B_IMM:
2292  case AArch64::LD2H_IMM:
2293  case AArch64::LD2W_IMM:
2294  case AArch64::LD2D_IMM:
2295  case AArch64::LD3B_IMM:
2296  case AArch64::LD3H_IMM:
2297  case AArch64::LD3W_IMM:
2298  case AArch64::LD3D_IMM:
2299  case AArch64::LD4B_IMM:
2300  case AArch64::LD4H_IMM:
2301  case AArch64::LD4W_IMM:
2302  case AArch64::LD4D_IMM:
2303 
2304  case AArch64::ST1B_IMM:
2305  case AArch64::ST1B_H_IMM:
2306  case AArch64::ST1B_S_IMM:
2307  case AArch64::ST1B_D_IMM:
2308  case AArch64::ST1H_IMM:
2309  case AArch64::ST1H_S_IMM:
2310  case AArch64::ST1H_D_IMM:
2311  case AArch64::ST1W_IMM:
2312  case AArch64::ST1W_D_IMM:
2313  case AArch64::ST1D_IMM:
2314 
2315  case AArch64::ST2B_IMM:
2316  case AArch64::ST2H_IMM:
2317  case AArch64::ST2W_IMM:
2318  case AArch64::ST2D_IMM:
2319  case AArch64::ST3B_IMM:
2320  case AArch64::ST3H_IMM:
2321  case AArch64::ST3W_IMM:
2322  case AArch64::ST3D_IMM:
2323  case AArch64::ST4B_IMM:
2324  case AArch64::ST4H_IMM:
2325  case AArch64::ST4W_IMM:
2326  case AArch64::ST4D_IMM:
2327 
2328  case AArch64::LD1RB_IMM:
2329  case AArch64::LD1RB_H_IMM:
2330  case AArch64::LD1RB_S_IMM:
2331  case AArch64::LD1RB_D_IMM:
2332  case AArch64::LD1RSB_H_IMM:
2333  case AArch64::LD1RSB_S_IMM:
2334  case AArch64::LD1RSB_D_IMM:
2335  case AArch64::LD1RH_IMM:
2336  case AArch64::LD1RH_S_IMM:
2337  case AArch64::LD1RH_D_IMM:
2338  case AArch64::LD1RSH_S_IMM:
2339  case AArch64::LD1RSH_D_IMM:
2340  case AArch64::LD1RW_IMM:
2341  case AArch64::LD1RW_D_IMM:
2342  case AArch64::LD1RSW_IMM:
2343  case AArch64::LD1RD_IMM:
2344 
2345  case AArch64::LDNT1B_ZRI:
2346  case AArch64::LDNT1H_ZRI:
2347  case AArch64::LDNT1W_ZRI:
2348  case AArch64::LDNT1D_ZRI:
2349  case AArch64::STNT1B_ZRI:
2350  case AArch64::STNT1H_ZRI:
2351  case AArch64::STNT1W_ZRI:
2352  case AArch64::STNT1D_ZRI:
2353 
2354  case AArch64::LDNF1B_IMM:
2355  case AArch64::LDNF1B_H_IMM:
2356  case AArch64::LDNF1B_S_IMM:
2357  case AArch64::LDNF1B_D_IMM:
2358  case AArch64::LDNF1SB_H_IMM:
2359  case AArch64::LDNF1SB_S_IMM:
2360  case AArch64::LDNF1SB_D_IMM:
2361  case AArch64::LDNF1H_IMM:
2362  case AArch64::LDNF1H_S_IMM:
2363  case AArch64::LDNF1H_D_IMM:
2364  case AArch64::LDNF1SH_S_IMM:
2365  case AArch64::LDNF1SH_D_IMM:
2366  case AArch64::LDNF1W_IMM:
2367  case AArch64::LDNF1W_D_IMM:
2368  case AArch64::LDNF1SW_D_IMM:
2369  case AArch64::LDNF1D_IMM:
2370  return 3;
2371  case AArch64::ADDG:
2372  case AArch64::STGOffset:
2373  case AArch64::LDR_PXI:
2374  case AArch64::STR_PXI:
2375  return 2;
2376  }
2377 }
2378 
2380  switch (MI.getOpcode()) {
2381  default:
2382  return false;
2383  // Scaled instructions.
2384  case AArch64::STRSui:
2385  case AArch64::STRDui:
2386  case AArch64::STRQui:
2387  case AArch64::STRXui:
2388  case AArch64::STRWui:
2389  case AArch64::LDRSui:
2390  case AArch64::LDRDui:
2391  case AArch64::LDRQui:
2392  case AArch64::LDRXui:
2393  case AArch64::LDRWui:
2394  case AArch64::LDRSWui:
2395  // Unscaled instructions.
2396  case AArch64::STURSi:
2397  case AArch64::STRSpre:
2398  case AArch64::STURDi:
2399  case AArch64::STRDpre:
2400  case AArch64::STURQi:
2401  case AArch64::STRQpre:
2402  case AArch64::STURWi:
2403  case AArch64::STRWpre:
2404  case AArch64::STURXi:
2405  case AArch64::STRXpre:
2406  case AArch64::LDURSi:
2407  case AArch64::LDRSpre:
2408  case AArch64::LDURDi:
2409  case AArch64::LDRDpre:
2410  case AArch64::LDURQi:
2411  case AArch64::LDRQpre:
2412  case AArch64::LDURWi:
2413  case AArch64::LDRWpre:
2414  case AArch64::LDURXi:
2415  case AArch64::LDRXpre:
2416  case AArch64::LDURSWi:
2417  return true;
2418  }
2419 }
2420 
2422  switch (Opc) {
2423  default:
2424  llvm_unreachable("Opcode has no flag setting equivalent!");
2425  // 32-bit cases:
2426  case AArch64::ADDWri:
2427  return AArch64::ADDSWri;
2428  case AArch64::ADDWrr:
2429  return AArch64::ADDSWrr;
2430  case AArch64::ADDWrs:
2431  return AArch64::ADDSWrs;
2432  case AArch64::ADDWrx:
2433  return AArch64::ADDSWrx;
2434  case AArch64::ANDWri:
2435  return AArch64::ANDSWri;
2436  case AArch64::ANDWrr:
2437  return AArch64::ANDSWrr;
2438  case AArch64::ANDWrs:
2439  return AArch64::ANDSWrs;
2440  case AArch64::BICWrr:
2441  return AArch64::BICSWrr;
2442  case AArch64::BICWrs:
2443  return AArch64::BICSWrs;
2444  case AArch64::SUBWri:
2445  return AArch64::SUBSWri;
2446  case AArch64::SUBWrr:
2447  return AArch64::SUBSWrr;
2448  case AArch64::SUBWrs:
2449  return AArch64::SUBSWrs;
2450  case AArch64::SUBWrx:
2451  return AArch64::SUBSWrx;
2452  // 64-bit cases:
2453  case AArch64::ADDXri:
2454  return AArch64::ADDSXri;
2455  case AArch64::ADDXrr:
2456  return AArch64::ADDSXrr;
2457  case AArch64::ADDXrs:
2458  return AArch64::ADDSXrs;
2459  case AArch64::ADDXrx:
2460  return AArch64::ADDSXrx;
2461  case AArch64::ANDXri:
2462  return AArch64::ANDSXri;
2463  case AArch64::ANDXrr:
2464  return AArch64::ANDSXrr;
2465  case AArch64::ANDXrs:
2466  return AArch64::ANDSXrs;
2467  case AArch64::BICXrr:
2468  return AArch64::BICSXrr;
2469  case AArch64::BICXrs:
2470  return AArch64::BICSXrs;
2471  case AArch64::SUBXri:
2472  return AArch64::SUBSXri;
2473  case AArch64::SUBXrr:
2474  return AArch64::SUBSXrr;
2475  case AArch64::SUBXrs:
2476  return AArch64::SUBSXrs;
2477  case AArch64::SUBXrx:
2478  return AArch64::SUBSXrx;
2479  // SVE instructions:
2480  case AArch64::AND_PPzPP:
2481  return AArch64::ANDS_PPzPP;
2482  case AArch64::BIC_PPzPP:
2483  return AArch64::BICS_PPzPP;
2484  case AArch64::EOR_PPzPP:
2485  return AArch64::EORS_PPzPP;
2486  case AArch64::NAND_PPzPP:
2487  return AArch64::NANDS_PPzPP;
2488  case AArch64::NOR_PPzPP:
2489  return AArch64::NORS_PPzPP;
2490  case AArch64::ORN_PPzPP:
2491  return AArch64::ORNS_PPzPP;
2492  case AArch64::ORR_PPzPP:
2493  return AArch64::ORRS_PPzPP;
2494  case AArch64::BRKA_PPzP:
2495  return AArch64::BRKAS_PPzP;
2496  case AArch64::BRKPA_PPzPP:
2497  return AArch64::BRKPAS_PPzPP;
2498  case AArch64::BRKB_PPzP:
2499  return AArch64::BRKBS_PPzP;
2500  case AArch64::BRKPB_PPzPP:
2501  return AArch64::BRKPBS_PPzPP;
2502  case AArch64::BRKN_PPzP:
2503  return AArch64::BRKNS_PPzP;
2504  case AArch64::RDFFR_PPz:
2505  return AArch64::RDFFRS_PPz;
2506  case AArch64::PTRUE_B:
2507  return AArch64::PTRUES_B;
2508  }
2509 }
2510 
2511 // Is this a candidate for ld/st merging or pairing? For example, we don't
2512 // touch volatiles or load/stores that have a hint to avoid pair formation.
2514 
2515  bool IsPreLdSt = isPreLdSt(MI);
2516 
2517  // If this is a volatile load/store, don't mess with it.
2518  if (MI.hasOrderedMemoryRef())
2519  return false;
2520 
2521  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2522  // For Pre-inc LD/ST, the operand is shifted by one.
2523  assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2524  MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2525  "Expected a reg or frame index operand.");
2526 
2527  // For Pre-indexed addressing quadword instructions, the third operand is the
2528  // immediate value.
2529  bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2530 
2531  if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2532  return false;
2533 
2534  // Can't merge/pair if the instruction modifies the base register.
2535  // e.g., ldr x0, [x0]
2536  // This case will never occur with an FI base.
2537  // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged.
2538  // For example:
2539  // ldr q0, [x11, #32]!
2540  // ldr q1, [x11, #16]
2541  // to
2542  // ldp q0, q1, [x11, #32]!
2543  if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2544  Register BaseReg = MI.getOperand(1).getReg();
2546  if (MI.modifiesRegister(BaseReg, TRI))
2547  return false;
2548  }
2549 
2550  // Check if this load/store has a hint to avoid pair formation.
2551  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2552  if (isLdStPairSuppressed(MI))
2553  return false;
2554 
2555  // Do not pair any callee-save store/reload instructions in the
2556  // prologue/epilogue if the CFI information encoded the operations as separate
2557  // instructions, as that will cause the size of the actual prologue to mismatch
2558  // with the prologue size recorded in the Windows CFI.
2559  const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2560  bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2561  MI.getMF()->getFunction().needsUnwindTableEntry();
2562  if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2563  MI.getFlag(MachineInstr::FrameDestroy)))
2564  return false;
2565 
2566  // On some CPUs quad load/store pairs are slower than two single load/stores.
2567  if (Subtarget.isPaired128Slow()) {
2568  switch (MI.getOpcode()) {
2569  default:
2570  break;
2571  case AArch64::LDURQi:
2572  case AArch64::STURQi:
2573  case AArch64::LDRQui:
2574  case AArch64::STRQui:
2575  return false;
2576  }
2577  }
2578 
2579  return true;
2580 }
2581 
2584  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2585  const TargetRegisterInfo *TRI) const {
2586  if (!LdSt.mayLoadOrStore())
2587  return false;
2588 
2589  const MachineOperand *BaseOp;
2590  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2591  Width, TRI))
2592  return false;
2593  BaseOps.push_back(BaseOp);
2594  return true;
2595 }
2596 
2599  const TargetRegisterInfo *TRI) const {
2600  const MachineOperand *Base; // Filled with the base operand of MI.
2601  int64_t Offset; // Filled with the offset of MI.
2602  bool OffsetIsScalable;
2603  if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2604  return None;
2605 
2606  if (!Base->isReg())
2607  return None;
2608  ExtAddrMode AM;
2609  AM.BaseReg = Base->getReg();
2610  AM.Displacement = Offset;
2611  AM.ScaledReg = 0;
2612  AM.Scale = 0;
2613  return AM;
2614 }
2615 
2617  const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
2618  bool &OffsetIsScalable, unsigned &Width,
2619  const TargetRegisterInfo *TRI) const {
2620  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2621  // Handle only loads/stores with base register followed by immediate offset.
2622  if (LdSt.getNumExplicitOperands() == 3) {
2623  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
2624  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
2625  !LdSt.getOperand(2).isImm())
2626  return false;
2627  } else if (LdSt.getNumExplicitOperands() == 4) {
2628  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
2629  if (!LdSt.getOperand(1).isReg() ||
2630  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
2631  !LdSt.getOperand(3).isImm())
2632  return false;
2633  } else
2634  return false;
2635 
2636  // Get the scaling factor for the instruction and set the width for the
2637  // instruction.
2638  TypeSize Scale(0U, false);
2639  int64_t Dummy1, Dummy2;
2640 
2641  // If this returns false, then it's an instruction we don't want to handle.
2642  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
2643  return false;
2644 
2645  // Compute the offset. Offset is calculated as the immediate operand
2646  // multiplied by the scaling factor. Unscaled instructions have scaling factor
2647  // set to 1.
2648  if (LdSt.getNumExplicitOperands() == 3) {
2649  BaseOp = &LdSt.getOperand(1);
2650  Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
2651  } else {
2652  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
2653  BaseOp = &LdSt.getOperand(2);
2654  Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
2655  }
2656  OffsetIsScalable = Scale.isScalable();
2657 
2658  if (!BaseOp->isReg() && !BaseOp->isFI())
2659  return false;
2660 
2661  return true;
2662 }
2663 
2666  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
2667  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
2668  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
2669  return OfsOp;
2670 }
2671 
2672 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
2673  unsigned &Width, int64_t &MinOffset,
2674  int64_t &MaxOffset) {
2675  const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
2676  switch (Opcode) {
2677  // Not a memory operation or something we want to handle.
2678  default:
2679  Scale = TypeSize::Fixed(0);
2680  Width = 0;
2681  MinOffset = MaxOffset = 0;
2682  return false;
2683  case AArch64::STRWpost:
2684  case AArch64::LDRWpost:
2685  Width = 32;
2686  Scale = TypeSize::Fixed(4);
2687  MinOffset = -256;
2688  MaxOffset = 255;
2689  break;
2690  case AArch64::LDURQi:
2691  case AArch64::STURQi:
2692  Width = 16;
2693  Scale = TypeSize::Fixed(1);
2694  MinOffset = -256;
2695  MaxOffset = 255;
2696  break;
2697  case AArch64::PRFUMi:
2698  case AArch64::LDURXi:
2699  case AArch64::LDURDi:
2700  case AArch64::STURXi:
2701  case AArch64::STURDi:
2702  Width = 8;
2703  Scale = TypeSize::Fixed(1);
2704  MinOffset = -256;
2705  MaxOffset = 255;
2706  break;
2707  case AArch64::LDURWi:
2708  case AArch64::LDURSi:
2709  case AArch64::LDURSWi:
2710  case AArch64::STURWi:
2711  case AArch64::STURSi:
2712  Width = 4;
2713  Scale = TypeSize::Fixed(1);
2714  MinOffset = -256;
2715  MaxOffset = 255;
2716  break;
2717  case AArch64::LDURHi:
2718  case AArch64::LDURHHi:
2719  case AArch64::LDURSHXi:
2720  case AArch64::LDURSHWi:
2721  case AArch64::STURHi:
2722  case AArch64::STURHHi:
2723  Width = 2;
2724  Scale = TypeSize::Fixed(1);
2725  MinOffset = -256;
2726  MaxOffset = 255;
2727  break;
2728  case AArch64::LDURBi:
2729  case AArch64::LDURBBi:
2730  case AArch64::LDURSBXi:
2731  case AArch64::LDURSBWi:
2732  case AArch64::STURBi:
2733  case AArch64::STURBBi:
2734  Width = 1;
2735  Scale = TypeSize::Fixed(1);
2736  MinOffset = -256;
2737  MaxOffset = 255;
2738  break;
2739  case AArch64::LDPQi:
2740  case AArch64::LDNPQi:
2741  case AArch64::STPQi:
2742  case AArch64::STNPQi:
2743  Scale = TypeSize::Fixed(16);
2744  Width = 32;
2745  MinOffset = -64;
2746  MaxOffset = 63;
2747  break;
2748  case AArch64::LDRQui:
2749  case AArch64::STRQui:
2750  Scale = TypeSize::Fixed(16);
2751  Width = 16;
2752  MinOffset = 0;
2753  MaxOffset = 4095;
2754  break;
2755  case AArch64::LDPXi:
2756  case AArch64::LDPDi:
2757  case AArch64::LDNPXi:
2758  case AArch64::LDNPDi:
2759  case AArch64::STPXi:
2760  case AArch64::STPDi:
2761  case AArch64::STNPXi:
2762  case AArch64::STNPDi:
2763  Scale = TypeSize::Fixed(8);
2764  Width = 16;
2765  MinOffset = -64;
2766  MaxOffset = 63;
2767  break;
2768  case AArch64::PRFMui:
2769  case AArch64::LDRXui:
2770  case AArch64::LDRDui:
2771  case AArch64::STRXui:
2772  case AArch64::STRDui:
2773  Scale = TypeSize::Fixed(8);
2774  Width = 8;
2775  MinOffset = 0;
2776  MaxOffset = 4095;
2777  break;
2778  case AArch64::StoreSwiftAsyncContext:
2779  // Store is an STRXui, but there might be an ADDXri in the expansion too.
2780  Scale = TypeSize::Fixed(1);
2781  Width = 8;
2782  MinOffset = 0;
2783  MaxOffset = 4095;
2784  break;
2785  case AArch64::LDPWi:
2786  case AArch64::LDPSi:
2787  case AArch64::LDNPWi:
2788  case AArch64::LDNPSi:
2789  case AArch64::STPWi:
2790  case AArch64::STPSi:
2791  case AArch64::STNPWi:
2792  case AArch64::STNPSi:
2793  Scale = TypeSize::Fixed(4);
2794  Width = 8;
2795  MinOffset = -64;
2796  MaxOffset = 63;
2797  break;
2798  case AArch64::LDRWui:
2799  case AArch64::LDRSui:
2800  case AArch64::LDRSWui:
2801  case AArch64::STRWui:
2802  case AArch64::STRSui:
2803  Scale = TypeSize::Fixed(4);
2804  Width = 4;
2805  MinOffset = 0;
2806  MaxOffset = 4095;
2807  break;
2808  case AArch64::LDRHui:
2809  case AArch64::LDRHHui:
2810  case AArch64::LDRSHWui:
2811  case AArch64::LDRSHXui:
2812  case AArch64::STRHui:
2813  case AArch64::STRHHui:
2814  Scale = TypeSize::Fixed(2);
2815  Width = 2;
2816  MinOffset = 0;
2817  MaxOffset = 4095;
2818  break;
2819  case AArch64::LDRBui:
2820  case AArch64::LDRBBui:
2821  case AArch64::LDRSBWui:
2822  case AArch64::LDRSBXui:
2823  case AArch64::STRBui:
2824  case AArch64::STRBBui:
2825  Scale = TypeSize::Fixed(1);
2826  Width = 1;
2827  MinOffset = 0;
2828  MaxOffset = 4095;
2829  break;
2830  case AArch64::STPXpre:
2831  case AArch64::LDPXpost:
2832  case AArch64::STPDpre:
2833  case AArch64::LDPDpost:
2834  Scale = TypeSize::Fixed(8);
2835  Width = 8;
2836  MinOffset = -512;
2837  MaxOffset = 504;
2838  break;
2839  case AArch64::STPQpre:
2840  case AArch64::LDPQpost:
2841  Scale = TypeSize::Fixed(16);
2842  Width = 16;
2843  MinOffset = -1024;
2844  MaxOffset = 1008;
2845  break;
2846  case AArch64::STRXpre:
2847  case AArch64::STRDpre:
2848  case AArch64::LDRXpost:
2849  case AArch64::LDRDpost:
2850  Scale = TypeSize::Fixed(1);
2851  Width = 8;
2852  MinOffset = -256;
2853  MaxOffset = 255;
2854  break;
2855  case AArch64::STRQpre:
2856  case AArch64::LDRQpost:
2857  Scale = TypeSize::Fixed(1);
2858  Width = 16;
2859  MinOffset = -256;
2860  MaxOffset = 255;
2861  break;
2862  case AArch64::ADDG:
2863  Scale = TypeSize::Fixed(16);
2864  Width = 0;
2865  MinOffset = 0;
2866  MaxOffset = 63;
2867  break;
2868  case AArch64::TAGPstack:
2869  Scale = TypeSize::Fixed(16);
2870  Width = 0;
2871  // TAGP with a negative offset turns into SUBP, which has a maximum offset
2872  // of 63 (not 64!).
2873  MinOffset = -63;
2874  MaxOffset = 63;
2875  break;
2876  case AArch64::LDG:
2877  case AArch64::STGOffset:
2878  case AArch64::STZGOffset:
2879  Scale = TypeSize::Fixed(16);
2880  Width = 16;
2881  MinOffset = -256;
2882  MaxOffset = 255;
2883  break;
2884  case AArch64::STR_ZZZZXI:
2885  case AArch64::LDR_ZZZZXI:
2886  Scale = TypeSize::Scalable(16);
2887  Width = SVEMaxBytesPerVector * 4;
2888  MinOffset = -256;
2889  MaxOffset = 252;
2890  break;
2891  case AArch64::STR_ZZZXI:
2892  case AArch64::LDR_ZZZXI:
2893  Scale = TypeSize::Scalable(16);
2894  Width = SVEMaxBytesPerVector * 3;
2895  MinOffset = -256;
2896  MaxOffset = 253;
2897  break;
2898  case AArch64::STR_ZZXI:
2899  case AArch64::LDR_ZZXI:
2900  Scale = TypeSize::Scalable(16);
2901  Width = SVEMaxBytesPerVector * 2;
2902  MinOffset = -256;
2903  MaxOffset = 254;
2904  break;
2905  case AArch64::LDR_PXI:
2906  case AArch64::STR_PXI:
2907  Scale = TypeSize::Scalable(2);
2908  Width = SVEMaxBytesPerVector / 8;
2909  MinOffset = -256;
2910  MaxOffset = 255;
2911  break;
2912  case AArch64::LDR_ZXI:
2913  case AArch64::STR_ZXI:
2914  Scale = TypeSize::Scalable(16);
2915  Width = SVEMaxBytesPerVector;
2916  MinOffset = -256;
2917  MaxOffset = 255;
2918  break;
2919  case AArch64::LD1B_IMM:
2920  case AArch64::LD1H_IMM:
2921  case AArch64::LD1W_IMM:
2922  case AArch64::LD1D_IMM:
2923  case AArch64::LDNT1B_ZRI:
2924  case AArch64::LDNT1H_ZRI:
2925  case AArch64::LDNT1W_ZRI:
2926  case AArch64::LDNT1D_ZRI:
2927  case AArch64::ST1B_IMM:
2928  case AArch64::ST1H_IMM:
2929  case AArch64::ST1W_IMM:
2930  case AArch64::ST1D_IMM:
2931  case AArch64::STNT1B_ZRI:
2932  case AArch64::STNT1H_ZRI:
2933  case AArch64::STNT1W_ZRI:
2934  case AArch64::STNT1D_ZRI:
2935  case AArch64::LDNF1B_IMM:
2936  case AArch64::LDNF1H_IMM:
2937  case AArch64::LDNF1W_IMM:
2938  case AArch64::LDNF1D_IMM:
2939  // A full vectors worth of data
2940  // Width = mbytes * elements
2941  Scale = TypeSize::Scalable(16);
2942  Width = SVEMaxBytesPerVector;
2943  MinOffset = -8;
2944  MaxOffset = 7;
2945  break;
2946  case AArch64::LD2B_IMM:
2947  case AArch64::LD2H_IMM:
2948  case AArch64::LD2W_IMM:
2949  case AArch64::LD2D_IMM:
2950  case AArch64::ST2B_IMM:
2951  case AArch64::ST2H_IMM:
2952  case AArch64::ST2W_IMM:
2953  case AArch64::ST2D_IMM:
2954  Scale = TypeSize::Scalable(32);
2955  Width = SVEMaxBytesPerVector * 2;
2956  MinOffset = -8;
2957  MaxOffset = 7;
2958  break;
2959  case AArch64::LD3B_IMM:
2960  case AArch64::LD3H_IMM:
2961  case AArch64::LD3W_IMM:
2962  case AArch64::LD3D_IMM:
2963  case AArch64::ST3B_IMM:
2964  case AArch64::ST3H_IMM:
2965  case AArch64::ST3W_IMM:
2966  case AArch64::ST3D_IMM:
2967  Scale = TypeSize::Scalable(48);
2968  Width = SVEMaxBytesPerVector * 3;
2969  MinOffset = -8;
2970  MaxOffset = 7;
2971  break;
2972  case AArch64::LD4B_IMM:
2973  case AArch64::LD4H_IMM:
2974  case AArch64::LD4W_IMM:
2975  case AArch64::LD4D_IMM:
2976  case AArch64::ST4B_IMM:
2977  case AArch64::ST4H_IMM:
2978  case AArch64::ST4W_IMM:
2979  case AArch64::ST4D_IMM:
2980  Scale = TypeSize::Scalable(64);
2981  Width = SVEMaxBytesPerVector * 4;
2982  MinOffset = -8;
2983  MaxOffset = 7;
2984  break;
2985  case AArch64::LD1B_H_IMM:
2986  case AArch64::LD1SB_H_IMM:
2987  case AArch64::LD1H_S_IMM:
2988  case AArch64::LD1SH_S_IMM:
2989  case AArch64::LD1W_D_IMM:
2990  case AArch64::LD1SW_D_IMM:
2991  case AArch64::ST1B_H_IMM:
2992  case AArch64::ST1H_S_IMM:
2993  case AArch64::ST1W_D_IMM:
2994  case AArch64::LDNF1B_H_IMM:
2995  case AArch64::LDNF1SB_H_IMM:
2996  case AArch64::LDNF1H_S_IMM:
2997  case AArch64::LDNF1SH_S_IMM:
2998  case AArch64::LDNF1W_D_IMM:
2999  case AArch64::LDNF1SW_D_IMM:
3000  // A half vector worth of data
3001  // Width = mbytes * elements
3002  Scale = TypeSize::Scalable(8);
3003  Width = SVEMaxBytesPerVector / 2;
3004  MinOffset = -8;
3005  MaxOffset = 7;
3006  break;
3007  case AArch64::LD1B_S_IMM:
3008  case AArch64::LD1SB_S_IMM:
3009  case AArch64::LD1H_D_IMM:
3010  case AArch64::LD1SH_D_IMM:
3011  case AArch64::ST1B_S_IMM:
3012  case AArch64::ST1H_D_IMM:
3013  case AArch64::LDNF1B_S_IMM:
3014  case AArch64::LDNF1SB_S_IMM:
3015  case AArch64::LDNF1H_D_IMM:
3016  case AArch64::LDNF1SH_D_IMM:
3017  // A quarter vector worth of data
3018  // Width = mbytes * elements
3019  Scale = TypeSize::Scalable(4);
3020  Width = SVEMaxBytesPerVector / 4;
3021  MinOffset = -8;
3022  MaxOffset = 7;
3023  break;
3024  case AArch64::LD1B_D_IMM:
3025  case AArch64::LD1SB_D_IMM:
3026  case AArch64::ST1B_D_IMM:
3027  case AArch64::LDNF1B_D_IMM:
3028  case AArch64::LDNF1SB_D_IMM:
3029  // A eighth vector worth of data
3030  // Width = mbytes * elements
3031  Scale = TypeSize::Scalable(2);
3032  Width = SVEMaxBytesPerVector / 8;
3033  MinOffset = -8;
3034  MaxOffset = 7;
3035  break;
3036  case AArch64::ST2GOffset:
3037  case AArch64::STZ2GOffset:
3038  Scale = TypeSize::Fixed(16);
3039  Width = 32;
3040  MinOffset = -256;
3041  MaxOffset = 255;
3042  break;
3043  case AArch64::STGPi:
3044  Scale = TypeSize::Fixed(16);
3045  Width = 16;
3046  MinOffset = -64;
3047  MaxOffset = 63;
3048  break;
3049  case AArch64::LD1RB_IMM:
3050  case AArch64::LD1RB_H_IMM:
3051  case AArch64::LD1RB_S_IMM:
3052  case AArch64::LD1RB_D_IMM:
3053  case AArch64::LD1RSB_H_IMM:
3054  case AArch64::LD1RSB_S_IMM:
3055  case AArch64::LD1RSB_D_IMM:
3056  Scale = TypeSize::Fixed(1);
3057  Width = 1;
3058  MinOffset = 0;
3059  MaxOffset = 63;
3060  break;
3061  case AArch64::LD1RH_IMM:
3062  case AArch64::LD1RH_S_IMM:
3063  case AArch64::LD1RH_D_IMM:
3064  case AArch64::LD1RSH_S_IMM:
3065  case AArch64::LD1RSH_D_IMM:
3066  Scale = TypeSize::Fixed(2);
3067  Width = 2;
3068  MinOffset = 0;
3069  MaxOffset = 63;
3070  break;
3071  case AArch64::LD1RW_IMM:
3072  case AArch64::LD1RW_D_IMM:
3073  case AArch64::LD1RSW_IMM:
3074  Scale = TypeSize::Fixed(4);
3075  Width = 4;
3076  MinOffset = 0;
3077  MaxOffset = 63;
3078  break;
3079  case AArch64::LD1RD_IMM:
3080  Scale = TypeSize::Fixed(8);
3081  Width = 8;
3082  MinOffset = 0;
3083  MaxOffset = 63;
3084  break;
3085  }
3086 
3087  return true;
3088 }
3089 
3090 // Scaling factor for unscaled load or store.
3092  switch (Opc) {
3093  default:
3094  llvm_unreachable("Opcode has unknown scale!");
3095  case AArch64::LDRBBui:
3096  case AArch64::LDURBBi:
3097  case AArch64::LDRSBWui:
3098  case AArch64::LDURSBWi:
3099  case AArch64::STRBBui:
3100  case AArch64::STURBBi:
3101  return 1;
3102  case AArch64::LDRHHui:
3103  case AArch64::LDURHHi:
3104  case AArch64::LDRSHWui:
3105  case AArch64::LDURSHWi:
3106  case AArch64::STRHHui:
3107  case AArch64::STURHHi:
3108  return 2;
3109  case AArch64::LDRSui:
3110  case AArch64::LDURSi:
3111  case AArch64::LDRSpre:
3112  case AArch64::LDRSWui:
3113  case AArch64::LDURSWi:
3114  case AArch64::LDRWpre:
3115  case AArch64::LDRWui:
3116  case AArch64::LDURWi:
3117  case AArch64::STRSui:
3118  case AArch64::STURSi:
3119  case AArch64::STRSpre:
3120  case AArch64::STRWui:
3121  case AArch64::STURWi:
3122  case AArch64::STRWpre:
3123  case AArch64::LDPSi:
3124  case AArch64::LDPSWi:
3125  case AArch64::LDPWi:
3126  case AArch64::STPSi:
3127  case AArch64::STPWi:
3128  return 4;
3129  case AArch64::LDRDui:
3130  case AArch64::LDURDi:
3131  case AArch64::LDRDpre:
3132  case AArch64::LDRXui:
3133  case AArch64::LDURXi:
3134  case AArch64::LDRXpre:
3135  case AArch64::STRDui:
3136  case AArch64::STURDi:
3137  case AArch64::STRDpre:
3138  case AArch64::STRXui:
3139  case AArch64::STURXi:
3140  case AArch64::STRXpre:
3141  case AArch64::LDPDi:
3142  case AArch64::LDPXi:
3143  case AArch64::STPDi:
3144  case AArch64::STPXi:
3145  return 8;
3146  case AArch64::LDRQui:
3147  case AArch64::LDURQi:
3148  case AArch64::STRQui:
3149  case AArch64::STURQi:
3150  case AArch64::STRQpre:
3151  case AArch64::LDPQi:
3152  case AArch64::LDRQpre:
3153  case AArch64::STPQi:
3154  case AArch64::STGOffset:
3155  case AArch64::STZGOffset:
3156  case AArch64::ST2GOffset:
3157  case AArch64::STZ2GOffset:
3158  case AArch64::STGPi:
3159  return 16;
3160  }
3161 }
3162 
3164  switch (MI.getOpcode()) {
3165  default:
3166  return false;
3167  case AArch64::LDRWpre:
3168  case AArch64::LDRXpre:
3169  case AArch64::LDRSpre:
3170  case AArch64::LDRDpre:
3171  case AArch64::LDRQpre:
3172  return true;
3173  }
3174 }
3175 
3177  switch (MI.getOpcode()) {
3178  default:
3179  return false;
3180  case AArch64::STRWpre:
3181  case AArch64::STRXpre:
3182  case AArch64::STRSpre:
3183  case AArch64::STRDpre:
3184  case AArch64::STRQpre:
3185  return true;
3186  }
3187 }
3188 
3190  return isPreLd(MI) || isPreSt(MI);
3191 }
3192 
3194  switch (MI.getOpcode()) {
3195  default:
3196  return false;
3197  case AArch64::LDPSi:
3198  case AArch64::LDPSWi:
3199  case AArch64::LDPDi:
3200  case AArch64::LDPQi:
3201  case AArch64::LDPWi:
3202  case AArch64::LDPXi:
3203  case AArch64::STPSi:
3204  case AArch64::STPDi:
3205  case AArch64::STPQi:
3206  case AArch64::STPWi:
3207  case AArch64::STPXi:
3208  case AArch64::STGPi:
3209  return true;
3210  }
3211 }
3212 
3214  unsigned Idx =
3216  : 1;
3217  return MI.getOperand(Idx);
3218 }
3219 
3220 const MachineOperand &
3222  unsigned Idx =
3224  : 2;
3225  return MI.getOperand(Idx);
3226 }
3227 
3229  Register Reg) {
3230  if (MI.getParent() == nullptr)
3231  return nullptr;
3232  const MachineFunction *MF = MI.getParent()->getParent();
3233  return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
3234 }
3235 
3237  auto IsQFPR = [&](const MachineOperand &Op) {
3238  if (!Op.isReg())
3239  return false;
3240  auto Reg = Op.getReg();
3241  if (Reg.isPhysical())
3242  return AArch64::FPR128RegClass.contains(Reg);
3243  const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
3244  return TRC == &AArch64::FPR128RegClass ||
3245  TRC == &AArch64::FPR128_loRegClass;
3246  };
3247  return llvm::any_of(MI.operands(), IsQFPR);
3248 }
3249 
3251  auto IsFPR = [&](const MachineOperand &Op) {
3252  if (!Op.isReg())
3253  return false;
3254  auto Reg = Op.getReg();
3255  if (Reg.isPhysical())
3256  return AArch64::FPR128RegClass.contains(Reg) ||
3257  AArch64::FPR64RegClass.contains(Reg) ||
3258  AArch64::FPR32RegClass.contains(Reg) ||
3259  AArch64::FPR16RegClass.contains(Reg) ||
3260  AArch64::FPR8RegClass.contains(Reg);
3261 
3262  const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
3263  return TRC == &AArch64::FPR128RegClass ||
3264  TRC == &AArch64::FPR128_loRegClass ||
3265  TRC == &AArch64::FPR64RegClass ||
3266  TRC == &AArch64::FPR64_loRegClass ||
3267  TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
3268  TRC == &AArch64::FPR8RegClass;
3269  };
3270  return llvm::any_of(MI.operands(), IsFPR);
3271 }
3272 
3273 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
3274 // scaled.
3275 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
3276  int Scale = AArch64InstrInfo::getMemScale(Opc);
3277 
3278  // If the byte-offset isn't a multiple of the stride, we can't scale this
3279  // offset.
3280  if (Offset % Scale != 0)
3281  return false;
3282 
3283  // Convert the byte-offset used by unscaled into an "element" offset used
3284  // by the scaled pair load/store instructions.
3285  Offset /= Scale;
3286  return true;
3287 }
3288 
3289 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
3290  if (FirstOpc == SecondOpc)
3291  return true;
3292  // We can also pair sign-ext and zero-ext instructions.
3293  switch (FirstOpc) {
3294  default:
3295  return false;
3296  case AArch64::LDRWui:
3297  case AArch64::LDURWi:
3298  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
3299  case AArch64::LDRSWui:
3300  case AArch64::LDURSWi:
3301  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
3302  }
3303  // These instructions can't be paired based on their opcodes.
3304  return false;
3305 }
3306 
3307 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
3308  int64_t Offset1, unsigned Opcode1, int FI2,
3309  int64_t Offset2, unsigned Opcode2) {
3310  // Accesses through fixed stack object frame indices may access a different
3311  // fixed stack slot. Check that the object offsets + offsets match.
3312  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
3313  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
3314  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
3315  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
3316  // Convert to scaled object offsets.
3317  int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
3318  if (ObjectOffset1 % Scale1 != 0)
3319  return false;
3320  ObjectOffset1 /= Scale1;
3321  int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
3322  if (ObjectOffset2 % Scale2 != 0)
3323  return false;
3324  ObjectOffset2 /= Scale2;
3325  ObjectOffset1 += Offset1;
3326  ObjectOffset2 += Offset2;
3327  return ObjectOffset1 + 1 == ObjectOffset2;
3328  }
3329 
3330  return FI1 == FI2;
3331 }
3332 
3333 /// Detect opportunities for ldp/stp formation.
3334 ///
3335 /// Only called for LdSt for which getMemOperandWithOffset returns true.
3338  ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
3339  unsigned NumBytes) const {
3340  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
3341  const MachineOperand &BaseOp1 = *BaseOps1.front();
3342  const MachineOperand &BaseOp2 = *BaseOps2.front();
3343  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
3344  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
3345  if (BaseOp1.getType() != BaseOp2.getType())
3346  return false;
3347 
3348  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
3349  "Only base registers and frame indices are supported.");
3350 
3351  // Check for both base regs and base FI.
3352  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
3353  return false;
3354 
3355  // Only cluster up to a single pair.
3356  if (NumLoads > 2)
3357  return false;
3358 
3359  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
3360  return false;
3361 
3362  // Can we pair these instructions based on their opcodes?
3363  unsigned FirstOpc = FirstLdSt.getOpcode();
3364  unsigned SecondOpc = SecondLdSt.getOpcode();
3365  if (!canPairLdStOpc(FirstOpc, SecondOpc))
3366  return false;
3367 
3368  // Can't merge volatiles or load/stores that have a hint to avoid pair
3369  // formation, for example.
3370  if (!isCandidateToMergeOrPair(FirstLdSt) ||
3371  !isCandidateToMergeOrPair(SecondLdSt))
3372  return false;
3373 
3374  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
3375  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
3376  if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
3377  return false;
3378 
3379  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
3380  if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
3381  return false;
3382 
3383  // Pairwise instructions have a 7-bit signed offset field.
3384  if (Offset1 > 63 || Offset1 < -64)
3385  return false;
3386 
3387  // The caller should already have ordered First/SecondLdSt by offset.
3388  // Note: except for non-equal frame index bases
3389  if (BaseOp1.isFI()) {
3390  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
3391  "Caller should have ordered offsets.");
3392 
3393  const MachineFrameInfo &MFI =
3394  FirstLdSt.getParent()->getParent()->getFrameInfo();
3395  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
3396  BaseOp2.getIndex(), Offset2, SecondOpc);
3397  }
3398 
3399  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
3400 
3401  return Offset1 + 1 == Offset2;
3402 }
3403 
3405  unsigned Reg, unsigned SubIdx,
3406  unsigned State,
3407  const TargetRegisterInfo *TRI) {
3408  if (!SubIdx)
3409  return MIB.addReg(Reg, State);
3410 
3412  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
3413  return MIB.addReg(Reg, State, SubIdx);
3414 }
3415 
3416 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
3417  unsigned NumRegs) {
3418  // We really want the positive remainder mod 32 here, that happens to be
3419  // easily obtainable with a mask.
3420  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
3421 }
3422 
3425  const DebugLoc &DL, MCRegister DestReg,
3426  MCRegister SrcReg, bool KillSrc,
3427  unsigned Opcode,
3428  ArrayRef<unsigned> Indices) const {
3429  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
3431  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3432  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3433  unsigned NumRegs = Indices.size();
3434 
3435  int SubReg = 0, End = NumRegs, Incr = 1;
3436  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
3437  SubReg = NumRegs - 1;
3438  End = -1;
3439  Incr = -1;
3440  }
3441 
3442  for (; SubReg != End; SubReg += Incr) {
3443  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3444  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3445  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
3446  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3447  }
3448 }
3449 
3452  DebugLoc DL, unsigned DestReg,
3453  unsigned SrcReg, bool KillSrc,
3454  unsigned Opcode, unsigned ZeroReg,
3455  llvm::ArrayRef<unsigned> Indices) const {
3457  unsigned NumRegs = Indices.size();
3458 
3459 #ifndef NDEBUG
3460  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
3461  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
3462  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
3463  "GPR reg sequences should not be able to overlap");
3464 #endif
3465 
3466  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
3467  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
3468  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
3469  MIB.addReg(ZeroReg);
3470  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
3471  MIB.addImm(0);
3472  }
3473 }
3474 
3477  const DebugLoc &DL, MCRegister DestReg,
3478  MCRegister SrcReg, bool KillSrc) const {
3479  if (AArch64::GPR32spRegClass.contains(DestReg) &&
3480  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
3482 
3483  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
3484  // If either operand is WSP, expand to ADD #0.
3485  if (Subtarget.hasZeroCycleRegMove()) {
3486  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
3487  MCRegister DestRegX = TRI->getMatchingSuperReg(
3488  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3489  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3490  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3491  // This instruction is reading and writing X registers. This may upset
3492  // the register scavenger and machine verifier, so we need to indicate
3493  // that we are reading an undefined value from SrcRegX, but a proper
3494  // value from SrcReg.
3495  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
3496  .addReg(SrcRegX, RegState::Undef)
3497  .addImm(0)
3499  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3500  } else {
3501  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
3502  .addReg(SrcReg, getKillRegState(KillSrc))
3503  .addImm(0)
3505  }
3506  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
3507  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
3508  .addImm(0)
3510  } else {
3511  if (Subtarget.hasZeroCycleRegMove()) {
3512  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
3513  MCRegister DestRegX = TRI->getMatchingSuperReg(
3514  DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3515  MCRegister SrcRegX = TRI->getMatchingSuperReg(
3516  SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
3517  // This instruction is reading and writing X registers. This may upset
3518  // the register scavenger and machine verifier, so we need to indicate
3519  // that we are reading an undefined value from SrcRegX, but a proper
3520  // value from SrcReg.
3521  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
3522  .addReg(AArch64::XZR)
3523  .addReg(SrcRegX, RegState::Undef)
3524  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
3525  } else {
3526  // Otherwise, expand to ORR WZR.
3527  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
3528  .addReg(AArch64::WZR)
3529  .addReg(SrcReg, getKillRegState(KillSrc));
3530  }
3531  }
3532  return;
3533  }
3534 
3535  // Copy a Predicate register by ORRing with itself.
3536  if (AArch64::PPRRegClass.contains(DestReg) &&
3537  AArch64::PPRRegClass.contains(SrcReg)) {
3538  assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
3539  BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
3540  .addReg(SrcReg) // Pg
3541  .addReg(SrcReg)
3542  .addReg(SrcReg, getKillRegState(KillSrc));
3543  return;
3544  }
3545 
3546  // Copy a Z register by ORRing with itself.
3547  if (AArch64::ZPRRegClass.contains(DestReg) &&
3548  AArch64::ZPRRegClass.contains(SrcReg)) {
3549  assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
3550  BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
3551  .addReg(SrcReg)
3552  .addReg(SrcReg, getKillRegState(KillSrc));
3553  return;
3554  }
3555 
3556  // Copy a Z register pair by copying the individual sub-registers.
3557  if (AArch64::ZPR2RegClass.contains(DestReg) &&
3558  AArch64::ZPR2RegClass.contains(SrcReg)) {
3559  assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
3560  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
3561  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3562  Indices);
3563  return;
3564  }
3565 
3566  // Copy a Z register triple by copying the individual sub-registers.
3567  if (AArch64::ZPR3RegClass.contains(DestReg) &&
3568  AArch64::ZPR3RegClass.contains(SrcReg)) {
3569  assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
3570  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3571  AArch64::zsub2};
3572  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3573  Indices);
3574  return;
3575  }
3576 
3577  // Copy a Z register quad by copying the individual sub-registers.
3578  if (AArch64::ZPR4RegClass.contains(DestReg) &&
3579  AArch64::ZPR4RegClass.contains(SrcReg)) {
3580  assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
3581  static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
3582  AArch64::zsub2, AArch64::zsub3};
3583  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
3584  Indices);
3585  return;
3586  }
3587 
3588  if (AArch64::GPR64spRegClass.contains(DestReg) &&
3589  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
3590  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
3591  // If either operand is SP, expand to ADD #0.
3592  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
3593  .addReg(SrcReg, getKillRegState(KillSrc))
3594  .addImm(0)
3596  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
3597  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
3598  .addImm(0)
3600  } else {
3601  // Otherwise, expand to ORR XZR.
3602  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
3603  .addReg(AArch64::XZR)
3604  .addReg(SrcReg, getKillRegState(KillSrc));
3605  }
3606  return;
3607  }
3608 
3609  // Copy a DDDD register quad by copying the individual sub-registers.
3610  if (AArch64::DDDDRegClass.contains(DestReg) &&
3611  AArch64::DDDDRegClass.contains(SrcReg)) {
3612  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3613  AArch64::dsub2, AArch64::dsub3};
3614  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3615  Indices);
3616  return;
3617  }
3618 
3619  // Copy a DDD register triple by copying the individual sub-registers.
3620  if (AArch64::DDDRegClass.contains(DestReg) &&
3621  AArch64::DDDRegClass.contains(SrcReg)) {
3622  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
3623  AArch64::dsub2};
3624  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3625  Indices);
3626  return;
3627  }
3628 
3629  // Copy a DD register pair by copying the individual sub-registers.
3630  if (AArch64::DDRegClass.contains(DestReg) &&
3631  AArch64::DDRegClass.contains(SrcReg)) {
3632  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
3633  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
3634  Indices);
3635  return;
3636  }
3637 
3638  // Copy a QQQQ register quad by copying the individual sub-registers.
3639  if (AArch64::QQQQRegClass.contains(DestReg) &&
3640  AArch64::QQQQRegClass.contains(SrcReg)) {
3641  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3642  AArch64::qsub2, AArch64::qsub3};
3643  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3644  Indices);
3645  return;
3646  }
3647 
3648  // Copy a QQQ register triple by copying the individual sub-registers.
3649  if (AArch64::QQQRegClass.contains(DestReg) &&
3650  AArch64::QQQRegClass.contains(SrcReg)) {
3651  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
3652  AArch64::qsub2};
3653  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3654  Indices);
3655  return;
3656  }
3657 
3658  // Copy a QQ register pair by copying the individual sub-registers.
3659  if (AArch64::QQRegClass.contains(DestReg) &&
3660  AArch64::QQRegClass.contains(SrcReg)) {
3661  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
3662  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
3663  Indices);
3664  return;
3665  }
3666 
3667  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
3668  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
3669  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
3670  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
3671  AArch64::XZR, Indices);
3672  return;
3673  }
3674 
3675  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
3676  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
3677  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
3678  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
3679  AArch64::WZR, Indices);
3680  return;
3681  }
3682 
3683  if (AArch64::FPR128RegClass.contains(DestReg) &&
3684  AArch64::FPR128RegClass.contains(SrcReg)) {
3685  if (Subtarget.forceStreamingCompatibleSVE()) {
3686  BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
3687  .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
3688  .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
3689  .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
3690  } else if (Subtarget.hasNEON()) {
3691  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
3692  .addReg(SrcReg)
3693  .addReg(SrcReg, getKillRegState(KillSrc));
3694  } else {
3695  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
3696  .addReg(AArch64::SP, RegState::Define)
3697  .addReg(SrcReg, getKillRegState(KillSrc))
3698  .addReg(AArch64::SP)
3699  .addImm(-16);
3700  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
3701  .addReg(AArch64::SP, RegState::Define)
3702  .addReg(DestReg, RegState::Define)
3703  .addReg(AArch64::SP)
3704  .addImm(16);
3705  }
3706  return;
3707  }
3708 
3709  if (AArch64::FPR64RegClass.contains(DestReg) &&
3710  AArch64::FPR64RegClass.contains(SrcReg)) {
3711  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
3712  .addReg(SrcReg, getKillRegState(KillSrc));
3713  return;
3714  }
3715 
3716  if (AArch64::FPR32RegClass.contains(DestReg) &&
3717  AArch64::FPR32RegClass.contains(SrcReg)) {
3718  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3719  .addReg(SrcReg, getKillRegState(KillSrc));
3720  return;
3721  }
3722 
3723  if (AArch64::FPR16RegClass.contains(DestReg) &&
3724  AArch64::FPR16RegClass.contains(SrcReg)) {
3725  DestReg =
3726  RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
3727  SrcReg =
3728  RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
3729  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3730  .addReg(SrcReg, getKillRegState(KillSrc));
3731  return;
3732  }
3733 
3734  if (AArch64::FPR8RegClass.contains(DestReg) &&
3735  AArch64::FPR8RegClass.contains(SrcReg)) {
3736  DestReg =
3737  RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
3738  SrcReg =
3739  RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
3740  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
3741  .addReg(SrcReg, getKillRegState(KillSrc));
3742  return;
3743  }
3744 
3745  // Copies between GPR64 and FPR64.
3746  if (AArch64::FPR64RegClass.contains(DestReg) &&
3747  AArch64::GPR64RegClass.contains(SrcReg)) {
3748  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
3749  .addReg(SrcReg, getKillRegState(KillSrc));
3750  return;
3751  }
3752  if (AArch64::GPR64RegClass.contains(DestReg) &&
3753  AArch64::FPR64RegClass.contains(SrcReg)) {
3754  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
3755  .addReg(SrcReg, getKillRegState(KillSrc));
3756  return;
3757  }
3758  // Copies between GPR32 and FPR32.
3759  if (AArch64::FPR32RegClass.contains(DestReg) &&
3760  AArch64::GPR32RegClass.contains(SrcReg)) {
3761  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
3762  .addReg(SrcReg, getKillRegState(KillSrc));
3763  return;
3764  }
3765  if (AArch64::GPR32RegClass.contains(DestReg) &&
3766  AArch64::FPR32RegClass.contains(SrcReg)) {
3767  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
3768  .addReg(SrcReg, getKillRegState(KillSrc));
3769  return;
3770  }
3771 
3772  if (DestReg == AArch64::NZCV) {
3773  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
3774  BuildMI(MBB, I, DL, get(AArch64::MSR))
3775  .addImm(AArch64SysReg::NZCV)
3776  .addReg(SrcReg, getKillRegState(KillSrc))
3777  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
3778  return;
3779  }
3780 
3781  if (SrcReg == AArch64::NZCV) {
3782  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
3783  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
3784  .addImm(AArch64SysReg::NZCV)
3785  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
3786  return;
3787  }
3788 
3789 #ifndef NDEBUG
3791  errs() << TRI.getRegAsmName(DestReg) << " = COPY "
3792  << TRI.getRegAsmName(SrcReg) << "\n";
3793 #endif
3794  llvm_unreachable("unimplemented reg-to-reg copy");
3795 }
3796 
3799  MachineBasicBlock::iterator InsertBefore,
3800  const MCInstrDesc &MCID,
3801  Register SrcReg, bool IsKill,
3802  unsigned SubIdx0, unsigned SubIdx1, int FI,
3803  MachineMemOperand *MMO) {
3804  Register SrcReg0 = SrcReg;
3805  Register SrcReg1 = SrcReg;
3806  if (Register::isPhysicalRegister(SrcReg)) {
3807  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
3808  SubIdx0 = 0;
3809  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
3810  SubIdx1 = 0;
3811  }
3812  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3813  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
3814  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
3815  .addFrameIndex(FI)
3816  .addImm(0)
3817  .addMemOperand(MMO);
3818 }
3819 
3822  bool isKill, int FI, const TargetRegisterClass *RC,
3823  const TargetRegisterInfo *TRI) const {
3824  MachineFunction &MF = *MBB.getParent();
3825  MachineFrameInfo &MFI = MF.getFrameInfo();
3826 
3828  MachineMemOperand *MMO =
3830  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3831  unsigned Opc = 0;
3832  bool Offset = true;
3833  unsigned StackID = TargetStackID::Default;
3834  switch (TRI->getSpillSize(*RC)) {
3835  case 1:
3836  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3837  Opc = AArch64::STRBui;
3838  break;
3839  case 2:
3840  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3841  Opc = AArch64::STRHui;
3842  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3843  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3844  Opc = AArch64::STR_PXI;
3846  }
3847  break;
3848  case 4:
3849  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
3850  Opc = AArch64::STRWui;
3851  if (Register::isVirtualRegister(SrcReg))
3852  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
3853  else
3854  assert(SrcReg != AArch64::WSP);
3855  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
3856  Opc = AArch64::STRSui;
3857  break;
3858  case 8:
3859  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
3860  Opc = AArch64::STRXui;
3861  if (Register::isVirtualRegister(SrcReg))
3862  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3863  else
3864  assert(SrcReg != AArch64::SP);
3865  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
3866  Opc = AArch64::STRDui;
3867  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
3869  get(AArch64::STPWi), SrcReg, isKill,
3870  AArch64::sube32, AArch64::subo32, FI, MMO);
3871  return;
3872  }
3873  break;
3874  case 16:
3875  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
3876  Opc = AArch64::STRQui;
3877  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
3878  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3879  Opc = AArch64::ST1Twov1d;
3880  Offset = false;
3881  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
3883  get(AArch64::STPXi), SrcReg, isKill,
3884  AArch64::sube64, AArch64::subo64, FI, MMO);
3885  return;
3886  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
3887  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3888  Opc = AArch64::STR_ZXI;
3890  }
3891  break;
3892  case 24:
3893  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
3894  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3895  Opc = AArch64::ST1Threev1d;
3896  Offset = false;
3897  }
3898  break;
3899  case 32:
3900  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
3901  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3902  Opc = AArch64::ST1Fourv1d;
3903  Offset = false;
3904  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
3905  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3906  Opc = AArch64::ST1Twov2d;
3907  Offset = false;
3908  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
3909  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3910  Opc = AArch64::STR_ZZXI;
3912  }
3913  break;
3914  case 48:
3915  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
3916  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3917  Opc = AArch64::ST1Threev2d;
3918  Offset = false;
3919  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
3920  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3921  Opc = AArch64::STR_ZZZXI;
3923  }
3924  break;
3925  case 64:
3926  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
3927  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
3928  Opc = AArch64::ST1Fourv2d;
3929  Offset = false;
3930  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
3931  assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
3932  Opc = AArch64::STR_ZZZZXI;
3934  }
3935  break;
3936  }
3937  assert(Opc && "Unknown register class");
3938  MFI.setStackID(FI, StackID);
3939 
3940  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
3941  .addReg(SrcReg, getKillRegState(isKill))
3942  .addFrameIndex(FI);
3943 
3944  if (Offset)
3945  MI.addImm(0);
3946  MI.addMemOperand(MMO);
3947 }
3948 
3951  MachineBasicBlock::iterator InsertBefore,
3952  const MCInstrDesc &MCID,
3953  Register DestReg, unsigned SubIdx0,
3954  unsigned SubIdx1, int FI,
3955  MachineMemOperand *MMO) {
3956  Register DestReg0 = DestReg;
3957  Register DestReg1 = DestReg;
3958  bool IsUndef = true;
3959  if (Register::isPhysicalRegister(DestReg)) {
3960  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
3961  SubIdx0 = 0;
3962  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
3963  SubIdx1 = 0;
3964  IsUndef = false;
3965  }
3966  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
3967  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
3968  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
3969  .addFrameIndex(FI)
3970  .addImm(0)
3971  .addMemOperand(MMO);
3972 }
3973 
3976  int FI, const TargetRegisterClass *RC,
3977  const TargetRegisterInfo *TRI) const {
3978  MachineFunction &MF = *MBB.getParent();
3979  MachineFrameInfo &MFI = MF.getFrameInfo();
3981  MachineMemOperand *MMO =
3983  MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
3984 
3985  unsigned Opc = 0;
3986  bool Offset = true;
3987  unsigned StackID = TargetStackID::Default;
3988  switch (TRI->getSpillSize(*RC)) {
3989  case 1:
3990  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
3991  Opc = AArch64::LDRBui;
3992  break;
3993  case 2:
3994  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
3995  Opc = AArch64::LDRHui;
3996  else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
3997  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
3998  Opc = AArch64::LDR_PXI;
4000  }
4001  break;
4002  case 4:
4003  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4004  Opc = AArch64::LDRWui;
4005  if (Register::isVirtualRegister(DestReg))
4006  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
4007  else
4008  assert(DestReg != AArch64::WSP);
4009  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4010  Opc = AArch64::LDRSui;
4011  break;
4012  case 8:
4013  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4014  Opc = AArch64::LDRXui;
4015  if (Register::isVirtualRegister(DestReg))
4016  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
4017  else
4018  assert(DestReg != AArch64::SP);
4019  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4020  Opc = AArch64::LDRDui;
4021  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4023  get(AArch64::LDPWi), DestReg, AArch64::sube32,
4024  AArch64::subo32, FI, MMO);
4025  return;
4026  }
4027  break;
4028  case 16:
4029  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4030  Opc = AArch64::LDRQui;
4031  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4032  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4033  Opc = AArch64::LD1Twov1d;
4034  Offset = false;
4035  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4037  get(AArch64::LDPXi), DestReg, AArch64::sube64,
4038  AArch64::subo64, FI, MMO);
4039  return;
4040  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4041  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
4042  Opc = AArch64::LDR_ZXI;
4044  }
4045  break;
4046  case 24:
4047  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4048  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4049  Opc = AArch64::LD1Threev1d;
4050  Offset = false;
4051  }
4052  break;
4053  case 32:
4054  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4055  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4056  Opc = AArch64::LD1Fourv1d;
4057  Offset = false;
4058  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4059  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4060  Opc = AArch64::LD1Twov2d;
4061  Offset = false;
4062  } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
4063  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
4064  Opc = AArch64::LDR_ZZXI;
4066  }
4067  break;
4068  case 48:
4069  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4070  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4071  Opc = AArch64::LD1Threev2d;
4072  Offset = false;
4073  } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4074  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
4075  Opc = AArch64::LDR_ZZZXI;
4077  }
4078  break;
4079  case 64:
4080  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4081  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
4082  Opc = AArch64::LD1Fourv2d;
4083  Offset = false;
4084  } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
4085  assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
4086  Opc = AArch64::LDR_ZZZZXI;
4088  }
4089  break;
4090  }
4091 
4092  assert(Opc && "Unknown register class");
4093  MFI.setStackID(FI, StackID);
4094 
4095  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4096  .addReg(DestReg, getDefRegState(true))
4097  .addFrameIndex(FI);
4098  if (Offset)
4099  MI.addImm(0);
4100  MI.addMemOperand(MMO);
4101 }
4102 
4104  const MachineInstr &UseMI,
4105  const TargetRegisterInfo *TRI) {
4106  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
4107  UseMI.getIterator()),
4108  [TRI](const MachineInstr &I) {
4109  return I.modifiesRegister(AArch64::NZCV, TRI) ||
4110  I.readsRegister(AArch64::NZCV, TRI);
4111  });
4112 }
4113 
4115  const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
4116  // The smallest scalable element supported by scaled SVE addressing
4117  // modes are predicates, which are 2 scalable bytes in size. So the scalable
4118  // byte offset must always be a multiple of 2.
4119  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
4120 
4121  // VGSized offsets are divided by '2', because the VG register is the
4122  // the number of 64bit granules as opposed to 128bit vector chunks,
4123  // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
4124  // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
4125  // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
4126  ByteSized = Offset.getFixed();
4127  VGSized = Offset.getScalable() / 2;
4128 }
4129 
4130 /// Returns the offset in parts to which this frame offset can be
4131 /// decomposed for the purpose of describing a frame offset.
4132 /// For non-scalable offsets this is simply its byte size.
4134  const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
4135  int64_t &NumDataVectors) {
4136  // The smallest scalable element supported by scaled SVE addressing
4137  // modes are predicates, which are 2 scalable bytes in size. So the scalable
4138  // byte offset must always be a multiple of 2.
4139  assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
4140 
4141  NumBytes = Offset.getFixed();
4142  NumDataVectors = 0;
4143  NumPredicateVectors = Offset.getScalable() / 2;
4144  // This method is used to get the offsets to adjust the frame offset.
4145  // If the function requires ADDPL to be used and needs more than two ADDPL
4146  // instructions, part of the offset is folded into NumDataVectors so that it
4147  // uses ADDVL for part of it, reducing the number of ADDPL instructions.
4148  if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
4149  NumPredicateVectors > 62) {
4150  NumDataVectors = NumPredicateVectors / 8;
4151  NumPredicateVectors -= NumDataVectors * 8;
4152  }
4153 }
4154 
4155 // Convenience function to create a DWARF expression for
4156 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
4157 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
4158  int NumVGScaledBytes, unsigned VG,
4159  llvm::raw_string_ostream &Comment) {
4160  uint8_t buffer[16];
4161 
4162  if (NumBytes) {
4163  Expr.push_back(dwarf::DW_OP_consts);
4164  Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
4165  Expr.push_back((uint8_t)dwarf::DW_OP_plus);
4166  Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
4167  }
4168 
4169  if (NumVGScaledBytes) {
4170  Expr.push_back((uint8_t)dwarf::DW_OP_consts);
4171  Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
4172 
4173  Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
4174  Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
4175  Expr.push_back(0);
4176 
4177  Expr.push_back((uint8_t)dwarf::DW_OP_mul);
4178  Expr.push_back((uint8_t)dwarf::DW_OP_plus);
4179 
4180  Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
4181  << std::abs(NumVGScaledBytes) << " * VG";
4182  }
4183 }
4184 
4185 // Creates an MCCFIInstruction:
4186 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
4188  unsigned Reg,
4189  const StackOffset &Offset) {
4190  int64_t NumBytes, NumVGScaledBytes;
4192  NumVGScaledBytes);
4193  std::string CommentBuffer;
4194  llvm::raw_string_ostream Comment(CommentBuffer);
4195 
4196  if (Reg == AArch64::SP)
4197  Comment << "sp";
4198  else if (Reg == AArch64::FP)
4199  Comment << "fp";
4200  else
4201  Comment << printReg(Reg, &TRI);
4202 
4203  // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
4204  SmallString<64> Expr;
4205  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
4206  Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
4207  Expr.push_back(0);
4208  appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
4209  TRI.getDwarfRegNum(AArch64::VG, true), Comment);
4210 
4211  // Wrap this into DW_CFA_def_cfa.
4212  SmallString<64> DefCfaExpr;
4213  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
4214  uint8_t buffer[16];
4215  DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
4216  DefCfaExpr.append(Expr.str());
4217  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
4218  Comment.str());
4219 }
4220 
4222  unsigned FrameReg, unsigned Reg,
4223  const StackOffset &Offset,
4224  bool LastAdjustmentWasScalable) {
4225  if (Offset.getScalable())
4226  return createDefCFAExpression(TRI, Reg, Offset);
4227 
4228  if (FrameReg == Reg && !LastAdjustmentWasScalable)
4229  return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
4230 
4231  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
4232  return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
4233 }
4234 
4236  unsigned Reg,
4237  const StackOffset &OffsetFromDefCFA) {
4238  int64_t NumBytes, NumVGScaledBytes;
4240  OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
4241 
4242  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
4243 
4244  // Non-scalable offsets can use DW_CFA_offset directly.
4245  if (!NumVGScaledBytes)
4246  return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
4247 
4248  std::string CommentBuffer;
4249  llvm::raw_string_ostream Comment(CommentBuffer);
4250  Comment << printReg(Reg, &TRI) << " @ cfa";
4251 
4252  // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
4253  SmallString<64> OffsetExpr;
4254  appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
4255  TRI.getDwarfRegNum(AArch64::VG, true), Comment);
4256 
4257  // Wrap this into DW_CFA_expression
4258  SmallString<64> CfaExpr;
4259  CfaExpr.push_back(dwarf::DW_CFA_expression);
4260  uint8_t buffer[16];
4261  CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
4262  CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
4263  CfaExpr.append(OffsetExpr.str());
4264 
4265  return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
4266 }
4267 
4268 // Helper function to emit a frame offset adjustment from a given
4269 // pointer (SrcReg), stored into DestReg. This function is explicit
4270 // in that it requires the opcode.
4273  const DebugLoc &DL, unsigned DestReg,
4274  unsigned SrcReg, int64_t Offset, unsigned Opc,
4275  const TargetInstrInfo *TII,
4276  MachineInstr::MIFlag Flag, bool NeedsWinCFI,
4277  bool *HasWinCFI, bool EmitCFAOffset,
4278  StackOffset CFAOffset, unsigned FrameReg) {
4279  int Sign = 1;
4280  unsigned MaxEncoding, ShiftSize;
4281  switch (Opc) {
4282  case AArch64::ADDXri:
4283  case AArch64::ADDSXri:
4284  case AArch64::SUBXri:
4285  case AArch64::SUBSXri:
4286  MaxEncoding = 0xfff;
4287  ShiftSize = 12;
4288  break;
4289  case AArch64::ADDVL_XXI:
4290  case AArch64::ADDPL_XXI:
4291  case AArch64::ADDSVL_XXI:
4292  case AArch64::ADDSPL_XXI:
4293  MaxEncoding = 31;
4294  ShiftSize = 0;
4295  if (Offset < 0) {
4296  MaxEncoding = 32;
4297  Sign = -1;
4298  Offset = -Offset;
4299  }
4300  break;
4301  default:
4302  llvm_unreachable("Unsupported opcode");
4303  }
4304 
4305  // `Offset` can be in bytes or in "scalable bytes".
4306  int VScale = 1;
4307  if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
4308  VScale = 16;
4309  else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
4310  VScale = 2;
4311 
4312  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
4313  // scratch register. If DestReg is a virtual register, use it as the
4314  // scratch register; otherwise, create a new virtual register (to be
4315  // replaced by the scavenger at the end of PEI). That case can be optimized
4316  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
4317  // register can be loaded with offset%8 and the add/sub can use an extending
4318  // instruction with LSL#3.
4319  // Currently the function handles any offsets but generates a poor sequence
4320  // of code.
4321  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
4322 
4323  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
4324  Register TmpReg = DestReg;
4325  if (TmpReg == AArch64::XZR)
4327  &AArch64::GPR64RegClass);
4328  do {
4329  uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
4330  unsigned LocalShiftSize = 0;
4331  if (ThisVal > MaxEncoding) {
4332  ThisVal = ThisVal >> ShiftSize;
4333  LocalShiftSize = ShiftSize;
4334  }
4335  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
4336  "Encoding cannot handle value that big");
4337 
4338  Offset -= ThisVal << LocalShiftSize;
4339  if (Offset == 0)
4340  TmpReg = DestReg;
4341  auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
4342  .addReg(SrcReg)
4343  .addImm(Sign * (int)ThisVal);
4344  if (ShiftSize)
4345  MBI = MBI.addImm(
4346  AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
4347  MBI = MBI.setMIFlag(Flag);
4348 
4349  auto Change =
4350  VScale == 1
4351  ? StackOffset::getFixed(ThisVal << LocalShiftSize)
4352  : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
4353  if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
4354  CFAOffset += Change;
4355  else
4356  CFAOffset -= Change;
4357  if (EmitCFAOffset && DestReg == TmpReg) {
4358  MachineFunction &MF = *MBB.getParent();
4359  const TargetSubtargetInfo &STI = MF.getSubtarget();
4360  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
4361 
4362  unsigned CFIIndex = MF.addFrameInst(
4363  createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
4364  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
4365  .addCFIIndex(CFIIndex)
4366  .setMIFlags(Flag);
4367  }
4368 
4369  if (NeedsWinCFI) {
4370  assert(Sign == 1 && "SEH directives should always have a positive sign");
4371  int Imm = (int)(ThisVal << LocalShiftSize);
4372  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
4373  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
4374  if (HasWinCFI)
4375  *HasWinCFI = true;
4376  if (Imm == 0)
4377  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
4378  else
4379  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
4380  .addImm(Imm)
4381  .setMIFlag(Flag);
4382  assert(Offset == 0 && "Expected remaining offset to be zero to "
4383  "emit a single SEH directive");
4384  } else if (DestReg == AArch64::SP) {
4385  if (HasWinCFI)
4386  *HasWinCFI = true;
4387  assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
4388  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
4389  .addImm(Imm)
4390  .setMIFlag(Flag);
4391  }
4392  }
4393 
4394  SrcReg = TmpReg;
4395  } while (Offset);
4396 }
4397 
4400  unsigned DestReg, unsigned SrcReg,
4401  StackOffset Offset, const TargetInstrInfo *TII,
4402  MachineInstr::MIFlag Flag, bool SetNZCV,
4403  bool NeedsWinCFI, bool *HasWinCFI,
4404  bool EmitCFAOffset, StackOffset CFAOffset,
4405  unsigned FrameReg) {
4406  // If a function is marked as arm_locally_streaming, then the runtime value of
4407  // vscale in the prologue/epilogue is different the runtime value of vscale
4408  // in the function's body. To avoid having to consider multiple vscales,
4409  // we can use `addsvl` to allocate any scalable stack-slots, which under
4410  // most circumstances will be only locals, not callee-save slots.
4411  const Function &F = MBB.getParent()->getFunction();
4412  bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
4413 
4414  int64_t Bytes, NumPredicateVectors, NumDataVectors;
4416  Offset, Bytes, NumPredicateVectors, NumDataVectors);
4417 
4418  // First emit non-scalable frame offsets, or a simple 'mov'.
4419  if (Bytes || (!Offset && SrcReg != DestReg)) {
4420  assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
4421  "SP increment/decrement not 8-byte aligned");
4422  unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
4423  if (Bytes < 0) {
4424  Bytes = -Bytes;
4425  Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
4426  }
4427  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
4428  NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
4429  FrameReg);
4430  CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
4431  ? StackOffset::getFixed(-Bytes)
4432  : StackOffset::getFixed(Bytes);
4433  SrcReg = DestReg;
4434  FrameReg = DestReg;
4435  }
4436 
4437  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
4438  "SetNZCV not supported with SVE vectors");
4439  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
4440  "WinCFI not supported with SVE vectors");
4441 
4442  if (NumDataVectors) {
4443  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
4444  UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
4445  TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
4446  CFAOffset, FrameReg);
4447  CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
4448  SrcReg = DestReg;
4449  }
4450 
4451  if (NumPredicateVectors) {
4452  assert(DestReg != AArch64::SP && "Unaligned access to SP");
4453  emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
4454  UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
4455  TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
4456  CFAOffset, FrameReg);
4457  }
4458 }
4459 
4463  LiveIntervals *LIS, VirtRegMap *VRM) const {
4464  // This is a bit of a hack. Consider this instruction:
4465  //
4466  // %0 = COPY %sp; GPR64all:%0
4467  //
4468  // We explicitly chose GPR64all for the virtual register so such a copy might
4469  // be eliminated by RegisterCoalescer. However, that may not be possible, and
4470  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
4471  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
4472  //
4473  // To prevent that, we are going to constrain the %0 register class here.
4474  //
4475  // <rdar://problem/11522048>
4476  //
4477  if (MI.isFullCopy()) {
4478  Register DstReg = MI.getOperand(0).getReg();
4479  Register SrcReg = MI.getOperand(1).getReg();
4480  if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) {
4481  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
4482  return nullptr;
4483  }
4484  if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) {
4485  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4486  return nullptr;
4487  }
4488  // Nothing can folded with copy from/to NZCV.
4489  if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
4490  return nullptr;
4491  }
4492 
4493  // Handle the case where a copy is being spilled or filled but the source
4494  // and destination register class don't match. For example:
4495  //
4496  // %0 = COPY %xzr; GPR64common:%0
4497  //
4498  // In this case we can still safely fold away the COPY and generate the
4499  // following spill code:
4500  //
4501  // STRXui %xzr, %stack.0
4502  //
4503  // This also eliminates spilled cross register class COPYs (e.g. between x and
4504  // d regs) of the same size. For example:
4505  //
4506  // %0 = COPY %1; GPR64:%0, FPR64:%1
4507  //
4508  // will be filled as
4509  //
4510  // LDRDui %0, fi<#0>
4511  //
4512  // instead of
4513  //
4514  // LDRXui %Temp, fi<#0>
4515  // %0 = FMOV %Temp
4516  //
4517  if (MI.isCopy() && Ops.size() == 1 &&
4518  // Make sure we're only folding the explicit COPY defs/uses.
4519  (Ops[0] == 0 || Ops[0] == 1)) {
4520  bool IsSpill = Ops[0] == 0;
4521  bool IsFill = !IsSpill;
4523  const MachineRegisterInfo &MRI = MF.getRegInfo();
4524  MachineBasicBlock &MBB = *MI.getParent();
4525  const MachineOperand &DstMO = MI.getOperand(0);
4526  const MachineOperand &SrcMO = MI.getOperand(1);
4527  Register DstReg = DstMO.getReg();
4528  Register SrcReg = SrcMO.getReg();
4529  // This is slightly expensive to compute for physical regs since
4530  // getMinimalPhysRegClass is slow.
4531  auto getRegClass = [&](unsigned Reg) {
4534  };
4535 
4536  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
4537  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
4538  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
4539  "Mismatched register size in non subreg COPY");
4540  if (IsSpill)
4541  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
4542  getRegClass(SrcReg), &TRI);
4543  else
4544  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
4545  getRegClass(DstReg), &TRI);
4546  return &*--InsertPt;
4547  }
4548 
4549  // Handle cases like spilling def of:
4550  //
4551  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
4552  //
4553  // where the physical register source can be widened and stored to the full
4554  // virtual reg destination stack slot, in this case producing:
4555  //
4556  // STRXui %xzr, %stack.0
4557  //
4558  if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) {
4559  assert(SrcMO.getSubReg() == 0 &&
4560  "Unexpected subreg on physical register");
4561  const TargetRegisterClass *SpillRC;
4562  unsigned SpillSubreg;
4563  switch (DstMO.getSubReg()) {
4564  default:
4565  SpillRC = nullptr;
4566  break;
4567  case AArch64::sub_32:
4568  case AArch64::ssub:
4569  if (AArch64::GPR32RegClass.contains(SrcReg)) {
4570  SpillRC = &AArch64::GPR64RegClass;
4571  SpillSubreg = AArch64::sub_32;
4572  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
4573  SpillRC = &AArch64::FPR64RegClass;
4574  SpillSubreg = AArch64::ssub;
4575  } else
4576  SpillRC = nullptr;
4577  break;
4578  case AArch64::dsub:
4579  if (AArch64::FPR64RegClass.contains(SrcReg)) {
4580  SpillRC = &AArch64::FPR128RegClass;
4581  SpillSubreg = AArch64::dsub;
4582  } else
4583  SpillRC = nullptr;
4584  break;
4585  }
4586 
4587  if (SpillRC)
4588  if (unsigned WidenedSrcReg =
4589  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
4590  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
4591  FrameIndex, SpillRC, &TRI);
4592  return &*--InsertPt;
4593  }
4594  }
4595 
4596  // Handle cases like filling use of:
4597  //
4598  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
4599  //
4600  // where we can load the full virtual reg source stack slot, into the subreg
4601  // destination, in this case producing:
4602  //
4603  // LDRWui %0:sub_32<def,read-undef>, %stack.0
4604  //
4605  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
4606  const TargetRegisterClass *FillRC;
4607  switch (DstMO.getSubReg()) {
4608  default:
4609  FillRC = nullptr;
4610  break;
4611  case AArch64::sub_32:
4612  FillRC = &AArch64::GPR32RegClass;
4613  break;
4614  case AArch64::ssub:
4615  FillRC = &AArch64::FPR32RegClass;
4616  break;
4617  case AArch64::dsub:
4618  FillRC = &AArch64::FPR64RegClass;
4619  break;
4620  }
4621 
4622  if (FillRC) {
4623  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
4624  TRI.getRegSizeInBits(*FillRC) &&
4625  "Mismatched regclass size on folded subreg COPY");
4626  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
4627  MachineInstr &LoadMI = *--InsertPt;
4628  MachineOperand &LoadDst = LoadMI.getOperand(0);
4629  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
4630  LoadDst.setSubReg(DstMO.getSubReg());
4631  LoadDst.setIsUndef();
4632  return &LoadMI;
4633  }
4634  }
4635  }
4636 
4637  // Cannot fold.
4638  return nullptr;
4639 }
4640 
4642  StackOffset &SOffset,
4643  bool *OutUseUnscaledOp,
4644  unsigned *OutUnscaledOp,
4645  int64_t *EmittableOffset) {
4646  // Set output values in case of early exit.
4647  if (EmittableOffset)
4648  *EmittableOffset = 0;
4649  if (OutUseUnscaledOp)
4650  *OutUseUnscaledOp = false;
4651  if (OutUnscaledOp)
4652  *OutUnscaledOp = 0;
4653 
4654  // Exit early for structured vector spills/fills as they can't take an
4655  // immediate offset.
4656  switch (MI.getOpcode()) {
4657  default:
4658  break;
4659  case AArch64::LD1Twov2d:
4660  case AArch64::LD1Threev2d:
4661  case AArch64::LD1Fourv2d:
4662  case AArch64::LD1Twov1d:
4663  case AArch64::LD1Threev1d:
4664  case AArch64::LD1Fourv1d:
4665  case AArch64::ST1Twov2d:
4666  case AArch64::ST1Threev2d:
4667  case AArch64::ST1Fourv2d:
4668  case AArch64::ST1Twov1d:
4669  case AArch64::ST1Threev1d:
4670  case AArch64::ST1Fourv1d:
4671  case AArch64::ST1i8:
4672  case AArch64::ST1i16:
4673  case AArch64::ST1i32:
4674  case AArch64::ST1i64:
4675  case AArch64::IRG:
4676  case AArch64::IRGstack:
4677  case AArch64::STGloop:
4678  case AArch64::STZGloop:
4680  }
4681 
4682  // Get the min/max offset and the scale.
4683  TypeSize ScaleValue(0U, false);
4684  unsigned Width;
4685  int64_t MinOff, MaxOff;
4686  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
4687  MaxOff))
4688  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4689 
4690  // Construct the complete offset.
4691  bool IsMulVL = ScaleValue.isScalable();
4692  unsigned Scale = ScaleValue.getKnownMinSize();
4693  int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
4694 
4695  const MachineOperand &ImmOpnd =
4696  MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
4697  Offset += ImmOpnd.getImm() * Scale;
4698 
4699  // If the offset doesn't match the scale, we rewrite the instruction to
4700  // use the unscaled instruction instead. Likewise, if we have a negative
4701  // offset and there is an unscaled op to use.
4702  Optional<unsigned> UnscaledOp =
4704  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
4705  if (useUnscaledOp &&
4706  !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
4707  MaxOff))
4708  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
4709 
4710  Scale = ScaleValue.getKnownMinSize();
4711  assert(IsMulVL == ScaleValue.isScalable() &&
4712  "Unscaled opcode has different value for scalable");
4713 
4714  int64_t Remainder = Offset % Scale;
4715  assert(!(Remainder && useUnscaledOp) &&
4716  "Cannot have remainder when using unscaled op");
4717 
4718  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
4719  int64_t NewOffset = Offset / Scale;
4720  if (MinOff <= NewOffset && NewOffset <= MaxOff)
4721  Offset = Remainder;
4722  else {
4723  NewOffset = NewOffset < 0 ? MinOff : MaxOff;
4724  Offset = Offset - NewOffset * Scale + Remainder;
4725  }
4726 
4727  if (EmittableOffset)
4728  *EmittableOffset = NewOffset;
4729  if (OutUseUnscaledOp)
4730  *OutUseUnscaledOp = useUnscaledOp;
4731  if (OutUnscaledOp && UnscaledOp)
4732  *OutUnscaledOp = *UnscaledOp;
4733 
4734  if (IsMulVL)
4735  SOffset = StackOffset::get(SOffset.getFixed(), Offset);
4736  else
4737  SOffset = StackOffset::get(Offset, SOffset.getScalable());
4739  (SOffset ? 0 : AArch64FrameOffsetIsLegal);
4740 }
4741 
4742 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
4743  unsigned FrameReg, StackOffset &Offset,
4744  const AArch64InstrInfo *TII) {
4745  unsigned Opcode = MI.getOpcode();
4746  unsigned ImmIdx = FrameRegIdx + 1;
4747 
4748  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
4749  Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
4750  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
4751  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
4752  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
4753  MI.eraseFromParent();
4754  Offset = StackOffset();
4755  return true;
4756  }
4757 
4758  int64_t NewOffset;
4759  unsigned UnscaledOp;
4760  bool UseUnscaledOp;
4761  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
4762  &UnscaledOp, &NewOffset);
4765  // Replace the FrameIndex with FrameReg.
4766  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
4767  if (UseUnscaledOp)
4768  MI.setDesc(TII->get(UnscaledOp));
4769 
4770  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
4771  return !Offset;
4772  }
4773 
4774  return false;
4775 }
4776 
4778  return MCInstBuilder(AArch64::HINT).addImm(0);
4779 }
4780 
4781 // AArch64 supports MachineCombiner.
4782 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
4783 
4784 // True when Opc sets flag
4785 static bool isCombineInstrSettingFlag(unsigned Opc) {
4786  switch (Opc) {
4787  case AArch64::ADDSWrr:
4788  case AArch64::ADDSWri:
4789  case AArch64::ADDSXrr:
4790  case AArch64::ADDSXri:
4791  case AArch64::SUBSWrr:
4792  case AArch64::SUBSXrr:
4793  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4794  case AArch64::SUBSWri:
4795  case AArch64::SUBSXri:
4796  return true;
4797  default:
4798  break;
4799  }
4800  return false;
4801 }
4802 
4803 // 32b Opcodes that can be combined with a MUL
4804 static bool isCombineInstrCandidate32(unsigned Opc) {
4805  switch (Opc) {
4806  case AArch64::ADDWrr:
4807  case AArch64::ADDWri:
4808  case AArch64::SUBWrr:
4809  case AArch64::ADDSWrr:
4810  case AArch64::ADDSWri:
4811  case AArch64::SUBSWrr:
4812  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4813  case AArch64::SUBWri:
4814  case AArch64::SUBSWri:
4815  return true;
4816  default:
4817  break;
4818  }
4819  return false;
4820 }
4821 
4822 // 64b Opcodes that can be combined with a MUL
4823 static bool isCombineInstrCandidate64(unsigned Opc) {
4824  switch (Opc) {
4825  case AArch64::ADDXrr:
4826  case AArch64::ADDXri:
4827  case AArch64::SUBXrr:
4828  case AArch64::ADDSXrr:
4829  case AArch64::ADDSXri:
4830  case AArch64::SUBSXrr:
4831  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
4832  case AArch64::SUBXri:
4833  case AArch64::SUBSXri:
4834  case AArch64::ADDv8i8:
4835  case AArch64::ADDv16i8:
4836  case AArch64::ADDv4i16:
4837  case AArch64::ADDv8i16:
4838  case AArch64::ADDv2i32:
4839  case AArch64::ADDv4i32:
4840  case AArch64::SUBv8i8:
4841  case AArch64::SUBv16i8:
4842  case AArch64::SUBv4i16:
4843  case AArch64::SUBv8i16:
4844  case AArch64::SUBv2i32:
4845  case AArch64::SUBv4i32:
4846  return true;
4847  default:
4848  break;
4849  }
4850  return false;
4851 }
4852 
4853 // FP Opcodes that can be combined with a FMUL.
4854 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
4855  switch (Inst.getOpcode()) {
4856  default:
4857  break;
4858  case AArch64::FADDHrr:
4859  case AArch64::FADDSrr:
4860  case AArch64::FADDDrr:
4861  case AArch64::FADDv4f16:
4862  case AArch64::FADDv8f16:
4863  case AArch64::FADDv2f32:
4864  case AArch64::FADDv2f64:
4865  case AArch64::FADDv4f32:
4866  case AArch64::FSUBHrr:
4867  case AArch64::FSUBSrr:
4868  case AArch64::FSUBDrr:
4869  case AArch64::FSUBv4f16:
4870  case AArch64::FSUBv8f16:
4871  case AArch64::FSUBv2f32:
4872  case AArch64::FSUBv2f64:
4873  case AArch64::FSUBv4f32:
4875  // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
4876  // the target options or if FADD/FSUB has the contract fast-math flag.
4877  return Options.UnsafeFPMath ||
4878  Options.AllowFPOpFusion == FPOpFusion::Fast ||
4880  return true;
4881  }
4882  return false;
4883 }
4884 
4885 // Opcodes that can be combined with a MUL
4886 static bool isCombineInstrCandidate(unsigned Opc) {
4888 }
4889 
4890 //
4891 // Utility routine that checks if \param MO is defined by an
4892 // \param CombineOpc instruction in the basic block \param MBB
4894  unsigned CombineOpc, unsigned ZeroReg = 0,
4895  bool CheckZeroReg = false) {
4897  MachineInstr *MI = nullptr;
4898 
4899  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
4900  MI = MRI.getUniqueVRegDef(MO.getReg());
4901  // And it needs to be in the trace (otherwise, it won't have a depth).
4902  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
4903  return false;
4904  // Must only used by the user we combine with.
4905  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
4906  return false;
4907 
4908  if (CheckZeroReg) {
4909  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
4910  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
4911  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
4912  // The third input reg must be zero.
4913  if (MI->getOperand(3).getReg() != ZeroReg)
4914  return false;
4915  }
4916 
4917  if (isCombineInstrSettingFlag(CombineOpc) &&
4918  MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
4919  return false;
4920 
4921  return true;
4922 }
4923 
4924 //
4925 // Is \param MO defined by an integer multiply and can be combined?
4927  unsigned MulOpc, unsigned ZeroReg) {
4928  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
4929 }
4930 
4931 //
4932 // Is \param MO defined by a floating-point multiply and can be combined?
4934  unsigned MulOpc) {
4935  return canCombine(MBB, MO, MulOpc);
4936 }
4937 
4938 // TODO: There are many more machine instruction opcodes to match:
4939 // 1. Other data types (integer, vectors)
4940 // 2. Other math / logic operations (xor, or)
4941 // 3. Other forms of the same operation (intrinsics and other variants)
4943  const MachineInstr &Inst) const {
4944  switch (Inst.getOpcode()) {
4945  case AArch64::FADDDrr:
4946  case AArch64::FADDSrr:
4947  case AArch64::FADDv2f32:
4948  case AArch64::FADDv2f64:
4949  case AArch64::FADDv4f32:
4950  case AArch64::FMULDrr:
4951  case AArch64::FMULSrr:
4952  case AArch64::FMULX32:
4953  case AArch64::FMULX64:
4954  case AArch64::FMULXv2f32:
4955  case AArch64::FMULXv2f64:
4956  case AArch64::FMULXv4f32:
4957  case AArch64::FMULv2f32:
4958  case AArch64::FMULv2f64:
4959  case AArch64::FMULv4f32:
4960  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
4961  (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
4962  Inst.getFlag(MachineInstr::MIFlag::FmNsz));
4963  case AArch64::ADDXrr:
4964  case AArch64::ANDXrr:
4965  case AArch64::ORRXrr:
4966  case AArch64::EORXrr:
4967  case AArch64::EONXrr:
4968  case AArch64::ADDWrr:
4969  case AArch64::ANDWrr:
4970  case AArch64::ORRWrr:
4971  case AArch64::EORWrr:
4972  case AArch64::EONWrr:
4973  return true;
4974  default:
4975  return false;
4976  }
4977 }
4978 
4979 /// Find instructions that can be turned into madd.
4980 static bool getMaddPatterns(MachineInstr &Root,
4982  unsigned Opc = Root.getOpcode();
4983  MachineBasicBlock &MBB = *Root.getParent();
4984  bool Found = false;
4985 
4986  if (!isCombineInstrCandidate(Opc))
4987  return false;
4988  if (isCombineInstrSettingFlag(Opc)) {
4989  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
4990  // When NZCV is live bail out.
4991  if (Cmp_NZCV == -1)
4992  return false;
4993  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
4994  // When opcode can't change bail out.
4995  // CHECKME: do we miss any cases for opcode conversion?
4996  if (NewOpc == Opc)
4997  return false;
4998  Opc = NewOpc;
4999  }
5000 
5001  auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
5003  if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
5004  Patterns.push_back(Pattern);
5005  Found = true;
5006  }
5007  };
5008 
5009  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
5010  if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
5011  Patterns.push_back(Pattern);
5012  Found = true;
5013  }
5014  };
5015 
5016  typedef MachineCombinerPattern MCP;
5017 
5018  switch (Opc) {
5019  default:
5020  break;
5021  case AArch64::ADDWrr:
5022  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
5023  "ADDWrr does not have register operands");
5024  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
5025  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
5026  break;
5027  case AArch64::ADDXrr:
5028  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
5029  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
5030  break;
5031  case AArch64::SUBWrr:
5032  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
5033  setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
5034  break;
5035  case AArch64::SUBXrr:
5036  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
5037  setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
5038  break;
5039  case AArch64::ADDWri:
5040  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
5041  break;
5042  case AArch64::ADDXri:
5043  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
5044  break;
5045  case AArch64::SUBWri:
5046  setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
5047  break;
5048  case AArch64::SUBXri:
5049  setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
5050  break;
5051  case AArch64::ADDv8i8:
5052  setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
5053  setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
5054  break;
5055  case AArch64::ADDv16i8:
5056  setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
5057  setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
5058  break;
5059  case AArch64::ADDv4i16:
5060  setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
5061  setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
5062  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
5063  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
5064  break;
5065  case AArch64::ADDv8i16:
5066  setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
5067  setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
5068  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
5069  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
5070  break;
5071  case AArch64::ADDv2i32:
5072  setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
5073  setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
5074  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
5075  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
5076  break;
5077  case AArch64::ADDv4i32:
5078  setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
5079  setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
5080  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
5081  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
5082  break;
5083  case AArch64::SUBv8i8:
5084  setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
5085  setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
5086  break;
5087  case AArch64::SUBv16i8:
5088  setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
5089  setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
5090  break;
5091  case AArch64::SUBv4i16:
5092  setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
5093  setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
5094  setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
5095  setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
5096  break;
5097  case AArch64::SUBv8i16:
5098  setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
5099  setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
5100  setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
5101  setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
5102  break;
5103  case AArch64::SUBv2i32:
5104  setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
5105  setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
5106  setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
5107  setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
5108  break;
5109  case AArch64::SUBv4i32:
5110  setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
5111  setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
5112  setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
5113  setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
5114  break;
5115  }
5116  return Found;
5117 }
5118 /// Floating-Point Support
5119 
5120 /// Find instructions that can be turned into madd.
5121 static bool getFMAPatterns(MachineInstr &Root,
5123 
5124  if (!isCombineInstrCandidateFP(Root))
5125  return false;
5126 
5127  MachineBasicBlock &MBB = *Root.getParent();
5128  bool Found = false;
5129 
5130  auto Match = [&](int Opcode, int Operand,
5131  MachineCombinerPattern Pattern) -> bool {
5132  if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
5133  Patterns.push_back(Pattern);
5134  return true;
5135  }
5136  return false;
5137  };
5138 
5139  typedef MachineCombinerPattern MCP;
5140 
5141  switch (Root.getOpcode()) {
5142  default:
5143  assert(false && "Unsupported FP instruction in combiner\n");
5144  break;
5145  case AArch64::FADDHrr:
5146  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
5147  "FADDHrr does not have register operands");
5148 
5149  Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
5150  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
5151  break;
5152  case AArch64::FADDSrr:
5153  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
5154  "FADDSrr does not have register operands");
5155 
5156  Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
5157  Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
5158 
5159  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
5160  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
5161  break;
5162  case AArch64::FADDDrr:
5163  Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
5164  Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
5165 
5166  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
5167  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
5168  break;
5169  case AArch64::FADDv4f16:
5170  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
5171  Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
5172 
5173  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
5174  Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
5175  break;
5176  case AArch64::FADDv8f16:
5177  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
5178  Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
5179 
5180  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
5181  Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
5182  break;
5183  case AArch64::FADDv2f32:
5184  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
5185  Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
5186 
5187  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
5188  Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
5189  break;
5190  case AArch64::FADDv2f64:
5191  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
5192  Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
5193 
5194  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
5195  Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
5196  break;
5197  case AArch64::FADDv4f32:
5198  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
5199  Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
5200 
5201  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
5202  Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
5203  break;
5204  case AArch64::FSUBHrr:
5205  Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
5206  Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
5207  Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
5208  break;
5209  case AArch64::FSUBSrr:
5210  Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
5211 
5212  Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
5213  Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
5214 
5215  Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
5216  break;
5217  case AArch64::FSUBDrr:
5218  Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
5219 
5220  Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
5221  Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
5222 
5223  Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
5224  break;
5225  case AArch64::FSUBv4f16:
5226  Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
5227  Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
5228 
5229  Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
5230  Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
5231  break;
5232  case AArch64::FSUBv8f16:
5233  Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
5234  Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
5235 
5236  Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
5237  Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
5238  break;
5239  case AArch64::FSUBv2f32:
5240  Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
5241  Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
5242 
5243  Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
5244  Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
5245  break;
5246  case AArch64::FSUBv2f64:
5247  Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
5248  Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
5249 
5250  Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
5251  Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
5252  break;
5253  case AArch64::FSUBv4f32:
5254  Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
5255  Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
5256 
5257  Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
5258  Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
5259  break;
5260  }
5261  return Found;
5262 }
5263 
5264 static bool getFMULPatterns(MachineInstr &Root,
5266  MachineBasicBlock &MBB = *Root.getParent();
5267  bool Found = false;
5268 
5269  auto Match = [&](unsigned Opcode, int Operand,
5270  MachineCombinerPattern Pattern) -> bool {
5272  MachineOperand &MO = Root.getOperand(Operand);
5273  MachineInstr *MI = nullptr;
5274  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
5275  MI = MRI.getUniqueVRegDef(MO.getReg());
5276  // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
5277  if (MI && MI->getOpcode() == TargetOpcode::COPY &&
5278  MI->getOperand(1).getReg().isVirtual())
5279  MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
5280  if (MI && MI->getOpcode() == Opcode) {
5281  Patterns.push_back(Pattern);
5282  return true;
5283  }
5284  return false;
5285  };
5286 
5287  typedef MachineCombinerPattern MCP;
5288 
5289  switch (Root.getOpcode()) {
5290  default:
5291  return false;
5292  case AArch64::FMULv2f32:
5293  Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
5294  Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
5295  break;
5296  case AArch64::FMULv2f64:
5297  Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
5298  Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
5299  break;
<