LLVM  9.0.0svn
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/CodeGen/StackMaps.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/MC/MCInst.h"
36 #include "llvm/MC/MCInstrDesc.h"
37 #include "llvm/Support/Casting.h"
38 #include "llvm/Support/CodeGen.h"
40 #include "llvm/Support/Compiler.h"
45 #include <cassert>
46 #include <cstdint>
47 #include <iterator>
48 #include <utility>
49 
50 using namespace llvm;
51 
52 #define GET_INSTRINFO_CTOR_DTOR
53 #include "AArch64GenInstrInfo.inc"
54 
56  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
57  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
58 
60  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
61  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
62 
63 static cl::opt<unsigned>
64  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
65  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
66 
68  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
69  AArch64::CATCHRET),
70  RI(STI.getTargetTriple()), Subtarget(STI) {}
71 
72 /// GetInstSize - Return the number of bytes of code the specified
73 /// instruction may be. This returns the maximum number of bytes.
75  const MachineBasicBlock &MBB = *MI.getParent();
76  const MachineFunction *MF = MBB.getParent();
77  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
78 
79  if (MI.getOpcode() == AArch64::INLINEASM)
80  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
81 
82  // FIXME: We currently only handle pseudoinstructions that don't get expanded
83  // before the assembly printer.
84  unsigned NumBytes = 0;
85  const MCInstrDesc &Desc = MI.getDesc();
86  switch (Desc.getOpcode()) {
87  default:
88  // Anything not explicitly designated otherwise is a normal 4-byte insn.
89  NumBytes = 4;
90  break;
91  case TargetOpcode::DBG_VALUE:
93  case TargetOpcode::IMPLICIT_DEF:
94  case TargetOpcode::KILL:
95  NumBytes = 0;
96  break;
97  case TargetOpcode::STACKMAP:
98  // The upper bound for a stackmap intrinsic is the full length of its shadow
99  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
100  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
101  break;
102  case TargetOpcode::PATCHPOINT:
103  // The size of the patchpoint intrinsic is the number of bytes requested
104  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
105  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
106  break;
108  // This gets lowered to an instruction sequence which takes 16 bytes
109  NumBytes = 16;
110  break;
111  case AArch64::JumpTableDest32:
112  case AArch64::JumpTableDest16:
113  case AArch64::JumpTableDest8:
114  NumBytes = 12;
115  break;
116  case AArch64::SPACE:
117  NumBytes = MI.getOperand(1).getImm();
118  break;
119  }
120 
121  return NumBytes;
122 }
123 
126  // Block ends with fall-through condbranch.
127  switch (LastInst->getOpcode()) {
128  default:
129  llvm_unreachable("Unknown branch instruction?");
130  case AArch64::Bcc:
131  Target = LastInst->getOperand(1).getMBB();
132  Cond.push_back(LastInst->getOperand(0));
133  break;
134  case AArch64::CBZW:
135  case AArch64::CBZX:
136  case AArch64::CBNZW:
137  case AArch64::CBNZX:
138  Target = LastInst->getOperand(1).getMBB();
140  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
141  Cond.push_back(LastInst->getOperand(0));
142  break;
143  case AArch64::TBZW:
144  case AArch64::TBZX:
145  case AArch64::TBNZW:
146  case AArch64::TBNZX:
147  Target = LastInst->getOperand(2).getMBB();
149  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
150  Cond.push_back(LastInst->getOperand(0));
151  Cond.push_back(LastInst->getOperand(1));
152  }
153 }
154 
155 static unsigned getBranchDisplacementBits(unsigned Opc) {
156  switch (Opc) {
157  default:
158  llvm_unreachable("unexpected opcode!");
159  case AArch64::B:
160  return 64;
161  case AArch64::TBNZW:
162  case AArch64::TBZW:
163  case AArch64::TBNZX:
164  case AArch64::TBZX:
165  return TBZDisplacementBits;
166  case AArch64::CBNZW:
167  case AArch64::CBZW:
168  case AArch64::CBNZX:
169  case AArch64::CBZX:
170  return CBZDisplacementBits;
171  case AArch64::Bcc:
172  return BCCDisplacementBits;
173  }
174 }
175 
177  int64_t BrOffset) const {
178  unsigned Bits = getBranchDisplacementBits(BranchOp);
179  assert(Bits >= 3 && "max branch displacement must be enough to jump"
180  "over conditional branch expansion");
181  return isIntN(Bits, BrOffset / 4);
182 }
183 
186  switch (MI.getOpcode()) {
187  default:
188  llvm_unreachable("unexpected opcode!");
189  case AArch64::B:
190  return MI.getOperand(0).getMBB();
191  case AArch64::TBZW:
192  case AArch64::TBNZW:
193  case AArch64::TBZX:
194  case AArch64::TBNZX:
195  return MI.getOperand(2).getMBB();
196  case AArch64::CBZW:
197  case AArch64::CBNZW:
198  case AArch64::CBZX:
199  case AArch64::CBNZX:
200  case AArch64::Bcc:
201  return MI.getOperand(1).getMBB();
202  }
203 }
204 
205 // Branch analysis.
207  MachineBasicBlock *&TBB,
208  MachineBasicBlock *&FBB,
210  bool AllowModify) const {
211  // If the block has no terminators, it just falls into the block after it.
213  if (I == MBB.end())
214  return false;
215 
216  if (!isUnpredicatedTerminator(*I))
217  return false;
218 
219  // Get the last instruction in the block.
220  MachineInstr *LastInst = &*I;
221 
222  // If there is only one terminator instruction, process it.
223  unsigned LastOpc = LastInst->getOpcode();
224  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
225  if (isUncondBranchOpcode(LastOpc)) {
226  TBB = LastInst->getOperand(0).getMBB();
227  return false;
228  }
229  if (isCondBranchOpcode(LastOpc)) {
230  // Block ends with fall-through condbranch.
231  parseCondBranch(LastInst, TBB, Cond);
232  return false;
233  }
234  return true; // Can't handle indirect branch.
235  }
236 
237  // Get the instruction before it if it is a terminator.
238  MachineInstr *SecondLastInst = &*I;
239  unsigned SecondLastOpc = SecondLastInst->getOpcode();
240 
241  // If AllowModify is true and the block ends with two or more unconditional
242  // branches, delete all but the first unconditional branch.
243  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
244  while (isUncondBranchOpcode(SecondLastOpc)) {
245  LastInst->eraseFromParent();
246  LastInst = SecondLastInst;
247  LastOpc = LastInst->getOpcode();
248  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
249  // Return now the only terminator is an unconditional branch.
250  TBB = LastInst->getOperand(0).getMBB();
251  return false;
252  } else {
253  SecondLastInst = &*I;
254  SecondLastOpc = SecondLastInst->getOpcode();
255  }
256  }
257  }
258 
259  // If there are three terminators, we don't know what sort of block this is.
260  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
261  return true;
262 
263  // If the block ends with a B and a Bcc, handle it.
264  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
265  parseCondBranch(SecondLastInst, TBB, Cond);
266  FBB = LastInst->getOperand(0).getMBB();
267  return false;
268  }
269 
270  // If the block ends with two unconditional branches, handle it. The second
271  // one is not executed, so remove it.
272  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
273  TBB = SecondLastInst->getOperand(0).getMBB();
274  I = LastInst;
275  if (AllowModify)
276  I->eraseFromParent();
277  return false;
278  }
279 
280  // ...likewise if it ends with an indirect branch followed by an unconditional
281  // branch.
282  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
283  I = LastInst;
284  if (AllowModify)
285  I->eraseFromParent();
286  return true;
287  }
288 
289  // Otherwise, can't handle this.
290  return true;
291 }
292 
294  SmallVectorImpl<MachineOperand> &Cond) const {
295  if (Cond[0].getImm() != -1) {
296  // Regular Bcc
297  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
298  Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
299  } else {
300  // Folded compare-and-branch
301  switch (Cond[1].getImm()) {
302  default:
303  llvm_unreachable("Unknown conditional branch!");
304  case AArch64::CBZW:
305  Cond[1].setImm(AArch64::CBNZW);
306  break;
307  case AArch64::CBNZW:
308  Cond[1].setImm(AArch64::CBZW);
309  break;
310  case AArch64::CBZX:
311  Cond[1].setImm(AArch64::CBNZX);
312  break;
313  case AArch64::CBNZX:
314  Cond[1].setImm(AArch64::CBZX);
315  break;
316  case AArch64::TBZW:
317  Cond[1].setImm(AArch64::TBNZW);
318  break;
319  case AArch64::TBNZW:
320  Cond[1].setImm(AArch64::TBZW);
321  break;
322  case AArch64::TBZX:
323  Cond[1].setImm(AArch64::TBNZX);
324  break;
325  case AArch64::TBNZX:
326  Cond[1].setImm(AArch64::TBZX);
327  break;
328  }
329  }
330 
331  return false;
332 }
333 
335  int *BytesRemoved) const {
337  if (I == MBB.end())
338  return 0;
339 
340  if (!isUncondBranchOpcode(I->getOpcode()) &&
341  !isCondBranchOpcode(I->getOpcode()))
342  return 0;
343 
344  // Remove the branch.
345  I->eraseFromParent();
346 
347  I = MBB.end();
348 
349  if (I == MBB.begin()) {
350  if (BytesRemoved)
351  *BytesRemoved = 4;
352  return 1;
353  }
354  --I;
355  if (!isCondBranchOpcode(I->getOpcode())) {
356  if (BytesRemoved)
357  *BytesRemoved = 4;
358  return 1;
359  }
360 
361  // Remove the branch.
362  I->eraseFromParent();
363  if (BytesRemoved)
364  *BytesRemoved = 8;
365 
366  return 2;
367 }
368 
369 void AArch64InstrInfo::instantiateCondBranch(
370  MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
371  ArrayRef<MachineOperand> Cond) const {
372  if (Cond[0].getImm() != -1) {
373  // Regular Bcc
374  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
375  } else {
376  // Folded compare-and-branch
377  // Note that we use addOperand instead of addReg to keep the flags.
378  const MachineInstrBuilder MIB =
379  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
380  if (Cond.size() > 3)
381  MIB.addImm(Cond[3].getImm());
382  MIB.addMBB(TBB);
383  }
384 }
385 
388  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
389  // Shouldn't be a fall through.
390  assert(TBB && "insertBranch must not be told to insert a fallthrough");
391 
392  if (!FBB) {
393  if (Cond.empty()) // Unconditional branch?
394  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
395  else
396  instantiateCondBranch(MBB, DL, TBB, Cond);
397 
398  if (BytesAdded)
399  *BytesAdded = 4;
400 
401  return 1;
402  }
403 
404  // Two-way conditional branch.
405  instantiateCondBranch(MBB, DL, TBB, Cond);
406  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
407 
408  if (BytesAdded)
409  *BytesAdded = 8;
410 
411  return 2;
412 }
413 
414 // Find the original register that VReg is copied from.
415 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
417  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
418  if (!DefMI->isFullCopy())
419  return VReg;
420  VReg = DefMI->getOperand(1).getReg();
421  }
422  return VReg;
423 }
424 
425 // Determine if VReg is defined by an instruction that can be folded into a
426 // csel instruction. If so, return the folded opcode, and the replacement
427 // register.
428 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
429  unsigned *NewVReg = nullptr) {
430  VReg = removeCopies(MRI, VReg);
432  return 0;
433 
434  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
435  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
436  unsigned Opc = 0;
437  unsigned SrcOpNum = 0;
438  switch (DefMI->getOpcode()) {
439  case AArch64::ADDSXri:
440  case AArch64::ADDSWri:
441  // if NZCV is used, do not fold.
442  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
443  return 0;
444  // fall-through to ADDXri and ADDWri.
446  case AArch64::ADDXri:
447  case AArch64::ADDWri:
448  // add x, 1 -> csinc.
449  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
450  DefMI->getOperand(3).getImm() != 0)
451  return 0;
452  SrcOpNum = 1;
453  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
454  break;
455 
456  case AArch64::ORNXrr:
457  case AArch64::ORNWrr: {
458  // not x -> csinv, represented as orn dst, xzr, src.
459  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
460  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
461  return 0;
462  SrcOpNum = 2;
463  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
464  break;
465  }
466 
467  case AArch64::SUBSXrr:
468  case AArch64::SUBSWrr:
469  // if NZCV is used, do not fold.
470  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
471  return 0;
472  // fall-through to SUBXrr and SUBWrr.
474  case AArch64::SUBXrr:
475  case AArch64::SUBWrr: {
476  // neg x -> csneg, represented as sub dst, xzr, src.
477  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
478  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
479  return 0;
480  SrcOpNum = 2;
481  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
482  break;
483  }
484  default:
485  return 0;
486  }
487  assert(Opc && SrcOpNum && "Missing parameters");
488 
489  if (NewVReg)
490  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
491  return Opc;
492 }
493 
496  unsigned TrueReg, unsigned FalseReg,
497  int &CondCycles, int &TrueCycles,
498  int &FalseCycles) const {
499  // Check register classes.
500  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
501  const TargetRegisterClass *RC =
502  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
503  if (!RC)
504  return false;
505 
506  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
507  unsigned ExtraCondLat = Cond.size() != 1;
508 
509  // GPRs are handled by csel.
510  // FIXME: Fold in x+1, -x, and ~x when applicable.
511  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
512  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
513  // Single-cycle csel, csinc, csinv, and csneg.
514  CondCycles = 1 + ExtraCondLat;
515  TrueCycles = FalseCycles = 1;
516  if (canFoldIntoCSel(MRI, TrueReg))
517  TrueCycles = 0;
518  else if (canFoldIntoCSel(MRI, FalseReg))
519  FalseCycles = 0;
520  return true;
521  }
522 
523  // Scalar floating point is handled by fcsel.
524  // FIXME: Form fabs, fmin, and fmax when applicable.
525  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
526  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
527  CondCycles = 5 + ExtraCondLat;
528  TrueCycles = FalseCycles = 2;
529  return true;
530  }
531 
532  // Can't do vectors.
533  return false;
534 }
535 
538  const DebugLoc &DL, unsigned DstReg,
540  unsigned TrueReg, unsigned FalseReg) const {
542 
543  // Parse the condition code, see parseCondBranch() above.
545  switch (Cond.size()) {
546  default:
547  llvm_unreachable("Unknown condition opcode in Cond");
548  case 1: // b.cc
549  CC = AArch64CC::CondCode(Cond[0].getImm());
550  break;
551  case 3: { // cbz/cbnz
552  // We must insert a compare against 0.
553  bool Is64Bit;
554  switch (Cond[1].getImm()) {
555  default:
556  llvm_unreachable("Unknown branch opcode in Cond");
557  case AArch64::CBZW:
558  Is64Bit = false;
559  CC = AArch64CC::EQ;
560  break;
561  case AArch64::CBZX:
562  Is64Bit = true;
563  CC = AArch64CC::EQ;
564  break;
565  case AArch64::CBNZW:
566  Is64Bit = false;
567  CC = AArch64CC::NE;
568  break;
569  case AArch64::CBNZX:
570  Is64Bit = true;
571  CC = AArch64CC::NE;
572  break;
573  }
574  unsigned SrcReg = Cond[2].getReg();
575  if (Is64Bit) {
576  // cmp reg, #0 is actually subs xzr, reg, #0.
577  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
578  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
579  .addReg(SrcReg)
580  .addImm(0)
581  .addImm(0);
582  } else {
583  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
584  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
585  .addReg(SrcReg)
586  .addImm(0)
587  .addImm(0);
588  }
589  break;
590  }
591  case 4: { // tbz/tbnz
592  // We must insert a tst instruction.
593  switch (Cond[1].getImm()) {
594  default:
595  llvm_unreachable("Unknown branch opcode in Cond");
596  case AArch64::TBZW:
597  case AArch64::TBZX:
598  CC = AArch64CC::EQ;
599  break;
600  case AArch64::TBNZW:
601  case AArch64::TBNZX:
602  CC = AArch64CC::NE;
603  break;
604  }
605  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
606  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
607  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
608  .addReg(Cond[2].getReg())
609  .addImm(
610  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
611  else
612  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
613  .addReg(Cond[2].getReg())
614  .addImm(
615  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
616  break;
617  }
618  }
619 
620  unsigned Opc = 0;
621  const TargetRegisterClass *RC = nullptr;
622  bool TryFold = false;
623  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
624  RC = &AArch64::GPR64RegClass;
625  Opc = AArch64::CSELXr;
626  TryFold = true;
627  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
628  RC = &AArch64::GPR32RegClass;
629  Opc = AArch64::CSELWr;
630  TryFold = true;
631  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
632  RC = &AArch64::FPR64RegClass;
633  Opc = AArch64::FCSELDrrr;
634  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
635  RC = &AArch64::FPR32RegClass;
636  Opc = AArch64::FCSELSrrr;
637  }
638  assert(RC && "Unsupported regclass");
639 
640  // Try folding simple instructions into the csel.
641  if (TryFold) {
642  unsigned NewVReg = 0;
643  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
644  if (FoldedOpc) {
645  // The folded opcodes csinc, csinc and csneg apply the operation to
646  // FalseReg, so we need to invert the condition.
648  TrueReg = FalseReg;
649  } else
650  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
651 
652  // Fold the operation. Leave any dead instructions for DCE to clean up.
653  if (FoldedOpc) {
654  FalseReg = NewVReg;
655  Opc = FoldedOpc;
656  // The extends the live range of NewVReg.
657  MRI.clearKillFlags(NewVReg);
658  }
659  }
660 
661  // Pull all virtual register into the appropriate class.
662  MRI.constrainRegClass(TrueReg, RC);
663  MRI.constrainRegClass(FalseReg, RC);
664 
665  // Insert the csel.
666  BuildMI(MBB, I, DL, get(Opc), DstReg)
667  .addReg(TrueReg)
668  .addReg(FalseReg)
669  .addImm(CC);
670 }
671 
672 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
673 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
674  uint64_t Imm = MI.getOperand(1).getImm();
675  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
676  uint64_t Encoding;
677  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
678 }
679 
680 // FIXME: this implementation should be micro-architecture dependent, so a
681 // micro-architecture target hook should be introduced here in future.
683  if (!Subtarget.hasCustomCheapAsMoveHandling())
684  return MI.isAsCheapAsAMove();
685 
686  const unsigned Opcode = MI.getOpcode();
687 
688  // Firstly, check cases gated by features.
689 
690  if (Subtarget.hasZeroCycleZeroingFP()) {
691  if (Opcode == AArch64::FMOVH0 ||
692  Opcode == AArch64::FMOVS0 ||
693  Opcode == AArch64::FMOVD0)
694  return true;
695  }
696 
697  if (Subtarget.hasZeroCycleZeroingGP()) {
698  if (Opcode == TargetOpcode::COPY &&
699  (MI.getOperand(1).getReg() == AArch64::WZR ||
700  MI.getOperand(1).getReg() == AArch64::XZR))
701  return true;
702  }
703 
704  // Secondly, check cases specific to sub-targets.
705 
706  if (Subtarget.hasExynosCheapAsMoveHandling()) {
707  if (isExynosCheapAsMove(MI))
708  return true;
709 
710  return MI.isAsCheapAsAMove();
711  }
712 
713  // Finally, check generic cases.
714 
715  switch (Opcode) {
716  default:
717  return false;
718 
719  // add/sub on register without shift
720  case AArch64::ADDWri:
721  case AArch64::ADDXri:
722  case AArch64::SUBWri:
723  case AArch64::SUBXri:
724  return (MI.getOperand(3).getImm() == 0);
725 
726  // logical ops on immediate
727  case AArch64::ANDWri:
728  case AArch64::ANDXri:
729  case AArch64::EORWri:
730  case AArch64::EORXri:
731  case AArch64::ORRWri:
732  case AArch64::ORRXri:
733  return true;
734 
735  // logical ops on register without shift
736  case AArch64::ANDWrr:
737  case AArch64::ANDXrr:
738  case AArch64::BICWrr:
739  case AArch64::BICXrr:
740  case AArch64::EONWrr:
741  case AArch64::EONXrr:
742  case AArch64::EORWrr:
743  case AArch64::EORXrr:
744  case AArch64::ORNWrr:
745  case AArch64::ORNXrr:
746  case AArch64::ORRWrr:
747  case AArch64::ORRXrr:
748  return true;
749 
750  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
751  // ORRXri, it is as cheap as MOV
752  case AArch64::MOVi32imm:
753  return canBeExpandedToORR(MI, 32);
754  case AArch64::MOVi64imm:
755  return canBeExpandedToORR(MI, 64);
756  }
757 
758  llvm_unreachable("Unknown opcode to check as cheap as a move!");
759 }
760 
762  switch (MI.getOpcode()) {
763  default:
764  return false;
765 
766  case AArch64::ADDWrs:
767  case AArch64::ADDXrs:
768  case AArch64::ADDSWrs:
769  case AArch64::ADDSXrs: {
770  unsigned Imm = MI.getOperand(3).getImm();
771  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
772  if (ShiftVal == 0)
773  return true;
774  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
775  }
776 
777  case AArch64::ADDWrx:
778  case AArch64::ADDXrx:
779  case AArch64::ADDXrx64:
780  case AArch64::ADDSWrx:
781  case AArch64::ADDSXrx:
782  case AArch64::ADDSXrx64: {
783  unsigned Imm = MI.getOperand(3).getImm();
784  switch (AArch64_AM::getArithExtendType(Imm)) {
785  default:
786  return false;
787  case AArch64_AM::UXTB:
788  case AArch64_AM::UXTH:
789  case AArch64_AM::UXTW:
790  case AArch64_AM::UXTX:
791  return AArch64_AM::getArithShiftValue(Imm) <= 4;
792  }
793  }
794 
795  case AArch64::SUBWrs:
796  case AArch64::SUBSWrs: {
797  unsigned Imm = MI.getOperand(3).getImm();
798  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
799  return ShiftVal == 0 ||
800  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
801  }
802 
803  case AArch64::SUBXrs:
804  case AArch64::SUBSXrs: {
805  unsigned Imm = MI.getOperand(3).getImm();
806  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
807  return ShiftVal == 0 ||
808  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
809  }
810 
811  case AArch64::SUBWrx:
812  case AArch64::SUBXrx:
813  case AArch64::SUBXrx64:
814  case AArch64::SUBSWrx:
815  case AArch64::SUBSXrx:
816  case AArch64::SUBSXrx64: {
817  unsigned Imm = MI.getOperand(3).getImm();
818  switch (AArch64_AM::getArithExtendType(Imm)) {
819  default:
820  return false;
821  case AArch64_AM::UXTB:
822  case AArch64_AM::UXTH:
823  case AArch64_AM::UXTW:
824  case AArch64_AM::UXTX:
825  return AArch64_AM::getArithShiftValue(Imm) == 0;
826  }
827  }
828 
829  case AArch64::LDRBBroW:
830  case AArch64::LDRBBroX:
831  case AArch64::LDRBroW:
832  case AArch64::LDRBroX:
833  case AArch64::LDRDroW:
834  case AArch64::LDRDroX:
835  case AArch64::LDRHHroW:
836  case AArch64::LDRHHroX:
837  case AArch64::LDRHroW:
838  case AArch64::LDRHroX:
839  case AArch64::LDRQroW:
840  case AArch64::LDRQroX:
841  case AArch64::LDRSBWroW:
842  case AArch64::LDRSBWroX:
843  case AArch64::LDRSBXroW:
844  case AArch64::LDRSBXroX:
845  case AArch64::LDRSHWroW:
846  case AArch64::LDRSHWroX:
847  case AArch64::LDRSHXroW:
848  case AArch64::LDRSHXroX:
849  case AArch64::LDRSWroW:
850  case AArch64::LDRSWroX:
851  case AArch64::LDRSroW:
852  case AArch64::LDRSroX:
853  case AArch64::LDRWroW:
854  case AArch64::LDRWroX:
855  case AArch64::LDRXroW:
856  case AArch64::LDRXroX:
857  case AArch64::PRFMroW:
858  case AArch64::PRFMroX:
859  case AArch64::STRBBroW:
860  case AArch64::STRBBroX:
861  case AArch64::STRBroW:
862  case AArch64::STRBroX:
863  case AArch64::STRDroW:
864  case AArch64::STRDroX:
865  case AArch64::STRHHroW:
866  case AArch64::STRHHroX:
867  case AArch64::STRHroW:
868  case AArch64::STRHroX:
869  case AArch64::STRQroW:
870  case AArch64::STRQroX:
871  case AArch64::STRSroW:
872  case AArch64::STRSroX:
873  case AArch64::STRWroW:
874  case AArch64::STRWroX:
875  case AArch64::STRXroW:
876  case AArch64::STRXroX: {
877  unsigned IsSigned = MI.getOperand(3).getImm();
878  return !IsSigned;
879  }
880  }
881 }
882 
884  unsigned Opc = MI.getOpcode();
885  switch (Opc) {
886  default:
887  return false;
888  case AArch64::SEH_StackAlloc:
889  case AArch64::SEH_SaveFPLR:
890  case AArch64::SEH_SaveFPLR_X:
891  case AArch64::SEH_SaveReg:
892  case AArch64::SEH_SaveReg_X:
893  case AArch64::SEH_SaveRegP:
894  case AArch64::SEH_SaveRegP_X:
895  case AArch64::SEH_SaveFReg:
896  case AArch64::SEH_SaveFReg_X:
897  case AArch64::SEH_SaveFRegP:
898  case AArch64::SEH_SaveFRegP_X:
899  case AArch64::SEH_SetFP:
900  case AArch64::SEH_AddFP:
901  case AArch64::SEH_Nop:
902  case AArch64::SEH_PrologEnd:
903  case AArch64::SEH_EpilogStart:
904  case AArch64::SEH_EpilogEnd:
905  return true;
906  }
907 }
908 
910  unsigned &SrcReg, unsigned &DstReg,
911  unsigned &SubIdx) const {
912  switch (MI.getOpcode()) {
913  default:
914  return false;
915  case AArch64::SBFMXri: // aka sxtw
916  case AArch64::UBFMXri: // aka uxtw
917  // Check for the 32 -> 64 bit extension case, these instructions can do
918  // much more.
919  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
920  return false;
921  // This is a signed or unsigned 32 -> 64 bit extension.
922  SrcReg = MI.getOperand(1).getReg();
923  DstReg = MI.getOperand(0).getReg();
924  SubIdx = AArch64::sub_32;
925  return true;
926  }
927 }
928 
930  const MachineInstr &MIa, const MachineInstr &MIb, AliasAnalysis *AA) const {
932  const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
933  int64_t OffsetA = 0, OffsetB = 0;
934  unsigned WidthA = 0, WidthB = 0;
935 
936  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
937  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
938 
941  return false;
942 
943  // Retrieve the base, offset from the base and width. Width
944  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
945  // base are identical, and the offset of a lower memory access +
946  // the width doesn't overlap the offset of a higher memory access,
947  // then the memory accesses are different.
948  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
949  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
950  if (BaseOpA->isIdenticalTo(*BaseOpB)) {
951  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
952  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
953  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
954  if (LowOffset + LowWidth <= HighOffset)
955  return true;
956  }
957  }
958  return false;
959 }
960 
962  const MachineBasicBlock *MBB,
963  const MachineFunction &MF) const {
964  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
965  return true;
966  switch (MI.getOpcode()) {
967  case AArch64::HINT:
968  // CSDB hints are scheduling barriers.
969  if (MI.getOperand(0).getImm() == 0x14)
970  return true;
971  break;
972  case AArch64::DSB:
973  case AArch64::ISB:
974  // DSB and ISB also are scheduling barriers.
975  return true;
976  default:;
977  }
978  return isSEHInstruction(MI);
979 }
980 
981 /// analyzeCompare - For a comparison instruction, return the source registers
982 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
983 /// Return true if the comparison instruction can be analyzed.
984 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
985  unsigned &SrcReg2, int &CmpMask,
986  int &CmpValue) const {
987  // The first operand can be a frame index where we'd normally expect a
988  // register.
989  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
990  if (!MI.getOperand(1).isReg())
991  return false;
992 
993  switch (MI.getOpcode()) {
994  default:
995  break;
996  case AArch64::SUBSWrr:
997  case AArch64::SUBSWrs:
998  case AArch64::SUBSWrx:
999  case AArch64::SUBSXrr:
1000  case AArch64::SUBSXrs:
1001  case AArch64::SUBSXrx:
1002  case AArch64::ADDSWrr:
1003  case AArch64::ADDSWrs:
1004  case AArch64::ADDSWrx:
1005  case AArch64::ADDSXrr:
1006  case AArch64::ADDSXrs:
1007  case AArch64::ADDSXrx:
1008  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1009  SrcReg = MI.getOperand(1).getReg();
1010  SrcReg2 = MI.getOperand(2).getReg();
1011  CmpMask = ~0;
1012  CmpValue = 0;
1013  return true;
1014  case AArch64::SUBSWri:
1015  case AArch64::ADDSWri:
1016  case AArch64::SUBSXri:
1017  case AArch64::ADDSXri:
1018  SrcReg = MI.getOperand(1).getReg();
1019  SrcReg2 = 0;
1020  CmpMask = ~0;
1021  // FIXME: In order to convert CmpValue to 0 or 1
1022  CmpValue = MI.getOperand(2).getImm() != 0;
1023  return true;
1024  case AArch64::ANDSWri:
1025  case AArch64::ANDSXri:
1026  // ANDS does not use the same encoding scheme as the others xxxS
1027  // instructions.
1028  SrcReg = MI.getOperand(1).getReg();
1029  SrcReg2 = 0;
1030  CmpMask = ~0;
1031  // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1032  // while the type of CmpValue is int. When converting uint64_t to int,
1033  // the high 32 bits of uint64_t will be lost.
1034  // In fact it causes a bug in spec2006-483.xalancbmk
1035  // CmpValue is only used to compare with zero in OptimizeCompareInstr
1037  MI.getOperand(2).getImm(),
1038  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1039  return true;
1040  }
1041 
1042  return false;
1043 }
1044 
1046  MachineBasicBlock *MBB = Instr.getParent();
1047  assert(MBB && "Can't get MachineBasicBlock here");
1048  MachineFunction *MF = MBB->getParent();
1049  assert(MF && "Can't get MachineFunction here");
1050  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1053 
1054  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1055  ++OpIdx) {
1056  MachineOperand &MO = Instr.getOperand(OpIdx);
1057  const TargetRegisterClass *OpRegCstraints =
1058  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1059 
1060  // If there's no constraint, there's nothing to do.
1061  if (!OpRegCstraints)
1062  continue;
1063  // If the operand is a frame index, there's nothing to do here.
1064  // A frame index operand will resolve correctly during PEI.
1065  if (MO.isFI())
1066  continue;
1067 
1068  assert(MO.isReg() &&
1069  "Operand has register constraints without being a register!");
1070 
1071  unsigned Reg = MO.getReg();
1073  if (!OpRegCstraints->contains(Reg))
1074  return false;
1075  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1076  !MRI->constrainRegClass(Reg, OpRegCstraints))
1077  return false;
1078  }
1079 
1080  return true;
1081 }
1082 
1083 /// Return the opcode that does not set flags when possible - otherwise
1084 /// return the original opcode. The caller is responsible to do the actual
1085 /// substitution and legality checking.
1086 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1087  // Don't convert all compare instructions, because for some the zero register
1088  // encoding becomes the sp register.
1089  bool MIDefinesZeroReg = false;
1090  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1091  MIDefinesZeroReg = true;
1092 
1093  switch (MI.getOpcode()) {
1094  default:
1095  return MI.getOpcode();
1096  case AArch64::ADDSWrr:
1097  return AArch64::ADDWrr;
1098  case AArch64::ADDSWri:
1099  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1100  case AArch64::ADDSWrs:
1101  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1102  case AArch64::ADDSWrx:
1103  return AArch64::ADDWrx;
1104  case AArch64::ADDSXrr:
1105  return AArch64::ADDXrr;
1106  case AArch64::ADDSXri:
1107  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1108  case AArch64::ADDSXrs:
1109  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1110  case AArch64::ADDSXrx:
1111  return AArch64::ADDXrx;
1112  case AArch64::SUBSWrr:
1113  return AArch64::SUBWrr;
1114  case AArch64::SUBSWri:
1115  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1116  case AArch64::SUBSWrs:
1117  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1118  case AArch64::SUBSWrx:
1119  return AArch64::SUBWrx;
1120  case AArch64::SUBSXrr:
1121  return AArch64::SUBXrr;
1122  case AArch64::SUBSXri:
1123  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1124  case AArch64::SUBSXrs:
1125  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1126  case AArch64::SUBSXrx:
1127  return AArch64::SUBXrx;
1128  }
1129 }
1130 
1131 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1132 
1133 /// True when condition flags are accessed (either by writing or reading)
1134 /// on the instruction trace starting at From and ending at To.
1135 ///
1136 /// Note: If From and To are from different blocks it's assumed CC are accessed
1137 /// on the path.
1140  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1141  // Early exit if To is at the beginning of the BB.
1142  if (To == To->getParent()->begin())
1143  return true;
1144 
1145  // Check whether the instructions are in the same basic block
1146  // If not, assume the condition flags might get modified somewhere.
1147  if (To->getParent() != From->getParent())
1148  return true;
1149 
1150  // From must be above To.
1151  assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1152  [From](MachineInstr &MI) {
1153  return MI.getIterator() == From;
1154  }) != To->getParent()->rend());
1155 
1156  // We iterate backward starting \p To until we hit \p From.
1157  for (--To; To != From; --To) {
1158  const MachineInstr &Instr = *To;
1159 
1160  if (((AccessToCheck & AK_Write) &&
1161  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1162  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1163  return true;
1164  }
1165  return false;
1166 }
1167 
1168 /// Try to optimize a compare instruction. A compare instruction is an
1169 /// instruction which produces AArch64::NZCV. It can be truly compare
1170 /// instruction
1171 /// when there are no uses of its destination register.
1172 ///
1173 /// The following steps are tried in order:
1174 /// 1. Convert CmpInstr into an unconditional version.
1175 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1176 /// condition code or an instruction which can be converted into such an
1177 /// instruction.
1178 /// Only comparison with zero is supported.
1180  MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1181  int CmpValue, const MachineRegisterInfo *MRI) const {
1182  assert(CmpInstr.getParent());
1183  assert(MRI);
1184 
1185  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1186  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1187  if (DeadNZCVIdx != -1) {
1188  if (CmpInstr.definesRegister(AArch64::WZR) ||
1189  CmpInstr.definesRegister(AArch64::XZR)) {
1190  CmpInstr.eraseFromParent();
1191  return true;
1192  }
1193  unsigned Opc = CmpInstr.getOpcode();
1194  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1195  if (NewOpc == Opc)
1196  return false;
1197  const MCInstrDesc &MCID = get(NewOpc);
1198  CmpInstr.setDesc(MCID);
1199  CmpInstr.RemoveOperand(DeadNZCVIdx);
1200  bool succeeded = UpdateOperandRegClass(CmpInstr);
1201  (void)succeeded;
1202  assert(succeeded && "Some operands reg class are incompatible!");
1203  return true;
1204  }
1205 
1206  // Continue only if we have a "ri" where immediate is zero.
1207  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1208  // function.
1209  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1210  if (CmpValue != 0 || SrcReg2 != 0)
1211  return false;
1212 
1213  // CmpInstr is a Compare instruction if destination register is not used.
1214  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1215  return false;
1216 
1217  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1218 }
1219 
1220 /// Get opcode of S version of Instr.
1221 /// If Instr is S version its opcode is returned.
1222 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1223 /// or we are not interested in it.
1224 static unsigned sForm(MachineInstr &Instr) {
1225  switch (Instr.getOpcode()) {
1226  default:
1227  return AArch64::INSTRUCTION_LIST_END;
1228 
1229  case AArch64::ADDSWrr:
1230  case AArch64::ADDSWri:
1231  case AArch64::ADDSXrr:
1232  case AArch64::ADDSXri:
1233  case AArch64::SUBSWrr:
1234  case AArch64::SUBSWri:
1235  case AArch64::SUBSXrr:
1236  case AArch64::SUBSXri:
1237  return Instr.getOpcode();
1238 
1239  case AArch64::ADDWrr:
1240  return AArch64::ADDSWrr;
1241  case AArch64::ADDWri:
1242  return AArch64::ADDSWri;
1243  case AArch64::ADDXrr:
1244  return AArch64::ADDSXrr;
1245  case AArch64::ADDXri:
1246  return AArch64::ADDSXri;
1247  case AArch64::ADCWr:
1248  return AArch64::ADCSWr;
1249  case AArch64::ADCXr:
1250  return AArch64::ADCSXr;
1251  case AArch64::SUBWrr:
1252  return AArch64::SUBSWrr;
1253  case AArch64::SUBWri:
1254  return AArch64::SUBSWri;
1255  case AArch64::SUBXrr:
1256  return AArch64::SUBSXrr;
1257  case AArch64::SUBXri:
1258  return AArch64::SUBSXri;
1259  case AArch64::SBCWr:
1260  return AArch64::SBCSWr;
1261  case AArch64::SBCXr:
1262  return AArch64::SBCSXr;
1263  case AArch64::ANDWri:
1264  return AArch64::ANDSWri;
1265  case AArch64::ANDXri:
1266  return AArch64::ANDSXri;
1267  }
1268 }
1269 
1270 /// Check if AArch64::NZCV should be alive in successors of MBB.
1272  for (auto *BB : MBB->successors())
1273  if (BB->isLiveIn(AArch64::NZCV))
1274  return true;
1275  return false;
1276 }
1277 
1278 namespace {
1279 
1280 struct UsedNZCV {
1281  bool N = false;
1282  bool Z = false;
1283  bool C = false;
1284  bool V = false;
1285 
1286  UsedNZCV() = default;
1287 
1288  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1289  this->N |= UsedFlags.N;
1290  this->Z |= UsedFlags.Z;
1291  this->C |= UsedFlags.C;
1292  this->V |= UsedFlags.V;
1293  return *this;
1294  }
1295 };
1296 
1297 } // end anonymous namespace
1298 
1299 /// Find a condition code used by the instruction.
1300 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1301 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1303  switch (Instr.getOpcode()) {
1304  default:
1305  return AArch64CC::Invalid;
1306 
1307  case AArch64::Bcc: {
1308  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1309  assert(Idx >= 2);
1310  return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1311  }
1312 
1313  case AArch64::CSINVWr:
1314  case AArch64::CSINVXr:
1315  case AArch64::CSINCWr:
1316  case AArch64::CSINCXr:
1317  case AArch64::CSELWr:
1318  case AArch64::CSELXr:
1319  case AArch64::CSNEGWr:
1320  case AArch64::CSNEGXr:
1321  case AArch64::FCSELSrrr:
1322  case AArch64::FCSELDrrr: {
1323  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1324  assert(Idx >= 1);
1325  return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1326  }
1327  }
1328 }
1329 
1330 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1331  assert(CC != AArch64CC::Invalid);
1332  UsedNZCV UsedFlags;
1333  switch (CC) {
1334  default:
1335  break;
1336 
1337  case AArch64CC::EQ: // Z set
1338  case AArch64CC::NE: // Z clear
1339  UsedFlags.Z = true;
1340  break;
1341 
1342  case AArch64CC::HI: // Z clear and C set
1343  case AArch64CC::LS: // Z set or C clear
1344  UsedFlags.Z = true;
1346  case AArch64CC::HS: // C set
1347  case AArch64CC::LO: // C clear
1348  UsedFlags.C = true;
1349  break;
1350 
1351  case AArch64CC::MI: // N set
1352  case AArch64CC::PL: // N clear
1353  UsedFlags.N = true;
1354  break;
1355 
1356  case AArch64CC::VS: // V set
1357  case AArch64CC::VC: // V clear
1358  UsedFlags.V = true;
1359  break;
1360 
1361  case AArch64CC::GT: // Z clear, N and V the same
1362  case AArch64CC::LE: // Z set, N and V differ
1363  UsedFlags.Z = true;
1365  case AArch64CC::GE: // N and V the same
1366  case AArch64CC::LT: // N and V differ
1367  UsedFlags.N = true;
1368  UsedFlags.V = true;
1369  break;
1370  }
1371  return UsedFlags;
1372 }
1373 
1374 static bool isADDSRegImm(unsigned Opcode) {
1375  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1376 }
1377 
1378 static bool isSUBSRegImm(unsigned Opcode) {
1379  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1380 }
1381 
1382 /// Check if CmpInstr can be substituted by MI.
1383 ///
1384 /// CmpInstr can be substituted:
1385 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1386 /// - and, MI and CmpInstr are from the same MachineBB
1387 /// - and, condition flags are not alive in successors of the CmpInstr parent
1388 /// - and, if MI opcode is the S form there must be no defs of flags between
1389 /// MI and CmpInstr
1390 /// or if MI opcode is not the S form there must be neither defs of flags
1391 /// nor uses of flags between MI and CmpInstr.
1392 /// - and C/V flags are not used after CmpInstr
1394  const TargetRegisterInfo *TRI) {
1395  assert(MI);
1396  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1397  assert(CmpInstr);
1398 
1399  const unsigned CmpOpcode = CmpInstr->getOpcode();
1400  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1401  return false;
1402 
1403  if (MI->getParent() != CmpInstr->getParent())
1404  return false;
1405 
1406  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1407  return false;
1408 
1409  AccessKind AccessToCheck = AK_Write;
1410  if (sForm(*MI) != MI->getOpcode())
1411  AccessToCheck = AK_All;
1412  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1413  return false;
1414 
1415  UsedNZCV NZCVUsedAfterCmp;
1416  for (auto I = std::next(CmpInstr->getIterator()),
1417  E = CmpInstr->getParent()->instr_end();
1418  I != E; ++I) {
1419  const MachineInstr &Instr = *I;
1420  if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1422  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1423  return false;
1424  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1425  }
1426 
1427  if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1428  break;
1429  }
1430 
1431  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1432 }
1433 
1434 /// Substitute an instruction comparing to zero with another instruction
1435 /// which produces needed condition flags.
1436 ///
1437 /// Return true on success.
1438 bool AArch64InstrInfo::substituteCmpToZero(
1439  MachineInstr &CmpInstr, unsigned SrcReg,
1440  const MachineRegisterInfo *MRI) const {
1441  assert(MRI);
1442  // Get the unique definition of SrcReg.
1443  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1444  if (!MI)
1445  return false;
1446 
1448 
1449  unsigned NewOpc = sForm(*MI);
1450  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1451  return false;
1452 
1453  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1454  return false;
1455 
1456  // Update the instruction to set NZCV.
1457  MI->setDesc(get(NewOpc));
1458  CmpInstr.eraseFromParent();
1459  bool succeeded = UpdateOperandRegClass(*MI);
1460  (void)succeeded;
1461  assert(succeeded && "Some operands reg class are incompatible!");
1462  MI->addRegisterDefined(AArch64::NZCV, TRI);
1463  return true;
1464 }
1465 
1467  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1468  MI.getOpcode() != AArch64::CATCHRET)
1469  return false;
1470 
1471  MachineBasicBlock &MBB = *MI.getParent();
1472  DebugLoc DL = MI.getDebugLoc();
1473 
1474  if (MI.getOpcode() == AArch64::CATCHRET) {
1475  // Skip to the first instruction before the epilog.
1476  const TargetInstrInfo *TII =
1477  MBB.getParent()->getSubtarget().getInstrInfo();
1478  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1479  auto MBBI = MachineBasicBlock::iterator(MI);
1480  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1481  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1482  FirstEpilogSEH != MBB.begin())
1483  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1484  if (FirstEpilogSEH != MBB.begin())
1485  FirstEpilogSEH = std::next(FirstEpilogSEH);
1486  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1487  .addReg(AArch64::X0, RegState::Define)
1488  .addMBB(TargetMBB);
1489  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1490  .addReg(AArch64::X0, RegState::Define)
1491  .addReg(AArch64::X0)
1492  .addMBB(TargetMBB)
1493  .addImm(0);
1494  return true;
1495  }
1496 
1497  unsigned Reg = MI.getOperand(0).getReg();
1498  const GlobalValue *GV =
1499  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1500  const TargetMachine &TM = MBB.getParent()->getTarget();
1501  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1502  const unsigned char MO_NC = AArch64II::MO_NC;
1503 
1504  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1505  BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1506  .addGlobalAddress(GV, 0, OpFlags);
1507  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1508  .addReg(Reg, RegState::Kill)
1509  .addImm(0)
1511  } else if (TM.getCodeModel() == CodeModel::Large) {
1512  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1513  .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1514  .addImm(0);
1515  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1516  .addReg(Reg, RegState::Kill)
1517  .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1518  .addImm(16);
1519  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1520  .addReg(Reg, RegState::Kill)
1521  .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1522  .addImm(32);
1523  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1524  .addReg(Reg, RegState::Kill)
1526  .addImm(48);
1527  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1528  .addReg(Reg, RegState::Kill)
1529  .addImm(0)
1531  } else if (TM.getCodeModel() == CodeModel::Tiny) {
1532  BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1533  .addGlobalAddress(GV, 0, OpFlags);
1534  } else {
1535  BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1536  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1537  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1538  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1539  .addReg(Reg, RegState::Kill)
1540  .addGlobalAddress(GV, 0, LoFlags)
1542  }
1543 
1544  MBB.erase(MI);
1545 
1546  return true;
1547 }
1548 
1549 // Return true if this instruction simply sets its single destination register
1550 // to zero. This is equivalent to a register rename of the zero-register.
1552  switch (MI.getOpcode()) {
1553  default:
1554  break;
1555  case AArch64::MOVZWi:
1556  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1557  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1558  assert(MI.getDesc().getNumOperands() == 3 &&
1559  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1560  return true;
1561  }
1562  break;
1563  case AArch64::ANDWri: // and Rd, Rzr, #imm
1564  return MI.getOperand(1).getReg() == AArch64::WZR;
1565  case AArch64::ANDXri:
1566  return MI.getOperand(1).getReg() == AArch64::XZR;
1567  case TargetOpcode::COPY:
1568  return MI.getOperand(1).getReg() == AArch64::WZR;
1569  }
1570  return false;
1571 }
1572 
1573 // Return true if this instruction simply renames a general register without
1574 // modifying bits.
1576  switch (MI.getOpcode()) {
1577  default:
1578  break;
1579  case TargetOpcode::COPY: {
1580  // GPR32 copies will by lowered to ORRXrs
1581  unsigned DstReg = MI.getOperand(0).getReg();
1582  return (AArch64::GPR32RegClass.contains(DstReg) ||
1583  AArch64::GPR64RegClass.contains(DstReg));
1584  }
1585  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1586  if (MI.getOperand(1).getReg() == AArch64::XZR) {
1587  assert(MI.getDesc().getNumOperands() == 4 &&
1588  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1589  return true;
1590  }
1591  break;
1592  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1593  if (MI.getOperand(2).getImm() == 0) {
1594  assert(MI.getDesc().getNumOperands() == 4 &&
1595  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1596  return true;
1597  }
1598  break;
1599  }
1600  return false;
1601 }
1602 
1603 // Return true if this instruction simply renames a general register without
1604 // modifying bits.
1606  switch (MI.getOpcode()) {
1607  default:
1608  break;
1609  case TargetOpcode::COPY: {
1610  // FPR64 copies will by lowered to ORR.16b
1611  unsigned DstReg = MI.getOperand(0).getReg();
1612  return (AArch64::FPR64RegClass.contains(DstReg) ||
1613  AArch64::FPR128RegClass.contains(DstReg));
1614  }
1615  case AArch64::ORRv16i8:
1616  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1617  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1618  "invalid ORRv16i8 operands");
1619  return true;
1620  }
1621  break;
1622  }
1623  return false;
1624 }
1625 
1627  int &FrameIndex) const {
1628  switch (MI.getOpcode()) {
1629  default:
1630  break;
1631  case AArch64::LDRWui:
1632  case AArch64::LDRXui:
1633  case AArch64::LDRBui:
1634  case AArch64::LDRHui:
1635  case AArch64::LDRSui:
1636  case AArch64::LDRDui:
1637  case AArch64::LDRQui:
1638  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1639  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1640  FrameIndex = MI.getOperand(1).getIndex();
1641  return MI.getOperand(0).getReg();
1642  }
1643  break;
1644  }
1645 
1646  return 0;
1647 }
1648 
1650  int &FrameIndex) const {
1651  switch (MI.getOpcode()) {
1652  default:
1653  break;
1654  case AArch64::STRWui:
1655  case AArch64::STRXui:
1656  case AArch64::STRBui:
1657  case AArch64::STRHui:
1658  case AArch64::STRSui:
1659  case AArch64::STRDui:
1660  case AArch64::STRQui:
1661  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1662  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1663  FrameIndex = MI.getOperand(1).getIndex();
1664  return MI.getOperand(0).getReg();
1665  }
1666  break;
1667  }
1668  return 0;
1669 }
1670 
1671 /// Check all MachineMemOperands for a hint to suppress pairing.
1673  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1674  return MMO->getFlags() & MOSuppressPair;
1675  });
1676 }
1677 
1678 /// Set a flag on the first MachineMemOperand to suppress pairing.
1680  if (MI.memoperands_empty())
1681  return;
1682  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1683 }
1684 
1685 /// Check all MachineMemOperands for a hint that the load/store is strided.
1687  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1688  return MMO->getFlags() & MOStridedAccess;
1689  });
1690 }
1691 
1693  switch (Opc) {
1694  default:
1695  return false;
1696  case AArch64::STURSi:
1697  case AArch64::STURDi:
1698  case AArch64::STURQi:
1699  case AArch64::STURBBi:
1700  case AArch64::STURHHi:
1701  case AArch64::STURWi:
1702  case AArch64::STURXi:
1703  case AArch64::LDURSi:
1704  case AArch64::LDURDi:
1705  case AArch64::LDURQi:
1706  case AArch64::LDURWi:
1707  case AArch64::LDURXi:
1708  case AArch64::LDURSWi:
1709  case AArch64::LDURHHi:
1710  case AArch64::LDURBBi:
1711  case AArch64::LDURSBWi:
1712  case AArch64::LDURSHWi:
1713  return true;
1714  }
1715 }
1716 
1718  switch (Opc) {
1719  default: return {};
1720  case AArch64::PRFMui: return AArch64::PRFUMi;
1721  case AArch64::LDRXui: return AArch64::LDURXi;
1722  case AArch64::LDRWui: return AArch64::LDURWi;
1723  case AArch64::LDRBui: return AArch64::LDURBi;
1724  case AArch64::LDRHui: return AArch64::LDURHi;
1725  case AArch64::LDRSui: return AArch64::LDURSi;
1726  case AArch64::LDRDui: return AArch64::LDURDi;
1727  case AArch64::LDRQui: return AArch64::LDURQi;
1728  case AArch64::LDRBBui: return AArch64::LDURBBi;
1729  case AArch64::LDRHHui: return AArch64::LDURHHi;
1730  case AArch64::LDRSBXui: return AArch64::LDURSBXi;
1731  case AArch64::LDRSBWui: return AArch64::LDURSBWi;
1732  case AArch64::LDRSHXui: return AArch64::LDURSHXi;
1733  case AArch64::LDRSHWui: return AArch64::LDURSHWi;
1734  case AArch64::LDRSWui: return AArch64::LDURSWi;
1735  case AArch64::STRXui: return AArch64::STURXi;
1736  case AArch64::STRWui: return AArch64::STURWi;
1737  case AArch64::STRBui: return AArch64::STURBi;
1738  case AArch64::STRHui: return AArch64::STURHi;
1739  case AArch64::STRSui: return AArch64::STURSi;
1740  case AArch64::STRDui: return AArch64::STURDi;
1741  case AArch64::STRQui: return AArch64::STURQi;
1742  case AArch64::STRBBui: return AArch64::STURBBi;
1743  case AArch64::STRHHui: return AArch64::STURHHi;
1744  }
1745 }
1746 
1748  switch (Opc) {
1749  default:
1750  return 2;
1751  case AArch64::LDPXi:
1752  case AArch64::LDPDi:
1753  case AArch64::STPXi:
1754  case AArch64::STPDi:
1755  case AArch64::LDNPXi:
1756  case AArch64::LDNPDi:
1757  case AArch64::STNPXi:
1758  case AArch64::STNPDi:
1759  case AArch64::LDPQi:
1760  case AArch64::STPQi:
1761  case AArch64::LDNPQi:
1762  case AArch64::STNPQi:
1763  case AArch64::LDPWi:
1764  case AArch64::LDPSi:
1765  case AArch64::STPWi:
1766  case AArch64::STPSi:
1767  case AArch64::LDNPWi:
1768  case AArch64::LDNPSi:
1769  case AArch64::STNPWi:
1770  case AArch64::STNPSi:
1771  return 3;
1772  }
1773 }
1774 
1776  switch (MI.getOpcode()) {
1777  default:
1778  return false;
1779  // Scaled instructions.
1780  case AArch64::STRSui:
1781  case AArch64::STRDui:
1782  case AArch64::STRQui:
1783  case AArch64::STRXui:
1784  case AArch64::STRWui:
1785  case AArch64::LDRSui:
1786  case AArch64::LDRDui:
1787  case AArch64::LDRQui:
1788  case AArch64::LDRXui:
1789  case AArch64::LDRWui:
1790  case AArch64::LDRSWui:
1791  // Unscaled instructions.
1792  case AArch64::STURSi:
1793  case AArch64::STURDi:
1794  case AArch64::STURQi:
1795  case AArch64::STURWi:
1796  case AArch64::STURXi:
1797  case AArch64::LDURSi:
1798  case AArch64::LDURDi:
1799  case AArch64::LDURQi:
1800  case AArch64::LDURWi:
1801  case AArch64::LDURXi:
1802  case AArch64::LDURSWi:
1803  return true;
1804  }
1805 }
1806 
1808  bool &Is64Bit) {
1809  switch (Opc) {
1810  default:
1811  llvm_unreachable("Opcode has no flag setting equivalent!");
1812  // 32-bit cases:
1813  case AArch64::ADDWri:
1814  Is64Bit = false;
1815  return AArch64::ADDSWri;
1816  case AArch64::ADDWrr:
1817  Is64Bit = false;
1818  return AArch64::ADDSWrr;
1819  case AArch64::ADDWrs:
1820  Is64Bit = false;
1821  return AArch64::ADDSWrs;
1822  case AArch64::ADDWrx:
1823  Is64Bit = false;
1824  return AArch64::ADDSWrx;
1825  case AArch64::ANDWri:
1826  Is64Bit = false;
1827  return AArch64::ANDSWri;
1828  case AArch64::ANDWrr:
1829  Is64Bit = false;
1830  return AArch64::ANDSWrr;
1831  case AArch64::ANDWrs:
1832  Is64Bit = false;
1833  return AArch64::ANDSWrs;
1834  case AArch64::BICWrr:
1835  Is64Bit = false;
1836  return AArch64::BICSWrr;
1837  case AArch64::BICWrs:
1838  Is64Bit = false;
1839  return AArch64::BICSWrs;
1840  case AArch64::SUBWri:
1841  Is64Bit = false;
1842  return AArch64::SUBSWri;
1843  case AArch64::SUBWrr:
1844  Is64Bit = false;
1845  return AArch64::SUBSWrr;
1846  case AArch64::SUBWrs:
1847  Is64Bit = false;
1848  return AArch64::SUBSWrs;
1849  case AArch64::SUBWrx:
1850  Is64Bit = false;
1851  return AArch64::SUBSWrx;
1852  // 64-bit cases:
1853  case AArch64::ADDXri:
1854  Is64Bit = true;
1855  return AArch64::ADDSXri;
1856  case AArch64::ADDXrr:
1857  Is64Bit = true;
1858  return AArch64::ADDSXrr;
1859  case AArch64::ADDXrs:
1860  Is64Bit = true;
1861  return AArch64::ADDSXrs;
1862  case AArch64::ADDXrx:
1863  Is64Bit = true;
1864  return AArch64::ADDSXrx;
1865  case AArch64::ANDXri:
1866  Is64Bit = true;
1867  return AArch64::ANDSXri;
1868  case AArch64::ANDXrr:
1869  Is64Bit = true;
1870  return AArch64::ANDSXrr;
1871  case AArch64::ANDXrs:
1872  Is64Bit = true;
1873  return AArch64::ANDSXrs;
1874  case AArch64::BICXrr:
1875  Is64Bit = true;
1876  return AArch64::BICSXrr;
1877  case AArch64::BICXrs:
1878  Is64Bit = true;
1879  return AArch64::BICSXrs;
1880  case AArch64::SUBXri:
1881  Is64Bit = true;
1882  return AArch64::SUBSXri;
1883  case AArch64::SUBXrr:
1884  Is64Bit = true;
1885  return AArch64::SUBSXrr;
1886  case AArch64::SUBXrs:
1887  Is64Bit = true;
1888  return AArch64::SUBSXrs;
1889  case AArch64::SUBXrx:
1890  Is64Bit = true;
1891  return AArch64::SUBSXrx;
1892  }
1893 }
1894 
1895 // Is this a candidate for ld/st merging or pairing? For example, we don't
1896 // touch volatiles or load/stores that have a hint to avoid pair formation.
1898  // If this is a volatile load/store, don't mess with it.
1899  if (MI.hasOrderedMemoryRef())
1900  return false;
1901 
1902  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1903  assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1904  "Expected a reg or frame index operand.");
1905  if (!MI.getOperand(2).isImm())
1906  return false;
1907 
1908  // Can't merge/pair if the instruction modifies the base register.
1909  // e.g., ldr x0, [x0]
1910  // This case will never occur with an FI base.
1911  if (MI.getOperand(1).isReg()) {
1912  unsigned BaseReg = MI.getOperand(1).getReg();
1914  if (MI.modifiesRegister(BaseReg, TRI))
1915  return false;
1916  }
1917 
1918  // Check if this load/store has a hint to avoid pair formation.
1919  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1920  if (isLdStPairSuppressed(MI))
1921  return false;
1922 
1923  // On some CPUs quad load/store pairs are slower than two single load/stores.
1924  if (Subtarget.isPaired128Slow()) {
1925  switch (MI.getOpcode()) {
1926  default:
1927  break;
1928  case AArch64::LDURQi:
1929  case AArch64::STURQi:
1930  case AArch64::LDRQui:
1931  case AArch64::STRQui:
1932  return false;
1933  }
1934  }
1935 
1936  return true;
1937 }
1938 
1940  const MachineOperand *&BaseOp,
1941  int64_t &Offset,
1942  const TargetRegisterInfo *TRI) const {
1943  unsigned Width;
1944  return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1945 }
1946 
1948  const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
1949  unsigned &Width, const TargetRegisterInfo *TRI) const {
1950  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1951  // Handle only loads/stores with base register followed by immediate offset.
1952  if (LdSt.getNumExplicitOperands() == 3) {
1953  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1954  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1955  !LdSt.getOperand(2).isImm())
1956  return false;
1957  } else if (LdSt.getNumExplicitOperands() == 4) {
1958  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
1959  if (!LdSt.getOperand(1).isReg() ||
1960  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
1961  !LdSt.getOperand(3).isImm())
1962  return false;
1963  } else
1964  return false;
1965 
1966  // Get the scaling factor for the instruction and set the width for the
1967  // instruction.
1968  unsigned Scale = 0;
1969  int64_t Dummy1, Dummy2;
1970 
1971  // If this returns false, then it's an instruction we don't want to handle.
1972  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
1973  return false;
1974 
1975  // Compute the offset. Offset is calculated as the immediate operand
1976  // multiplied by the scaling factor. Unscaled instructions have scaling factor
1977  // set to 1.
1978  if (LdSt.getNumExplicitOperands() == 3) {
1979  BaseOp = &LdSt.getOperand(1);
1980  Offset = LdSt.getOperand(2).getImm() * Scale;
1981  } else {
1982  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
1983  BaseOp = &LdSt.getOperand(2);
1984  Offset = LdSt.getOperand(3).getImm() * Scale;
1985  }
1986 
1987  assert((BaseOp->isReg() || BaseOp->isFI()) &&
1988  "getMemOperandWithOffset only supports base "
1989  "operands of type register or frame index.");
1990 
1991  return true;
1992 }
1993 
1996  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1997  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
1998  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
1999  return OfsOp;
2000 }
2001 
2002 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
2003  unsigned &Width, int64_t &MinOffset,
2004  int64_t &MaxOffset) {
2005  switch (Opcode) {
2006  // Not a memory operation or something we want to handle.
2007  default:
2008  Scale = Width = 0;
2009  MinOffset = MaxOffset = 0;
2010  return false;
2011  case AArch64::STRWpost:
2012  case AArch64::LDRWpost:
2013  Width = 32;
2014  Scale = 4;
2015  MinOffset = -256;
2016  MaxOffset = 255;
2017  break;
2018  case AArch64::LDURQi:
2019  case AArch64::STURQi:
2020  Width = 16;
2021  Scale = 1;
2022  MinOffset = -256;
2023  MaxOffset = 255;
2024  break;
2025  case AArch64::PRFUMi:
2026  case AArch64::LDURXi:
2027  case AArch64::LDURDi:
2028  case AArch64::STURXi:
2029  case AArch64::STURDi:
2030  Width = 8;
2031  Scale = 1;
2032  MinOffset = -256;
2033  MaxOffset = 255;
2034  break;
2035  case AArch64::LDURWi:
2036  case AArch64::LDURSi:
2037  case AArch64::LDURSWi:
2038  case AArch64::STURWi:
2039  case AArch64::STURSi:
2040  Width = 4;
2041  Scale = 1;
2042  MinOffset = -256;
2043  MaxOffset = 255;
2044  break;
2045  case AArch64::LDURHi:
2046  case AArch64::LDURHHi:
2047  case AArch64::LDURSHXi:
2048  case AArch64::LDURSHWi:
2049  case AArch64::STURHi:
2050  case AArch64::STURHHi:
2051  Width = 2;
2052  Scale = 1;
2053  MinOffset = -256;
2054  MaxOffset = 255;
2055  break;
2056  case AArch64::LDURBi:
2057  case AArch64::LDURBBi:
2058  case AArch64::LDURSBXi:
2059  case AArch64::LDURSBWi:
2060  case AArch64::STURBi:
2061  case AArch64::STURBBi:
2062  Width = 1;
2063  Scale = 1;
2064  MinOffset = -256;
2065  MaxOffset = 255;
2066  break;
2067  case AArch64::LDPQi:
2068  case AArch64::LDNPQi:
2069  case AArch64::STPQi:
2070  case AArch64::STNPQi:
2071  Scale = 16;
2072  Width = 32;
2073  MinOffset = -64;
2074  MaxOffset = 63;
2075  break;
2076  case AArch64::LDRQui:
2077  case AArch64::STRQui:
2078  Scale = Width = 16;
2079  MinOffset = 0;
2080  MaxOffset = 4095;
2081  break;
2082  case AArch64::LDPXi:
2083  case AArch64::LDPDi:
2084  case AArch64::LDNPXi:
2085  case AArch64::LDNPDi:
2086  case AArch64::STPXi:
2087  case AArch64::STPDi:
2088  case AArch64::STNPXi:
2089  case AArch64::STNPDi:
2090  Scale = 8;
2091  Width = 16;
2092  MinOffset = -64;
2093  MaxOffset = 63;
2094  break;
2095  case AArch64::PRFMui:
2096  case AArch64::LDRXui:
2097  case AArch64::LDRDui:
2098  case AArch64::STRXui:
2099  case AArch64::STRDui:
2100  Scale = Width = 8;
2101  MinOffset = 0;
2102  MaxOffset = 4095;
2103  break;
2104  case AArch64::LDPWi:
2105  case AArch64::LDPSi:
2106  case AArch64::LDNPWi:
2107  case AArch64::LDNPSi:
2108  case AArch64::STPWi:
2109  case AArch64::STPSi:
2110  case AArch64::STNPWi:
2111  case AArch64::STNPSi:
2112  Scale = 4;
2113  Width = 8;
2114  MinOffset = -64;
2115  MaxOffset = 63;
2116  break;
2117  case AArch64::LDRWui:
2118  case AArch64::LDRSui:
2119  case AArch64::LDRSWui:
2120  case AArch64::STRWui:
2121  case AArch64::STRSui:
2122  Scale = Width = 4;
2123  MinOffset = 0;
2124  MaxOffset = 4095;
2125  break;
2126  case AArch64::LDRHui:
2127  case AArch64::LDRHHui:
2128  case AArch64::LDRSHWui:
2129  case AArch64::LDRSHXui:
2130  case AArch64::STRHui:
2131  case AArch64::STRHHui:
2132  Scale = Width = 2;
2133  MinOffset = 0;
2134  MaxOffset = 4095;
2135  break;
2136  case AArch64::LDRBui:
2137  case AArch64::LDRBBui:
2138  case AArch64::LDRSBWui:
2139  case AArch64::LDRSBXui:
2140  case AArch64::STRBui:
2141  case AArch64::STRBBui:
2142  Scale = Width = 1;
2143  MinOffset = 0;
2144  MaxOffset = 4095;
2145  break;
2146  }
2147 
2148  return true;
2149 }
2150 
2151 static unsigned getOffsetStride(unsigned Opc) {
2152  switch (Opc) {
2153  default:
2154  return 0;
2155  case AArch64::LDURQi:
2156  case AArch64::STURQi:
2157  return 16;
2158  case AArch64::LDURXi:
2159  case AArch64::LDURDi:
2160  case AArch64::STURXi:
2161  case AArch64::STURDi:
2162  return 8;
2163  case AArch64::LDURWi:
2164  case AArch64::LDURSi:
2165  case AArch64::LDURSWi:
2166  case AArch64::STURWi:
2167  case AArch64::STURSi:
2168  return 4;
2169  }
2170 }
2171 
2172 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2173 // scaled.
2174 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2175  unsigned OffsetStride = getOffsetStride(Opc);
2176  if (OffsetStride == 0)
2177  return false;
2178  // If the byte-offset isn't a multiple of the stride, we can't scale this
2179  // offset.
2180  if (Offset % OffsetStride != 0)
2181  return false;
2182 
2183  // Convert the byte-offset used by unscaled into an "element" offset used
2184  // by the scaled pair load/store instructions.
2185  Offset /= OffsetStride;
2186  return true;
2187 }
2188 
2189 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2190 // unscaled.
2191 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2192  unsigned OffsetStride = getOffsetStride(Opc);
2193  if (OffsetStride == 0)
2194  return false;
2195 
2196  // Convert the "element" offset used by scaled pair load/store instructions
2197  // into the byte-offset used by unscaled.
2198  Offset *= OffsetStride;
2199  return true;
2200 }
2201 
2202 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2203  if (FirstOpc == SecondOpc)
2204  return true;
2205  // We can also pair sign-ext and zero-ext instructions.
2206  switch (FirstOpc) {
2207  default:
2208  return false;
2209  case AArch64::LDRWui:
2210  case AArch64::LDURWi:
2211  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2212  case AArch64::LDRSWui:
2213  case AArch64::LDURSWi:
2214  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2215  }
2216  // These instructions can't be paired based on their opcodes.
2217  return false;
2218 }
2219 
2220 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2221  int64_t Offset1, unsigned Opcode1, int FI2,
2222  int64_t Offset2, unsigned Opcode2) {
2223  // Accesses through fixed stack object frame indices may access a different
2224  // fixed stack slot. Check that the object offsets + offsets match.
2225  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2226  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2227  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2228  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2229  // Get the byte-offset from the object offset.
2230  if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2231  return false;
2232  ObjectOffset1 += Offset1;
2233  ObjectOffset2 += Offset2;
2234  // Get the "element" index in the object.
2235  if (!scaleOffset(Opcode1, ObjectOffset1) ||
2236  !scaleOffset(Opcode2, ObjectOffset2))
2237  return false;
2238  return ObjectOffset1 + 1 == ObjectOffset2;
2239  }
2240 
2241  return FI1 == FI2;
2242 }
2243 
2244 /// Detect opportunities for ldp/stp formation.
2245 ///
2246 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2248  const MachineOperand &BaseOp2,
2249  unsigned NumLoads) const {
2250  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
2251  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
2252  if (BaseOp1.getType() != BaseOp2.getType())
2253  return false;
2254 
2255  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2256  "Only base registers and frame indices are supported.");
2257 
2258  // Check for both base regs and base FI.
2259  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2260  return false;
2261 
2262  // Only cluster up to a single pair.
2263  if (NumLoads > 1)
2264  return false;
2265 
2266  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2267  return false;
2268 
2269  // Can we pair these instructions based on their opcodes?
2270  unsigned FirstOpc = FirstLdSt.getOpcode();
2271  unsigned SecondOpc = SecondLdSt.getOpcode();
2272  if (!canPairLdStOpc(FirstOpc, SecondOpc))
2273  return false;
2274 
2275  // Can't merge volatiles or load/stores that have a hint to avoid pair
2276  // formation, for example.
2277  if (!isCandidateToMergeOrPair(FirstLdSt) ||
2278  !isCandidateToMergeOrPair(SecondLdSt))
2279  return false;
2280 
2281  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2282  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2283  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2284  return false;
2285 
2286  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2287  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2288  return false;
2289 
2290  // Pairwise instructions have a 7-bit signed offset field.
2291  if (Offset1 > 63 || Offset1 < -64)
2292  return false;
2293 
2294  // The caller should already have ordered First/SecondLdSt by offset.
2295  // Note: except for non-equal frame index bases
2296  if (BaseOp1.isFI()) {
2297  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2298  "Caller should have ordered offsets.");
2299 
2300  const MachineFrameInfo &MFI =
2301  FirstLdSt.getParent()->getParent()->getFrameInfo();
2302  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2303  BaseOp2.getIndex(), Offset2, SecondOpc);
2304  }
2305 
2306  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2307  "Caller should have ordered offsets.");
2308 
2309  return Offset1 + 1 == Offset2;
2310 }
2311 
2313  unsigned Reg, unsigned SubIdx,
2314  unsigned State,
2315  const TargetRegisterInfo *TRI) {
2316  if (!SubIdx)
2317  return MIB.addReg(Reg, State);
2318 
2320  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2321  return MIB.addReg(Reg, State, SubIdx);
2322 }
2323 
2324 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2325  unsigned NumRegs) {
2326  // We really want the positive remainder mod 32 here, that happens to be
2327  // easily obtainable with a mask.
2328  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2329 }
2330 
2333  const DebugLoc &DL, unsigned DestReg,
2334  unsigned SrcReg, bool KillSrc,
2335  unsigned Opcode,
2336  ArrayRef<unsigned> Indices) const {
2337  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2339  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2340  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2341  unsigned NumRegs = Indices.size();
2342 
2343  int SubReg = 0, End = NumRegs, Incr = 1;
2344  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2345  SubReg = NumRegs - 1;
2346  End = -1;
2347  Incr = -1;
2348  }
2349 
2350  for (; SubReg != End; SubReg += Incr) {
2351  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2352  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2353  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2354  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2355  }
2356 }
2357 
2360  DebugLoc DL, unsigned DestReg,
2361  unsigned SrcReg, bool KillSrc,
2362  unsigned Opcode, unsigned ZeroReg,
2363  llvm::ArrayRef<unsigned> Indices) const {
2365  unsigned NumRegs = Indices.size();
2366 
2367 #ifndef NDEBUG
2368  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2369  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2370  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2371  "GPR reg sequences should not be able to overlap");
2372 #endif
2373 
2374  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2375  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2376  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2377  MIB.addReg(ZeroReg);
2378  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2379  MIB.addImm(0);
2380  }
2381 }
2382 
2385  const DebugLoc &DL, unsigned DestReg,
2386  unsigned SrcReg, bool KillSrc) const {
2387  if (AArch64::GPR32spRegClass.contains(DestReg) &&
2388  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2390 
2391  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2392  // If either operand is WSP, expand to ADD #0.
2393  if (Subtarget.hasZeroCycleRegMove()) {
2394  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2395  unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2396  &AArch64::GPR64spRegClass);
2397  unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2398  &AArch64::GPR64spRegClass);
2399  // This instruction is reading and writing X registers. This may upset
2400  // the register scavenger and machine verifier, so we need to indicate
2401  // that we are reading an undefined value from SrcRegX, but a proper
2402  // value from SrcReg.
2403  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2404  .addReg(SrcRegX, RegState::Undef)
2405  .addImm(0)
2407  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2408  } else {
2409  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2410  .addReg(SrcReg, getKillRegState(KillSrc))
2411  .addImm(0)
2413  }
2414  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2415  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2416  .addImm(0)
2418  } else {
2419  if (Subtarget.hasZeroCycleRegMove()) {
2420  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2421  unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2422  &AArch64::GPR64spRegClass);
2423  unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2424  &AArch64::GPR64spRegClass);
2425  // This instruction is reading and writing X registers. This may upset
2426  // the register scavenger and machine verifier, so we need to indicate
2427  // that we are reading an undefined value from SrcRegX, but a proper
2428  // value from SrcReg.
2429  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2430  .addReg(AArch64::XZR)
2431  .addReg(SrcRegX, RegState::Undef)
2432  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2433  } else {
2434  // Otherwise, expand to ORR WZR.
2435  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2436  .addReg(AArch64::WZR)
2437  .addReg(SrcReg, getKillRegState(KillSrc));
2438  }
2439  }
2440  return;
2441  }
2442 
2443  if (AArch64::GPR64spRegClass.contains(DestReg) &&
2444  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2445  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2446  // If either operand is SP, expand to ADD #0.
2447  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2448  .addReg(SrcReg, getKillRegState(KillSrc))
2449  .addImm(0)
2451  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2452  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2453  .addImm(0)
2455  } else {
2456  // Otherwise, expand to ORR XZR.
2457  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2458  .addReg(AArch64::XZR)
2459  .addReg(SrcReg, getKillRegState(KillSrc));
2460  }
2461  return;
2462  }
2463 
2464  // Copy a DDDD register quad by copying the individual sub-registers.
2465  if (AArch64::DDDDRegClass.contains(DestReg) &&
2466  AArch64::DDDDRegClass.contains(SrcReg)) {
2467  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2468  AArch64::dsub2, AArch64::dsub3};
2469  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2470  Indices);
2471  return;
2472  }
2473 
2474  // Copy a DDD register triple by copying the individual sub-registers.
2475  if (AArch64::DDDRegClass.contains(DestReg) &&
2476  AArch64::DDDRegClass.contains(SrcReg)) {
2477  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2478  AArch64::dsub2};
2479  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2480  Indices);
2481  return;
2482  }
2483 
2484  // Copy a DD register pair by copying the individual sub-registers.
2485  if (AArch64::DDRegClass.contains(DestReg) &&
2486  AArch64::DDRegClass.contains(SrcReg)) {
2487  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2488  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2489  Indices);
2490  return;
2491  }
2492 
2493  // Copy a QQQQ register quad by copying the individual sub-registers.
2494  if (AArch64::QQQQRegClass.contains(DestReg) &&
2495  AArch64::QQQQRegClass.contains(SrcReg)) {
2496  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2497  AArch64::qsub2, AArch64::qsub3};
2498  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2499  Indices);
2500  return;
2501  }
2502 
2503  // Copy a QQQ register triple by copying the individual sub-registers.
2504  if (AArch64::QQQRegClass.contains(DestReg) &&
2505  AArch64::QQQRegClass.contains(SrcReg)) {
2506  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2507  AArch64::qsub2};
2508  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2509  Indices);
2510  return;
2511  }
2512 
2513  // Copy a QQ register pair by copying the individual sub-registers.
2514  if (AArch64::QQRegClass.contains(DestReg) &&
2515  AArch64::QQRegClass.contains(SrcReg)) {
2516  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2517  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2518  Indices);
2519  return;
2520  }
2521 
2522  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2523  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2524  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2525  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2526  AArch64::XZR, Indices);
2527  return;
2528  }
2529 
2530  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2531  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2532  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2533  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2534  AArch64::WZR, Indices);
2535  return;
2536  }
2537 
2538  if (AArch64::FPR128RegClass.contains(DestReg) &&
2539  AArch64::FPR128RegClass.contains(SrcReg)) {
2540  if (Subtarget.hasNEON()) {
2541  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2542  .addReg(SrcReg)
2543  .addReg(SrcReg, getKillRegState(KillSrc));
2544  } else {
2545  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2546  .addReg(AArch64::SP, RegState::Define)
2547  .addReg(SrcReg, getKillRegState(KillSrc))
2548  .addReg(AArch64::SP)
2549  .addImm(-16);
2550  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2551  .addReg(AArch64::SP, RegState::Define)
2552  .addReg(DestReg, RegState::Define)
2553  .addReg(AArch64::SP)
2554  .addImm(16);
2555  }
2556  return;
2557  }
2558 
2559  if (AArch64::FPR64RegClass.contains(DestReg) &&
2560  AArch64::FPR64RegClass.contains(SrcReg)) {
2561  if (Subtarget.hasNEON()) {
2562  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2563  &AArch64::FPR128RegClass);
2564  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2565  &AArch64::FPR128RegClass);
2566  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2567  .addReg(SrcReg)
2568  .addReg(SrcReg, getKillRegState(KillSrc));
2569  } else {
2570  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2571  .addReg(SrcReg, getKillRegState(KillSrc));
2572  }
2573  return;
2574  }
2575 
2576  if (AArch64::FPR32RegClass.contains(DestReg) &&
2577  AArch64::FPR32RegClass.contains(SrcReg)) {
2578  if (Subtarget.hasNEON()) {
2579  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2580  &AArch64::FPR128RegClass);
2581  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2582  &AArch64::FPR128RegClass);
2583  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2584  .addReg(SrcReg)
2585  .addReg(SrcReg, getKillRegState(KillSrc));
2586  } else {
2587  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2588  .addReg(SrcReg, getKillRegState(KillSrc));
2589  }
2590  return;
2591  }
2592 
2593  if (AArch64::FPR16RegClass.contains(DestReg) &&
2594  AArch64::FPR16RegClass.contains(SrcReg)) {
2595  if (Subtarget.hasNEON()) {
2596  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2597  &AArch64::FPR128RegClass);
2598  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2599  &AArch64::FPR128RegClass);
2600  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2601  .addReg(SrcReg)
2602  .addReg(SrcReg, getKillRegState(KillSrc));
2603  } else {
2604  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2605  &AArch64::FPR32RegClass);
2606  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2607  &AArch64::FPR32RegClass);
2608  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2609  .addReg(SrcReg, getKillRegState(KillSrc));
2610  }
2611  return;
2612  }
2613 
2614  if (AArch64::FPR8RegClass.contains(DestReg) &&
2615  AArch64::FPR8RegClass.contains(SrcReg)) {
2616  if (Subtarget.hasNEON()) {
2617  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2618  &AArch64::FPR128RegClass);
2619  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2620  &AArch64::FPR128RegClass);
2621  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2622  .addReg(SrcReg)
2623  .addReg(SrcReg, getKillRegState(KillSrc));
2624  } else {
2625  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2626  &AArch64::FPR32RegClass);
2627  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2628  &AArch64::FPR32RegClass);
2629  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2630  .addReg(SrcReg, getKillRegState(KillSrc));
2631  }
2632  return;
2633  }
2634 
2635  // Copies between GPR64 and FPR64.
2636  if (AArch64::FPR64RegClass.contains(DestReg) &&
2637  AArch64::GPR64RegClass.contains(SrcReg)) {
2638  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2639  .addReg(SrcReg, getKillRegState(KillSrc));
2640  return;
2641  }
2642  if (AArch64::GPR64RegClass.contains(DestReg) &&
2643  AArch64::FPR64RegClass.contains(SrcReg)) {
2644  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2645  .addReg(SrcReg, getKillRegState(KillSrc));
2646  return;
2647  }
2648  // Copies between GPR32 and FPR32.
2649  if (AArch64::FPR32RegClass.contains(DestReg) &&
2650  AArch64::GPR32RegClass.contains(SrcReg)) {
2651  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2652  .addReg(SrcReg, getKillRegState(KillSrc));
2653  return;
2654  }
2655  if (AArch64::GPR32RegClass.contains(DestReg) &&
2656  AArch64::FPR32RegClass.contains(SrcReg)) {
2657  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2658  .addReg(SrcReg, getKillRegState(KillSrc));
2659  return;
2660  }
2661 
2662  if (DestReg == AArch64::NZCV) {
2663  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2664  BuildMI(MBB, I, DL, get(AArch64::MSR))
2665  .addImm(AArch64SysReg::NZCV)
2666  .addReg(SrcReg, getKillRegState(KillSrc))
2667  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2668  return;
2669  }
2670 
2671  if (SrcReg == AArch64::NZCV) {
2672  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2673  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2674  .addImm(AArch64SysReg::NZCV)
2675  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2676  return;
2677  }
2678 
2679  llvm_unreachable("unimplemented reg-to-reg copy");
2680 }
2681 
2683  MachineBasicBlock &MBB,
2684  MachineBasicBlock::iterator InsertBefore,
2685  const MCInstrDesc &MCID,
2686  unsigned SrcReg, bool IsKill,
2687  unsigned SubIdx0, unsigned SubIdx1, int FI,
2688  MachineMemOperand *MMO) {
2689  unsigned SrcReg0 = SrcReg;
2690  unsigned SrcReg1 = SrcReg;
2692  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2693  SubIdx0 = 0;
2694  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2695  SubIdx1 = 0;
2696  }
2697  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2698  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2699  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2700  .addFrameIndex(FI)
2701  .addImm(0)
2702  .addMemOperand(MMO);
2703 }
2704 
2706  MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2707  bool isKill, int FI, const TargetRegisterClass *RC,
2708  const TargetRegisterInfo *TRI) const {
2709  MachineFunction &MF = *MBB.getParent();
2710  MachineFrameInfo &MFI = MF.getFrameInfo();
2711  unsigned Align = MFI.getObjectAlignment(FI);
2712 
2715  PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2716  unsigned Opc = 0;
2717  bool Offset = true;
2718  switch (TRI->getSpillSize(*RC)) {
2719  case 1:
2720  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2721  Opc = AArch64::STRBui;
2722  break;
2723  case 2:
2724  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2725  Opc = AArch64::STRHui;
2726  break;
2727  case 4:
2728  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2729  Opc = AArch64::STRWui;
2731  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2732  else
2733  assert(SrcReg != AArch64::WSP);
2734  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2735  Opc = AArch64::STRSui;
2736  break;
2737  case 8:
2738  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2739  Opc = AArch64::STRXui;
2741  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2742  else
2743  assert(SrcReg != AArch64::SP);
2744  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2745  Opc = AArch64::STRDui;
2746  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2748  get(AArch64::STPWi), SrcReg, isKill,
2749  AArch64::sube32, AArch64::subo32, FI, MMO);
2750  return;
2751  }
2752  break;
2753  case 16:
2754  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2755  Opc = AArch64::STRQui;
2756  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2757  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2758  Opc = AArch64::ST1Twov1d;
2759  Offset = false;
2760  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2762  get(AArch64::STPXi), SrcReg, isKill,
2763  AArch64::sube64, AArch64::subo64, FI, MMO);
2764  return;
2765  }
2766  break;
2767  case 24:
2768  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2769  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2770  Opc = AArch64::ST1Threev1d;
2771  Offset = false;
2772  }
2773  break;
2774  case 32:
2775  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2776  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2777  Opc = AArch64::ST1Fourv1d;
2778  Offset = false;
2779  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2780  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2781  Opc = AArch64::ST1Twov2d;
2782  Offset = false;
2783  }
2784  break;
2785  case 48:
2786  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2787  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2788  Opc = AArch64::ST1Threev2d;
2789  Offset = false;
2790  }
2791  break;
2792  case 64:
2793  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2794  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2795  Opc = AArch64::ST1Fourv2d;
2796  Offset = false;
2797  }
2798  break;
2799  }
2800  assert(Opc && "Unknown register class");
2801 
2802  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2803  .addReg(SrcReg, getKillRegState(isKill))
2804  .addFrameIndex(FI);
2805 
2806  if (Offset)
2807  MI.addImm(0);
2808  MI.addMemOperand(MMO);
2809 }
2810 
2812  MachineBasicBlock &MBB,
2813  MachineBasicBlock::iterator InsertBefore,
2814  const MCInstrDesc &MCID,
2815  unsigned DestReg, unsigned SubIdx0,
2816  unsigned SubIdx1, int FI,
2817  MachineMemOperand *MMO) {
2818  unsigned DestReg0 = DestReg;
2819  unsigned DestReg1 = DestReg;
2820  bool IsUndef = true;
2822  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2823  SubIdx0 = 0;
2824  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2825  SubIdx1 = 0;
2826  IsUndef = false;
2827  }
2828  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2829  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2830  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2831  .addFrameIndex(FI)
2832  .addImm(0)
2833  .addMemOperand(MMO);
2834 }
2835 
2837  MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2838  int FI, const TargetRegisterClass *RC,
2839  const TargetRegisterInfo *TRI) const {
2840  MachineFunction &MF = *MBB.getParent();
2841  MachineFrameInfo &MFI = MF.getFrameInfo();
2842  unsigned Align = MFI.getObjectAlignment(FI);
2845  PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2846 
2847  unsigned Opc = 0;
2848  bool Offset = true;
2849  switch (TRI->getSpillSize(*RC)) {
2850  case 1:
2851  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2852  Opc = AArch64::LDRBui;
2853  break;
2854  case 2:
2855  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2856  Opc = AArch64::LDRHui;
2857  break;
2858  case 4:
2859  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2860  Opc = AArch64::LDRWui;
2862  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2863  else
2864  assert(DestReg != AArch64::WSP);
2865  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2866  Opc = AArch64::LDRSui;
2867  break;
2868  case 8:
2869  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2870  Opc = AArch64::LDRXui;
2872  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2873  else
2874  assert(DestReg != AArch64::SP);
2875  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2876  Opc = AArch64::LDRDui;
2877  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2879  get(AArch64::LDPWi), DestReg, AArch64::sube32,
2880  AArch64::subo32, FI, MMO);
2881  return;
2882  }
2883  break;
2884  case 16:
2885  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2886  Opc = AArch64::LDRQui;
2887  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2888  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2889  Opc = AArch64::LD1Twov1d;
2890  Offset = false;
2891  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2893  get(AArch64::LDPXi), DestReg, AArch64::sube64,
2894  AArch64::subo64, FI, MMO);
2895  return;
2896  }
2897  break;
2898  case 24:
2899  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2900  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2901  Opc = AArch64::LD1Threev1d;
2902  Offset = false;
2903  }
2904  break;
2905  case 32:
2906  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2907  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2908  Opc = AArch64::LD1Fourv1d;
2909  Offset = false;
2910  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2911  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2912  Opc = AArch64::LD1Twov2d;
2913  Offset = false;
2914  }
2915  break;
2916  case 48:
2917  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2918  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2919  Opc = AArch64::LD1Threev2d;
2920  Offset = false;
2921  }
2922  break;
2923  case 64:
2924  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2925  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2926  Opc = AArch64::LD1Fourv2d;
2927  Offset = false;
2928  }
2929  break;
2930  }
2931  assert(Opc && "Unknown register class");
2932 
2933  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2934  .addReg(DestReg, getDefRegState(true))
2935  .addFrameIndex(FI);
2936  if (Offset)
2937  MI.addImm(0);
2938  MI.addMemOperand(MMO);
2939 }
2940 
2942  MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
2943  unsigned DestReg, unsigned SrcReg, int Offset,
2944  const TargetInstrInfo *TII,
2945  MachineInstr::MIFlag Flag, bool SetNZCV,
2946  bool NeedsWinCFI) {
2947  if (DestReg == SrcReg && Offset == 0)
2948  return;
2949 
2950  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
2951  "SP increment/decrement not 16-byte aligned");
2952 
2953  bool isSub = Offset < 0;
2954  if (isSub)
2955  Offset = -Offset;
2956 
2957  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
2958  // scratch register. If DestReg is a virtual register, use it as the
2959  // scratch register; otherwise, create a new virtual register (to be
2960  // replaced by the scavenger at the end of PEI). That case can be optimized
2961  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
2962  // register can be loaded with offset%8 and the add/sub can use an extending
2963  // instruction with LSL#3.
2964  // Currently the function handles any offsets but generates a poor sequence
2965  // of code.
2966  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
2967 
2968  unsigned Opc;
2969  if (SetNZCV)
2970  Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
2971  else
2972  Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
2973  const unsigned MaxEncoding = 0xfff;
2974  const unsigned ShiftSize = 12;
2975  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
2976  while (((unsigned)Offset) >= (1 << ShiftSize)) {
2977  unsigned ThisVal;
2978  if (((unsigned)Offset) > MaxEncodableValue) {
2979  ThisVal = MaxEncodableValue;
2980  } else {
2981  ThisVal = Offset & MaxEncodableValue;
2982  }
2983  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
2984  "Encoding cannot handle value that big");
2985  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2986  .addReg(SrcReg)
2987  .addImm(ThisVal >> ShiftSize)
2989  .setMIFlag(Flag);
2990 
2991  if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
2992  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2993  .addImm(ThisVal)
2994  .setMIFlag(Flag);
2995 
2996  SrcReg = DestReg;
2997  Offset -= ThisVal;
2998  if (Offset == 0)
2999  return;
3000  }
3001  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
3002  .addReg(SrcReg)
3003  .addImm(Offset)
3005  .setMIFlag(Flag);
3006 
3007  if (NeedsWinCFI) {
3008  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
3009  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
3010  if (Offset == 0)
3011  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
3012  setMIFlag(Flag);
3013  else
3014  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
3015  addImm(Offset).setMIFlag(Flag);
3016  } else if (DestReg == AArch64::SP) {
3017  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
3018  addImm(Offset).setMIFlag(Flag);
3019  }
3020  }
3021 }
3022 
3026  LiveIntervals *LIS) const {
3027  // This is a bit of a hack. Consider this instruction:
3028  //
3029  // %0 = COPY %sp; GPR64all:%0
3030  //
3031  // We explicitly chose GPR64all for the virtual register so such a copy might
3032  // be eliminated by RegisterCoalescer. However, that may not be possible, and
3033  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
3034  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
3035  //
3036  // To prevent that, we are going to constrain the %0 register class here.
3037  //
3038  // <rdar://problem/11522048>
3039  //
3040  if (MI.isFullCopy()) {
3041  unsigned DstReg = MI.getOperand(0).getReg();
3042  unsigned SrcReg = MI.getOperand(1).getReg();
3043  if (SrcReg == AArch64::SP &&
3045  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
3046  return nullptr;
3047  }
3048  if (DstReg == AArch64::SP &&
3050  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
3051  return nullptr;
3052  }
3053  }
3054 
3055  // Handle the case where a copy is being spilled or filled but the source
3056  // and destination register class don't match. For example:
3057  //
3058  // %0 = COPY %xzr; GPR64common:%0
3059  //
3060  // In this case we can still safely fold away the COPY and generate the
3061  // following spill code:
3062  //
3063  // STRXui %xzr, %stack.0
3064  //
3065  // This also eliminates spilled cross register class COPYs (e.g. between x and
3066  // d regs) of the same size. For example:
3067  //
3068  // %0 = COPY %1; GPR64:%0, FPR64:%1
3069  //
3070  // will be filled as
3071  //
3072  // LDRDui %0, fi<#0>
3073  //
3074  // instead of
3075  //
3076  // LDRXui %Temp, fi<#0>
3077  // %0 = FMOV %Temp
3078  //
3079  if (MI.isCopy() && Ops.size() == 1 &&
3080  // Make sure we're only folding the explicit COPY defs/uses.
3081  (Ops[0] == 0 || Ops[0] == 1)) {
3082  bool IsSpill = Ops[0] == 0;
3083  bool IsFill = !IsSpill;
3085  const MachineRegisterInfo &MRI = MF.getRegInfo();
3086  MachineBasicBlock &MBB = *MI.getParent();
3087  const MachineOperand &DstMO = MI.getOperand(0);
3088  const MachineOperand &SrcMO = MI.getOperand(1);
3089  unsigned DstReg = DstMO.getReg();
3090  unsigned SrcReg = SrcMO.getReg();
3091  // This is slightly expensive to compute for physical regs since
3092  // getMinimalPhysRegClass is slow.
3093  auto getRegClass = [&](unsigned Reg) {
3095  ? MRI.getRegClass(Reg)
3096  : TRI.getMinimalPhysRegClass(Reg);
3097  };
3098 
3099  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3100  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3101  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3102  "Mismatched register size in non subreg COPY");
3103  if (IsSpill)
3104  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3105  getRegClass(SrcReg), &TRI);
3106  else
3107  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3108  getRegClass(DstReg), &TRI);
3109  return &*--InsertPt;
3110  }
3111 
3112  // Handle cases like spilling def of:
3113  //
3114  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3115  //
3116  // where the physical register source can be widened and stored to the full
3117  // virtual reg destination stack slot, in this case producing:
3118  //
3119  // STRXui %xzr, %stack.0
3120  //
3121  if (IsSpill && DstMO.isUndef() &&
3123  assert(SrcMO.getSubReg() == 0 &&
3124  "Unexpected subreg on physical register");
3125  const TargetRegisterClass *SpillRC;
3126  unsigned SpillSubreg;
3127  switch (DstMO.getSubReg()) {
3128  default:
3129  SpillRC = nullptr;
3130  break;
3131  case AArch64::sub_32:
3132  case AArch64::ssub:
3133  if (AArch64::GPR32RegClass.contains(SrcReg)) {
3134  SpillRC = &AArch64::GPR64RegClass;
3135  SpillSubreg = AArch64::sub_32;
3136  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3137  SpillRC = &AArch64::FPR64RegClass;
3138  SpillSubreg = AArch64::ssub;
3139  } else
3140  SpillRC = nullptr;
3141  break;
3142  case AArch64::dsub:
3143  if (AArch64::FPR64RegClass.contains(SrcReg)) {
3144  SpillRC = &AArch64::FPR128RegClass;
3145  SpillSubreg = AArch64::dsub;
3146  } else
3147  SpillRC = nullptr;
3148  break;
3149  }
3150 
3151  if (SpillRC)
3152  if (unsigned WidenedSrcReg =
3153  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3154  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3155  FrameIndex, SpillRC, &TRI);
3156  return &*--InsertPt;
3157  }
3158  }
3159 
3160  // Handle cases like filling use of:
3161  //
3162  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3163  //
3164  // where we can load the full virtual reg source stack slot, into the subreg
3165  // destination, in this case producing:
3166  //
3167  // LDRWui %0:sub_32<def,read-undef>, %stack.0
3168  //
3169  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3170  const TargetRegisterClass *FillRC;
3171  switch (DstMO.getSubReg()) {
3172  default:
3173  FillRC = nullptr;
3174  break;
3175  case AArch64::sub_32:
3176  FillRC = &AArch64::GPR32RegClass;
3177  break;
3178  case AArch64::ssub:
3179  FillRC = &AArch64::FPR32RegClass;
3180  break;
3181  case AArch64::dsub:
3182  FillRC = &AArch64::FPR64RegClass;
3183  break;
3184  }
3185 
3186  if (FillRC) {
3187  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3188  TRI.getRegSizeInBits(*FillRC) &&
3189  "Mismatched regclass size on folded subreg COPY");
3190  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3191  MachineInstr &LoadMI = *--InsertPt;
3192  MachineOperand &LoadDst = LoadMI.getOperand(0);
3193  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3194  LoadDst.setSubReg(DstMO.getSubReg());
3195  LoadDst.setIsUndef();
3196  return &LoadMI;
3197  }
3198  }
3199  }
3200 
3201  // Cannot fold.
3202  return nullptr;
3203 }
3204 
3206  bool *OutUseUnscaledOp,
3207  unsigned *OutUnscaledOp,
3208  int *EmittableOffset) {
3209  // Set output values in case of early exit.
3210  if (EmittableOffset)
3211  *EmittableOffset = 0;
3212  if (OutUseUnscaledOp)
3213  *OutUseUnscaledOp = false;
3214  if (OutUnscaledOp)
3215  *OutUnscaledOp = 0;
3216 
3217  // Exit early for structured vector spills/fills as they can't take an
3218  // immediate offset.
3219  switch (MI.getOpcode()) {
3220  default:
3221  break;
3222  case AArch64::LD1Twov2d:
3223  case AArch64::LD1Threev2d:
3224  case AArch64::LD1Fourv2d:
3225  case AArch64::LD1Twov1d:
3226  case AArch64::LD1Threev1d:
3227  case AArch64::LD1Fourv1d:
3228  case AArch64::ST1Twov2d:
3229  case AArch64::ST1Threev2d:
3230  case AArch64::ST1Fourv2d:
3231  case AArch64::ST1Twov1d:
3232  case AArch64::ST1Threev1d:
3233  case AArch64::ST1Fourv1d:
3235  }
3236 
3237  // Get the min/max offset and the scale.
3238  unsigned Scale, Width;
3239  int64_t MinOff, MaxOff;
3240  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
3241  MaxOff))
3242  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3243 
3244  // Construct the complete offset.
3245  const MachineOperand &ImmOpnd =
3247  Offset += ImmOpnd.getImm() * Scale;
3248 
3249  // If the offset doesn't match the scale, we rewrite the instruction to
3250  // use the unscaled instruction instead. Likewise, if we have a negative
3251  // offset and there is an unscaled op to use.
3252  Optional<unsigned> UnscaledOp =
3254  bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
3255  if (useUnscaledOp &&
3256  !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
3257  llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
3258 
3259  int64_t Remainder = Offset % Scale;
3260  assert(!(Remainder && useUnscaledOp) &&
3261  "Cannot have remainder when using unscaled op");
3262 
3263  assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
3264  int NewOffset = Offset / Scale;
3265  if (MinOff <= NewOffset && NewOffset <= MaxOff)
3266  Offset = Remainder;
3267  else {
3268  NewOffset = NewOffset < 0 ? MinOff : MaxOff;
3269  Offset = Offset - NewOffset * Scale + Remainder;
3270  }
3271 
3272  if (EmittableOffset)
3273  *EmittableOffset = NewOffset;
3274  if (OutUseUnscaledOp)
3275  *OutUseUnscaledOp = useUnscaledOp;
3276  if (OutUnscaledOp && UnscaledOp)
3277  *OutUnscaledOp = *UnscaledOp;
3278 
3280  (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
3281 }
3282 
3283 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3284  unsigned FrameReg, int &Offset,
3285  const AArch64InstrInfo *TII) {
3286  unsigned Opcode = MI.getOpcode();
3287  unsigned ImmIdx = FrameRegIdx + 1;
3288 
3289  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3290  Offset += MI.getOperand(ImmIdx).getImm();
3291  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3292  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3293  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3294  MI.eraseFromParent();
3295  Offset = 0;
3296  return true;
3297  }
3298 
3299  int NewOffset;
3300  unsigned UnscaledOp;
3301  bool UseUnscaledOp;
3302  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3303  &UnscaledOp, &NewOffset);
3304  if (Status & AArch64FrameOffsetCanUpdate) {
3305  if (Status & AArch64FrameOffsetIsLegal)
3306  // Replace the FrameIndex with FrameReg.
3307  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3308  if (UseUnscaledOp)
3309  MI.setDesc(TII->get(UnscaledOp));
3310 
3311  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3312  return Offset == 0;
3313  }
3314 
3315  return false;
3316 }
3317 
3318 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3319  NopInst.setOpcode(AArch64::HINT);
3320  NopInst.addOperand(MCOperand::createImm(0));
3321 }
3322 
3323 // AArch64 supports MachineCombiner.
3324 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3325 
3326 // True when Opc sets flag
3327 static bool isCombineInstrSettingFlag(unsigned Opc) {
3328  switch (Opc) {
3329  case AArch64::ADDSWrr:
3330  case AArch64::ADDSWri:
3331  case AArch64::ADDSXrr:
3332  case AArch64::ADDSXri:
3333  case AArch64::SUBSWrr:
3334  case AArch64::SUBSXrr:
3335  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3336  case AArch64::SUBSWri:
3337  case AArch64::SUBSXri:
3338  return true;
3339  default:
3340  break;
3341  }
3342  return false;
3343 }
3344 
3345 // 32b Opcodes that can be combined with a MUL
3346 static bool isCombineInstrCandidate32(unsigned Opc) {
3347  switch (Opc) {
3348  case AArch64::ADDWrr:
3349  case AArch64::ADDWri:
3350  case AArch64::SUBWrr:
3351  case AArch64::ADDSWrr:
3352  case AArch64::ADDSWri:
3353  case AArch64::SUBSWrr:
3354  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3355  case AArch64::SUBWri:
3356  case AArch64::SUBSWri:
3357  return true;
3358  default:
3359  break;
3360  }
3361  return false;
3362 }
3363 
3364 // 64b Opcodes that can be combined with a MUL
3365 static bool isCombineInstrCandidate64(unsigned Opc) {
3366  switch (Opc) {
3367  case AArch64::ADDXrr:
3368  case AArch64::ADDXri:
3369  case AArch64::SUBXrr:
3370  case AArch64::ADDSXrr:
3371  case AArch64::ADDSXri:
3372  case AArch64::SUBSXrr:
3373  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3374  case AArch64::SUBXri:
3375  case AArch64::SUBSXri:
3376  return true;
3377  default:
3378  break;
3379  }
3380  return false;
3381 }
3382 
3383 // FP Opcodes that can be combined with a FMUL
3384 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3385  switch (Inst.getOpcode()) {
3386  default:
3387  break;
3388  case AArch64::FADDSrr:
3389  case AArch64::FADDDrr:
3390  case AArch64::FADDv2f32:
3391  case AArch64::FADDv2f64:
3392  case AArch64::FADDv4f32:
3393  case AArch64::FSUBSrr:
3394  case AArch64::FSUBDrr:
3395  case AArch64::FSUBv2f32:
3396  case AArch64::FSUBv2f64:
3397  case AArch64::FSUBv4f32:
3398  TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3399  return (Options.UnsafeFPMath ||
3400  Options.AllowFPOpFusion == FPOpFusion::Fast);
3401  }
3402  return false;
3403 }
3404 
3405 // Opcodes that can be combined with a MUL
3406 static bool isCombineInstrCandidate(unsigned Opc) {
3408 }
3409 
3410 //
3411 // Utility routine that checks if \param MO is defined by an
3412 // \param CombineOpc instruction in the basic block \param MBB
3414  unsigned CombineOpc, unsigned ZeroReg = 0,
3415  bool CheckZeroReg = false) {
3416  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3417  MachineInstr *MI = nullptr;
3418 
3420  MI = MRI.getUniqueVRegDef(MO.getReg());
3421  // And it needs to be in the trace (otherwise, it won't have a depth).
3422  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3423  return false;
3424  // Must only used by the user we combine with.
3425  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3426  return false;
3427 
3428  if (CheckZeroReg) {
3429  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3430  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3431  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3432  // The third input reg must be zero.
3433  if (MI->getOperand(3).getReg() != ZeroReg)
3434  return false;
3435  }
3436 
3437  return true;
3438 }
3439 
3440 //
3441 // Is \param MO defined by an integer multiply and can be combined?
3443  unsigned MulOpc, unsigned ZeroReg) {
3444  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3445 }
3446 
3447 //
3448 // Is \param MO defined by a floating-point multiply and can be combined?
3450  unsigned MulOpc) {
3451  return canCombine(MBB, MO, MulOpc);
3452 }
3453 
3454 // TODO: There are many more machine instruction opcodes to match:
3455 // 1. Other data types (integer, vectors)
3456 // 2. Other math / logic operations (xor, or)
3457 // 3. Other forms of the same operation (intrinsics and other variants)
3459  const MachineInstr &Inst) const {
3460  switch (Inst.getOpcode()) {
3461  case AArch64::FADDDrr:
3462  case AArch64::FADDSrr:
3463  case AArch64::FADDv2f32:
3464  case AArch64::FADDv2f64:
3465  case AArch64::FADDv4f32:
3466  case AArch64::FMULDrr:
3467  case AArch64::FMULSrr:
3468  case AArch64::FMULX32:
3469  case AArch64::FMULX64:
3470  case AArch64::FMULXv2f32:
3471  case AArch64::FMULXv2f64:
3472  case AArch64::FMULXv4f32:
3473  case AArch64::FMULv2f32:
3474  case AArch64::FMULv2f64:
3475  case AArch64::FMULv4f32:
3476  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3477  default:
3478  return false;
3479  }
3480 }
3481 
3482 /// Find instructions that can be turned into madd.
3483 static bool getMaddPatterns(MachineInstr &Root,
3485  unsigned Opc = Root.getOpcode();
3486  MachineBasicBlock &MBB = *Root.getParent();
3487  bool Found = false;
3488 
3489  if (!isCombineInstrCandidate(Opc))
3490  return false;
3491  if (isCombineInstrSettingFlag(Opc)) {
3492  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3493  // When NZCV is live bail out.
3494  if (Cmp_NZCV == -1)
3495  return false;
3496  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3497  // When opcode can't change bail out.
3498  // CHECKME: do we miss any cases for opcode conversion?
3499  if (NewOpc == Opc)
3500  return false;
3501  Opc = NewOpc;
3502  }
3503 
3504  switch (Opc) {
3505  default:
3506  break;
3507  case AArch64::ADDWrr:
3508  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3509  "ADDWrr does not have register operands");
3510  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3511  AArch64::WZR)) {
3513  Found = true;
3514  }
3515  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3516  AArch64::WZR)) {
3518  Found = true;
3519  }
3520  break;
3521  case AArch64::ADDXrr:
3522  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3523  AArch64::XZR)) {
3525  Found = true;
3526  }
3527  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3528  AArch64::XZR)) {
3530  Found = true;
3531  }
3532  break;
3533  case AArch64::SUBWrr:
3534  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3535  AArch64::WZR)) {
3537  Found = true;
3538  }
3539  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3540  AArch64::WZR)) {
3542  Found = true;
3543  }
3544  break;
3545  case AArch64::SUBXrr:
3546  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3547  AArch64::XZR)) {
3549  Found = true;
3550  }
3551  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3552  AArch64::XZR)) {
3554  Found = true;
3555  }
3556  break;
3557  case AArch64::ADDWri:
3558  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3559  AArch64::WZR)) {
3561  Found = true;
3562  }
3563  break;
3564  case AArch64::ADDXri:
3565  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3566  AArch64::XZR)) {
3568  Found = true;
3569  }
3570  break;
3571  case AArch64::SUBWri:
3572  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3573  AArch64::WZR)) {
3575  Found = true;
3576  }
3577  break;
3578  case AArch64::SUBXri:
3579  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3580  AArch64::XZR)) {
3582  Found = true;
3583  }
3584  break;
3585  }
3586  return Found;
3587 }
3588 /// Floating-Point Support
3589 
3590 /// Find instructions that can be turned into madd.
3591 static bool getFMAPatterns(MachineInstr &Root,
3593 
3594  if (!isCombineInstrCandidateFP(Root))
3595  return false;
3596 
3597  MachineBasicBlock &MBB = *Root.getParent();
3598  bool Found = false;
3599 
3600  switch (Root.getOpcode()) {
3601  default:
3602  assert(false && "Unsupported FP instruction in combiner\n");
3603  break;
3604  case AArch64::FADDSrr:
3605  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3606  "FADDWrr does not have register operands");
3607  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3609  Found = true;
3610  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3611  AArch64::FMULv1i32_indexed)) {
3613  Found = true;
3614  }
3615  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3617  Found = true;
3618  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3619  AArch64::FMULv1i32_indexed)) {
3621  Found = true;
3622  }
3623  break;
3624  case AArch64::FADDDrr:
3625  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3627  Found = true;
3628  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3629  AArch64::FMULv1i64_indexed)) {
3631  Found = true;
3632  }
3633  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3635  Found = true;
3636  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3637  AArch64::FMULv1i64_indexed)) {
3639  Found = true;
3640  }
3641  break;
3642  case AArch64::FADDv2f32:
3643  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3644  AArch64::FMULv2i32_indexed)) {
3646  Found = true;
3647  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3648  AArch64::FMULv2f32)) {
3650  Found = true;
3651  }
3652  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3653  AArch64::FMULv2i32_indexed)) {
3655  Found = true;
3656  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3657  AArch64::FMULv2f32)) {
3659  Found = true;
3660  }
3661  break;
3662  case AArch64::FADDv2f64:
3663  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3664  AArch64::FMULv2i64_indexed)) {
3666  Found = true;
3667  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3668  AArch64::FMULv2f64)) {
3670  Found = true;
3671  }
3672  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3673  AArch64::FMULv2i64_indexed)) {
3675  Found = true;
3676  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3677  AArch64::FMULv2f64)) {
3679  Found = true;
3680  }
3681  break;
3682  case AArch64::FADDv4f32:
3683  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3684  AArch64::FMULv4i32_indexed)) {
3686  Found = true;
3687  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3688  AArch64::FMULv4f32)) {
3690  Found = true;
3691  }
3692  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3693  AArch64::FMULv4i32_indexed)) {
3695  Found = true;
3696  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3697  AArch64::FMULv4f32)) {
3699  Found = true;
3700  }
3701  break;
3702 
3703  case AArch64::FSUBSrr:
3704  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3706  Found = true;
3707  }
3708  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3710  Found = true;
3711  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3712  AArch64::FMULv1i32_indexed)) {
3714  Found = true;
3715  }
3716  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
3718  Found = true;
3719  }
3720  break;
3721  case AArch64::FSUBDrr:
3722  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3724  Found = true;
3725  }
3726  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3728  Found = true;
3729  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3730  AArch64::FMULv1i64_indexed)) {
3732  Found = true;
3733  }
3734  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
3736  Found = true;
3737  }
3738  break;
3739  case AArch64::FSUBv2f32:
3740  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3741  AArch64::FMULv2i32_indexed)) {
3743  Found = true;
3744  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3745  AArch64::FMULv2f32)) {
3747  Found = true;
3748  }
3749  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3750  AArch64::FMULv2i32_indexed)) {
3752  Found = true;
3753  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3754  AArch64::FMULv2f32)) {
3756  Found = true;
3757  }
3758  break;
3759  case AArch64::FSUBv2f64:
3760  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3761  AArch64::FMULv2i64_indexed)) {
3763  Found = true;
3764  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3765  AArch64::FMULv2f64)) {
3767  Found = true;
3768  }
3769  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3770  AArch64::FMULv2i64_indexed)) {
3772  Found = true;
3773  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3774  AArch64::FMULv2f64)) {
3776  Found = true;
3777  }
3778  break;
3779  case AArch64::FSUBv4f32:
3780  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3781  AArch64::FMULv4i32_indexed)) {
3783  Found = true;
3784  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3785  AArch64::FMULv4f32)) {
3787  Found = true;
3788  }
3789  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3790  AArch64::FMULv4i32_indexed)) {
3792  Found = true;
3793  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3794  AArch64::FMULv4f32)) {
3796  Found = true;
3797  }
3798  break;
3799  }
3800  return Found;
3801 }
3802 
3803 /// Return true when a code sequence can improve throughput. It
3804 /// should be called only for instructions in loops.
3805 /// \param Pattern - combiner pattern
3807  MachineCombinerPattern Pattern) const {
3808  switch (Pattern) {
3809  default:
3810  break;
3845  return true;
3846  } // end switch (Pattern)
3847  return false;
3848 }
3849 /// Return true when there is potentially a faster code sequence for an
3850 /// instruction chain ending in \p Root. All potential patterns are listed in
3851 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3852 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3853 
3855  MachineInstr &Root,
3856  SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3857  // Integer patterns
3858  if (getMaddPatterns(Root, Patterns))
3859  return true;
3860  // Floating point patterns
3861  if (getFMAPatterns(Root, Patterns))
3862  return true;
3863 
3864  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3865 }
3866 
3868 /// genFusedMultiply - Generate fused multiply instructions.
3869 /// This function supports both integer and floating point instructions.
3870 /// A typical example:
3871 /// F|MUL I=A,B,0
3872 /// F|ADD R,I,C
3873 /// ==> F|MADD R,A,B,C
3874 /// \param MF Containing MachineFunction
3875 /// \param MRI Register information
3876 /// \param TII Target information
3877 /// \param Root is the F|ADD instruction
3878 /// \param [out] InsInstrs is a vector of machine instructions and will
3879 /// contain the generated madd instruction
3880 /// \param IdxMulOpd is index of operand in Root that is the result of
3881 /// the F|MUL. In the example above IdxMulOpd is 1.
3882 /// \param MaddOpc the opcode fo the f|madd instruction
3883 /// \param RC Register class of operands
3884 /// \param kind of fma instruction (addressing mode) to be generated
3885 /// \param ReplacedAddend is the result register from the instruction
3886 /// replacing the non-combined operand, if any.
3887 static MachineInstr *
3889  const TargetInstrInfo *TII, MachineInstr &Root,
3890  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3891  unsigned MaddOpc, const TargetRegisterClass *RC,
3893  const unsigned *ReplacedAddend = nullptr) {
3894  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3895 
3896  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3897  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3898  unsigned ResultReg = Root.getOperand(0).getReg();
3899  unsigned SrcReg0 = MUL->getOperand(1).getReg();
3900  bool Src0IsKill = MUL->getOperand(1).isKill();
3901  unsigned SrcReg1 = MUL->getOperand(2).getReg();
3902  bool Src1IsKill = MUL->getOperand(2).isKill();
3903 
3904  unsigned SrcReg2;
3905  bool Src2IsKill;
3906  if (ReplacedAddend) {
3907  // If we just generated a new addend, we must be it's only use.
3908  SrcReg2 = *ReplacedAddend;
3909  Src2IsKill = true;
3910  } else {
3911  SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
3912  Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
3913  }
3914 
3916  MRI.constrainRegClass(ResultReg, RC);
3918  MRI.constrainRegClass(SrcReg0, RC);
3920  MRI.constrainRegClass(SrcReg1, RC);
3922  MRI.constrainRegClass(SrcReg2, RC);
3923 
3924  MachineInstrBuilder MIB;
3925  if (kind == FMAInstKind::Default)
3926  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3927  .addReg(SrcReg0, getKillRegState(Src0IsKill))
3928  .addReg(SrcReg1, getKillRegState(Src1IsKill))
3929  .addReg(SrcReg2, getKillRegState(Src2IsKill));
3930  else if (kind == FMAInstKind::Indexed)
3931  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3932  .addReg(SrcReg2, getKillRegState(Src2IsKill))
3933  .addReg(SrcReg0, getKillRegState(Src0IsKill))
3934  .addReg(SrcReg1, getKillRegState(Src1IsKill))
3935  .addImm(MUL->getOperand(3).getImm());
3936  else if (kind == FMAInstKind::Accumulator)
3937  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3938  .addReg(SrcReg2, getKillRegState(Src2IsKill))
3939  .addReg(SrcReg0, getKillRegState(Src0IsKill))
3940  .addReg(SrcReg1, getKillRegState(Src1IsKill));
3941  else
3942  assert(false && "Invalid FMA instruction kind \n");
3943  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
3944  InsInstrs.push_back(MIB);
3945  return MUL;
3946 }
3947 
3948 /// genMaddR - Generate madd instruction and combine mul and add using
3949 /// an extra virtual register
3950 /// Example - an ADD intermediate needs to be stored in a register:
3951 /// MUL I=A,B,0
3952 /// ADD R,I,Imm
3953 /// ==> ORR V, ZR, Imm
3954 /// ==> MADD R,A,B,V
3955 /// \param MF Containing MachineFunction
3956 /// \param MRI Register information
3957 /// \param TII Target information
3958 /// \param Root is the ADD instruction
3959 /// \param [out] InsInstrs is a vector of machine instructions and will
3960 /// contain the generated madd instruction
3961 /// \param IdxMulOpd is index of operand in Root that is the result of
3962 /// the MUL. In the example above IdxMulOpd is 1.
3963 /// \param MaddOpc the opcode fo the madd instruction
3964 /// \param VR is a virtual register that holds the value of an ADD operand
3965 /// (V in the example above).
3966 /// \param RC Register class of operands
3968  const TargetInstrInfo *TII, MachineInstr &Root,
3970  unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
3971  const TargetRegisterClass *RC) {
3972  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3973 
3974  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3975  unsigned ResultReg = Root.getOperand(0).getReg();
3976  unsigned SrcReg0 = MUL->getOperand(1).getReg();
3977  bool Src0IsKill = MUL->getOperand(1).isKill();
3978  unsigned SrcReg1 = MUL->getOperand(2).getReg();
3979  bool Src1IsKill = MUL->getOperand(2).isKill();
3980 
3982  MRI.constrainRegClass(ResultReg, RC);
3984  MRI.constrainRegClass(SrcReg0, RC);
3986  MRI.constrainRegClass(SrcReg1, RC);
3988  MRI.constrainRegClass(VR, RC);
3989 
3990  MachineInstrBuilder MIB =
3991  BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3992  .addReg(SrcReg0, getKillRegState(Src0IsKill))
3993  .addReg(SrcReg1, getKillRegState(Src1IsKill))
3994  .addReg(VR);
3995  // Insert the MADD
3996  InsInstrs.push_back(MIB);
3997  return MUL;
3998 }
3999 
4000 /// When getMachineCombinerPatterns() finds potential patterns,
4001 /// this function generates the instructions that could replace the
4002 /// original code sequence
4004  MachineInstr &Root, MachineCombinerPattern Pattern,
4007  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4008  MachineBasicBlock &MBB = *Root.getParent();
4009  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4010  MachineFunction &MF = *MBB.getParent();
4011  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4012 
4013  MachineInstr *MUL;
4014  const TargetRegisterClass *RC;
4015  unsigned Opc;
4016  switch (Pattern) {
4017  default:
4018  // Reassociate instructions.
4019  TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4020  DelInstrs, InstrIdxForVirtReg);
4021  return;
4024  // MUL I=A,B,0
4025  // ADD R,I,C
4026  // ==> MADD R,A,B,C
4027  // --- Create(MADD);
4028  if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4029  Opc = AArch64::MADDWrrr;
4030  RC = &AArch64::GPR32RegClass;
4031  } else {
4032  Opc = AArch64::MADDXrrr;
4033  RC = &AArch64::GPR64RegClass;
4034  }
4035  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4036  break;
4039  // MUL I=A,B,0
4040  // ADD R,C,I
4041  // ==> MADD R,A,B,C
4042  // --- Create(MADD);
4043  if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4044  Opc = AArch64::MADDWrrr;
4045  RC = &AArch64::GPR32RegClass;
4046  } else {
4047  Opc = AArch64::MADDXrrr;
4048  RC = &AArch64::GPR64RegClass;
4049  }
4050  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4051  break;
4054  // MUL I=A,B,0
4055  // ADD R,I,Imm
4056  // ==> ORR V, ZR, Imm
4057  // ==> MADD R,A,B,V
4058  // --- Create(MADD);
4059  const TargetRegisterClass *OrrRC;
4060  unsigned BitSize, OrrOpc, ZeroReg;
4061  if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4062  OrrOpc = AArch64::ORRWri;
4063  OrrRC = &AArch64::GPR32spRegClass;
4064  BitSize = 32;
4065  ZeroReg = AArch64::WZR;
4066  Opc = AArch64::MADDWrrr;
4067  RC = &AArch64::GPR32RegClass;
4068  } else {
4069  OrrOpc = AArch64::ORRXri;
4070  OrrRC = &AArch64::GPR64spRegClass;
4071  BitSize = 64;
4072  ZeroReg = AArch64::XZR;
4073  Opc = AArch64::MADDXrrr;
4074  RC = &AArch64::GPR64RegClass;
4075  }
4076  unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4077  uint64_t Imm = Root.getOperand(2).getImm();
4078 
4079  if (Root.getOperand(3).isImm()) {
4080  unsigned Val = Root.getOperand(3).getImm();
4081  Imm = Imm << Val;
4082  }
4083  uint64_t UImm = SignExtend64(Imm, BitSize);
4084  uint64_t Encoding;
4085  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4086  MachineInstrBuilder MIB1 =
4087  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4088  .addReg(ZeroReg)
4089  .addImm(Encoding);
4090  InsInstrs.push_back(MIB1);
4091  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4092  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4093  }
4094  break;
4095  }
4098  // MUL I=A,B,0
4099  // SUB R,I, C
4100  // ==> SUB V, 0, C
4101  // ==> MADD R,A,B,V // = -C + A*B
4102  // --- Create(MADD);
4103  const TargetRegisterClass *SubRC;
4104  unsigned SubOpc, ZeroReg;
4105  if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4106  SubOpc = AArch64::SUBWrr;
4107  SubRC = &AArch64::GPR32spRegClass;
4108  ZeroReg = AArch64::WZR;
4109  Opc = AArch64::MADDWrrr;
4110  RC = &AArch64::GPR32RegClass;
4111  } else {
4112  SubOpc = AArch64::SUBXrr;
4113  SubRC = &AArch64::GPR64spRegClass;
4114  ZeroReg = AArch64::XZR;
4115  Opc = AArch64::MADDXrrr;
4116  RC = &AArch64::GPR64RegClass;
4117  }
4118  unsigned NewVR = MRI.createVirtualRegister(SubRC);
4119  // SUB NewVR, 0, C
4120  MachineInstrBuilder MIB1 =
4121  BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4122  .addReg(ZeroReg)
4123  .add(Root.getOperand(2));
4124  InsInstrs.push_back(MIB1);
4125  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4126  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4127  break;
4128  }
4131  // MUL I=A,B,0
4132  // SUB R,C,I
4133  // ==> MSUB R,A,B,C (computes C - A*B)
4134  // --- Create(MSUB);
4135  if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4136  Opc = AArch64::MSUBWrrr;
4137  RC = &AArch64::GPR32RegClass;
4138  } else {
4139  Opc = AArch64::MSUBXrrr;
4140  RC = &AArch64::GPR64RegClass;
4141  }
4142  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4143  break;
4146  // MUL I=A,B,0
4147  // SUB R,I, Imm
4148  // ==> ORR V, ZR, -Imm
4149  // ==> MADD R,A,B,V // = -Imm + A*B
4150  // --- Create(MADD);
4151  const TargetRegisterClass *OrrRC;
4152  unsigned BitSize, OrrOpc, ZeroReg;
4153  if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4154  OrrOpc = AArch64::ORRWri;
4155  OrrRC = &AArch64::GPR32spRegClass;
4156  BitSize = 32;
4157  ZeroReg = AArch64::WZR;
4158  Opc = AArch64::MADDWrrr;
4159  RC = &AArch64::GPR32RegClass;
4160  } else {
4161  OrrOpc = AArch64::ORRXri;
4162  OrrRC = &AArch64::GPR64spRegClass;
4163  BitSize = 64;
4164  ZeroReg = AArch64::XZR;
4165  Opc = AArch64::MADDXrrr;
4166  RC = &AArch64::GPR64RegClass;
4167  }
4168  unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4169  uint64_t Imm = Root.getOperand(2).getImm();
4170  if (Root.getOperand(3).isImm()) {
4171  unsigned Val = Root.getOperand(3).getImm();
4172  Imm = Imm << Val;
4173  }
4174  uint64_t UImm = SignExtend64(-Imm, BitSize);
4175  uint64_t Encoding;
4176  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4177  MachineInstrBuilder MIB1 =
4178  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4179  .addReg(ZeroReg)
4180  .addImm(Encoding);
4181  InsInstrs.push_back(MIB1);
4182  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4183  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4184  }
4185  break;
4186  }
4187  // Floating Point Support
4190  // MUL I=A,B,0
4191  // ADD R,I,C
4192  // ==> MADD R,A,B,C
4193  // --- Create(MADD);
4194  if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
4195  Opc = AArch64::FMADDSrrr;
4196  RC = &AArch64::FPR32RegClass;
4197  } else {
4198  Opc = AArch64::FMADDDrrr;
4199  RC = &AArch64::FPR64RegClass;
4200  }
4201  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4202  break;
4205  // FMUL I=A,B,0
4206  // FADD R,C,I
4207  // ==> FMADD R,A,B,C
4208  // --- Create(FMADD);
4209  if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
4210  Opc = AArch64::FMADDSrrr;
4211  RC = &AArch64::FPR32RegClass;
4212  } else {
4213  Opc = AArch64::FMADDDrrr;
4214  RC = &AArch64::FPR64RegClass;
4215  }
4216  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4217  break;
4218 
4220  Opc = AArch64::FMLAv1i32_indexed;
4221  RC = &AArch64::FPR32RegClass;
4222  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4224  break;
4226  Opc = AArch64::FMLAv1i32_indexed;
4227  RC = &AArch64::FPR32RegClass;
4228  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4230  break;
4231 
4233  Opc = AArch64::FMLAv1i64_indexed;
4234  RC = &AArch64::FPR64RegClass;
4235  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4237  break;
4239  Opc = AArch64::FMLAv1i64_indexed;
4240  RC = &AArch64::FPR64RegClass;
4241  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4243  break;
4244 
4247  RC = &AArch64::FPR64RegClass;
4249  Opc = AArch64::FMLAv2i32_indexed;
4250  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4252  } else {
4253  Opc = AArch64::FMLAv2f32;
4254  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4256  }
4257  break;
4260  RC = &AArch64::FPR64RegClass;
4262  Opc = AArch64::FMLAv2i32_indexed;
4263  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4265  } else {
4266  Opc = AArch64::FMLAv2f32;
4267  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4269  }
4270  break;
4271 
4274  RC = &AArch64::FPR128RegClass;
4276  Opc = AArch64::FMLAv2i64_indexed;
4277  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4279  } else {
4280  Opc = AArch64::FMLAv2f64;
4281  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4283  }
4284  break;
4287  RC = &AArch64::FPR128RegClass;
4289  Opc = AArch64::FMLAv2i64_indexed;
4290  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4292  } else {
4293  Opc = AArch64::FMLAv2f64;
4294  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4296  }
4297  break;
4298 
4301  RC = &AArch64::FPR128RegClass;
4303  Opc = AArch64::FMLAv4i32_indexed;
4304  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4306  } else {
4307  Opc = AArch64::FMLAv4f32;
4308  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4310  }
4311  break;
4312 
4315  RC = &AArch64::FPR128RegClass;
4317  Opc = AArch64::FMLAv4i32_indexed;
4318  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4320  } else {
4321  Opc = AArch64::FMLAv4f32;
4322  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4324  }
4325  break;
4326 
4329  // FMUL I=A,B,0
4330  // FSUB R,I,C
4331  // ==> FNMSUB R,A,B,C // = -C + A*B
4332  // --- Create(FNMSUB);
4333  if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
4334  Opc = AArch64::FNMSUBSrrr;
4335  RC = &AArch64::FPR32RegClass;
4336  } else {
4337  Opc = AArch64::FNMSUBDrrr;
4338  RC = &AArch64::FPR64RegClass;
4339  }
4340  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4341  break;
4342  }
4343 
4346  // FNMUL I=A,B,0
4347  // FSUB R,I,C
4348  // ==> FNMADD R,A,B,C // = -A*B - C
4349  // --- Create(FNMADD);
4350  if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
4351  Opc = AArch64::FNMADDSrrr;
4352  RC = &AArch64::FPR32RegClass;
4353  } else {
4354  Opc = AArch64::FNMADDDrrr;
4355  RC = &AArch64::FPR64RegClass;
4356  }
4357  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4358  break;
4359  }
4360 
4363  // FMUL I=A,B,0
4364  // FSUB R,C,I
4365  // ==> FMSUB R,A,B,C (computes C - A*B)
4366  // --- Create(FMSUB);
4367  if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
4368  Opc = AArch64::FMSUBSrrr;
4369  RC = &AArch64::FPR32RegClass;
4370  } else {
4371  Opc = AArch64::FMSUBDrrr;
4372  RC = &AArch64::FPR64RegClass;
4373  }
4374  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4375  break;
4376  }
4377 
4379  Opc = AArch64::FMLSv1i32_indexed;
4380  RC = &AArch64::FPR32RegClass;
4381  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4383  break;
4384 
4386  Opc = AArch64::FMLSv1i64_indexed;
4387  RC = &AArch64::FPR64RegClass;
4388  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4390  break;
4391 
4394  RC = &AArch64::FPR64RegClass;
4396  Opc = AArch64::FMLSv2i32_indexed;
4397  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4399  } else {
4400  Opc = AArch64::FMLSv2f32;
4401  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4403  }
4404  break;
4405 
4408  RC = &AArch64::FPR128RegClass;
4410  Opc = AArch64::FMLSv2i64_indexed;
4411  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4413  } else {
4414  Opc = AArch64::FMLSv2f64;
4415  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4417  }
4418  break;
4419 
4422  RC = &AArch64::FPR128RegClass;
4424  Opc = AArch64::FMLSv4i32_indexed;
4425  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4427  } else {
4428  Opc = AArch64::FMLSv4f32;
4429  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4431  }
4432  break;
4435  RC = &AArch64::FPR64RegClass;
4436  unsigned NewVR = MRI.createVirtualRegister(RC);
4437  MachineInstrBuilder MIB1 =
4438  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4439  .add(Root.getOperand(2));
4440  InsInstrs.push_back(MIB1);
4441  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4443  Opc = AArch64::FMLAv2i32_indexed;
4444  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4445  FMAInstKind::Indexed, &NewVR);
4446  } else {
4447  Opc = AArch64::FMLAv2f32;
4448  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4449  FMAInstKind::Accumulator, &NewVR);
4450  }
4451  break;
4452  }
4455  RC = &AArch64::FPR128RegClass;
4456  unsigned NewVR = MRI.createVirtualRegister(RC);
4457  MachineInstrBuilder MIB1 =
4458  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4459  .add(Root.getOperand(2));
4460  InsInstrs.push_back(MIB1);
4461  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4463  Opc = AArch64::FMLAv4i32_indexed;
4464  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4465  FMAInstKind::Indexed, &NewVR);
4466  } else {
4467  Opc = AArch64::FMLAv4f32;
4468  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4469  FMAInstKind::Accumulator, &NewVR);
4470  }
4471  break;
4472  }
4475  RC = &AArch64::FPR128RegClass;
4476  unsigned NewVR = MRI.createVirtualRegister(RC);
4477  MachineInstrBuilder MIB1 =
4478  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4479  .add(Root.getOperand(2));
4480  InsInstrs.push_back(MIB1);
4481  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4483  Opc = AArch64::FMLAv2i64_indexed;
4484  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4485  FMAInstKind::Indexed, &NewVR);
4486  } else {
4487  Opc = AArch64::FMLAv2f64;
4488  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4489  FMAInstKind::Accumulator, &NewVR);
4490  }
4491  break;
4492  }
4493  } // end switch (Pattern)
4494  // Record MUL and ADD/SUB for deletion
4495  DelInstrs.push_back(MUL);
4496  DelInstrs.push_back(&Root);
4497 }
4498 
4499 /// Replace csincr-branch sequence by simple conditional branch
4500 ///
4501 /// Examples:
4502 /// 1. \code
4503 /// csinc w9, wzr, wzr, <condition code>
4504 /// tbnz w9, #0, 0x44
4505 /// \endcode
4506 /// to
4507 /// \code
4508 /// b.<inverted condition code>
4509 /// \endcode
4510 ///
4511 /// 2. \code
4512 /// csinc w9, wzr, wzr, <condition code>
4513 /// tbz w9, #0, 0x44
4514 /// \endcode
4515 /// to
4516 /// \code
4517 /// b.<condition code>
4518 /// \endcode
4519 ///
4520 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4521 /// compare's constant operand is power of 2.
4522 ///
4523 /// Examples:
4524 /// \code
4525 /// and w8, w8, #0x400
4526 /// cbnz w8, L1
4527 /// \endcode
4528 /// to
4529 /// \code
4530 /// tbnz w8, #10, L1
4531 /// \endcode
4532 ///
4533 /// \param MI Conditional Branch
4534 /// \return True when the simple conditional branch is generated
4535 ///
4537  bool IsNegativeBranch = false;
4538  bool IsTestAndBranch = false;
4539  unsigned TargetBBInMI = 0;
4540  switch (MI.getOpcode()) {
4541  default:
4542  llvm_unreachable("Unknown branch instruction?");
4543  case AArch64::Bcc:
4544  return false;
4545  case AArch64::CBZW:
4546  case AArch64::CBZX:
4547  TargetBBInMI = 1;
4548  break;
4549  case AArch64::CBNZW:
4550  case AArch64::CBNZX:
4551  TargetBBInMI = 1;
4552  IsNegativeBranch = true;
4553  break;
4554  case AArch64::TBZW:
4555  case AArch64::TBZX:
4556  TargetBBInMI = 2;
4557  IsTestAndBranch = true;
4558  break;
4559  case AArch64::TBNZW:
4560  case AArch64::TBNZX:
4561  TargetBBInMI = 2;
4562  IsNegativeBranch = true;
4563  IsTestAndBranch = true;
4564  break;
4565  }
4566  // So we increment a zero register and test for bits other
4567  // than bit 0? Conservatively bail out in case the verifier
4568  // missed this case.
4569  if (IsTestAndBranch && MI.getOperand(1).getImm())
4570  return false;
4571 
4572  // Find Definition.
4573  assert(MI.getParent() && "Incomplete machine instruciton\n");
4574  MachineBasicBlock *MBB = MI.getParent();
4575  MachineFunction *MF = MBB->getParent();
4576  MachineRegisterInfo *MRI = &MF->getRegInfo();
4577  unsigned VReg = MI.getOperand(0).getReg();
4579  return false;
4580 
4581  MachineInstr *DefMI = MRI->getVRegDef(VReg);
4582 
4583  // Look through COPY instructions to find definition.
4584  while (DefMI->isCopy()) {
4585  unsigned CopyVReg = DefMI->getOperand(1).getReg();
4586  if (!MRI->hasOneNonDBGUse(CopyVReg))
4587  return false;
4588  if (!MRI->hasOneDef(CopyVReg))
4589  return false;
4590  DefMI = MRI->getVRegDef(CopyVReg);
4591  }
4592 
4593  switch (DefMI->getOpcode()) {
4594  default:
4595  return false;
4596  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4597  case AArch64::ANDWri:
4598  case AArch64::ANDXri: {
4599  if (IsTestAndBranch)
4600  return false;
4601  if (DefMI->getParent() != MBB)
4602  return false;
4603  if (!MRI->hasOneNonDBGUse(VReg))
4604  return false;
4605 
4606  bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4608  DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4609  if (!isPowerOf2_64(Mask))
4610  return false;
4611 
4612  MachineOperand &MO = DefMI->getOperand(1);
4613  unsigned NewReg = MO.getReg();
4615  return false;
4616 
4617  assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4618 
4619  MachineBasicBlock &RefToMBB = *MBB;
4620  MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4621  DebugLoc DL = MI.getDebugLoc();
4622  unsigned Imm = Log2_64(Mask);
4623  unsigned Opc = (Imm < 32)
4624  ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4625  : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4626  MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4627  .addReg(NewReg)
4628  .addImm(Imm)
4629  .addMBB(TBB);
4630  // Register lives on to the CBZ now.
4631  MO.setIsKill(false);
4632 
4633  // For immediate smaller than 32, we need to use the 32-bit
4634  // variant (W) in all cases. Indeed the 64-bit variant does not
4635  // allow to encode them.
4636  // Therefore, if the input register is 64-bit, we need to take the
4637  // 32-bit sub-part.
4638  if (!Is32Bit && Imm < 32)
4639  NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4640  MI.eraseFromParent();
4641  return true;
4642  }
4643  // Look for CSINC
4644  case AArch64::CSINCWr:
4645  case AArch64::CSINCXr: {
4646  if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4647  DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4648  !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4649  DefMI->getOperand(2).getReg() == AArch64::XZR))
4650  return false;
4651 
4652  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4653  return false;
4654 
4656  // Convert only when the condition code is not modified between
4657  // the CSINC and the branch. The CC may be used by other
4658  // instructions in between.
4660  return false;
4661  MachineBasicBlock &RefToMBB = *MBB;
4662  MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4663  DebugLoc DL = MI.getDebugLoc();
4664  if (IsNegativeBranch)
4666  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4667  MI.eraseFromParent();
4668  return true;
4669  }
4670  }
4671 }
4672 
4673 std::pair<unsigned, unsigned>
4675  const unsigned Mask = AArch64II::MO_FRAGMENT;
4676  return std::make_pair(TF & Mask, TF & ~Mask);
4677 }
4678 
4681  using namespace AArch64II;
4682 
4683  static const std::pair<unsigned, const char *> TargetFlags[] = {
4684  {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4685  {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
4686  {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
4687  {MO_HI12, "aarch64-hi12"}};
4688  return makeArrayRef(TargetFlags);
4689 }
4690 
4693  using namespace AArch64II;
4694 
4695  static const std::pair<unsigned, const char *> TargetFlags[] = {
4696  {MO_COFFSTUB, "aarch64-coffstub"},
4697  {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
4698  {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
4699  {MO_DLLIMPORT, "aarch64-dllimport"}};
4700  return makeArrayRef(TargetFlags);
4701 }
4702 
4705  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4706  {{MOSuppressPair, "aarch64-suppress-pair"},
4707  {MOStridedAccess, "aarch64-strided-access"}};
4708  return makeArrayRef(TargetFlags);
4709 }
4710 
4711 /// Constants defining how certain sequences should be outlined.
4712 /// This encompasses how an outlined function should be called, and what kind of
4713 /// frame should be emitted for that outlined function.
4714 ///
4715 /// \p MachineOutlinerDefault implies that the function should be called with
4716 /// a save and restore of LR to the stack.
4717 ///
4718 /// That is,
4719 ///
4720 /// I1 Save LR OUTLINED_FUNCTION:
4721 /// I2 --> BL OUTLINED_FUNCTION I1
4722 /// I3 Restore LR I2
4723 /// I3
4724 /// RET
4725 ///
4726 /// * Call construction overhead: 3 (save + BL + restore)
4727 /// * Frame construction overhead: 1 (ret)
4728 /// * Requires stack fixups? Yes
4729 ///
4730 /// \p MachineOutlinerTailCall implies that the function is being created from
4731 /// a sequence of instructions ending in a return.
4732 ///
4733 /// That is,
4734 ///
4735 /// I1 OUTLINED_FUNCTION:
4736 /// I2 --> B OUTLINED_FUNCTION I1
4737 /// RET I2
4738 /// RET
4739 ///
4740 /// * Call construction overhead: 1 (B)
4741 /// * Frame construction overhead: 0 (Return included in sequence)
4742 /// * Requires stack fixups? No
4743 ///
4744 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4745 /// a BL instruction, but doesn't require LR to be saved and restored. This
4746 /// happens when LR is known to be dead.
4747 ///
4748 /// That is,
4749 ///
4750 /// I1 OUTLINED_FUNCTION:
4751 /// I2 --> BL OUTLINED_FUNCTION I1
4752 /// I3 I2
4753 /// I3
4754 /// RET
4755 ///
4756 /// * Call construction overhead: 1 (BL)
4757 /// * Frame construction overhead: 1 (RET)
4758 /// * Requires stack fixups? No
4759 ///
4760 /// \p MachineOutlinerThunk implies that the function is being created from
4761 /// a sequence of instructions ending in a call. The outlined function is
4762 /// called with a BL instruction, and the outlined function tail-calls the
4763 /// original call destination.
4764 ///
4765 /// That is,
4766 ///
4767 /// I1 OUTLINED_FUNCTION:
4768 /// I2 --> BL OUTLINED_FUNCTION I1
4769 /// BL f I2
4770 /// B f
4771 /// * Call construction overhead: 1 (BL)
4772 /// * Frame construction overhead: 0
4773 /// * Requires stack fixups? No
4774 ///
4775 /// \p MachineOutlinerRegSave implies that the function should be called with a
4776 /// save and restore of LR to an available register. This allows us to avoid
4777 /// stack fixups. Note that this outlining variant is compatible with the
4778 /// NoLRSave case.
4779 ///
4780 /// That is,
4781 ///
4782 /// I1 Save LR OUTLINED_FUNCTION:
4783 /// I2 --> BL OUTLINED_FUNCTION I1
4784 /// I3 Restore LR I2
4785 /// I3
4786 /// RET
4787 ///
4788 /// * Call construction overhead: 3 (save + BL + restore)
4789 /// * Frame construction overhead: 1 (ret)
4790 /// * Requires stack fixups? No
4792  MachineOutlinerDefault, /// Emit a save, restore, call, and return.
4793  MachineOutlinerTailCall, /// Only emit a branch.
4794  MachineOutlinerNoLRSave, /// Emit a call and return.
4795  MachineOutlinerThunk, /// Emit a call and tail-call.
4796  MachineOutlinerRegSave /// Same as default, but save to a register.
4797 };
4798 
4801  HasCalls = 0x4,
4803 };
4804 
4805 unsigned
4806 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
4807  assert(C.LRUWasSet && "LRU wasn't set?");
4808  MachineFunction *MF = C.getMF();
4809  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
4810  MF->getSubtarget().getRegisterInfo());
4811 
4812  // Check if there is an available register across the sequence that we can
4813  // use.
4814  for (unsigned Reg : AArch64::GPR64RegClass) {
4815  if (!ARI->isReservedReg(*MF, Reg) &&
4816  Reg != AArch64::LR && // LR is not reserved, but don't use it.
4817  Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
4818  Reg != AArch64::X17 && // Ditto for X17.
4820  return Reg;
4821  }
4822 
4823  // No suitable register. Return 0.
4824  return 0u;
4825 }
4826 
4829  std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
4830  outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
4831  unsigned SequenceSize =
4832  std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
4833  [this](unsigned Sum, const MachineInstr &MI) {
4834  return Sum + getInstSizeInBytes(MI);
4835  });
4836 
4837  // Properties about candidate MBBs that hold for all of them.
4838  unsigned FlagsSetInAll = 0xF;
4839 
4840  // Compute liveness information for each candidate, and set FlagsSetInAll.
4842  std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
4843  [&FlagsSetInAll](outliner::Candidate &C) {
4844  FlagsSetInAll &= C.Flags;
4845  });
4846 
4847  // According to the AArch64 Procedure Call Standard, the following are
4848  // undefined on entry/exit from a function call:
4849  //
4850  // * Registers x16, x17, (and thus w16, w17)
4851  // * Condition codes (and thus the NZCV register)
4852  //
4853  // Because if this, we can't outline any sequence of instructions where
4854  // one
4855  // of these registers is live into/across it. Thus, we need to delete
4856  // those
4857  // candidates.
4858  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
4859  // If the unsafe registers in this block are all dead, then we don't need
4860  // to compute liveness here.
4861  if (C.Flags & UnsafeRegsDead)
4862  return false;
4863  C.initLRU(TRI);
4864  LiveRegUnits LRU = C.LRU;
4865  return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
4866  !LRU.available(AArch64::NZCV));
4867  };
4868 
4869  // Are there any candidates where those registers are live?
4870  if (!(FlagsSetInAll & UnsafeRegsDead)) {
4871  // Erase every candidate that violates the restrictions above. (It could be
4872  // true that we have viable candidates, so it's not worth bailing out in
4873  // the case that, say, 1 out of 20 candidates violate the restructions.)
4874  RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
4875  RepeatedSequenceLocs.end(),
4876  CantGuaranteeValueAcrossCall),
4877  RepeatedSequenceLocs.end());
4878 
4879  // If the sequence doesn't have enough candidates left, then we're done.
4880  if (RepeatedSequenceLocs.size() < 2)
4881  return outliner::OutlinedFunction();
4882  }
4883 
4884  // At this point, we have only "safe" candidates to outline. Figure out
4885  // frame + call instruction information.
4886 
4887  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
4888 
4889  // Helper lambda which sets call information for every candidate.
4890  auto SetCandidateCallInfo =
4891  [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
4892  for (outliner::Candidate &C : RepeatedSequenceLocs)
4893  C.setCallInfo(CallID, NumBytesForCall);
4894  };
4895 
4896  unsigned FrameID = MachineOutlinerDefault;
4897  unsigned NumBytesToCreateFrame = 4;
4898 
4899  bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
4900  return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
4901  });
4902 
4903  // Returns true if an instructions is safe to fix up, false otherwise.
4904  auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
4905  if (MI.isCall())
4906  return true;
4907 
4908  if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
4909  !MI.readsRegister(AArch64::SP, &TRI))
4910  return true;
4911 
4912  // Any modification of SP will break our code to save/restore LR.
4913  // FIXME: We could handle some instructions which add a constant
4914  // offset to SP, with a bit more work.
4915  if (MI.modifiesRegister(AArch64::SP, &TRI))
4916  return false;
4917 
4918  // At this point, we have a stack instruction that we might need to
4919  // fix up. We'll handle it if it's a load or store.
4920  if (MI.mayLoadOrStore()) {
4921  const MachineOperand *Base; // Filled with the base operand of MI.
4922  int64_t Offset; // Filled with the offset of MI.
4923 
4924  // Does it allow us to offset the base operand and is the base the
4925  // register SP?
4926  if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
4927  Base->getReg() != AArch64::SP)
4928  return false;
4929 
4930  // Find the minimum/maximum offset for this instruction and check
4931  // if fixing it up would be in range.
4932  int64_t MinOffset,
4933  MaxOffset; // Unscaled offsets for the instruction.
4934  unsigned Scale; // The scale to multiply the offsets by.
4935  unsigned DummyWidth;
4936  getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
4937 
4938  Offset += 16; // Update the offset to what it would be if we outlined.
4939  if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
4940  return false;
4941 
4942  // It's in range, so we can outline it.
4943  return true;
4944  }
4945 
4946  // FIXME: Add handling for instructions like "add x0, sp, #8".
4947 
4948  // We can't fix it up, so don't outline it.
4949  return false;
4950  };
4951 
4952  // True if it's possible to fix up each stack instruction in this sequence.
4953  // Important for frames/call variants that modify the stack.
4954  bool AllStackInstrsSafe = std::all_of(
4955  FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
4956 
4957  // If the last instruction in any candidate is a terminator, then we should
4958  // tail call all of the candidates.
4959  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
4960  FrameID = MachineOutlinerTailCall;
4961  NumBytesToCreateFrame = 0;
4962  SetCandidateCallInfo(MachineOutlinerTailCall, 4);
4963  }
4964 
4965  else if (LastInstrOpcode == AArch64::BL ||
4966  (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
4967  // FIXME: Do we need to check if the code after this uses the value of LR?
4968  FrameID = MachineOutlinerThunk;
4969  NumBytesToCreateFrame = 0;
4970  SetCandidateCallInfo(MachineOutlinerThunk, 4);
4971  }
4972 
4973  else {
4974  // We need to decide how to emit calls + frames. We can always emit the same
4975  // frame if we don't need to save to the stack. If we have to save to the
4976  // stack, then we need a different frame.
4977  unsigned NumBytesNoStackCalls = 0;
4978  std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
4979 
4980  for (outliner::Candidate &C : RepeatedSequenceLocs) {
4981  C.initLRU(TRI);
4982 
4983  // Is LR available? If so, we don't need a save.
4984  if (C.LRU.available(AArch64::LR)) {
4985  NumBytesNoStackCalls += 4;
4987  CandidatesWithoutStackFixups.push_back(C);
4988  }
4989 
4990  // Is an unused register available? If so, we won't modify the stack, so
4991  // we can outline with the same frame type as those that don't save LR.
4992  else if (findRegisterToSaveLRTo(C)) {
4993  NumBytesNoStackCalls += 12;
4995  CandidatesWithoutStackFixups.push_back(C);
4996  }
4997 
4998  // Is SP used in the sequence at all? If not, we don't have to modify
4999  // the stack, so we are guaranteed to get the same frame.
5000  else if (C.UsedInSequence.available(AArch64::SP)) {
5001  NumBytesNoStackCalls += 12;
5003  CandidatesWithoutStackFixups.push_back(C);
5004  }
5005 
5006  // If we outline this, we need to modify the stack. Pretend we don't
5007  // outline this by saving all of its bytes.
5008  else {
5009  NumBytesNoStackCalls += SequenceSize;
5010  }
5011  }
5012 
5013  // If there are no places where we have to save LR, then note that we
5014  // don't have to update the stack. Otherwise, give every candidate the
5015  // default call type, as long as it's safe to do so.
5016  if (!AllStackInstrsSafe ||
5017  NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5018  RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5019  FrameID = MachineOutlinerNoLRSave;
5020  } else {
5021  SetCandidateCallInfo(MachineOutlinerDefault, 12);
5022  }
5023 
5024  // If we dropped all of the candidates, bail out here.
5025  if (RepeatedSequenceLocs.size() < 2) {
5026  RepeatedSequenceLocs.clear();
5027  return outliner::OutlinedFunction();
5028  }
5029  }
5030 
5031  // Does every candidate's MBB contain a call? If so, then we might have a call
5032  // in the range.
5033  if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5034  // Check if the range contains a call. These require a save + restore of the
5035  // link register.
5036  bool ModStackToSaveLR = false;
5037  if (std::any_of(FirstCand.front(), FirstCand.back(),
5038  [](const MachineInstr &MI) { return MI.isCall(); }))
5039  ModStackToSaveLR = true;
5040 
5041  // Handle the last instruction separately. If this is a tail call, then the
5042  // last instruction is a call. We don't want to save + restore in this case.
5043  // However, it could be possible that the last instruction is a call without
5044  // it being valid to tail call this sequence. We should consider this as
5045  // well.
5046  else if (FrameID != MachineOutlinerThunk &&
5047  FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5048  ModStackToSaveLR = true;
5049 
5050  if (ModStackToSaveLR) {
5051  // We can't fix up the stack. Bail out.
5052  if (!AllStackInstrsSafe) {
5053  RepeatedSequenceLocs.clear();
5054  return outliner::OutlinedFunction();
5055  }
5056 
5057  // Save + restore LR.
5058  NumBytesToCreateFrame += 8;
5059  }
5060  }
5061 
5062  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5063  NumBytesToCreateFrame, FrameID);
5064 }
5065 
5067  MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5068  const Function &F = MF.getFunction();
5069 
5070  // Can F be deduplicated by the linker? If it can, don't outline from it.
5071  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5072  return false;
5073 
5074  // Don't outline from functions with section markings; the program could
5075  // expect that all the code is in the named section.
5076  // FIXME: Allow outlining from multiple functions with the same section
5077  // marking.
5078  if (F.hasSection())
5079  return false;
5080 
5081  // Outlining from functions with redzones is unsafe since the outliner may
5082  // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5083  // outline from it.
5085  if (!AFI || AFI->hasRedZone().getValueOr(true))
5086  return false;
5087 
5088  // It's safe to outline from MF.
5089  return true;
5090 }
5091 
5093  unsigned &Flags) const {
5094  // Check if LR is available through all of the MBB. If it's not, then set
5095  // a flag.
5097  "Suitable Machine Function for outlining must track liveness");
5099 
5100  std::for_each(MBB.rbegin(), MBB.rend(),
5101  [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5102 
5103  // Check if each of the unsafe registers are available...
5104  bool W16AvailableInBlock = LRU.available(AArch64::W16);
5105  bool W17AvailableInBlock = LRU.available(AArch64::W17);
5106  bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5107 
5108  // If all of these are dead (and not live out), we know we don't have to check
5109  // them later.
5110  if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5112 
5113  // Now, add the live outs to the set.
5114  LRU.addLiveOuts(MBB);
5115 
5116  // If any of these registers is available in the MBB, but also a live out of
5117  // the block, then we know outlining is unsafe.
5118  if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5119  return false;
5120  if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5121  return false;
5122  if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5123  return false;
5124 
5125  // Check if there's a call inside this MachineBasicBlock. If there is, then
5126  // set a flag.
5127  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5129 
5130  MachineFunction *MF = MBB.getParent();
5131 
5132  // In the event that we outline, we may have to save LR. If there is an
5133  // available register in the MBB, then we'll always save LR there. Check if
5134  // this is true.
5135  bool CanSaveLR = false;
5136  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5137  MF->getSubtarget().getRegisterInfo());
5138 
5139  // Check if there is an available register across the sequence that we can
5140  // use.
5141  for (unsigned Reg : AArch64::GPR64RegClass) {
5142  if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5143  Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5144  CanSaveLR = true;
5145  break;
5146  }
5147  }
5148 
5149  // Check if we have a register we can save LR to, and if LR was used
5150  // somewhere. If both of those things are true, then we need to evaluate the
5151  // safety of outlining stack instructions later.
5152  if (!CanSaveLR && !LRU.available(AArch64::LR))
5154 
5155  return true;
5156 }
5157 
5160  unsigned Flags) const {
5161  MachineInstr &MI = *MIT;
5162  MachineBasicBlock *MBB = MI.getParent();
5163  MachineFunction *MF = MBB->getParent();
5164  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5165 
5166  // Don't outline LOHs.
5167  if (FuncInfo->getLOHRelated().count(&MI))
5169 
5170  // Don't allow debug values to impact outlining type.
5171  if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5173 
5174  // At this point, KILL instructions don't really tell us much so we can go
5175  // ahead and skip over them.
5176  if (MI.isKill())
5178 
5179  // Is this a terminator for a basic block?
5180  if (MI.isTerminator()) {
5181 
5182  // Is this the end of a function?
5183  if (MI.getParent()->succ_empty())
5185 
5186  // It's not, so don't outline it.
5188  }
5189 
5190  // Make sure none of the operands are un-outlinable.
5191  for (const MachineOperand &MOP : MI.operands()) {
5192  if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5193  MOP.isTargetIndex())
5195 
5196  // If it uses LR or W30 explicitly, then don't touch it.
5197  if (MOP.isReg() && !MOP.isImplicit() &&
5198  (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5200  }
5201 
5202  // Special cases for instructions that can always be outlined, but will fail
5203  // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5204  // be outlined because they don't require a *specific* value to be in LR.
5205  if (MI.getOpcode() == AArch64::ADRP)
5207 
5208  // If MI is a call we might be able to outline it. We don't want to outline
5209  // any calls that rely on the position of items on the stack. When we outline
5210  // something containing a call, we have to emit a save and restore of LR in
5211  // the outlined function. Currently, this always happens by saving LR to the
5212  // stack. Thus, if we outline, say, half the parameters for a function call
5213  // plus the call, then we'll break the callee's expectations for the layout
5214  // of the stack.
5215  //
5216  // FIXME: Allow calls to functions which construct a stack frame, as long
5217  // as they don't access arguments on the stack.
5218  // FIXME: Figure out some way to analyze functions defined in other modules.
5219  // We should be able to compute the memory usage based on the IR calling
5220  // convention, even if we can't see the definition.
5221  if (MI.isCall()) {
5222  // Get the function associated with the call. Look at each operand and find
5223  // the one that represents the callee and get its name.
5224  const Function *Callee = nullptr;
5225  for (const MachineOperand &MOP : MI.operands()) {
5226  if (MOP.isGlobal()) {
5227  Callee = dyn_cast<Function>(MOP.getGlobal());
5228  break;
5229  }
5230  }
5231 
5232  // Never outline calls to mcount. There isn't any rule that would require
5233  // this, but the Linux kernel's "ftrace" feature depends on it.
5234  if (Callee && Callee->getName() == "\01_mcount")
5236 
5237  // If we don't know anything about the callee, assume it depends on the
5238  // stack layout of the caller. In that case, it's only legal to outline
5239  // as a tail-call. Whitelist the call instructions we know about so we
5240  // don't get unexpected results with call pseudo-instructions.
5241  auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5242  if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5243  UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5244 
5245  if (!Callee)
5246  return UnknownCallOutlineType;
5247 
5248  // We have a function we have information about. Check it if it's something
5249  // can safely outline.
5250  MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5251 
5252  // We don't know what's going on with the callee at all. Don't touch it.
5253  if (!CalleeMF)
5254  return UnknownCallOutlineType;
5255 
5256  // Check if we know anything about the callee saves on the function. If we
5257  // don't, then don't touch it, since that implies that we haven't
5258  // computed anything about its stack frame yet.
5259  MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5260  if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5261  MFI.getNumObjects() > 0)
5262  return UnknownCallOutlineType;
5263 
5264  // At this point, we can say that CalleeMF ought to not pass anything on the
5265  // stack. Therefore, we can outline it.
5267  }
5268 
5269  // Don't outline positions.
5270  if (MI.isPosition())
5272 
5273  // Don't touch the link register or W30.
5274  if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5275  MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5277 
5278  // Don't outline BTI instructions, because that will prevent the outlining
5279  // site from being indirectly callable.
5280  if (MI.getOpcode() == AArch64::HINT) {
5281  int64_t Imm = MI.getOperand(0).getImm();
5282  if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5284  }
5285 
5287 }
5288 
5289 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5290  for (MachineInstr &MI : MBB) {
5291  const MachineOperand *Base;
5292  unsigned Width;
5293  int64_t Offset;
5294 
5295  // Is this a load or store with an immediate offset with SP as the base?
5296  if (!MI.mayLoadOrStore() ||
5297  !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5298  (Base->isReg() && Base->getReg() != AArch64::SP))
5299  continue;
5300 
5301  // It is, so we have to fix it up.
5302  unsigned Scale;
5303  int64_t Dummy1, Dummy2;
5304 
5305  MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5306  assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5307  getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5308  assert(Scale != 0 && "Unexpected opcode!");
5309 
5310  // We've pushed the return address to the stack, so add 16 to the offset.
5311  // This is safe, since we already checked if it would overflow when we
5312  // checked if this instruction was legal to outline.
5313  int64_t NewImm = (Offset + 16) / Scale;
5314  StackOffsetOperand.setImm(NewImm);
5315  }
5316 }
5317 
5320  const outliner::OutlinedFunction &OF) const {
5321  // For thunk outlining, rewrite the last instruction from a call to a
5322  // tail-call.
5324  MachineInstr *Call = &*--MBB.instr_end();
5325  unsigned TailOpcode;
5326  if (Call->getOpcode() == AArch64::BL) {
5327  TailOpcode = AArch64::TCRETURNdi;
5328  } else {
5329  assert(Call->getOpcode() == AArch64::BLR);
5330  TailOpcode = AArch64::TCRETURNriALL;
5331  }
5332  MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5333  .add(Call->getOperand(0))
5334  .addImm(0);
5335  MBB.insert(MBB.end(), TC);
5336  Call->eraseFromParent();
5337  }
5338 
5339  // Is there a call in the outlined range?
5340  auto IsNonTailCall = [](MachineInstr &MI) {
5341  return MI.isCall() && !MI.isReturn();
5342  };
5343  if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5344  // Fix up the instructions in the range, since we're going to modify the
5345  // stack.
5347  "Can only fix up stack references once");
5348  fixupPostOutline(MBB);
5349 
5350  // LR has to be a live in so that we can save it.
5351  MBB.addLiveIn(AArch64::LR);
5352 
5354  MachineBasicBlock::iterator Et = MBB.end();
5355 
5358  Et = std::prev(MBB.end());
5359 
5360  // Insert a save before the outlined region
5361  MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5362  .addReg(AArch64::SP, RegState::Define)
5363  .addReg(AArch64::LR)
5364  .addReg(AArch64::SP)
5365  .addImm(-16);
5366  It = MBB.insert(It, STRXpre);
5367 
5368  const TargetSubtargetInfo &STI = MF.getSubtarget();
5369  const MCRegisterInfo *MRI = STI.getRegisterInfo();
5370  unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5371 
5372  // Add a CFI saying the stack was moved 16 B down.
5373  int64_t StackPosEntry =
5375  BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5376  .addCFIIndex(StackPosEntry)
5378 
5379  // Add a CFI saying that the LR that we want to find is now 16 B higher than
5380  // before.
5381  int64_t LRPosEntry =
5382  MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5383  BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5384  .addCFIIndex(LRPosEntry)
5386 
5387  // Insert a restore before the terminator for the function.
5388  MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5389  .addReg(AArch64::SP, RegState::Define)
5390  .addReg(AArch64::LR, RegState::Define)
5391  .addReg(AArch64::SP)
5392  .addImm(16);
5393  Et = MBB.insert(Et, LDRXpost);
5394  }
5395 
5396  // If this is a tail call outlined function, then there's already a return.
5399  return;
5400 
5401  // It's not a tail call, so we have to insert the return ourselves.
5402  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5403  .addReg(AArch64::LR, RegState::Undef);
5404  MBB.insert(MBB.end(), ret);
5405 
5406  // Did we have to modify the stack by saving the link register?
5408  return;
5409 
5410  // We modified the stack.
5411  // Walk over the basic block and fix up all the stack accesses.
5412  fixupPostOutline(MBB);
5413 }
5414 
5417  MachineFunction &MF, const outliner::Candidate &C) const {
5418 
5419  // Are we tail calling?
5421  // If yes, then we can just branch to the label.
5422  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5423  .addGlobalAddress(M.getNamedValue(MF.getName()))
5424  .addImm(0));
5425  return It;
5426  }
5427 
5428  // Are we saving the link register?
5431  // No, so just insert the call.
5432  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5433  .addGlobalAddress(M.getNamedValue(MF.getName())));
5434  return It;
5435  }
5436 
5437  // We want to return the spot where we inserted the call.
5439 
5440  // Instructions for saving and restoring LR around the call instruction we're
5441  // going to insert.
5442  MachineInstr *Save;
5443  MachineInstr *Restore;
5444  // Can we save to a register?
5446  // FIXME: This logic should be sunk into a target-specific interface so that
5447  // we don't have to recompute the register.
5448  unsigned Reg = findRegisterToSaveLRTo(C);
5449  assert(Reg != 0 && "No callee-saved register available?");
5450 
5451  // Save and restore LR from that register.
5452  Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
5453  .addReg(AArch64::XZR)
5454  .addReg(AArch64::LR)
5455  .addImm(0);
5456  Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
5457  .addReg(AArch64::XZR)
5458  .addReg(Reg)
5459  .addImm(0);
5460  } else {
5461  // We have the default case. Save and restore from SP.
5462  Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5463  .addReg(AArch64::SP, RegState::Define)
5464  .addReg(AArch64::LR)
5465  .addReg(AArch64::SP)
5466  .addImm(-16);
5467  Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5468  .addReg(AArch64::SP, RegState::Define)
5469  .addReg(AArch64::LR, RegState::Define)
5470  .addReg(AArch64::SP)
5471  .addImm(16);
5472  }
5473 
5474  It = MBB.insert(It, Save);
5475  It++;
5476 
5477  // Insert the call.
5478  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5479  .addGlobalAddress(M.getNamedValue(MF.getName())));
5480  CallPt = It;
5481  It++;
5482 
5483  It = MBB.insert(It, Restore);
5484  return CallPt;
5485 }
5486 
5487 bool