LLVM  9.0.0svn
AArch64InstrInfo.cpp
Go to the documentation of this file.
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
15 #include "AArch64Subtarget.h"
17 #include "Utils/AArch64BaseInfo.h"
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/CodeGen/StackMaps.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/MC/MCInst.h"
36 #include "llvm/MC/MCInstrDesc.h"
37 #include "llvm/Support/Casting.h"
38 #include "llvm/Support/CodeGen.h"
40 #include "llvm/Support/Compiler.h"
45 #include <cassert>
46 #include <cstdint>
47 #include <iterator>
48 #include <utility>
49 
50 using namespace llvm;
51 
52 #define GET_INSTRINFO_CTOR_DTOR
53 #include "AArch64GenInstrInfo.inc"
54 
56  "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
57  cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
58 
60  "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
61  cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
62 
63 static cl::opt<unsigned>
64  BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
65  cl::desc("Restrict range of Bcc instructions (DEBUG)"));
66 
68  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
69  AArch64::CATCHRET),
70  RI(STI.getTargetTriple()), Subtarget(STI) {}
71 
72 /// GetInstSize - Return the number of bytes of code the specified
73 /// instruction may be. This returns the maximum number of bytes.
75  const MachineBasicBlock &MBB = *MI.getParent();
76  const MachineFunction *MF = MBB.getParent();
77  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
78 
79  if (MI.getOpcode() == AArch64::INLINEASM)
80  return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
81 
82  // FIXME: We currently only handle pseudoinstructions that don't get expanded
83  // before the assembly printer.
84  unsigned NumBytes = 0;
85  const MCInstrDesc &Desc = MI.getDesc();
86  switch (Desc.getOpcode()) {
87  default:
88  // Anything not explicitly designated otherwise is a normal 4-byte insn.
89  NumBytes = 4;
90  break;
91  case TargetOpcode::DBG_VALUE:
93  case TargetOpcode::IMPLICIT_DEF:
94  case TargetOpcode::KILL:
95  NumBytes = 0;
96  break;
97  case TargetOpcode::STACKMAP:
98  // The upper bound for a stackmap intrinsic is the full length of its shadow
99  NumBytes = StackMapOpers(&MI).getNumPatchBytes();
100  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
101  break;
102  case TargetOpcode::PATCHPOINT:
103  // The size of the patchpoint intrinsic is the number of bytes requested
104  NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
105  assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
106  break;
108  // This gets lowered to an instruction sequence which takes 16 bytes
109  NumBytes = 16;
110  break;
111  case AArch64::JumpTableDest32:
112  case AArch64::JumpTableDest16:
113  case AArch64::JumpTableDest8:
114  NumBytes = 12;
115  break;
116  case AArch64::SPACE:
117  NumBytes = MI.getOperand(1).getImm();
118  break;
119  }
120 
121  return NumBytes;
122 }
123 
126  // Block ends with fall-through condbranch.
127  switch (LastInst->getOpcode()) {
128  default:
129  llvm_unreachable("Unknown branch instruction?");
130  case AArch64::Bcc:
131  Target = LastInst->getOperand(1).getMBB();
132  Cond.push_back(LastInst->getOperand(0));
133  break;
134  case AArch64::CBZW:
135  case AArch64::CBZX:
136  case AArch64::CBNZW:
137  case AArch64::CBNZX:
138  Target = LastInst->getOperand(1).getMBB();
140  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
141  Cond.push_back(LastInst->getOperand(0));
142  break;
143  case AArch64::TBZW:
144  case AArch64::TBZX:
145  case AArch64::TBNZW:
146  case AArch64::TBNZX:
147  Target = LastInst->getOperand(2).getMBB();
149  Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
150  Cond.push_back(LastInst->getOperand(0));
151  Cond.push_back(LastInst->getOperand(1));
152  }
153 }
154 
155 static unsigned getBranchDisplacementBits(unsigned Opc) {
156  switch (Opc) {
157  default:
158  llvm_unreachable("unexpected opcode!");
159  case AArch64::B:
160  return 64;
161  case AArch64::TBNZW:
162  case AArch64::TBZW:
163  case AArch64::TBNZX:
164  case AArch64::TBZX:
165  return TBZDisplacementBits;
166  case AArch64::CBNZW:
167  case AArch64::CBZW:
168  case AArch64::CBNZX:
169  case AArch64::CBZX:
170  return CBZDisplacementBits;
171  case AArch64::Bcc:
172  return BCCDisplacementBits;
173  }
174 }
175 
177  int64_t BrOffset) const {
178  unsigned Bits = getBranchDisplacementBits(BranchOp);
179  assert(Bits >= 3 && "max branch displacement must be enough to jump"
180  "over conditional branch expansion");
181  return isIntN(Bits, BrOffset / 4);
182 }
183 
186  switch (MI.getOpcode()) {
187  default:
188  llvm_unreachable("unexpected opcode!");
189  case AArch64::B:
190  return MI.getOperand(0).getMBB();
191  case AArch64::TBZW:
192  case AArch64::TBNZW:
193  case AArch64::TBZX:
194  case AArch64::TBNZX:
195  return MI.getOperand(2).getMBB();
196  case AArch64::CBZW:
197  case AArch64::CBNZW:
198  case AArch64::CBZX:
199  case AArch64::CBNZX:
200  case AArch64::Bcc:
201  return MI.getOperand(1).getMBB();
202  }
203 }
204 
205 // Branch analysis.
207  MachineBasicBlock *&TBB,
208  MachineBasicBlock *&FBB,
210  bool AllowModify) const {
211  // If the block has no terminators, it just falls into the block after it.
213  if (I == MBB.end())
214  return false;
215 
216  if (!isUnpredicatedTerminator(*I))
217  return false;
218 
219  // Get the last instruction in the block.
220  MachineInstr *LastInst = &*I;
221 
222  // If there is only one terminator instruction, process it.
223  unsigned LastOpc = LastInst->getOpcode();
224  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
225  if (isUncondBranchOpcode(LastOpc)) {
226  TBB = LastInst->getOperand(0).getMBB();
227  return false;
228  }
229  if (isCondBranchOpcode(LastOpc)) {
230  // Block ends with fall-through condbranch.
231  parseCondBranch(LastInst, TBB, Cond);
232  return false;
233  }
234  return true; // Can't handle indirect branch.
235  }
236 
237  // Get the instruction before it if it is a terminator.
238  MachineInstr *SecondLastInst = &*I;
239  unsigned SecondLastOpc = SecondLastInst->getOpcode();
240 
241  // If AllowModify is true and the block ends with two or more unconditional
242  // branches, delete all but the first unconditional branch.
243  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
244  while (isUncondBranchOpcode(SecondLastOpc)) {
245  LastInst->eraseFromParent();
246  LastInst = SecondLastInst;
247  LastOpc = LastInst->getOpcode();
248  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
249  // Return now the only terminator is an unconditional branch.
250  TBB = LastInst->getOperand(0).getMBB();
251  return false;
252  } else {
253  SecondLastInst = &*I;
254  SecondLastOpc = SecondLastInst->getOpcode();
255  }
256  }
257  }
258 
259  // If there are three terminators, we don't know what sort of block this is.
260  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
261  return true;
262 
263  // If the block ends with a B and a Bcc, handle it.
264  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
265  parseCondBranch(SecondLastInst, TBB, Cond);
266  FBB = LastInst->getOperand(0).getMBB();
267  return false;
268  }
269 
270  // If the block ends with two unconditional branches, handle it. The second
271  // one is not executed, so remove it.
272  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
273  TBB = SecondLastInst->getOperand(0).getMBB();
274  I = LastInst;
275  if (AllowModify)
276  I->eraseFromParent();
277  return false;
278  }
279 
280  // ...likewise if it ends with an indirect branch followed by an unconditional
281  // branch.
282  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
283  I = LastInst;
284  if (AllowModify)
285  I->eraseFromParent();
286  return true;
287  }
288 
289  // Otherwise, can't handle this.
290  return true;
291 }
292 
294  SmallVectorImpl<MachineOperand> &Cond) const {
295  if (Cond[0].getImm() != -1) {
296  // Regular Bcc
297  AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
298  Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
299  } else {
300  // Folded compare-and-branch
301  switch (Cond[1].getImm()) {
302  default:
303  llvm_unreachable("Unknown conditional branch!");
304  case AArch64::CBZW:
305  Cond[1].setImm(AArch64::CBNZW);
306  break;
307  case AArch64::CBNZW:
308  Cond[1].setImm(AArch64::CBZW);
309  break;
310  case AArch64::CBZX:
311  Cond[1].setImm(AArch64::CBNZX);
312  break;
313  case AArch64::CBNZX:
314  Cond[1].setImm(AArch64::CBZX);
315  break;
316  case AArch64::TBZW:
317  Cond[1].setImm(AArch64::TBNZW);
318  break;
319  case AArch64::TBNZW:
320  Cond[1].setImm(AArch64::TBZW);
321  break;
322  case AArch64::TBZX:
323  Cond[1].setImm(AArch64::TBNZX);
324  break;
325  case AArch64::TBNZX:
326  Cond[1].setImm(AArch64::TBZX);
327  break;
328  }
329  }
330 
331  return false;
332 }
333 
335  int *BytesRemoved) const {
337  if (I == MBB.end())
338  return 0;
339 
340  if (!isUncondBranchOpcode(I->getOpcode()) &&
341  !isCondBranchOpcode(I->getOpcode()))
342  return 0;
343 
344  // Remove the branch.
345  I->eraseFromParent();
346 
347  I = MBB.end();
348 
349  if (I == MBB.begin()) {
350  if (BytesRemoved)
351  *BytesRemoved = 4;
352  return 1;
353  }
354  --I;
355  if (!isCondBranchOpcode(I->getOpcode())) {
356  if (BytesRemoved)
357  *BytesRemoved = 4;
358  return 1;
359  }
360 
361  // Remove the branch.
362  I->eraseFromParent();
363  if (BytesRemoved)
364  *BytesRemoved = 8;
365 
366  return 2;
367 }
368 
369 void AArch64InstrInfo::instantiateCondBranch(
370  MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
371  ArrayRef<MachineOperand> Cond) const {
372  if (Cond[0].getImm() != -1) {
373  // Regular Bcc
374  BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
375  } else {
376  // Folded compare-and-branch
377  // Note that we use addOperand instead of addReg to keep the flags.
378  const MachineInstrBuilder MIB =
379  BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
380  if (Cond.size() > 3)
381  MIB.addImm(Cond[3].getImm());
382  MIB.addMBB(TBB);
383  }
384 }
385 
388  ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
389  // Shouldn't be a fall through.
390  assert(TBB && "insertBranch must not be told to insert a fallthrough");
391 
392  if (!FBB) {
393  if (Cond.empty()) // Unconditional branch?
394  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
395  else
396  instantiateCondBranch(MBB, DL, TBB, Cond);
397 
398  if (BytesAdded)
399  *BytesAdded = 4;
400 
401  return 1;
402  }
403 
404  // Two-way conditional branch.
405  instantiateCondBranch(MBB, DL, TBB, Cond);
406  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
407 
408  if (BytesAdded)
409  *BytesAdded = 8;
410 
411  return 2;
412 }
413 
414 // Find the original register that VReg is copied from.
415 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
417  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
418  if (!DefMI->isFullCopy())
419  return VReg;
420  VReg = DefMI->getOperand(1).getReg();
421  }
422  return VReg;
423 }
424 
425 // Determine if VReg is defined by an instruction that can be folded into a
426 // csel instruction. If so, return the folded opcode, and the replacement
427 // register.
428 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
429  unsigned *NewVReg = nullptr) {
430  VReg = removeCopies(MRI, VReg);
432  return 0;
433 
434  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
435  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
436  unsigned Opc = 0;
437  unsigned SrcOpNum = 0;
438  switch (DefMI->getOpcode()) {
439  case AArch64::ADDSXri:
440  case AArch64::ADDSWri:
441  // if NZCV is used, do not fold.
442  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
443  return 0;
444  // fall-through to ADDXri and ADDWri.
446  case AArch64::ADDXri:
447  case AArch64::ADDWri:
448  // add x, 1 -> csinc.
449  if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
450  DefMI->getOperand(3).getImm() != 0)
451  return 0;
452  SrcOpNum = 1;
453  Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
454  break;
455 
456  case AArch64::ORNXrr:
457  case AArch64::ORNWrr: {
458  // not x -> csinv, represented as orn dst, xzr, src.
459  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
460  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
461  return 0;
462  SrcOpNum = 2;
463  Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
464  break;
465  }
466 
467  case AArch64::SUBSXrr:
468  case AArch64::SUBSWrr:
469  // if NZCV is used, do not fold.
470  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
471  return 0;
472  // fall-through to SUBXrr and SUBWrr.
474  case AArch64::SUBXrr:
475  case AArch64::SUBWrr: {
476  // neg x -> csneg, represented as sub dst, xzr, src.
477  unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
478  if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
479  return 0;
480  SrcOpNum = 2;
481  Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
482  break;
483  }
484  default:
485  return 0;
486  }
487  assert(Opc && SrcOpNum && "Missing parameters");
488 
489  if (NewVReg)
490  *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
491  return Opc;
492 }
493 
496  unsigned TrueReg, unsigned FalseReg,
497  int &CondCycles, int &TrueCycles,
498  int &FalseCycles) const {
499  // Check register classes.
500  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
501  const TargetRegisterClass *RC =
502  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
503  if (!RC)
504  return false;
505 
506  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
507  unsigned ExtraCondLat = Cond.size() != 1;
508 
509  // GPRs are handled by csel.
510  // FIXME: Fold in x+1, -x, and ~x when applicable.
511  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
512  AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
513  // Single-cycle csel, csinc, csinv, and csneg.
514  CondCycles = 1 + ExtraCondLat;
515  TrueCycles = FalseCycles = 1;
516  if (canFoldIntoCSel(MRI, TrueReg))
517  TrueCycles = 0;
518  else if (canFoldIntoCSel(MRI, FalseReg))
519  FalseCycles = 0;
520  return true;
521  }
522 
523  // Scalar floating point is handled by fcsel.
524  // FIXME: Form fabs, fmin, and fmax when applicable.
525  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
526  AArch64::FPR32RegClass.hasSubClassEq(RC)) {
527  CondCycles = 5 + ExtraCondLat;
528  TrueCycles = FalseCycles = 2;
529  return true;
530  }
531 
532  // Can't do vectors.
533  return false;
534 }
535 
538  const DebugLoc &DL, unsigned DstReg,
540  unsigned TrueReg, unsigned FalseReg) const {
542 
543  // Parse the condition code, see parseCondBranch() above.
545  switch (Cond.size()) {
546  default:
547  llvm_unreachable("Unknown condition opcode in Cond");
548  case 1: // b.cc
549  CC = AArch64CC::CondCode(Cond[0].getImm());
550  break;
551  case 3: { // cbz/cbnz
552  // We must insert a compare against 0.
553  bool Is64Bit;
554  switch (Cond[1].getImm()) {
555  default:
556  llvm_unreachable("Unknown branch opcode in Cond");
557  case AArch64::CBZW:
558  Is64Bit = false;
559  CC = AArch64CC::EQ;
560  break;
561  case AArch64::CBZX:
562  Is64Bit = true;
563  CC = AArch64CC::EQ;
564  break;
565  case AArch64::CBNZW:
566  Is64Bit = false;
567  CC = AArch64CC::NE;
568  break;
569  case AArch64::CBNZX:
570  Is64Bit = true;
571  CC = AArch64CC::NE;
572  break;
573  }
574  unsigned SrcReg = Cond[2].getReg();
575  if (Is64Bit) {
576  // cmp reg, #0 is actually subs xzr, reg, #0.
577  MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
578  BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
579  .addReg(SrcReg)
580  .addImm(0)
581  .addImm(0);
582  } else {
583  MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
584  BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
585  .addReg(SrcReg)
586  .addImm(0)
587  .addImm(0);
588  }
589  break;
590  }
591  case 4: { // tbz/tbnz
592  // We must insert a tst instruction.
593  switch (Cond[1].getImm()) {
594  default:
595  llvm_unreachable("Unknown branch opcode in Cond");
596  case AArch64::TBZW:
597  case AArch64::TBZX:
598  CC = AArch64CC::EQ;
599  break;
600  case AArch64::TBNZW:
601  case AArch64::TBNZX:
602  CC = AArch64CC::NE;
603  break;
604  }
605  // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
606  if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
607  BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
608  .addReg(Cond[2].getReg())
609  .addImm(
610  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
611  else
612  BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
613  .addReg(Cond[2].getReg())
614  .addImm(
615  AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
616  break;
617  }
618  }
619 
620  unsigned Opc = 0;
621  const TargetRegisterClass *RC = nullptr;
622  bool TryFold = false;
623  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
624  RC = &AArch64::GPR64RegClass;
625  Opc = AArch64::CSELXr;
626  TryFold = true;
627  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
628  RC = &AArch64::GPR32RegClass;
629  Opc = AArch64::CSELWr;
630  TryFold = true;
631  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
632  RC = &AArch64::FPR64RegClass;
633  Opc = AArch64::FCSELDrrr;
634  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
635  RC = &AArch64::FPR32RegClass;
636  Opc = AArch64::FCSELSrrr;
637  }
638  assert(RC && "Unsupported regclass");
639 
640  // Try folding simple instructions into the csel.
641  if (TryFold) {
642  unsigned NewVReg = 0;
643  unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
644  if (FoldedOpc) {
645  // The folded opcodes csinc, csinc and csneg apply the operation to
646  // FalseReg, so we need to invert the condition.
648  TrueReg = FalseReg;
649  } else
650  FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
651 
652  // Fold the operation. Leave any dead instructions for DCE to clean up.
653  if (FoldedOpc) {
654  FalseReg = NewVReg;
655  Opc = FoldedOpc;
656  // The extends the live range of NewVReg.
657  MRI.clearKillFlags(NewVReg);
658  }
659  }
660 
661  // Pull all virtual register into the appropriate class.
662  MRI.constrainRegClass(TrueReg, RC);
663  MRI.constrainRegClass(FalseReg, RC);
664 
665  // Insert the csel.
666  BuildMI(MBB, I, DL, get(Opc), DstReg)
667  .addReg(TrueReg)
668  .addReg(FalseReg)
669  .addImm(CC);
670 }
671 
672 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx.
673 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
674  uint64_t Imm = MI.getOperand(1).getImm();
675  uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
676  uint64_t Encoding;
677  return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
678 }
679 
680 // FIXME: this implementation should be micro-architecture dependent, so a
681 // micro-architecture target hook should be introduced here in future.
683  if (!Subtarget.hasCustomCheapAsMoveHandling())
684  return MI.isAsCheapAsAMove();
685 
686  const unsigned Opcode = MI.getOpcode();
687 
688  // Firstly, check cases gated by features.
689 
690  if (Subtarget.hasZeroCycleZeroingFP()) {
691  if (Opcode == AArch64::FMOVH0 ||
692  Opcode == AArch64::FMOVS0 ||
693  Opcode == AArch64::FMOVD0)
694  return true;
695  }
696 
697  if (Subtarget.hasZeroCycleZeroingGP()) {
698  if (Opcode == TargetOpcode::COPY &&
699  (MI.getOperand(1).getReg() == AArch64::WZR ||
700  MI.getOperand(1).getReg() == AArch64::XZR))
701  return true;
702  }
703 
704  // Secondly, check cases specific to sub-targets.
705 
706  if (Subtarget.hasExynosCheapAsMoveHandling()) {
707  if (isExynosCheapAsMove(MI))
708  return true;
709 
710  return MI.isAsCheapAsAMove();
711  }
712 
713  // Finally, check generic cases.
714 
715  switch (Opcode) {
716  default:
717  return false;
718 
719  // add/sub on register without shift
720  case AArch64::ADDWri:
721  case AArch64::ADDXri:
722  case AArch64::SUBWri:
723  case AArch64::SUBXri:
724  return (MI.getOperand(3).getImm() == 0);
725 
726  // logical ops on immediate
727  case AArch64::ANDWri:
728  case AArch64::ANDXri:
729  case AArch64::EORWri:
730  case AArch64::EORXri:
731  case AArch64::ORRWri:
732  case AArch64::ORRXri:
733  return true;
734 
735  // logical ops on register without shift
736  case AArch64::ANDWrr:
737  case AArch64::ANDXrr:
738  case AArch64::BICWrr:
739  case AArch64::BICXrr:
740  case AArch64::EONWrr:
741  case AArch64::EONXrr:
742  case AArch64::EORWrr:
743  case AArch64::EORXrr:
744  case AArch64::ORNWrr:
745  case AArch64::ORNXrr:
746  case AArch64::ORRWrr:
747  case AArch64::ORRXrr:
748  return true;
749 
750  // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
751  // ORRXri, it is as cheap as MOV
752  case AArch64::MOVi32imm:
753  return canBeExpandedToORR(MI, 32);
754  case AArch64::MOVi64imm:
755  return canBeExpandedToORR(MI, 64);
756  }
757 
758  llvm_unreachable("Unknown opcode to check as cheap as a move!");
759 }
760 
762  switch (MI.getOpcode()) {
763  default:
764  return false;
765 
766  case AArch64::ADDWrs:
767  case AArch64::ADDXrs:
768  case AArch64::ADDSWrs:
769  case AArch64::ADDSXrs: {
770  unsigned Imm = MI.getOperand(3).getImm();
771  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
772  if (ShiftVal == 0)
773  return true;
774  return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
775  }
776 
777  case AArch64::ADDWrx:
778  case AArch64::ADDXrx:
779  case AArch64::ADDXrx64:
780  case AArch64::ADDSWrx:
781  case AArch64::ADDSXrx:
782  case AArch64::ADDSXrx64: {
783  unsigned Imm = MI.getOperand(3).getImm();
784  switch (AArch64_AM::getArithExtendType(Imm)) {
785  default:
786  return false;
787  case AArch64_AM::UXTB:
788  case AArch64_AM::UXTH:
789  case AArch64_AM::UXTW:
790  case AArch64_AM::UXTX:
791  return AArch64_AM::getArithShiftValue(Imm) <= 4;
792  }
793  }
794 
795  case AArch64::SUBWrs:
796  case AArch64::SUBSWrs: {
797  unsigned Imm = MI.getOperand(3).getImm();
798  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
799  return ShiftVal == 0 ||
800  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
801  }
802 
803  case AArch64::SUBXrs:
804  case AArch64::SUBSXrs: {
805  unsigned Imm = MI.getOperand(3).getImm();
806  unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
807  return ShiftVal == 0 ||
808  (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
809  }
810 
811  case AArch64::SUBWrx:
812  case AArch64::SUBXrx:
813  case AArch64::SUBXrx64:
814  case AArch64::SUBSWrx:
815  case AArch64::SUBSXrx:
816  case AArch64::SUBSXrx64: {
817  unsigned Imm = MI.getOperand(3).getImm();
818  switch (AArch64_AM::getArithExtendType(Imm)) {
819  default:
820  return false;
821  case AArch64_AM::UXTB:
822  case AArch64_AM::UXTH:
823  case AArch64_AM::UXTW:
824  case AArch64_AM::UXTX:
825  return AArch64_AM::getArithShiftValue(Imm) == 0;
826  }
827  }
828 
829  case AArch64::LDRBBroW:
830  case AArch64::LDRBBroX:
831  case AArch64::LDRBroW:
832  case AArch64::LDRBroX:
833  case AArch64::LDRDroW:
834  case AArch64::LDRDroX:
835  case AArch64::LDRHHroW:
836  case AArch64::LDRHHroX:
837  case AArch64::LDRHroW:
838  case AArch64::LDRHroX:
839  case AArch64::LDRQroW:
840  case AArch64::LDRQroX:
841  case AArch64::LDRSBWroW:
842  case AArch64::LDRSBWroX:
843  case AArch64::LDRSBXroW:
844  case AArch64::LDRSBXroX:
845  case AArch64::LDRSHWroW:
846  case AArch64::LDRSHWroX:
847  case AArch64::LDRSHXroW:
848  case AArch64::LDRSHXroX:
849  case AArch64::LDRSWroW:
850  case AArch64::LDRSWroX:
851  case AArch64::LDRSroW:
852  case AArch64::LDRSroX:
853  case AArch64::LDRWroW:
854  case AArch64::LDRWroX:
855  case AArch64::LDRXroW:
856  case AArch64::LDRXroX:
857  case AArch64::PRFMroW:
858  case AArch64::PRFMroX:
859  case AArch64::STRBBroW:
860  case AArch64::STRBBroX:
861  case AArch64::STRBroW:
862  case AArch64::STRBroX:
863  case AArch64::STRDroW:
864  case AArch64::STRDroX:
865  case AArch64::STRHHroW:
866  case AArch64::STRHHroX:
867  case AArch64::STRHroW:
868  case AArch64::STRHroX:
869  case AArch64::STRQroW:
870  case AArch64::STRQroX:
871  case AArch64::STRSroW:
872  case AArch64::STRSroX:
873  case AArch64::STRWroW:
874  case AArch64::STRWroX:
875  case AArch64::STRXroW:
876  case AArch64::STRXroX: {
877  unsigned IsSigned = MI.getOperand(3).getImm();
878  return !IsSigned;
879  }
880  }
881 }
882 
884  unsigned Opc = MI.getOpcode();
885  switch (Opc) {
886  default:
887  return false;
888  case AArch64::SEH_StackAlloc:
889  case AArch64::SEH_SaveFPLR:
890  case AArch64::SEH_SaveFPLR_X:
891  case AArch64::SEH_SaveReg:
892  case AArch64::SEH_SaveReg_X:
893  case AArch64::SEH_SaveRegP:
894  case AArch64::SEH_SaveRegP_X:
895  case AArch64::SEH_SaveFReg:
896  case AArch64::SEH_SaveFReg_X:
897  case AArch64::SEH_SaveFRegP:
898  case AArch64::SEH_SaveFRegP_X:
899  case AArch64::SEH_SetFP:
900  case AArch64::SEH_AddFP:
901  case AArch64::SEH_Nop:
902  case AArch64::SEH_PrologEnd:
903  case AArch64::SEH_EpilogStart:
904  case AArch64::SEH_EpilogEnd:
905  return true;
906  }
907 }
908 
910  unsigned &SrcReg, unsigned &DstReg,
911  unsigned &SubIdx) const {
912  switch (MI.getOpcode()) {
913  default:
914  return false;
915  case AArch64::SBFMXri: // aka sxtw
916  case AArch64::UBFMXri: // aka uxtw
917  // Check for the 32 -> 64 bit extension case, these instructions can do
918  // much more.
919  if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
920  return false;
921  // This is a signed or unsigned 32 -> 64 bit extension.
922  SrcReg = MI.getOperand(1).getReg();
923  DstReg = MI.getOperand(0).getReg();
924  SubIdx = AArch64::sub_32;
925  return true;
926  }
927 }
928 
930  MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
932  MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
933  int64_t OffsetA = 0, OffsetB = 0;
934  unsigned WidthA = 0, WidthB = 0;
935 
936  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
937  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
938 
941  return false;
942 
943  // Retrieve the base, offset from the base and width. Width
944  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
945  // base are identical, and the offset of a lower memory access +
946  // the width doesn't overlap the offset of a higher memory access,
947  // then the memory accesses are different.
948  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
949  getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
950  if (BaseOpA->isIdenticalTo(*BaseOpB)) {
951  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
952  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
953  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
954  if (LowOffset + LowWidth <= HighOffset)
955  return true;
956  }
957  }
958  return false;
959 }
960 
962  const MachineBasicBlock *MBB,
963  const MachineFunction &MF) const {
964  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
965  return true;
966  switch (MI.getOpcode()) {
967  case AArch64::HINT:
968  // CSDB hints are scheduling barriers.
969  if (MI.getOperand(0).getImm() == 0x14)
970  return true;
971  break;
972  case AArch64::DSB:
973  case AArch64::ISB:
974  // DSB and ISB also are scheduling barriers.
975  return true;
976  default:;
977  }
978  return isSEHInstruction(MI);
979 }
980 
981 /// analyzeCompare - For a comparison instruction, return the source registers
982 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
983 /// Return true if the comparison instruction can be analyzed.
984 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
985  unsigned &SrcReg2, int &CmpMask,
986  int &CmpValue) const {
987  // The first operand can be a frame index where we'd normally expect a
988  // register.
989  assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
990  if (!MI.getOperand(1).isReg())
991  return false;
992 
993  switch (MI.getOpcode()) {
994  default:
995  break;
996  case AArch64::SUBSWrr:
997  case AArch64::SUBSWrs:
998  case AArch64::SUBSWrx:
999  case AArch64::SUBSXrr:
1000  case AArch64::SUBSXrs:
1001  case AArch64::SUBSXrx:
1002  case AArch64::ADDSWrr:
1003  case AArch64::ADDSWrs:
1004  case AArch64::ADDSWrx:
1005  case AArch64::ADDSXrr:
1006  case AArch64::ADDSXrs:
1007  case AArch64::ADDSXrx:
1008  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1009  SrcReg = MI.getOperand(1).getReg();
1010  SrcReg2 = MI.getOperand(2).getReg();
1011  CmpMask = ~0;
1012  CmpValue = 0;
1013  return true;
1014  case AArch64::SUBSWri:
1015  case AArch64::ADDSWri:
1016  case AArch64::SUBSXri:
1017  case AArch64::ADDSXri:
1018  SrcReg = MI.getOperand(1).getReg();
1019  SrcReg2 = 0;
1020  CmpMask = ~0;
1021  // FIXME: In order to convert CmpValue to 0 or 1
1022  CmpValue = MI.getOperand(2).getImm() != 0;
1023  return true;
1024  case AArch64::ANDSWri:
1025  case AArch64::ANDSXri:
1026  // ANDS does not use the same encoding scheme as the others xxxS
1027  // instructions.
1028  SrcReg = MI.getOperand(1).getReg();
1029  SrcReg2 = 0;
1030  CmpMask = ~0;
1031  // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
1032  // while the type of CmpValue is int. When converting uint64_t to int,
1033  // the high 32 bits of uint64_t will be lost.
1034  // In fact it causes a bug in spec2006-483.xalancbmk
1035  // CmpValue is only used to compare with zero in OptimizeCompareInstr
1037  MI.getOperand(2).getImm(),
1038  MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
1039  return true;
1040  }
1041 
1042  return false;
1043 }
1044 
1046  MachineBasicBlock *MBB = Instr.getParent();
1047  assert(MBB && "Can't get MachineBasicBlock here");
1048  MachineFunction *MF = MBB->getParent();
1049  assert(MF && "Can't get MachineFunction here");
1050  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1053 
1054  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1055  ++OpIdx) {
1056  MachineOperand &MO = Instr.getOperand(OpIdx);
1057  const TargetRegisterClass *OpRegCstraints =
1058  Instr.getRegClassConstraint(OpIdx, TII, TRI);
1059 
1060  // If there's no constraint, there's nothing to do.
1061  if (!OpRegCstraints)
1062  continue;
1063  // If the operand is a frame index, there's nothing to do here.
1064  // A frame index operand will resolve correctly during PEI.
1065  if (MO.isFI())
1066  continue;
1067 
1068  assert(MO.isReg() &&
1069  "Operand has register constraints without being a register!");
1070 
1071  unsigned Reg = MO.getReg();
1073  if (!OpRegCstraints->contains(Reg))
1074  return false;
1075  } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1076  !MRI->constrainRegClass(Reg, OpRegCstraints))
1077  return false;
1078  }
1079 
1080  return true;
1081 }
1082 
1083 /// Return the opcode that does not set flags when possible - otherwise
1084 /// return the original opcode. The caller is responsible to do the actual
1085 /// substitution and legality checking.
1086 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1087  // Don't convert all compare instructions, because for some the zero register
1088  // encoding becomes the sp register.
1089  bool MIDefinesZeroReg = false;
1090  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1091  MIDefinesZeroReg = true;
1092 
1093  switch (MI.getOpcode()) {
1094  default:
1095  return MI.getOpcode();
1096  case AArch64::ADDSWrr:
1097  return AArch64::ADDWrr;
1098  case AArch64::ADDSWri:
1099  return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1100  case AArch64::ADDSWrs:
1101  return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1102  case AArch64::ADDSWrx:
1103  return AArch64::ADDWrx;
1104  case AArch64::ADDSXrr:
1105  return AArch64::ADDXrr;
1106  case AArch64::ADDSXri:
1107  return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1108  case AArch64::ADDSXrs:
1109  return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1110  case AArch64::ADDSXrx:
1111  return AArch64::ADDXrx;
1112  case AArch64::SUBSWrr:
1113  return AArch64::SUBWrr;
1114  case AArch64::SUBSWri:
1115  return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1116  case AArch64::SUBSWrs:
1117  return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1118  case AArch64::SUBSWrx:
1119  return AArch64::SUBWrx;
1120  case AArch64::SUBSXrr:
1121  return AArch64::SUBXrr;
1122  case AArch64::SUBSXri:
1123  return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1124  case AArch64::SUBSXrs:
1125  return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1126  case AArch64::SUBSXrx:
1127  return AArch64::SUBXrx;
1128  }
1129 }
1130 
1131 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1132 
1133 /// True when condition flags are accessed (either by writing or reading)
1134 /// on the instruction trace starting at From and ending at To.
1135 ///
1136 /// Note: If From and To are from different blocks it's assumed CC are accessed
1137 /// on the path.
1140  const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1141  // Early exit if To is at the beginning of the BB.
1142  if (To == To->getParent()->begin())
1143  return true;
1144 
1145  // Check whether the instructions are in the same basic block
1146  // If not, assume the condition flags might get modified somewhere.
1147  if (To->getParent() != From->getParent())
1148  return true;
1149 
1150  // From must be above To.
1151  assert(std::find_if(++To.getReverse(), To->getParent()->rend(),
1152  [From](MachineInstr &MI) {
1153  return MI.getIterator() == From;
1154  }) != To->getParent()->rend());
1155 
1156  // We iterate backward starting \p To until we hit \p From.
1157  for (--To; To != From; --To) {
1158  const MachineInstr &Instr = *To;
1159 
1160  if (((AccessToCheck & AK_Write) &&
1161  Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1162  ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1163  return true;
1164  }
1165  return false;
1166 }
1167 
1168 /// Try to optimize a compare instruction. A compare instruction is an
1169 /// instruction which produces AArch64::NZCV. It can be truly compare
1170 /// instruction
1171 /// when there are no uses of its destination register.
1172 ///
1173 /// The following steps are tried in order:
1174 /// 1. Convert CmpInstr into an unconditional version.
1175 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1176 /// condition code or an instruction which can be converted into such an
1177 /// instruction.
1178 /// Only comparison with zero is supported.
1180  MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
1181  int CmpValue, const MachineRegisterInfo *MRI) const {
1182  assert(CmpInstr.getParent());
1183  assert(MRI);
1184 
1185  // Replace SUBSWrr with SUBWrr if NZCV is not used.
1186  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1187  if (DeadNZCVIdx != -1) {
1188  if (CmpInstr.definesRegister(AArch64::WZR) ||
1189  CmpInstr.definesRegister(AArch64::XZR)) {
1190  CmpInstr.eraseFromParent();
1191  return true;
1192  }
1193  unsigned Opc = CmpInstr.getOpcode();
1194  unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1195  if (NewOpc == Opc)
1196  return false;
1197  const MCInstrDesc &MCID = get(NewOpc);
1198  CmpInstr.setDesc(MCID);
1199  CmpInstr.RemoveOperand(DeadNZCVIdx);
1200  bool succeeded = UpdateOperandRegClass(CmpInstr);
1201  (void)succeeded;
1202  assert(succeeded && "Some operands reg class are incompatible!");
1203  return true;
1204  }
1205 
1206  // Continue only if we have a "ri" where immediate is zero.
1207  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
1208  // function.
1209  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
1210  if (CmpValue != 0 || SrcReg2 != 0)
1211  return false;
1212 
1213  // CmpInstr is a Compare instruction if destination register is not used.
1214  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1215  return false;
1216 
1217  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
1218 }
1219 
1220 /// Get opcode of S version of Instr.
1221 /// If Instr is S version its opcode is returned.
1222 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1223 /// or we are not interested in it.
1224 static unsigned sForm(MachineInstr &Instr) {
1225  switch (Instr.getOpcode()) {
1226  default:
1227  return AArch64::INSTRUCTION_LIST_END;
1228 
1229  case AArch64::ADDSWrr:
1230  case AArch64::ADDSWri:
1231  case AArch64::ADDSXrr:
1232  case AArch64::ADDSXri:
1233  case AArch64::SUBSWrr:
1234  case AArch64::SUBSWri:
1235  case AArch64::SUBSXrr:
1236  case AArch64::SUBSXri:
1237  return Instr.getOpcode();
1238 
1239  case AArch64::ADDWrr:
1240  return AArch64::ADDSWrr;
1241  case AArch64::ADDWri:
1242  return AArch64::ADDSWri;
1243  case AArch64::ADDXrr:
1244  return AArch64::ADDSXrr;
1245  case AArch64::ADDXri:
1246  return AArch64::ADDSXri;
1247  case AArch64::ADCWr:
1248  return AArch64::ADCSWr;
1249  case AArch64::ADCXr:
1250  return AArch64::ADCSXr;
1251  case AArch64::SUBWrr:
1252  return AArch64::SUBSWrr;
1253  case AArch64::SUBWri:
1254  return AArch64::SUBSWri;
1255  case AArch64::SUBXrr:
1256  return AArch64::SUBSXrr;
1257  case AArch64::SUBXri:
1258  return AArch64::SUBSXri;
1259  case AArch64::SBCWr:
1260  return AArch64::SBCSWr;
1261  case AArch64::SBCXr:
1262  return AArch64::SBCSXr;
1263  case AArch64::ANDWri:
1264  return AArch64::ANDSWri;
1265  case AArch64::ANDXri:
1266  return AArch64::ANDSXri;
1267  }
1268 }
1269 
1270 /// Check if AArch64::NZCV should be alive in successors of MBB.
1272  for (auto *BB : MBB->successors())
1273  if (BB->isLiveIn(AArch64::NZCV))
1274  return true;
1275  return false;
1276 }
1277 
1278 namespace {
1279 
1280 struct UsedNZCV {
1281  bool N = false;
1282  bool Z = false;
1283  bool C = false;
1284  bool V = false;
1285 
1286  UsedNZCV() = default;
1287 
1288  UsedNZCV &operator|=(const UsedNZCV &UsedFlags) {
1289  this->N |= UsedFlags.N;
1290  this->Z |= UsedFlags.Z;
1291  this->C |= UsedFlags.C;
1292  this->V |= UsedFlags.V;
1293  return *this;
1294  }
1295 };
1296 
1297 } // end anonymous namespace
1298 
1299 /// Find a condition code used by the instruction.
1300 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1301 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1303  switch (Instr.getOpcode()) {
1304  default:
1305  return AArch64CC::Invalid;
1306 
1307  case AArch64::Bcc: {
1308  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1309  assert(Idx >= 2);
1310  return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
1311  }
1312 
1313  case AArch64::CSINVWr:
1314  case AArch64::CSINVXr:
1315  case AArch64::CSINCWr:
1316  case AArch64::CSINCXr:
1317  case AArch64::CSELWr:
1318  case AArch64::CSELXr:
1319  case AArch64::CSNEGWr:
1320  case AArch64::CSNEGXr:
1321  case AArch64::FCSELSrrr:
1322  case AArch64::FCSELDrrr: {
1323  int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1324  assert(Idx >= 1);
1325  return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
1326  }
1327  }
1328 }
1329 
1330 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1331  assert(CC != AArch64CC::Invalid);
1332  UsedNZCV UsedFlags;
1333  switch (CC) {
1334  default:
1335  break;
1336 
1337  case AArch64CC::EQ: // Z set
1338  case AArch64CC::NE: // Z clear
1339  UsedFlags.Z = true;
1340  break;
1341 
1342  case AArch64CC::HI: // Z clear and C set
1343  case AArch64CC::LS: // Z set or C clear
1344  UsedFlags.Z = true;
1346  case AArch64CC::HS: // C set
1347  case AArch64CC::LO: // C clear
1348  UsedFlags.C = true;
1349  break;
1350 
1351  case AArch64CC::MI: // N set
1352  case AArch64CC::PL: // N clear
1353  UsedFlags.N = true;
1354  break;
1355 
1356  case AArch64CC::VS: // V set
1357  case AArch64CC::VC: // V clear
1358  UsedFlags.V = true;
1359  break;
1360 
1361  case AArch64CC::GT: // Z clear, N and V the same
1362  case AArch64CC::LE: // Z set, N and V differ
1363  UsedFlags.Z = true;
1365  case AArch64CC::GE: // N and V the same
1366  case AArch64CC::LT: // N and V differ
1367  UsedFlags.N = true;
1368  UsedFlags.V = true;
1369  break;
1370  }
1371  return UsedFlags;
1372 }
1373 
1374 static bool isADDSRegImm(unsigned Opcode) {
1375  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1376 }
1377 
1378 static bool isSUBSRegImm(unsigned Opcode) {
1379  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1380 }
1381 
1382 /// Check if CmpInstr can be substituted by MI.
1383 ///
1384 /// CmpInstr can be substituted:
1385 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1386 /// - and, MI and CmpInstr are from the same MachineBB
1387 /// - and, condition flags are not alive in successors of the CmpInstr parent
1388 /// - and, if MI opcode is the S form there must be no defs of flags between
1389 /// MI and CmpInstr
1390 /// or if MI opcode is not the S form there must be neither defs of flags
1391 /// nor uses of flags between MI and CmpInstr.
1392 /// - and C/V flags are not used after CmpInstr
1394  const TargetRegisterInfo *TRI) {
1395  assert(MI);
1396  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
1397  assert(CmpInstr);
1398 
1399  const unsigned CmpOpcode = CmpInstr->getOpcode();
1400  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1401  return false;
1402 
1403  if (MI->getParent() != CmpInstr->getParent())
1404  return false;
1405 
1406  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
1407  return false;
1408 
1409  AccessKind AccessToCheck = AK_Write;
1410  if (sForm(*MI) != MI->getOpcode())
1411  AccessToCheck = AK_All;
1412  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
1413  return false;
1414 
1415  UsedNZCV NZCVUsedAfterCmp;
1416  for (auto I = std::next(CmpInstr->getIterator()),
1417  E = CmpInstr->getParent()->instr_end();
1418  I != E; ++I) {
1419  const MachineInstr &Instr = *I;
1420  if (Instr.readsRegister(AArch64::NZCV, TRI)) {
1422  if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1423  return false;
1424  NZCVUsedAfterCmp |= getUsedNZCV(CC);
1425  }
1426 
1427  if (Instr.modifiesRegister(AArch64::NZCV, TRI))
1428  break;
1429  }
1430 
1431  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
1432 }
1433 
1434 /// Substitute an instruction comparing to zero with another instruction
1435 /// which produces needed condition flags.
1436 ///
1437 /// Return true on success.
1438 bool AArch64InstrInfo::substituteCmpToZero(
1439  MachineInstr &CmpInstr, unsigned SrcReg,
1440  const MachineRegisterInfo *MRI) const {
1441  assert(MRI);
1442  // Get the unique definition of SrcReg.
1443  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
1444  if (!MI)
1445  return false;
1446 
1448 
1449  unsigned NewOpc = sForm(*MI);
1450  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1451  return false;
1452 
1453  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
1454  return false;
1455 
1456  // Update the instruction to set NZCV.
1457  MI->setDesc(get(NewOpc));
1458  CmpInstr.eraseFromParent();
1459  bool succeeded = UpdateOperandRegClass(*MI);
1460  (void)succeeded;
1461  assert(succeeded && "Some operands reg class are incompatible!");
1462  MI->addRegisterDefined(AArch64::NZCV, TRI);
1463  return true;
1464 }
1465 
1467  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1468  MI.getOpcode() != AArch64::CATCHRET)
1469  return false;
1470 
1471  MachineBasicBlock &MBB = *MI.getParent();
1472  DebugLoc DL = MI.getDebugLoc();
1473 
1474  if (MI.getOpcode() == AArch64::CATCHRET) {
1475  // Skip to the first instruction before the epilog.
1476  const TargetInstrInfo *TII =
1477  MBB.getParent()->getSubtarget().getInstrInfo();
1478  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1479  auto MBBI = MachineBasicBlock::iterator(MI);
1480  MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1481  while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1482  FirstEpilogSEH != MBB.begin())
1483  FirstEpilogSEH = std::prev(FirstEpilogSEH);
1484  if (FirstEpilogSEH != MBB.begin())
1485  FirstEpilogSEH = std::next(FirstEpilogSEH);
1486  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1487  .addReg(AArch64::X0, RegState::Define)
1488  .addMBB(TargetMBB);
1489  BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1490  .addReg(AArch64::X0, RegState::Define)
1491  .addReg(AArch64::X0)
1492  .addMBB(TargetMBB)
1493  .addImm(0);
1494  return true;
1495  }
1496 
1497  unsigned Reg = MI.getOperand(0).getReg();
1498  const GlobalValue *GV =
1499  cast<GlobalValue>((*MI.memoperands_begin())->getValue());
1500  const TargetMachine &TM = MBB.getParent()->getTarget();
1501  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
1502  const unsigned char MO_NC = AArch64II::MO_NC;
1503 
1504  if ((OpFlags & AArch64II::MO_GOT) != 0) {
1505  BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
1506  .addGlobalAddress(GV, 0, OpFlags);
1507  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1508  .addReg(Reg, RegState::Kill)
1509  .addImm(0)
1511  } else if (TM.getCodeModel() == CodeModel::Large) {
1512  BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
1513  .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
1514  .addImm(0);
1515  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1516  .addReg(Reg, RegState::Kill)
1517  .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
1518  .addImm(16);
1519  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1520  .addReg(Reg, RegState::Kill)
1521  .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
1522  .addImm(32);
1523  BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
1524  .addReg(Reg, RegState::Kill)
1526  .addImm(48);
1527  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1528  .addReg(Reg, RegState::Kill)
1529  .addImm(0)
1531  } else if (TM.getCodeModel() == CodeModel::Tiny) {
1532  BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
1533  .addGlobalAddress(GV, 0, OpFlags);
1534  } else {
1535  BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
1536  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
1537  unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
1538  BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
1539  .addReg(Reg, RegState::Kill)
1540  .addGlobalAddress(GV, 0, LoFlags)
1542  }
1543 
1544  MBB.erase(MI);
1545 
1546  return true;
1547 }
1548 
1549 // Return true if this instruction simply sets its single destination register
1550 // to zero. This is equivalent to a register rename of the zero-register.
1552  switch (MI.getOpcode()) {
1553  default:
1554  break;
1555  case AArch64::MOVZWi:
1556  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
1557  if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
1558  assert(MI.getDesc().getNumOperands() == 3 &&
1559  MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
1560  return true;
1561  }
1562  break;
1563  case AArch64::ANDWri: // and Rd, Rzr, #imm
1564  return MI.getOperand(1).getReg() == AArch64::WZR;
1565  case AArch64::ANDXri:
1566  return MI.getOperand(1).getReg() == AArch64::XZR;
1567  case TargetOpcode::COPY:
1568  return MI.getOperand(1).getReg() == AArch64::WZR;
1569  }
1570  return false;
1571 }
1572 
1573 // Return true if this instruction simply renames a general register without
1574 // modifying bits.
1576  switch (MI.getOpcode()) {
1577  default:
1578  break;
1579  case TargetOpcode::COPY: {
1580  // GPR32 copies will by lowered to ORRXrs
1581  unsigned DstReg = MI.getOperand(0).getReg();
1582  return (AArch64::GPR32RegClass.contains(DstReg) ||
1583  AArch64::GPR64RegClass.contains(DstReg));
1584  }
1585  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
1586  if (MI.getOperand(1).getReg() == AArch64::XZR) {
1587  assert(MI.getDesc().getNumOperands() == 4 &&
1588  MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
1589  return true;
1590  }
1591  break;
1592  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
1593  if (MI.getOperand(2).getImm() == 0) {
1594  assert(MI.getDesc().getNumOperands() == 4 &&
1595  MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
1596  return true;
1597  }
1598  break;
1599  }
1600  return false;
1601 }
1602 
1603 // Return true if this instruction simply renames a general register without
1604 // modifying bits.
1606  switch (MI.getOpcode()) {
1607  default:
1608  break;
1609  case TargetOpcode::COPY: {
1610  // FPR64 copies will by lowered to ORR.16b
1611  unsigned DstReg = MI.getOperand(0).getReg();
1612  return (AArch64::FPR64RegClass.contains(DstReg) ||
1613  AArch64::FPR128RegClass.contains(DstReg));
1614  }
1615  case AArch64::ORRv16i8:
1616  if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
1617  assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
1618  "invalid ORRv16i8 operands");
1619  return true;
1620  }
1621  break;
1622  }
1623  return false;
1624 }
1625 
1627  int &FrameIndex) const {
1628  switch (MI.getOpcode()) {
1629  default:
1630  break;
1631  case AArch64::LDRWui:
1632  case AArch64::LDRXui:
1633  case AArch64::LDRBui:
1634  case AArch64::LDRHui:
1635  case AArch64::LDRSui:
1636  case AArch64::LDRDui:
1637  case AArch64::LDRQui:
1638  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1639  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1640  FrameIndex = MI.getOperand(1).getIndex();
1641  return MI.getOperand(0).getReg();
1642  }
1643  break;
1644  }
1645 
1646  return 0;
1647 }
1648 
1650  int &FrameIndex) const {
1651  switch (MI.getOpcode()) {
1652  default:
1653  break;
1654  case AArch64::STRWui:
1655  case AArch64::STRXui:
1656  case AArch64::STRBui:
1657  case AArch64::STRHui:
1658  case AArch64::STRSui:
1659  case AArch64::STRDui:
1660  case AArch64::STRQui:
1661  if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
1662  MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
1663  FrameIndex = MI.getOperand(1).getIndex();
1664  return MI.getOperand(0).getReg();
1665  }
1666  break;
1667  }
1668  return 0;
1669 }
1670 
1671 /// Check all MachineMemOperands for a hint to suppress pairing.
1673  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1674  return MMO->getFlags() & MOSuppressPair;
1675  });
1676 }
1677 
1678 /// Set a flag on the first MachineMemOperand to suppress pairing.
1680  if (MI.memoperands_empty())
1681  return;
1682  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
1683 }
1684 
1685 /// Check all MachineMemOperands for a hint that the load/store is strided.
1687  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
1688  return MMO->getFlags() & MOStridedAccess;
1689  });
1690 }
1691 
1693  switch (Opc) {
1694  default:
1695  return false;
1696  case AArch64::STURSi:
1697  case AArch64::STURDi:
1698  case AArch64::STURQi:
1699  case AArch64::STURBBi:
1700  case AArch64::STURHHi:
1701  case AArch64::STURWi:
1702  case AArch64::STURXi:
1703  case AArch64::LDURSi:
1704  case AArch64::LDURDi:
1705  case AArch64::LDURQi:
1706  case AArch64::LDURWi:
1707  case AArch64::LDURXi:
1708  case AArch64::LDURSWi:
1709  case AArch64::LDURHHi:
1710  case AArch64::LDURBBi:
1711  case AArch64::LDURSBWi:
1712  case AArch64::LDURSHWi:
1713  return true;
1714  }
1715 }
1716 
1718  switch (MI.getOpcode()) {
1719  default:
1720  return false;
1721  // Scaled instructions.
1722  case AArch64::STRSui:
1723  case AArch64::STRDui:
1724  case AArch64::STRQui:
1725  case AArch64::STRXui:
1726  case AArch64::STRWui:
1727  case AArch64::LDRSui:
1728  case AArch64::LDRDui:
1729  case AArch64::LDRQui:
1730  case AArch64::LDRXui:
1731  case AArch64::LDRWui:
1732  case AArch64::LDRSWui:
1733  // Unscaled instructions.
1734  case AArch64::STURSi:
1735  case AArch64::STURDi:
1736  case AArch64::STURQi:
1737  case AArch64::STURWi:
1738  case AArch64::STURXi:
1739  case AArch64::LDURSi:
1740  case AArch64::LDURDi:
1741  case AArch64::LDURQi:
1742  case AArch64::LDURWi:
1743  case AArch64::LDURXi:
1744  case AArch64::LDURSWi:
1745  return true;
1746  }
1747 }
1748 
1750  bool &Is64Bit) {
1751  switch (Opc) {
1752  default:
1753  llvm_unreachable("Opcode has no flag setting equivalent!");
1754  // 32-bit cases:
1755  case AArch64::ADDWri:
1756  Is64Bit = false;
1757  return AArch64::ADDSWri;
1758  case AArch64::ADDWrr:
1759  Is64Bit = false;
1760  return AArch64::ADDSWrr;
1761  case AArch64::ADDWrs:
1762  Is64Bit = false;
1763  return AArch64::ADDSWrs;
1764  case AArch64::ADDWrx:
1765  Is64Bit = false;
1766  return AArch64::ADDSWrx;
1767  case AArch64::ANDWri:
1768  Is64Bit = false;
1769  return AArch64::ANDSWri;
1770  case AArch64::ANDWrr:
1771  Is64Bit = false;
1772  return AArch64::ANDSWrr;
1773  case AArch64::ANDWrs:
1774  Is64Bit = false;
1775  return AArch64::ANDSWrs;
1776  case AArch64::BICWrr:
1777  Is64Bit = false;
1778  return AArch64::BICSWrr;
1779  case AArch64::BICWrs:
1780  Is64Bit = false;
1781  return AArch64::BICSWrs;
1782  case AArch64::SUBWri:
1783  Is64Bit = false;
1784  return AArch64::SUBSWri;
1785  case AArch64::SUBWrr:
1786  Is64Bit = false;
1787  return AArch64::SUBSWrr;
1788  case AArch64::SUBWrs:
1789  Is64Bit = false;
1790  return AArch64::SUBSWrs;
1791  case AArch64::SUBWrx:
1792  Is64Bit = false;
1793  return AArch64::SUBSWrx;
1794  // 64-bit cases:
1795  case AArch64::ADDXri:
1796  Is64Bit = true;
1797  return AArch64::ADDSXri;
1798  case AArch64::ADDXrr:
1799  Is64Bit = true;
1800  return AArch64::ADDSXrr;
1801  case AArch64::ADDXrs:
1802  Is64Bit = true;
1803  return AArch64::ADDSXrs;
1804  case AArch64::ADDXrx:
1805  Is64Bit = true;
1806  return AArch64::ADDSXrx;
1807  case AArch64::ANDXri:
1808  Is64Bit = true;
1809  return AArch64::ANDSXri;
1810  case AArch64::ANDXrr:
1811  Is64Bit = true;
1812  return AArch64::ANDSXrr;
1813  case AArch64::ANDXrs:
1814  Is64Bit = true;
1815  return AArch64::ANDSXrs;
1816  case AArch64::BICXrr:
1817  Is64Bit = true;
1818  return AArch64::BICSXrr;
1819  case AArch64::BICXrs:
1820  Is64Bit = true;
1821  return AArch64::BICSXrs;
1822  case AArch64::SUBXri:
1823  Is64Bit = true;
1824  return AArch64::SUBSXri;
1825  case AArch64::SUBXrr:
1826  Is64Bit = true;
1827  return AArch64::SUBSXrr;
1828  case AArch64::SUBXrs:
1829  Is64Bit = true;
1830  return AArch64::SUBSXrs;
1831  case AArch64::SUBXrx:
1832  Is64Bit = true;
1833  return AArch64::SUBSXrx;
1834  }
1835 }
1836 
1837 // Is this a candidate for ld/st merging or pairing? For example, we don't
1838 // touch volatiles or load/stores that have a hint to avoid pair formation.
1840  // If this is a volatile load/store, don't mess with it.
1841  if (MI.hasOrderedMemoryRef())
1842  return false;
1843 
1844  // Make sure this is a reg/fi+imm (as opposed to an address reloc).
1845  assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) &&
1846  "Expected a reg or frame index operand.");
1847  if (!MI.getOperand(2).isImm())
1848  return false;
1849 
1850  // Can't merge/pair if the instruction modifies the base register.
1851  // e.g., ldr x0, [x0]
1852  // This case will never occur with an FI base.
1853  if (MI.getOperand(1).isReg()) {
1854  unsigned BaseReg = MI.getOperand(1).getReg();
1856  if (MI.modifiesRegister(BaseReg, TRI))
1857  return false;
1858  }
1859 
1860  // Check if this load/store has a hint to avoid pair formation.
1861  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1862  if (isLdStPairSuppressed(MI))
1863  return false;
1864 
1865  // On some CPUs quad load/store pairs are slower than two single load/stores.
1866  if (Subtarget.isPaired128Slow()) {
1867  switch (MI.getOpcode()) {
1868  default:
1869  break;
1870  case AArch64::LDURQi:
1871  case AArch64::STURQi:
1872  case AArch64::LDRQui:
1873  case AArch64::STRQui:
1874  return false;
1875  }
1876  }
1877 
1878  return true;
1879 }
1880 
1882  MachineOperand *&BaseOp,
1883  int64_t &Offset,
1884  const TargetRegisterInfo *TRI) const {
1885  unsigned Width;
1886  return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
1887 }
1888 
1890  MachineInstr &LdSt, MachineOperand *&BaseOp, int64_t &Offset,
1891  unsigned &Width, const TargetRegisterInfo *TRI) const {
1892  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1893  // Handle only loads/stores with base register followed by immediate offset.
1894  if (LdSt.getNumExplicitOperands() == 3) {
1895  // Non-paired instruction (e.g., ldr x1, [x0, #8]).
1896  if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
1897  !LdSt.getOperand(2).isImm())
1898  return false;
1899  } else if (LdSt.getNumExplicitOperands() == 4) {
1900  // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
1901  if (!LdSt.getOperand(1).isReg() ||
1902  (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
1903  !LdSt.getOperand(3).isImm())
1904  return false;
1905  } else
1906  return false;
1907 
1908  // Get the scaling factor for the instruction and set the width for the
1909  // instruction.
1910  unsigned Scale = 0;
1911  int64_t Dummy1, Dummy2;
1912 
1913  // If this returns false, then it's an instruction we don't want to handle.
1914  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
1915  return false;
1916 
1917  // Compute the offset. Offset is calculated as the immediate operand
1918  // multiplied by the scaling factor. Unscaled instructions have scaling factor
1919  // set to 1.
1920  if (LdSt.getNumExplicitOperands() == 3) {
1921  BaseOp = &LdSt.getOperand(1);
1922  Offset = LdSt.getOperand(2).getImm() * Scale;
1923  } else {
1924  assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
1925  BaseOp = &LdSt.getOperand(2);
1926  Offset = LdSt.getOperand(3).getImm() * Scale;
1927  }
1928 
1929  assert((BaseOp->isReg() || BaseOp->isFI()) &&
1930  "getMemOperandWithOffset only supports base "
1931  "operands of type register or frame index.");
1932 
1933  return true;
1934 }
1935 
1938  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
1939  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
1940  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
1941  return OfsOp;
1942 }
1943 
1944 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
1945  unsigned &Width, int64_t &MinOffset,
1946  int64_t &MaxOffset) const {
1947  switch (Opcode) {
1948  // Not a memory operation or something we want to handle.
1949  default:
1950  Scale = Width = 0;
1951  MinOffset = MaxOffset = 0;
1952  return false;
1953  case AArch64::STRWpost:
1954  case AArch64::LDRWpost:
1955  Width = 32;
1956  Scale = 4;
1957  MinOffset = -256;
1958  MaxOffset = 255;
1959  break;
1960  case AArch64::LDURQi:
1961  case AArch64::STURQi:
1962  Width = 16;
1963  Scale = 1;
1964  MinOffset = -256;
1965  MaxOffset = 255;
1966  break;
1967  case AArch64::LDURXi:
1968  case AArch64::LDURDi:
1969  case AArch64::STURXi:
1970  case AArch64::STURDi:
1971  Width = 8;
1972  Scale = 1;
1973  MinOffset = -256;
1974  MaxOffset = 255;
1975  break;
1976  case AArch64::LDURWi:
1977  case AArch64::LDURSi:
1978  case AArch64::LDURSWi:
1979  case AArch64::STURWi:
1980  case AArch64::STURSi:
1981  Width = 4;
1982  Scale = 1;
1983  MinOffset = -256;
1984  MaxOffset = 255;
1985  break;
1986  case AArch64::LDURHi:
1987  case AArch64::LDURHHi:
1988  case AArch64::LDURSHXi:
1989  case AArch64::LDURSHWi:
1990  case AArch64::STURHi:
1991  case AArch64::STURHHi:
1992  Width = 2;
1993  Scale = 1;
1994  MinOffset = -256;
1995  MaxOffset = 255;
1996  break;
1997  case AArch64::LDURBi:
1998  case AArch64::LDURBBi:
1999  case AArch64::LDURSBXi:
2000  case AArch64::LDURSBWi:
2001  case AArch64::STURBi:
2002  case AArch64::STURBBi:
2003  Width = 1;
2004  Scale = 1;
2005  MinOffset = -256;
2006  MaxOffset = 255;
2007  break;
2008  case AArch64::LDPQi:
2009  case AArch64::LDNPQi:
2010  case AArch64::STPQi:
2011  case AArch64::STNPQi:
2012  Scale = 16;
2013  Width = 32;
2014  MinOffset = -64;
2015  MaxOffset = 63;
2016  break;
2017  case AArch64::LDRQui:
2018  case AArch64::STRQui:
2019  Scale = Width = 16;
2020  MinOffset = 0;
2021  MaxOffset = 4095;
2022  break;
2023  case AArch64::LDPXi:
2024  case AArch64::LDPDi:
2025  case AArch64::LDNPXi:
2026  case AArch64::LDNPDi:
2027  case AArch64::STPXi:
2028  case AArch64::STPDi:
2029  case AArch64::STNPXi:
2030  case AArch64::STNPDi:
2031  Scale = 8;
2032  Width = 16;
2033  MinOffset = -64;
2034  MaxOffset = 63;
2035  break;
2036  case AArch64::LDRXui:
2037  case AArch64::LDRDui:
2038  case AArch64::STRXui:
2039  case AArch64::STRDui:
2040  Scale = Width = 8;
2041  MinOffset = 0;
2042  MaxOffset = 4095;
2043  break;
2044  case AArch64::LDPWi:
2045  case AArch64::LDPSi:
2046  case AArch64::LDNPWi:
2047  case AArch64::LDNPSi:
2048  case AArch64::STPWi:
2049  case AArch64::STPSi:
2050  case AArch64::STNPWi:
2051  case AArch64::STNPSi:
2052  Scale = 4;
2053  Width = 8;
2054  MinOffset = -64;
2055  MaxOffset = 63;
2056  break;
2057  case AArch64::LDRWui:
2058  case AArch64::LDRSui:
2059  case AArch64::LDRSWui:
2060  case AArch64::STRWui:
2061  case AArch64::STRSui:
2062  Scale = Width = 4;
2063  MinOffset = 0;
2064  MaxOffset = 4095;
2065  break;
2066  case AArch64::LDRHui:
2067  case AArch64::LDRHHui:
2068  case AArch64::STRHui:
2069  case AArch64::STRHHui:
2070  Scale = Width = 2;
2071  MinOffset = 0;
2072  MaxOffset = 4095;
2073  break;
2074  case AArch64::LDRBui:
2075  case AArch64::LDRBBui:
2076  case AArch64::STRBui:
2077  case AArch64::STRBBui:
2078  Scale = Width = 1;
2079  MinOffset = 0;
2080  MaxOffset = 4095;
2081  break;
2082  }
2083 
2084  return true;
2085 }
2086 
2087 static unsigned getOffsetStride(unsigned Opc) {
2088  switch (Opc) {
2089  default:
2090  return 0;
2091  case AArch64::LDURQi:
2092  case AArch64::STURQi:
2093  return 16;
2094  case AArch64::LDURXi:
2095  case AArch64::LDURDi:
2096  case AArch64::STURXi:
2097  case AArch64::STURDi:
2098  return 8;
2099  case AArch64::LDURWi:
2100  case AArch64::LDURSi:
2101  case AArch64::LDURSWi:
2102  case AArch64::STURWi:
2103  case AArch64::STURSi:
2104  return 4;
2105  }
2106 }
2107 
2108 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
2109 // scaled.
2110 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
2111  unsigned OffsetStride = getOffsetStride(Opc);
2112  if (OffsetStride == 0)
2113  return false;
2114  // If the byte-offset isn't a multiple of the stride, we can't scale this
2115  // offset.
2116  if (Offset % OffsetStride != 0)
2117  return false;
2118 
2119  // Convert the byte-offset used by unscaled into an "element" offset used
2120  // by the scaled pair load/store instructions.
2121  Offset /= OffsetStride;
2122  return true;
2123 }
2124 
2125 // Unscale the scaled offsets. Returns false if the scaled offset can't be
2126 // unscaled.
2127 static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
2128  unsigned OffsetStride = getOffsetStride(Opc);
2129  if (OffsetStride == 0)
2130  return false;
2131 
2132  // Convert the "element" offset used by scaled pair load/store instructions
2133  // into the byte-offset used by unscaled.
2134  Offset *= OffsetStride;
2135  return true;
2136 }
2137 
2138 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
2139  if (FirstOpc == SecondOpc)
2140  return true;
2141  // We can also pair sign-ext and zero-ext instructions.
2142  switch (FirstOpc) {
2143  default:
2144  return false;
2145  case AArch64::LDRWui:
2146  case AArch64::LDURWi:
2147  return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
2148  case AArch64::LDRSWui:
2149  case AArch64::LDURSWi:
2150  return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
2151  }
2152  // These instructions can't be paired based on their opcodes.
2153  return false;
2154 }
2155 
2156 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
2157  int64_t Offset1, unsigned Opcode1, int FI2,
2158  int64_t Offset2, unsigned Opcode2) {
2159  // Accesses through fixed stack object frame indices may access a different
2160  // fixed stack slot. Check that the object offsets + offsets match.
2161  if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
2162  int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
2163  int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
2164  assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
2165  // Get the byte-offset from the object offset.
2166  if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
2167  return false;
2168  ObjectOffset1 += Offset1;
2169  ObjectOffset2 += Offset2;
2170  // Get the "element" index in the object.
2171  if (!scaleOffset(Opcode1, ObjectOffset1) ||
2172  !scaleOffset(Opcode2, ObjectOffset2))
2173  return false;
2174  return ObjectOffset1 + 1 == ObjectOffset2;
2175  }
2176 
2177  return FI1 == FI2;
2178 }
2179 
2180 /// Detect opportunities for ldp/stp formation.
2181 ///
2182 /// Only called for LdSt for which getMemOperandWithOffset returns true.
2184  MachineOperand &BaseOp2,
2185  unsigned NumLoads) const {
2186  MachineInstr &FirstLdSt = *BaseOp1.getParent();
2187  MachineInstr &SecondLdSt = *BaseOp2.getParent();
2188  if (BaseOp1.getType() != BaseOp2.getType())
2189  return false;
2190 
2191  assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
2192  "Only base registers and frame indices are supported.");
2193 
2194  // Check for both base regs and base FI.
2195  if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
2196  return false;
2197 
2198  // Only cluster up to a single pair.
2199  if (NumLoads > 1)
2200  return false;
2201 
2202  if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
2203  return false;
2204 
2205  // Can we pair these instructions based on their opcodes?
2206  unsigned FirstOpc = FirstLdSt.getOpcode();
2207  unsigned SecondOpc = SecondLdSt.getOpcode();
2208  if (!canPairLdStOpc(FirstOpc, SecondOpc))
2209  return false;
2210 
2211  // Can't merge volatiles or load/stores that have a hint to avoid pair
2212  // formation, for example.
2213  if (!isCandidateToMergeOrPair(FirstLdSt) ||
2214  !isCandidateToMergeOrPair(SecondLdSt))
2215  return false;
2216 
2217  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
2218  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
2219  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
2220  return false;
2221 
2222  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
2223  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
2224  return false;
2225 
2226  // Pairwise instructions have a 7-bit signed offset field.
2227  if (Offset1 > 63 || Offset1 < -64)
2228  return false;
2229 
2230  // The caller should already have ordered First/SecondLdSt by offset.
2231  // Note: except for non-equal frame index bases
2232  if (BaseOp1.isFI()) {
2233  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
2234  "Caller should have ordered offsets.");
2235 
2236  const MachineFrameInfo &MFI =
2237  FirstLdSt.getParent()->getParent()->getFrameInfo();
2238  return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
2239  BaseOp2.getIndex(), Offset2, SecondOpc);
2240  }
2241 
2242  assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
2243  "Caller should have ordered offsets.");
2244 
2245  return Offset1 + 1 == Offset2;
2246 }
2247 
2249  unsigned Reg, unsigned SubIdx,
2250  unsigned State,
2251  const TargetRegisterInfo *TRI) {
2252  if (!SubIdx)
2253  return MIB.addReg(Reg, State);
2254 
2256  return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
2257  return MIB.addReg(Reg, State, SubIdx);
2258 }
2259 
2260 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
2261  unsigned NumRegs) {
2262  // We really want the positive remainder mod 32 here, that happens to be
2263  // easily obtainable with a mask.
2264  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
2265 }
2266 
2269  const DebugLoc &DL, unsigned DestReg,
2270  unsigned SrcReg, bool KillSrc,
2271  unsigned Opcode,
2272  ArrayRef<unsigned> Indices) const {
2273  assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
2275  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2276  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2277  unsigned NumRegs = Indices.size();
2278 
2279  int SubReg = 0, End = NumRegs, Incr = 1;
2280  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
2281  SubReg = NumRegs - 1;
2282  End = -1;
2283  Incr = -1;
2284  }
2285 
2286  for (; SubReg != End; SubReg += Incr) {
2287  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2288  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2289  AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
2290  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2291  }
2292 }
2293 
2296  DebugLoc DL, unsigned DestReg,
2297  unsigned SrcReg, bool KillSrc,
2298  unsigned Opcode, unsigned ZeroReg,
2299  llvm::ArrayRef<unsigned> Indices) const {
2301  unsigned NumRegs = Indices.size();
2302 
2303 #ifndef NDEBUG
2304  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
2305  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
2306  assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
2307  "GPR reg sequences should not be able to overlap");
2308 #endif
2309 
2310  for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
2311  const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
2312  AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
2313  MIB.addReg(ZeroReg);
2314  AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
2315  MIB.addImm(0);
2316  }
2317 }
2318 
2321  const DebugLoc &DL, unsigned DestReg,
2322  unsigned SrcReg, bool KillSrc) const {
2323  if (AArch64::GPR32spRegClass.contains(DestReg) &&
2324  (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
2326 
2327  if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
2328  // If either operand is WSP, expand to ADD #0.
2329  if (Subtarget.hasZeroCycleRegMove()) {
2330  // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
2331  unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2332  &AArch64::GPR64spRegClass);
2333  unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2334  &AArch64::GPR64spRegClass);
2335  // This instruction is reading and writing X registers. This may upset
2336  // the register scavenger and machine verifier, so we need to indicate
2337  // that we are reading an undefined value from SrcRegX, but a proper
2338  // value from SrcReg.
2339  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
2340  .addReg(SrcRegX, RegState::Undef)
2341  .addImm(0)
2343  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2344  } else {
2345  BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
2346  .addReg(SrcReg, getKillRegState(KillSrc))
2347  .addImm(0)
2349  }
2350  } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
2351  BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
2352  .addImm(0)
2354  } else {
2355  if (Subtarget.hasZeroCycleRegMove()) {
2356  // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
2357  unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
2358  &AArch64::GPR64spRegClass);
2359  unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
2360  &AArch64::GPR64spRegClass);
2361  // This instruction is reading and writing X registers. This may upset
2362  // the register scavenger and machine verifier, so we need to indicate
2363  // that we are reading an undefined value from SrcRegX, but a proper
2364  // value from SrcReg.
2365  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
2366  .addReg(AArch64::XZR)
2367  .addReg(SrcRegX, RegState::Undef)
2368  .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
2369  } else {
2370  // Otherwise, expand to ORR WZR.
2371  BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
2372  .addReg(AArch64::WZR)
2373  .addReg(SrcReg, getKillRegState(KillSrc));
2374  }
2375  }
2376  return;
2377  }
2378 
2379  if (AArch64::GPR64spRegClass.contains(DestReg) &&
2380  (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
2381  if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
2382  // If either operand is SP, expand to ADD #0.
2383  BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
2384  .addReg(SrcReg, getKillRegState(KillSrc))
2385  .addImm(0)
2387  } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
2388  BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
2389  .addImm(0)
2391  } else {
2392  // Otherwise, expand to ORR XZR.
2393  BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
2394  .addReg(AArch64::XZR)
2395  .addReg(SrcReg, getKillRegState(KillSrc));
2396  }
2397  return;
2398  }
2399 
2400  // Copy a DDDD register quad by copying the individual sub-registers.
2401  if (AArch64::DDDDRegClass.contains(DestReg) &&
2402  AArch64::DDDDRegClass.contains(SrcReg)) {
2403  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2404  AArch64::dsub2, AArch64::dsub3};
2405  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2406  Indices);
2407  return;
2408  }
2409 
2410  // Copy a DDD register triple by copying the individual sub-registers.
2411  if (AArch64::DDDRegClass.contains(DestReg) &&
2412  AArch64::DDDRegClass.contains(SrcReg)) {
2413  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
2414  AArch64::dsub2};
2415  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2416  Indices);
2417  return;
2418  }
2419 
2420  // Copy a DD register pair by copying the individual sub-registers.
2421  if (AArch64::DDRegClass.contains(DestReg) &&
2422  AArch64::DDRegClass.contains(SrcReg)) {
2423  static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
2424  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
2425  Indices);
2426  return;
2427  }
2428 
2429  // Copy a QQQQ register quad by copying the individual sub-registers.
2430  if (AArch64::QQQQRegClass.contains(DestReg) &&
2431  AArch64::QQQQRegClass.contains(SrcReg)) {
2432  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2433  AArch64::qsub2, AArch64::qsub3};
2434  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2435  Indices);
2436  return;
2437  }
2438 
2439  // Copy a QQQ register triple by copying the individual sub-registers.
2440  if (AArch64::QQQRegClass.contains(DestReg) &&
2441  AArch64::QQQRegClass.contains(SrcReg)) {
2442  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
2443  AArch64::qsub2};
2444  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2445  Indices);
2446  return;
2447  }
2448 
2449  // Copy a QQ register pair by copying the individual sub-registers.
2450  if (AArch64::QQRegClass.contains(DestReg) &&
2451  AArch64::QQRegClass.contains(SrcReg)) {
2452  static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
2453  copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
2454  Indices);
2455  return;
2456  }
2457 
2458  if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
2459  AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
2460  static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
2461  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
2462  AArch64::XZR, Indices);
2463  return;
2464  }
2465 
2466  if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
2467  AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
2468  static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
2469  copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
2470  AArch64::WZR, Indices);
2471  return;
2472  }
2473 
2474  if (AArch64::FPR128RegClass.contains(DestReg) &&
2475  AArch64::FPR128RegClass.contains(SrcReg)) {
2476  if (Subtarget.hasNEON()) {
2477  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2478  .addReg(SrcReg)
2479  .addReg(SrcReg, getKillRegState(KillSrc));
2480  } else {
2481  BuildMI(MBB, I, DL, get(AArch64::STRQpre))
2482  .addReg(AArch64::SP, RegState::Define)
2483  .addReg(SrcReg, getKillRegState(KillSrc))
2484  .addReg(AArch64::SP)
2485  .addImm(-16);
2486  BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
2487  .addReg(AArch64::SP, RegState::Define)
2488  .addReg(DestReg, RegState::Define)
2489  .addReg(AArch64::SP)
2490  .addImm(16);
2491  }
2492  return;
2493  }
2494 
2495  if (AArch64::FPR64RegClass.contains(DestReg) &&
2496  AArch64::FPR64RegClass.contains(SrcReg)) {
2497  if (Subtarget.hasNEON()) {
2498  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
2499  &AArch64::FPR128RegClass);
2500  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
2501  &AArch64::FPR128RegClass);
2502  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2503  .addReg(SrcReg)
2504  .addReg(SrcReg, getKillRegState(KillSrc));
2505  } else {
2506  BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
2507  .addReg(SrcReg, getKillRegState(KillSrc));
2508  }
2509  return;
2510  }
2511 
2512  if (AArch64::FPR32RegClass.contains(DestReg) &&
2513  AArch64::FPR32RegClass.contains(SrcReg)) {
2514  if (Subtarget.hasNEON()) {
2515  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
2516  &AArch64::FPR128RegClass);
2517  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
2518  &AArch64::FPR128RegClass);
2519  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2520  .addReg(SrcReg)
2521  .addReg(SrcReg, getKillRegState(KillSrc));
2522  } else {
2523  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2524  .addReg(SrcReg, getKillRegState(KillSrc));
2525  }
2526  return;
2527  }
2528 
2529  if (AArch64::FPR16RegClass.contains(DestReg) &&
2530  AArch64::FPR16RegClass.contains(SrcReg)) {
2531  if (Subtarget.hasNEON()) {
2532  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2533  &AArch64::FPR128RegClass);
2534  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2535  &AArch64::FPR128RegClass);
2536  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2537  .addReg(SrcReg)
2538  .addReg(SrcReg, getKillRegState(KillSrc));
2539  } else {
2540  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
2541  &AArch64::FPR32RegClass);
2542  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
2543  &AArch64::FPR32RegClass);
2544  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2545  .addReg(SrcReg, getKillRegState(KillSrc));
2546  }
2547  return;
2548  }
2549 
2550  if (AArch64::FPR8RegClass.contains(DestReg) &&
2551  AArch64::FPR8RegClass.contains(SrcReg)) {
2552  if (Subtarget.hasNEON()) {
2553  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2554  &AArch64::FPR128RegClass);
2555  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2556  &AArch64::FPR128RegClass);
2557  BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
2558  .addReg(SrcReg)
2559  .addReg(SrcReg, getKillRegState(KillSrc));
2560  } else {
2561  DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
2562  &AArch64::FPR32RegClass);
2563  SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
2564  &AArch64::FPR32RegClass);
2565  BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
2566  .addReg(SrcReg, getKillRegState(KillSrc));
2567  }
2568  return;
2569  }
2570 
2571  // Copies between GPR64 and FPR64.
2572  if (AArch64::FPR64RegClass.contains(DestReg) &&
2573  AArch64::GPR64RegClass.contains(SrcReg)) {
2574  BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
2575  .addReg(SrcReg, getKillRegState(KillSrc));
2576  return;
2577  }
2578  if (AArch64::GPR64RegClass.contains(DestReg) &&
2579  AArch64::FPR64RegClass.contains(SrcReg)) {
2580  BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
2581  .addReg(SrcReg, getKillRegState(KillSrc));
2582  return;
2583  }
2584  // Copies between GPR32 and FPR32.
2585  if (AArch64::FPR32RegClass.contains(DestReg) &&
2586  AArch64::GPR32RegClass.contains(SrcReg)) {
2587  BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
2588  .addReg(SrcReg, getKillRegState(KillSrc));
2589  return;
2590  }
2591  if (AArch64::GPR32RegClass.contains(DestReg) &&
2592  AArch64::FPR32RegClass.contains(SrcReg)) {
2593  BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
2594  .addReg(SrcReg, getKillRegState(KillSrc));
2595  return;
2596  }
2597 
2598  if (DestReg == AArch64::NZCV) {
2599  assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
2600  BuildMI(MBB, I, DL, get(AArch64::MSR))
2601  .addImm(AArch64SysReg::NZCV)
2602  .addReg(SrcReg, getKillRegState(KillSrc))
2603  .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
2604  return;
2605  }
2606 
2607  if (SrcReg == AArch64::NZCV) {
2608  assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
2609  BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
2610  .addImm(AArch64SysReg::NZCV)
2611  .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
2612  return;
2613  }
2614 
2615  llvm_unreachable("unimplemented reg-to-reg copy");
2616 }
2617 
2619  MachineBasicBlock &MBB,
2620  MachineBasicBlock::iterator InsertBefore,
2621  const MCInstrDesc &MCID,
2622  unsigned SrcReg, bool IsKill,
2623  unsigned SubIdx0, unsigned SubIdx1, int FI,
2624  MachineMemOperand *MMO) {
2625  unsigned SrcReg0 = SrcReg;
2626  unsigned SrcReg1 = SrcReg;
2628  SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
2629  SubIdx0 = 0;
2630  SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
2631  SubIdx1 = 0;
2632  }
2633  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2634  .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
2635  .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
2636  .addFrameIndex(FI)
2637  .addImm(0)
2638  .addMemOperand(MMO);
2639 }
2640 
2642  MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
2643  bool isKill, int FI, const TargetRegisterClass *RC,
2644  const TargetRegisterInfo *TRI) const {
2645  MachineFunction &MF = *MBB.getParent();
2646  MachineFrameInfo &MFI = MF.getFrameInfo();
2647  unsigned Align = MFI.getObjectAlignment(FI);
2648 
2651  PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
2652  unsigned Opc = 0;
2653  bool Offset = true;
2654  switch (TRI->getSpillSize(*RC)) {
2655  case 1:
2656  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2657  Opc = AArch64::STRBui;
2658  break;
2659  case 2:
2660  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2661  Opc = AArch64::STRHui;
2662  break;
2663  case 4:
2664  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2665  Opc = AArch64::STRWui;
2667  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
2668  else
2669  assert(SrcReg != AArch64::WSP);
2670  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2671  Opc = AArch64::STRSui;
2672  break;
2673  case 8:
2674  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2675  Opc = AArch64::STRXui;
2677  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2678  else
2679  assert(SrcReg != AArch64::SP);
2680  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2681  Opc = AArch64::STRDui;
2682  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2684  get(AArch64::STPWi), SrcReg, isKill,
2685  AArch64::sube32, AArch64::subo32, FI, MMO);
2686  return;
2687  }
2688  break;
2689  case 16:
2690  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2691  Opc = AArch64::STRQui;
2692  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2693  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2694  Opc = AArch64::ST1Twov1d;
2695  Offset = false;
2696  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2698  get(AArch64::STPXi), SrcReg, isKill,
2699  AArch64::sube64, AArch64::subo64, FI, MMO);
2700  return;
2701  }
2702  break;
2703  case 24:
2704  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2705  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2706  Opc = AArch64::ST1Threev1d;
2707  Offset = false;
2708  }
2709  break;
2710  case 32:
2711  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2712  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2713  Opc = AArch64::ST1Fourv1d;
2714  Offset = false;
2715  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2716  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2717  Opc = AArch64::ST1Twov2d;
2718  Offset = false;
2719  }
2720  break;
2721  case 48:
2722  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2723  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2724  Opc = AArch64::ST1Threev2d;
2725  Offset = false;
2726  }
2727  break;
2728  case 64:
2729  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2730  assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
2731  Opc = AArch64::ST1Fourv2d;
2732  Offset = false;
2733  }
2734  break;
2735  }
2736  assert(Opc && "Unknown register class");
2737 
2738  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2739  .addReg(SrcReg, getKillRegState(isKill))
2740  .addFrameIndex(FI);
2741 
2742  if (Offset)
2743  MI.addImm(0);
2744  MI.addMemOperand(MMO);
2745 }
2746 
2748  MachineBasicBlock &MBB,
2749  MachineBasicBlock::iterator InsertBefore,
2750  const MCInstrDesc &MCID,
2751  unsigned DestReg, unsigned SubIdx0,
2752  unsigned SubIdx1, int FI,
2753  MachineMemOperand *MMO) {
2754  unsigned DestReg0 = DestReg;
2755  unsigned DestReg1 = DestReg;
2756  bool IsUndef = true;
2758  DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
2759  SubIdx0 = 0;
2760  DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
2761  SubIdx1 = 0;
2762  IsUndef = false;
2763  }
2764  BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
2765  .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
2766  .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
2767  .addFrameIndex(FI)
2768  .addImm(0)
2769  .addMemOperand(MMO);
2770 }
2771 
2773  MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
2774  int FI, const TargetRegisterClass *RC,
2775  const TargetRegisterInfo *TRI) const {
2776  MachineFunction &MF = *MBB.getParent();
2777  MachineFrameInfo &MFI = MF.getFrameInfo();
2778  unsigned Align = MFI.getObjectAlignment(FI);
2781  PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
2782 
2783  unsigned Opc = 0;
2784  bool Offset = true;
2785  switch (TRI->getSpillSize(*RC)) {
2786  case 1:
2787  if (AArch64::FPR8RegClass.hasSubClassEq(RC))
2788  Opc = AArch64::LDRBui;
2789  break;
2790  case 2:
2791  if (AArch64::FPR16RegClass.hasSubClassEq(RC))
2792  Opc = AArch64::LDRHui;
2793  break;
2794  case 4:
2795  if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
2796  Opc = AArch64::LDRWui;
2798  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
2799  else
2800  assert(DestReg != AArch64::WSP);
2801  } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
2802  Opc = AArch64::LDRSui;
2803  break;
2804  case 8:
2805  if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
2806  Opc = AArch64::LDRXui;
2808  MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
2809  else
2810  assert(DestReg != AArch64::SP);
2811  } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
2812  Opc = AArch64::LDRDui;
2813  } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
2815  get(AArch64::LDPWi), DestReg, AArch64::sube32,
2816  AArch64::subo32, FI, MMO);
2817  return;
2818  }
2819  break;
2820  case 16:
2821  if (AArch64::FPR128RegClass.hasSubClassEq(RC))
2822  Opc = AArch64::LDRQui;
2823  else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
2824  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2825  Opc = AArch64::LD1Twov1d;
2826  Offset = false;
2827  } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
2829  get(AArch64::LDPXi), DestReg, AArch64::sube64,
2830  AArch64::subo64, FI, MMO);
2831  return;
2832  }
2833  break;
2834  case 24:
2835  if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
2836  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2837  Opc = AArch64::LD1Threev1d;
2838  Offset = false;
2839  }
2840  break;
2841  case 32:
2842  if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
2843  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2844  Opc = AArch64::LD1Fourv1d;
2845  Offset = false;
2846  } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
2847  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2848  Opc = AArch64::LD1Twov2d;
2849  Offset = false;
2850  }
2851  break;
2852  case 48:
2853  if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
2854  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2855  Opc = AArch64::LD1Threev2d;
2856  Offset = false;
2857  }
2858  break;
2859  case 64:
2860  if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
2861  assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
2862  Opc = AArch64::LD1Fourv2d;
2863  Offset = false;
2864  }
2865  break;
2866  }
2867  assert(Opc && "Unknown register class");
2868 
2869  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
2870  .addReg(DestReg, getDefRegState(true))
2871  .addFrameIndex(FI);
2872  if (Offset)
2873  MI.addImm(0);
2874  MI.addMemOperand(MMO);
2875 }
2876 
2878  MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
2879  unsigned DestReg, unsigned SrcReg, int Offset,
2880  const TargetInstrInfo *TII,
2881  MachineInstr::MIFlag Flag, bool SetNZCV,
2882  bool NeedsWinCFI) {
2883  if (DestReg == SrcReg && Offset == 0)
2884  return;
2885 
2886  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
2887  "SP increment/decrement not 16-byte aligned");
2888 
2889  bool isSub = Offset < 0;
2890  if (isSub)
2891  Offset = -Offset;
2892 
2893  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
2894  // scratch register. If DestReg is a virtual register, use it as the
2895  // scratch register; otherwise, create a new virtual register (to be
2896  // replaced by the scavenger at the end of PEI). That case can be optimized
2897  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
2898  // register can be loaded with offset%8 and the add/sub can use an extending
2899  // instruction with LSL#3.
2900  // Currently the function handles any offsets but generates a poor sequence
2901  // of code.
2902  // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
2903 
2904  unsigned Opc;
2905  if (SetNZCV)
2906  Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
2907  else
2908  Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
2909  const unsigned MaxEncoding = 0xfff;
2910  const unsigned ShiftSize = 12;
2911  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
2912  while (((unsigned)Offset) >= (1 << ShiftSize)) {
2913  unsigned ThisVal;
2914  if (((unsigned)Offset) > MaxEncodableValue) {
2915  ThisVal = MaxEncodableValue;
2916  } else {
2917  ThisVal = Offset & MaxEncodableValue;
2918  }
2919  assert((ThisVal >> ShiftSize) <= MaxEncoding &&
2920  "Encoding cannot handle value that big");
2921  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2922  .addReg(SrcReg)
2923  .addImm(ThisVal >> ShiftSize)
2925  .setMIFlag(Flag);
2926 
2927  if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
2928  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2929  .addImm(ThisVal)
2930  .setMIFlag(Flag);
2931 
2932  SrcReg = DestReg;
2933  Offset -= ThisVal;
2934  if (Offset == 0)
2935  return;
2936  }
2937  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
2938  .addReg(SrcReg)
2939  .addImm(Offset)
2941  .setMIFlag(Flag);
2942 
2943  if (NeedsWinCFI) {
2944  if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
2945  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
2946  if (Offset == 0)
2947  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
2948  setMIFlag(Flag);
2949  else
2950  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
2951  addImm(Offset).setMIFlag(Flag);
2952  } else if (DestReg == AArch64::SP) {
2953  BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
2954  addImm(Offset).setMIFlag(Flag);
2955  }
2956  }
2957 }
2958 
2962  LiveIntervals *LIS) const {
2963  // This is a bit of a hack. Consider this instruction:
2964  //
2965  // %0 = COPY %sp; GPR64all:%0
2966  //
2967  // We explicitly chose GPR64all for the virtual register so such a copy might
2968  // be eliminated by RegisterCoalescer. However, that may not be possible, and
2969  // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
2970  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
2971  //
2972  // To prevent that, we are going to constrain the %0 register class here.
2973  //
2974  // <rdar://problem/11522048>
2975  //
2976  if (MI.isFullCopy()) {
2977  unsigned DstReg = MI.getOperand(0).getReg();
2978  unsigned SrcReg = MI.getOperand(1).getReg();
2979  if (SrcReg == AArch64::SP &&
2981  MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
2982  return nullptr;
2983  }
2984  if (DstReg == AArch64::SP &&
2986  MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
2987  return nullptr;
2988  }
2989  }
2990 
2991  // Handle the case where a copy is being spilled or filled but the source
2992  // and destination register class don't match. For example:
2993  //
2994  // %0 = COPY %xzr; GPR64common:%0
2995  //
2996  // In this case we can still safely fold away the COPY and generate the
2997  // following spill code:
2998  //
2999  // STRXui %xzr, %stack.0
3000  //
3001  // This also eliminates spilled cross register class COPYs (e.g. between x and
3002  // d regs) of the same size. For example:
3003  //
3004  // %0 = COPY %1; GPR64:%0, FPR64:%1
3005  //
3006  // will be filled as
3007  //
3008  // LDRDui %0, fi<#0>
3009  //
3010  // instead of
3011  //
3012  // LDRXui %Temp, fi<#0>
3013  // %0 = FMOV %Temp
3014  //
3015  if (MI.isCopy() && Ops.size() == 1 &&
3016  // Make sure we're only folding the explicit COPY defs/uses.
3017  (Ops[0] == 0 || Ops[0] == 1)) {
3018  bool IsSpill = Ops[0] == 0;
3019  bool IsFill = !IsSpill;
3021  const MachineRegisterInfo &MRI = MF.getRegInfo();
3022  MachineBasicBlock &MBB = *MI.getParent();
3023  const MachineOperand &DstMO = MI.getOperand(0);
3024  const MachineOperand &SrcMO = MI.getOperand(1);
3025  unsigned DstReg = DstMO.getReg();
3026  unsigned SrcReg = SrcMO.getReg();
3027  // This is slightly expensive to compute for physical regs since
3028  // getMinimalPhysRegClass is slow.
3029  auto getRegClass = [&](unsigned Reg) {
3031  ? MRI.getRegClass(Reg)
3032  : TRI.getMinimalPhysRegClass(Reg);
3033  };
3034 
3035  if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
3036  assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
3037  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
3038  "Mismatched register size in non subreg COPY");
3039  if (IsSpill)
3040  storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
3041  getRegClass(SrcReg), &TRI);
3042  else
3043  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
3044  getRegClass(DstReg), &TRI);
3045  return &*--InsertPt;
3046  }
3047 
3048  // Handle cases like spilling def of:
3049  //
3050  // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
3051  //
3052  // where the physical register source can be widened and stored to the full
3053  // virtual reg destination stack slot, in this case producing:
3054  //
3055  // STRXui %xzr, %stack.0
3056  //
3057  if (IsSpill && DstMO.isUndef() &&
3059  assert(SrcMO.getSubReg() == 0 &&
3060  "Unexpected subreg on physical register");
3061  const TargetRegisterClass *SpillRC;
3062  unsigned SpillSubreg;
3063  switch (DstMO.getSubReg()) {
3064  default:
3065  SpillRC = nullptr;
3066  break;
3067  case AArch64::sub_32:
3068  case AArch64::ssub:
3069  if (AArch64::GPR32RegClass.contains(SrcReg)) {
3070  SpillRC = &AArch64::GPR64RegClass;
3071  SpillSubreg = AArch64::sub_32;
3072  } else if (AArch64::FPR32RegClass.contains(SrcReg)) {
3073  SpillRC = &AArch64::FPR64RegClass;
3074  SpillSubreg = AArch64::ssub;
3075  } else
3076  SpillRC = nullptr;
3077  break;
3078  case AArch64::dsub:
3079  if (AArch64::FPR64RegClass.contains(SrcReg)) {
3080  SpillRC = &AArch64::FPR128RegClass;
3081  SpillSubreg = AArch64::dsub;
3082  } else
3083  SpillRC = nullptr;
3084  break;
3085  }
3086 
3087  if (SpillRC)
3088  if (unsigned WidenedSrcReg =
3089  TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) {
3090  storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(),
3091  FrameIndex, SpillRC, &TRI);
3092  return &*--InsertPt;
3093  }
3094  }
3095 
3096  // Handle cases like filling use of:
3097  //
3098  // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
3099  //
3100  // where we can load the full virtual reg source stack slot, into the subreg
3101  // destination, in this case producing:
3102  //
3103  // LDRWui %0:sub_32<def,read-undef>, %stack.0
3104  //
3105  if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
3106  const TargetRegisterClass *FillRC;
3107  switch (DstMO.getSubReg()) {
3108  default:
3109  FillRC = nullptr;
3110  break;
3111  case AArch64::sub_32:
3112  FillRC = &AArch64::GPR32RegClass;
3113  break;
3114  case AArch64::ssub:
3115  FillRC = &AArch64::FPR32RegClass;
3116  break;
3117  case AArch64::dsub:
3118  FillRC = &AArch64::FPR64RegClass;
3119  break;
3120  }
3121 
3122  if (FillRC) {
3123  assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
3124  TRI.getRegSizeInBits(*FillRC) &&
3125  "Mismatched regclass size on folded subreg COPY");
3126  loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
3127  MachineInstr &LoadMI = *--InsertPt;
3128  MachineOperand &LoadDst = LoadMI.getOperand(0);
3129  assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
3130  LoadDst.setSubReg(DstMO.getSubReg());
3131  LoadDst.setIsUndef();
3132  return &LoadMI;
3133  }
3134  }
3135  }
3136 
3137  // Cannot fold.
3138  return nullptr;
3139 }
3140 
3142  bool *OutUseUnscaledOp,
3143  unsigned *OutUnscaledOp,
3144  int *EmittableOffset) {
3145  int Scale = 1;
3146  bool IsSigned = false;
3147  // The ImmIdx should be changed case by case if it is not 2.
3148  unsigned ImmIdx = 2;
3149  unsigned UnscaledOp = 0;
3150  // Set output values in case of early exit.
3151  if (EmittableOffset)
3152  *EmittableOffset = 0;
3153  if (OutUseUnscaledOp)
3154  *OutUseUnscaledOp = false;
3155  if (OutUnscaledOp)
3156  *OutUnscaledOp = 0;
3157  switch (MI.getOpcode()) {
3158  default:
3159  llvm_unreachable("unhandled opcode in rewriteAArch64FrameIndex");
3160  // Vector spills/fills can't take an immediate offset.
3161  case AArch64::LD1Twov2d:
3162  case AArch64::LD1Threev2d:
3163  case AArch64::LD1Fourv2d:
3164  case AArch64::LD1Twov1d:
3165  case AArch64::LD1Threev1d:
3166  case AArch64::LD1Fourv1d:
3167  case AArch64::ST1Twov2d:
3168  case AArch64::ST1Threev2d:
3169  case AArch64::ST1Fourv2d:
3170  case AArch64::ST1Twov1d:
3171  case AArch64::ST1Threev1d:
3172  case AArch64::ST1Fourv1d:
3174  case AArch64::PRFMui:
3175  Scale = 8;
3176  UnscaledOp = AArch64::PRFUMi;
3177  break;
3178  case AArch64::LDRXui:
3179  Scale = 8;
3180  UnscaledOp = AArch64::LDURXi;
3181  break;
3182  case AArch64::LDRWui:
3183  Scale = 4;
3184  UnscaledOp = AArch64::LDURWi;
3185  break;
3186  case AArch64::LDRBui:
3187  Scale = 1;
3188  UnscaledOp = AArch64::LDURBi;
3189  break;
3190  case AArch64::LDRHui:
3191  Scale = 2;
3192  UnscaledOp = AArch64::LDURHi;
3193  break;
3194  case AArch64::LDRSui:
3195  Scale = 4;
3196  UnscaledOp = AArch64::LDURSi;
3197  break;
3198  case AArch64::LDRDui:
3199  Scale = 8;
3200  UnscaledOp = AArch64::LDURDi;
3201  break;
3202  case AArch64::LDRQui:
3203  Scale = 16;
3204  UnscaledOp = AArch64::LDURQi;
3205  break;
3206  case AArch64::LDRBBui:
3207  Scale = 1;
3208  UnscaledOp = AArch64::LDURBBi;
3209  break;
3210  case AArch64::LDRHHui:
3211  Scale = 2;
3212  UnscaledOp = AArch64::LDURHHi;
3213  break;
3214  case AArch64::LDRSBXui:
3215  Scale = 1;
3216  UnscaledOp = AArch64::LDURSBXi;
3217  break;
3218  case AArch64::LDRSBWui:
3219  Scale = 1;
3220  UnscaledOp = AArch64::LDURSBWi;
3221  break;
3222  case AArch64::LDRSHXui:
3223  Scale = 2;
3224  UnscaledOp = AArch64::LDURSHXi;
3225  break;
3226  case AArch64::LDRSHWui:
3227  Scale = 2;
3228  UnscaledOp = AArch64::LDURSHWi;
3229  break;
3230  case AArch64::LDRSWui:
3231  Scale = 4;
3232  UnscaledOp = AArch64::LDURSWi;
3233  break;
3234 
3235  case AArch64::STRXui:
3236  Scale = 8;
3237  UnscaledOp = AArch64::STURXi;
3238  break;
3239  case AArch64::STRWui:
3240  Scale = 4;
3241  UnscaledOp = AArch64::STURWi;
3242  break;
3243  case AArch64::STRBui:
3244  Scale = 1;
3245  UnscaledOp = AArch64::STURBi;
3246  break;
3247  case AArch64::STRHui:
3248  Scale = 2;
3249  UnscaledOp = AArch64::STURHi;
3250  break;
3251  case AArch64::STRSui:
3252  Scale = 4;
3253  UnscaledOp = AArch64::STURSi;
3254  break;
3255  case AArch64::STRDui:
3256  Scale = 8;
3257  UnscaledOp = AArch64::STURDi;
3258  break;
3259  case AArch64::STRQui:
3260  Scale = 16;
3261  UnscaledOp = AArch64::STURQi;
3262  break;
3263  case AArch64::STRBBui:
3264  Scale = 1;
3265  UnscaledOp = AArch64::STURBBi;
3266  break;
3267  case AArch64::STRHHui:
3268  Scale = 2;
3269  UnscaledOp = AArch64::STURHHi;
3270  break;
3271 
3272  case AArch64::LDPXi:
3273  case AArch64::LDPDi:
3274  case AArch64::STPXi:
3275  case AArch64::STPDi:
3276  case AArch64::LDNPXi:
3277  case AArch64::LDNPDi:
3278  case AArch64::STNPXi:
3279  case AArch64::STNPDi:
3280  ImmIdx = 3;
3281  IsSigned = true;
3282  Scale = 8;
3283  break;
3284  case AArch64::LDPQi:
3285  case AArch64::STPQi:
3286  case AArch64::LDNPQi:
3287  case AArch64::STNPQi:
3288  ImmIdx = 3;
3289  IsSigned = true;
3290  Scale = 16;
3291  break;
3292  case AArch64::LDPWi:
3293  case AArch64::LDPSi:
3294  case AArch64::STPWi:
3295  case AArch64::STPSi:
3296  case AArch64::LDNPWi:
3297  case AArch64::LDNPSi:
3298  case AArch64::STNPWi:
3299  case AArch64::STNPSi:
3300  ImmIdx = 3;
3301  IsSigned = true;
3302  Scale = 4;
3303  break;
3304 
3305  case AArch64::LDURXi:
3306  case AArch64::LDURWi:
3307  case AArch64::LDURBi:
3308  case AArch64::LDURHi:
3309  case AArch64::LDURSi:
3310  case AArch64::LDURDi:
3311  case AArch64::LDURQi:
3312  case AArch64::LDURHHi:
3313  case AArch64::LDURBBi:
3314  case AArch64::LDURSBXi:
3315  case AArch64::LDURSBWi:
3316  case AArch64::LDURSHXi:
3317  case AArch64::LDURSHWi:
3318  case AArch64::LDURSWi:
3319  case AArch64::STURXi:
3320  case AArch64::STURWi:
3321  case AArch64::STURBi:
3322  case AArch64::STURHi:
3323  case AArch64::STURSi:
3324  case AArch64::STURDi:
3325  case AArch64::STURQi:
3326  case AArch64::STURBBi:
3327  case AArch64::STURHHi:
3328  Scale = 1;
3329  break;
3330  }
3331 
3332  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
3333 
3334  bool useUnscaledOp = false;
3335  // If the offset doesn't match the scale, we rewrite the instruction to
3336  // use the unscaled instruction instead. Likewise, if we have a negative
3337  // offset (and have an unscaled op to use).
3338  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
3339  useUnscaledOp = true;
3340 
3341  // Use an unscaled addressing mode if the instruction has a negative offset
3342  // (or if the instruction is already using an unscaled addressing mode).
3343  unsigned MaskBits;
3344  if (IsSigned) {
3345  // ldp/stp instructions.
3346  MaskBits = 7;
3347  Offset /= Scale;
3348  } else if (UnscaledOp == 0 || useUnscaledOp) {
3349  MaskBits = 9;
3350  IsSigned = true;
3351  Scale = 1;
3352  } else {
3353  MaskBits = 12;
3354  IsSigned = false;
3355  Offset /= Scale;
3356  }
3357 
3358  // Attempt to fold address computation.
3359  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
3360  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
3361  if (Offset >= MinOff && Offset <= MaxOff) {
3362  if (EmittableOffset)
3363  *EmittableOffset = Offset;
3364  Offset = 0;
3365  } else {
3366  int NewOff = Offset < 0 ? MinOff : MaxOff;
3367  if (EmittableOffset)
3368  *EmittableOffset = NewOff;
3369  Offset = (Offset - NewOff) * Scale;
3370  }
3371  if (OutUseUnscaledOp)
3372  *OutUseUnscaledOp = useUnscaledOp;
3373  if (OutUnscaledOp)
3374  *OutUnscaledOp = UnscaledOp;
3376  (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
3377 }
3378 
3379 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
3380  unsigned FrameReg, int &Offset,
3381  const AArch64InstrInfo *TII) {
3382  unsigned Opcode = MI.getOpcode();
3383  unsigned ImmIdx = FrameRegIdx + 1;
3384 
3385  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
3386  Offset += MI.getOperand(ImmIdx).getImm();
3387  emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
3388  MI.getOperand(0).getReg(), FrameReg, Offset, TII,
3389  MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
3390  MI.eraseFromParent();
3391  Offset = 0;
3392  return true;
3393  }
3394 
3395  int NewOffset;
3396  unsigned UnscaledOp;
3397  bool UseUnscaledOp;
3398  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
3399  &UnscaledOp, &NewOffset);
3400  if (Status & AArch64FrameOffsetCanUpdate) {
3401  if (Status & AArch64FrameOffsetIsLegal)
3402  // Replace the FrameIndex with FrameReg.
3403  MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
3404  if (UseUnscaledOp)
3405  MI.setDesc(TII->get(UnscaledOp));
3406 
3407  MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
3408  return Offset == 0;
3409  }
3410 
3411  return false;
3412 }
3413 
3414 void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
3415  NopInst.setOpcode(AArch64::HINT);
3416  NopInst.addOperand(MCOperand::createImm(0));
3417 }
3418 
3419 // AArch64 supports MachineCombiner.
3420 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
3421 
3422 // True when Opc sets flag
3423 static bool isCombineInstrSettingFlag(unsigned Opc) {
3424  switch (Opc) {
3425  case AArch64::ADDSWrr:
3426  case AArch64::ADDSWri:
3427  case AArch64::ADDSXrr:
3428  case AArch64::ADDSXri:
3429  case AArch64::SUBSWrr:
3430  case AArch64::SUBSXrr:
3431  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3432  case AArch64::SUBSWri:
3433  case AArch64::SUBSXri:
3434  return true;
3435  default:
3436  break;
3437  }
3438  return false;
3439 }
3440 
3441 // 32b Opcodes that can be combined with a MUL
3442 static bool isCombineInstrCandidate32(unsigned Opc) {
3443  switch (Opc) {
3444  case AArch64::ADDWrr:
3445  case AArch64::ADDWri:
3446  case AArch64::SUBWrr:
3447  case AArch64::ADDSWrr:
3448  case AArch64::ADDSWri:
3449  case AArch64::SUBSWrr:
3450  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3451  case AArch64::SUBWri:
3452  case AArch64::SUBSWri:
3453  return true;
3454  default:
3455  break;
3456  }
3457  return false;
3458 }
3459 
3460 // 64b Opcodes that can be combined with a MUL
3461 static bool isCombineInstrCandidate64(unsigned Opc) {
3462  switch (Opc) {
3463  case AArch64::ADDXrr:
3464  case AArch64::ADDXri:
3465  case AArch64::SUBXrr:
3466  case AArch64::ADDSXrr:
3467  case AArch64::ADDSXri:
3468  case AArch64::SUBSXrr:
3469  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
3470  case AArch64::SUBXri:
3471  case AArch64::SUBSXri:
3472  return true;
3473  default:
3474  break;
3475  }
3476  return false;
3477 }
3478 
3479 // FP Opcodes that can be combined with a FMUL
3480 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
3481  switch (Inst.getOpcode()) {
3482  default:
3483  break;
3484  case AArch64::FADDSrr:
3485  case AArch64::FADDDrr:
3486  case AArch64::FADDv2f32:
3487  case AArch64::FADDv2f64:
3488  case AArch64::FADDv4f32:
3489  case AArch64::FSUBSrr:
3490  case AArch64::FSUBDrr:
3491  case AArch64::FSUBv2f32:
3492  case AArch64::FSUBv2f64:
3493  case AArch64::FSUBv4f32:
3494  TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
3495  return (Options.UnsafeFPMath ||
3496  Options.AllowFPOpFusion == FPOpFusion::Fast);
3497  }
3498  return false;
3499 }
3500 
3501 // Opcodes that can be combined with a MUL
3502 static bool isCombineInstrCandidate(unsigned Opc) {
3504 }
3505 
3506 //
3507 // Utility routine that checks if \param MO is defined by an
3508 // \param CombineOpc instruction in the basic block \param MBB
3510  unsigned CombineOpc, unsigned ZeroReg = 0,
3511  bool CheckZeroReg = false) {
3512  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3513  MachineInstr *MI = nullptr;
3514 
3516  MI = MRI.getUniqueVRegDef(MO.getReg());
3517  // And it needs to be in the trace (otherwise, it won't have a depth).
3518  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
3519  return false;
3520  // Must only used by the user we combine with.
3521  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
3522  return false;
3523 
3524  if (CheckZeroReg) {
3525  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
3526  MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
3527  MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
3528  // The third input reg must be zero.
3529  if (MI->getOperand(3).getReg() != ZeroReg)
3530  return false;
3531  }
3532 
3533  return true;
3534 }
3535 
3536 //
3537 // Is \param MO defined by an integer multiply and can be combined?
3539  unsigned MulOpc, unsigned ZeroReg) {
3540  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
3541 }
3542 
3543 //
3544 // Is \param MO defined by a floating-point multiply and can be combined?
3546  unsigned MulOpc) {
3547  return canCombine(MBB, MO, MulOpc);
3548 }
3549 
3550 // TODO: There are many more machine instruction opcodes to match:
3551 // 1. Other data types (integer, vectors)
3552 // 2. Other math / logic operations (xor, or)
3553 // 3. Other forms of the same operation (intrinsics and other variants)
3555  const MachineInstr &Inst) const {
3556  switch (Inst.getOpcode()) {
3557  case AArch64::FADDDrr:
3558  case AArch64::FADDSrr:
3559  case AArch64::FADDv2f32:
3560  case AArch64::FADDv2f64:
3561  case AArch64::FADDv4f32:
3562  case AArch64::FMULDrr:
3563  case AArch64::FMULSrr:
3564  case AArch64::FMULX32:
3565  case AArch64::FMULX64:
3566  case AArch64::FMULXv2f32:
3567  case AArch64::FMULXv2f64:
3568  case AArch64::FMULXv4f32:
3569  case AArch64::FMULv2f32:
3570  case AArch64::FMULv2f64:
3571  case AArch64::FMULv4f32:
3572  return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
3573  default:
3574  return false;
3575  }
3576 }
3577 
3578 /// Find instructions that can be turned into madd.
3579 static bool getMaddPatterns(MachineInstr &Root,
3581  unsigned Opc = Root.getOpcode();
3582  MachineBasicBlock &MBB = *Root.getParent();
3583  bool Found = false;
3584 
3585  if (!isCombineInstrCandidate(Opc))
3586  return false;
3587  if (isCombineInstrSettingFlag(Opc)) {
3588  int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
3589  // When NZCV is live bail out.
3590  if (Cmp_NZCV == -1)
3591  return false;
3592  unsigned NewOpc = convertToNonFlagSettingOpc(Root);
3593  // When opcode can't change bail out.
3594  // CHECKME: do we miss any cases for opcode conversion?
3595  if (NewOpc == Opc)
3596  return false;
3597  Opc = NewOpc;
3598  }
3599 
3600  switch (Opc) {
3601  default:
3602  break;
3603  case AArch64::ADDWrr:
3604  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3605  "ADDWrr does not have register operands");
3606  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3607  AArch64::WZR)) {
3609  Found = true;
3610  }
3611  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3612  AArch64::WZR)) {
3614  Found = true;
3615  }
3616  break;
3617  case AArch64::ADDXrr:
3618  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3619  AArch64::XZR)) {
3621  Found = true;
3622  }
3623  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3624  AArch64::XZR)) {
3626  Found = true;
3627  }
3628  break;
3629  case AArch64::SUBWrr:
3630  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3631  AArch64::WZR)) {
3633  Found = true;
3634  }
3635  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
3636  AArch64::WZR)) {
3638  Found = true;
3639  }
3640  break;
3641  case AArch64::SUBXrr:
3642  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3643  AArch64::XZR)) {
3645  Found = true;
3646  }
3647  if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
3648  AArch64::XZR)) {
3650  Found = true;
3651  }
3652  break;
3653  case AArch64::ADDWri:
3654  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3655  AArch64::WZR)) {
3657  Found = true;
3658  }
3659  break;
3660  case AArch64::ADDXri:
3661  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3662  AArch64::XZR)) {
3664  Found = true;
3665  }
3666  break;
3667  case AArch64::SUBWri:
3668  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
3669  AArch64::WZR)) {
3671  Found = true;
3672  }
3673  break;
3674  case AArch64::SUBXri:
3675  if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
3676  AArch64::XZR)) {
3678  Found = true;
3679  }
3680  break;
3681  }
3682  return Found;
3683 }
3684 /// Floating-Point Support
3685 
3686 /// Find instructions that can be turned into madd.
3687 static bool getFMAPatterns(MachineInstr &Root,
3689 
3690  if (!isCombineInstrCandidateFP(Root))
3691  return false;
3692 
3693  MachineBasicBlock &MBB = *Root.getParent();
3694  bool Found = false;
3695 
3696  switch (Root.getOpcode()) {
3697  default:
3698  assert(false && "Unsupported FP instruction in combiner\n");
3699  break;
3700  case AArch64::FADDSrr:
3701  assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3702  "FADDWrr does not have register operands");
3703  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3705  Found = true;
3706  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3707  AArch64::FMULv1i32_indexed)) {
3709  Found = true;
3710  }
3711  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3713  Found = true;
3714  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3715  AArch64::FMULv1i32_indexed)) {
3717  Found = true;
3718  }
3719  break;
3720  case AArch64::FADDDrr:
3721  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3723  Found = true;
3724  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3725  AArch64::FMULv1i64_indexed)) {
3727  Found = true;
3728  }
3729  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3731  Found = true;
3732  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3733  AArch64::FMULv1i64_indexed)) {
3735  Found = true;
3736  }
3737  break;
3738  case AArch64::FADDv2f32:
3739  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3740  AArch64::FMULv2i32_indexed)) {
3742  Found = true;
3743  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3744  AArch64::FMULv2f32)) {
3746  Found = true;
3747  }
3748  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3749  AArch64::FMULv2i32_indexed)) {
3751  Found = true;
3752  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3753  AArch64::FMULv2f32)) {
3755  Found = true;
3756  }
3757  break;
3758  case AArch64::FADDv2f64:
3759  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3760  AArch64::FMULv2i64_indexed)) {
3762  Found = true;
3763  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3764  AArch64::FMULv2f64)) {
3766  Found = true;
3767  }
3768  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3769  AArch64::FMULv2i64_indexed)) {
3771  Found = true;
3772  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3773  AArch64::FMULv2f64)) {
3775  Found = true;
3776  }
3777  break;
3778  case AArch64::FADDv4f32:
3779  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3780  AArch64::FMULv4i32_indexed)) {
3782  Found = true;
3783  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3784  AArch64::FMULv4f32)) {
3786  Found = true;
3787  }
3788  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3789  AArch64::FMULv4i32_indexed)) {
3791  Found = true;
3792  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3793  AArch64::FMULv4f32)) {
3795  Found = true;
3796  }
3797  break;
3798 
3799  case AArch64::FSUBSrr:
3800  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3802  Found = true;
3803  }
3804  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3806  Found = true;
3807  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3808  AArch64::FMULv1i32_indexed)) {
3810  Found = true;
3811  }
3812  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
3814  Found = true;
3815  }
3816  break;
3817  case AArch64::FSUBDrr:
3818  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3820  Found = true;
3821  }
3822  if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3824  Found = true;
3825  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3826  AArch64::FMULv1i64_indexed)) {
3828  Found = true;
3829  }
3830  if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
3832  Found = true;
3833  }
3834  break;
3835  case AArch64::FSUBv2f32:
3836  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3837  AArch64::FMULv2i32_indexed)) {
3839  Found = true;
3840  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3841  AArch64::FMULv2f32)) {
3843  Found = true;
3844  }
3845  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3846  AArch64::FMULv2i32_indexed)) {
3848  Found = true;
3849  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3850  AArch64::FMULv2f32)) {
3852  Found = true;
3853  }
3854  break;
3855  case AArch64::FSUBv2f64:
3856  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3857  AArch64::FMULv2i64_indexed)) {
3859  Found = true;
3860  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3861  AArch64::FMULv2f64)) {
3863  Found = true;
3864  }
3865  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3866  AArch64::FMULv2i64_indexed)) {
3868  Found = true;
3869  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3870  AArch64::FMULv2f64)) {
3872  Found = true;
3873  }
3874  break;
3875  case AArch64::FSUBv4f32:
3876  if (canCombineWithFMUL(MBB, Root.getOperand(2),
3877  AArch64::FMULv4i32_indexed)) {
3879  Found = true;
3880  } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3881  AArch64::FMULv4f32)) {
3883  Found = true;
3884  }
3885  if (canCombineWithFMUL(MBB, Root.getOperand(1),
3886  AArch64::FMULv4i32_indexed)) {
3888  Found = true;
3889  } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3890  AArch64::FMULv4f32)) {
3892  Found = true;
3893  }
3894  break;
3895  }
3896  return Found;
3897 }
3898 
3899 /// Return true when a code sequence can improve throughput. It
3900 /// should be called only for instructions in loops.
3901 /// \param Pattern - combiner pattern
3903  MachineCombinerPattern Pattern) const {
3904  switch (Pattern) {
3905  default:
3906  break;
3941  return true;
3942  } // end switch (Pattern)
3943  return false;
3944 }
3945 /// Return true when there is potentially a faster code sequence for an
3946 /// instruction chain ending in \p Root. All potential patterns are listed in
3947 /// the \p Pattern vector. Pattern should be sorted in priority order since the
3948 /// pattern evaluator stops checking as soon as it finds a faster sequence.
3949 
3951  MachineInstr &Root,
3952  SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
3953  // Integer patterns
3954  if (getMaddPatterns(Root, Patterns))
3955  return true;
3956  // Floating point patterns
3957  if (getFMAPatterns(Root, Patterns))
3958  return true;
3959 
3960  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
3961 }
3962 
3964 /// genFusedMultiply - Generate fused multiply instructions.
3965 /// This function supports both integer and floating point instructions.
3966 /// A typical example:
3967 /// F|MUL I=A,B,0
3968 /// F|ADD R,I,C
3969 /// ==> F|MADD R,A,B,C
3970 /// \param MF Containing MachineFunction
3971 /// \param MRI Register information
3972 /// \param TII Target information
3973 /// \param Root is the F|ADD instruction
3974 /// \param [out] InsInstrs is a vector of machine instructions and will
3975 /// contain the generated madd instruction
3976 /// \param IdxMulOpd is index of operand in Root that is the result of
3977 /// the F|MUL. In the example above IdxMulOpd is 1.
3978 /// \param MaddOpc the opcode fo the f|madd instruction
3979 /// \param RC Register class of operands
3980 /// \param kind of fma instruction (addressing mode) to be generated
3981 /// \param ReplacedAddend is the result register from the instruction
3982 /// replacing the non-combined operand, if any.
3983 static MachineInstr *
3985  const TargetInstrInfo *TII, MachineInstr &Root,
3986  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
3987  unsigned MaddOpc, const TargetRegisterClass *RC,
3989  const unsigned *ReplacedAddend = nullptr) {
3990  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
3991 
3992  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
3993  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
3994  unsigned ResultReg = Root.getOperand(0).getReg();
3995  unsigned SrcReg0 = MUL->getOperand(1).getReg();
3996  bool Src0IsKill = MUL->getOperand(1).isKill();
3997  unsigned SrcReg1 = MUL->getOperand(2).getReg();
3998  bool Src1IsKill = MUL->getOperand(2).isKill();
3999 
4000  unsigned SrcReg2;
4001  bool Src2IsKill;
4002  if (ReplacedAddend) {
4003  // If we just generated a new addend, we must be it's only use.
4004  SrcReg2 = *ReplacedAddend;
4005  Src2IsKill = true;
4006  } else {
4007  SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
4008  Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
4009  }
4010 
4012  MRI.constrainRegClass(ResultReg, RC);
4014  MRI.constrainRegClass(SrcReg0, RC);
4016  MRI.constrainRegClass(SrcReg1, RC);
4018  MRI.constrainRegClass(SrcReg2, RC);
4019 
4020  MachineInstrBuilder MIB;
4021  if (kind == FMAInstKind::Default)
4022  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4023  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4024  .addReg(SrcReg1, getKillRegState(Src1IsKill))
4025  .addReg(SrcReg2, getKillRegState(Src2IsKill));
4026  else if (kind == FMAInstKind::Indexed)
4027  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4028  .addReg(SrcReg2, getKillRegState(Src2IsKill))
4029  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4030  .addReg(SrcReg1, getKillRegState(Src1IsKill))
4031  .addImm(MUL->getOperand(3).getImm());
4032  else if (kind == FMAInstKind::Accumulator)
4033  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4034  .addReg(SrcReg2, getKillRegState(Src2IsKill))
4035  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4036  .addReg(SrcReg1, getKillRegState(Src1IsKill));
4037  else
4038  assert(false && "Invalid FMA instruction kind \n");
4039  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
4040  InsInstrs.push_back(MIB);
4041  return MUL;
4042 }
4043 
4044 /// genMaddR - Generate madd instruction and combine mul and add using
4045 /// an extra virtual register
4046 /// Example - an ADD intermediate needs to be stored in a register:
4047 /// MUL I=A,B,0
4048 /// ADD R,I,Imm
4049 /// ==> ORR V, ZR, Imm
4050 /// ==> MADD R,A,B,V
4051 /// \param MF Containing MachineFunction
4052 /// \param MRI Register information
4053 /// \param TII Target information
4054 /// \param Root is the ADD instruction
4055 /// \param [out] InsInstrs is a vector of machine instructions and will
4056 /// contain the generated madd instruction
4057 /// \param IdxMulOpd is index of operand in Root that is the result of
4058 /// the MUL. In the example above IdxMulOpd is 1.
4059 /// \param MaddOpc the opcode fo the madd instruction
4060 /// \param VR is a virtual register that holds the value of an ADD operand
4061 /// (V in the example above).
4062 /// \param RC Register class of operands
4064  const TargetInstrInfo *TII, MachineInstr &Root,
4066  unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
4067  const TargetRegisterClass *RC) {
4068  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
4069 
4070  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
4071  unsigned ResultReg = Root.getOperand(0).getReg();
4072  unsigned SrcReg0 = MUL->getOperand(1).getReg();
4073  bool Src0IsKill = MUL->getOperand(1).isKill();
4074  unsigned SrcReg1 = MUL->getOperand(2).getReg();
4075  bool Src1IsKill = MUL->getOperand(2).isKill();
4076 
4078  MRI.constrainRegClass(ResultReg, RC);
4080  MRI.constrainRegClass(SrcReg0, RC);
4082  MRI.constrainRegClass(SrcReg1, RC);
4084  MRI.constrainRegClass(VR, RC);
4085 
4086  MachineInstrBuilder MIB =
4087  BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
4088  .addReg(SrcReg0, getKillRegState(Src0IsKill))
4089  .addReg(SrcReg1, getKillRegState(Src1IsKill))
4090  .addReg(VR);
4091  // Insert the MADD
4092  InsInstrs.push_back(MIB);
4093  return MUL;
4094 }
4095 
4096 /// When getMachineCombinerPatterns() finds potential patterns,
4097 /// this function generates the instructions that could replace the
4098 /// original code sequence
4100  MachineInstr &Root, MachineCombinerPattern Pattern,
4103  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
4104  MachineBasicBlock &MBB = *Root.getParent();
4105  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4106  MachineFunction &MF = *MBB.getParent();
4107  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
4108 
4109  MachineInstr *MUL;
4110  const TargetRegisterClass *RC;
4111  unsigned Opc;
4112  switch (Pattern) {
4113  default:
4114  // Reassociate instructions.
4115  TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
4116  DelInstrs, InstrIdxForVirtReg);
4117  return;
4120  // MUL I=A,B,0
4121  // ADD R,I,C
4122  // ==> MADD R,A,B,C
4123  // --- Create(MADD);
4124  if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
4125  Opc = AArch64::MADDWrrr;
4126  RC = &AArch64::GPR32RegClass;
4127  } else {
4128  Opc = AArch64::MADDXrrr;
4129  RC = &AArch64::GPR64RegClass;
4130  }
4131  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4132  break;
4135  // MUL I=A,B,0
4136  // ADD R,C,I
4137  // ==> MADD R,A,B,C
4138  // --- Create(MADD);
4139  if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
4140  Opc = AArch64::MADDWrrr;
4141  RC = &AArch64::GPR32RegClass;
4142  } else {
4143  Opc = AArch64::MADDXrrr;
4144  RC = &AArch64::GPR64RegClass;
4145  }
4146  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4147  break;
4150  // MUL I=A,B,0
4151  // ADD R,I,Imm
4152  // ==> ORR V, ZR, Imm
4153  // ==> MADD R,A,B,V
4154  // --- Create(MADD);
4155  const TargetRegisterClass *OrrRC;
4156  unsigned BitSize, OrrOpc, ZeroReg;
4157  if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
4158  OrrOpc = AArch64::ORRWri;
4159  OrrRC = &AArch64::GPR32spRegClass;
4160  BitSize = 32;
4161  ZeroReg = AArch64::WZR;
4162  Opc = AArch64::MADDWrrr;
4163  RC = &AArch64::GPR32RegClass;
4164  } else {
4165  OrrOpc = AArch64::ORRXri;
4166  OrrRC = &AArch64::GPR64spRegClass;
4167  BitSize = 64;
4168  ZeroReg = AArch64::XZR;
4169  Opc = AArch64::MADDXrrr;
4170  RC = &AArch64::GPR64RegClass;
4171  }
4172  unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4173  uint64_t Imm = Root.getOperand(2).getImm();
4174 
4175  if (Root.getOperand(3).isImm()) {
4176  unsigned Val = Root.getOperand(3).getImm();
4177  Imm = Imm << Val;
4178  }
4179  uint64_t UImm = SignExtend64(Imm, BitSize);
4180  uint64_t Encoding;
4181  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4182  MachineInstrBuilder MIB1 =
4183  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4184  .addReg(ZeroReg)
4185  .addImm(Encoding);
4186  InsInstrs.push_back(MIB1);
4187  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4188  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4189  }
4190  break;
4191  }
4194  // MUL I=A,B,0
4195  // SUB R,I, C
4196  // ==> SUB V, 0, C
4197  // ==> MADD R,A,B,V // = -C + A*B
4198  // --- Create(MADD);
4199  const TargetRegisterClass *SubRC;
4200  unsigned SubOpc, ZeroReg;
4201  if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
4202  SubOpc = AArch64::SUBWrr;
4203  SubRC = &AArch64::GPR32spRegClass;
4204  ZeroReg = AArch64::WZR;
4205  Opc = AArch64::MADDWrrr;
4206  RC = &AArch64::GPR32RegClass;
4207  } else {
4208  SubOpc = AArch64::SUBXrr;
4209  SubRC = &AArch64::GPR64spRegClass;
4210  ZeroReg = AArch64::XZR;
4211  Opc = AArch64::MADDXrrr;
4212  RC = &AArch64::GPR64RegClass;
4213  }
4214  unsigned NewVR = MRI.createVirtualRegister(SubRC);
4215  // SUB NewVR, 0, C
4216  MachineInstrBuilder MIB1 =
4217  BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
4218  .addReg(ZeroReg)
4219  .add(Root.getOperand(2));
4220  InsInstrs.push_back(MIB1);
4221  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4222  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4223  break;
4224  }
4227  // MUL I=A,B,0
4228  // SUB R,C,I
4229  // ==> MSUB R,A,B,C (computes C - A*B)
4230  // --- Create(MSUB);
4231  if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
4232  Opc = AArch64::MSUBWrrr;
4233  RC = &AArch64::GPR32RegClass;
4234  } else {
4235  Opc = AArch64::MSUBXrrr;
4236  RC = &AArch64::GPR64RegClass;
4237  }
4238  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4239  break;
4242  // MUL I=A,B,0
4243  // SUB R,I, Imm
4244  // ==> ORR V, ZR, -Imm
4245  // ==> MADD R,A,B,V // = -Imm + A*B
4246  // --- Create(MADD);
4247  const TargetRegisterClass *OrrRC;
4248  unsigned BitSize, OrrOpc, ZeroReg;
4249  if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
4250  OrrOpc = AArch64::ORRWri;
4251  OrrRC = &AArch64::GPR32spRegClass;
4252  BitSize = 32;
4253  ZeroReg = AArch64::WZR;
4254  Opc = AArch64::MADDWrrr;
4255  RC = &AArch64::GPR32RegClass;
4256  } else {
4257  OrrOpc = AArch64::ORRXri;
4258  OrrRC = &AArch64::GPR64spRegClass;
4259  BitSize = 64;
4260  ZeroReg = AArch64::XZR;
4261  Opc = AArch64::MADDXrrr;
4262  RC = &AArch64::GPR64RegClass;
4263  }
4264  unsigned NewVR = MRI.createVirtualRegister(OrrRC);
4265  uint64_t Imm = Root.getOperand(2).getImm();
4266  if (Root.getOperand(3).isImm()) {
4267  unsigned Val = Root.getOperand(3).getImm();
4268  Imm = Imm << Val;
4269  }
4270  uint64_t UImm = SignExtend64(-Imm, BitSize);
4271  uint64_t Encoding;
4272  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
4273  MachineInstrBuilder MIB1 =
4274  BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
4275  .addReg(ZeroReg)
4276  .addImm(Encoding);
4277  InsInstrs.push_back(MIB1);
4278  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4279  MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
4280  }
4281  break;
4282  }
4283  // Floating Point Support
4286  // MUL I=A,B,0
4287  // ADD R,I,C
4288  // ==> MADD R,A,B,C
4289  // --- Create(MADD);
4290  if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
4291  Opc = AArch64::FMADDSrrr;
4292  RC = &AArch64::FPR32RegClass;
4293  } else {
4294  Opc = AArch64::FMADDDrrr;
4295  RC = &AArch64::FPR64RegClass;
4296  }
4297  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4298  break;
4301  // FMUL I=A,B,0
4302  // FADD R,C,I
4303  // ==> FMADD R,A,B,C
4304  // --- Create(FMADD);
4305  if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
4306  Opc = AArch64::FMADDSrrr;
4307  RC = &AArch64::FPR32RegClass;
4308  } else {
4309  Opc = AArch64::FMADDDrrr;
4310  RC = &AArch64::FPR64RegClass;
4311  }
4312  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4313  break;
4314 
4316  Opc = AArch64::FMLAv1i32_indexed;
4317  RC = &AArch64::FPR32RegClass;
4318  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4320  break;
4322  Opc = AArch64::FMLAv1i32_indexed;
4323  RC = &AArch64::FPR32RegClass;
4324  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4326  break;
4327 
4329  Opc = AArch64::FMLAv1i64_indexed;
4330  RC = &AArch64::FPR64RegClass;
4331  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4333  break;
4335  Opc = AArch64::FMLAv1i64_indexed;
4336  RC = &AArch64::FPR64RegClass;
4337  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4339  break;
4340 
4343  RC = &AArch64::FPR64RegClass;
4345  Opc = AArch64::FMLAv2i32_indexed;
4346  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4348  } else {
4349  Opc = AArch64::FMLAv2f32;
4350  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4352  }
4353  break;
4356  RC = &AArch64::FPR64RegClass;
4358  Opc = AArch64::FMLAv2i32_indexed;
4359  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4361  } else {
4362  Opc = AArch64::FMLAv2f32;
4363  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4365  }
4366  break;
4367 
4370  RC = &AArch64::FPR128RegClass;
4372  Opc = AArch64::FMLAv2i64_indexed;
4373  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4375  } else {
4376  Opc = AArch64::FMLAv2f64;
4377  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4379  }
4380  break;
4383  RC = &AArch64::FPR128RegClass;
4385  Opc = AArch64::FMLAv2i64_indexed;
4386  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4388  } else {
4389  Opc = AArch64::FMLAv2f64;
4390  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4392  }
4393  break;
4394 
4397  RC = &AArch64::FPR128RegClass;
4399  Opc = AArch64::FMLAv4i32_indexed;
4400  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4402  } else {
4403  Opc = AArch64::FMLAv4f32;
4404  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4406  }
4407  break;
4408 
4411  RC = &AArch64::FPR128RegClass;
4413  Opc = AArch64::FMLAv4i32_indexed;
4414  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4416  } else {
4417  Opc = AArch64::FMLAv4f32;
4418  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4420  }
4421  break;
4422 
4425  // FMUL I=A,B,0
4426  // FSUB R,I,C
4427  // ==> FNMSUB R,A,B,C // = -C + A*B
4428  // --- Create(FNMSUB);
4429  if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
4430  Opc = AArch64::FNMSUBSrrr;
4431  RC = &AArch64::FPR32RegClass;
4432  } else {
4433  Opc = AArch64::FNMSUBDrrr;
4434  RC = &AArch64::FPR64RegClass;
4435  }
4436  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4437  break;
4438  }
4439 
4442  // FNMUL I=A,B,0
4443  // FSUB R,I,C
4444  // ==> FNMADD R,A,B,C // = -A*B - C
4445  // --- Create(FNMADD);
4446  if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
4447  Opc = AArch64::FNMADDSrrr;
4448  RC = &AArch64::FPR32RegClass;
4449  } else {
4450  Opc = AArch64::FNMADDDrrr;
4451  RC = &AArch64::FPR64RegClass;
4452  }
4453  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
4454  break;
4455  }
4456 
4459  // FMUL I=A,B,0
4460  // FSUB R,C,I
4461  // ==> FMSUB R,A,B,C (computes C - A*B)
4462  // --- Create(FMSUB);
4463  if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
4464  Opc = AArch64::FMSUBSrrr;
4465  RC = &AArch64::FPR32RegClass;
4466  } else {
4467  Opc = AArch64::FMSUBDrrr;
4468  RC = &AArch64::FPR64RegClass;
4469  }
4470  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
4471  break;
4472  }
4473 
4475  Opc = AArch64::FMLSv1i32_indexed;
4476  RC = &AArch64::FPR32RegClass;
4477  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4479  break;
4480 
4482  Opc = AArch64::FMLSv1i64_indexed;
4483  RC = &AArch64::FPR64RegClass;
4484  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4486  break;
4487 
4490  RC = &AArch64::FPR64RegClass;
4492  Opc = AArch64::FMLSv2i32_indexed;
4493  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4495  } else {
4496  Opc = AArch64::FMLSv2f32;
4497  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4499  }
4500  break;
4501 
4504  RC = &AArch64::FPR128RegClass;
4506  Opc = AArch64::FMLSv2i64_indexed;
4507  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4509  } else {
4510  Opc = AArch64::FMLSv2f64;
4511  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4513  }
4514  break;
4515 
4518  RC = &AArch64::FPR128RegClass;
4520  Opc = AArch64::FMLSv4i32_indexed;
4521  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4523  } else {
4524  Opc = AArch64::FMLSv4f32;
4525  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
4527  }
4528  break;
4531  RC = &AArch64::FPR64RegClass;
4532  unsigned NewVR = MRI.createVirtualRegister(RC);
4533  MachineInstrBuilder MIB1 =
4534  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR)
4535  .add(Root.getOperand(2));
4536  InsInstrs.push_back(MIB1);
4537  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4539  Opc = AArch64::FMLAv2i32_indexed;
4540  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4541  FMAInstKind::Indexed, &NewVR);
4542  } else {
4543  Opc = AArch64::FMLAv2f32;
4544  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4545  FMAInstKind::Accumulator, &NewVR);
4546  }
4547  break;
4548  }
4551  RC = &AArch64::FPR128RegClass;
4552  unsigned NewVR = MRI.createVirtualRegister(RC);
4553  MachineInstrBuilder MIB1 =
4554  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR)
4555  .add(Root.getOperand(2));
4556  InsInstrs.push_back(MIB1);
4557  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4559  Opc = AArch64::FMLAv4i32_indexed;
4560  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4561  FMAInstKind::Indexed, &NewVR);
4562  } else {
4563  Opc = AArch64::FMLAv4f32;
4564  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4565  FMAInstKind::Accumulator, &NewVR);
4566  }
4567  break;
4568  }
4571  RC = &AArch64::FPR128RegClass;
4572  unsigned NewVR = MRI.createVirtualRegister(RC);
4573  MachineInstrBuilder MIB1 =
4574  BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR)
4575  .add(Root.getOperand(2));
4576  InsInstrs.push_back(MIB1);
4577  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
4579  Opc = AArch64::FMLAv2i64_indexed;
4580  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4581  FMAInstKind::Indexed, &NewVR);
4582  } else {
4583  Opc = AArch64::FMLAv2f64;
4584  MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
4585  FMAInstKind::Accumulator, &NewVR);
4586  }
4587  break;
4588  }
4589  } // end switch (Pattern)
4590  // Record MUL and ADD/SUB for deletion
4591  DelInstrs.push_back(MUL);
4592  DelInstrs.push_back(&Root);
4593 }
4594 
4595 /// Replace csincr-branch sequence by simple conditional branch
4596 ///
4597 /// Examples:
4598 /// 1. \code
4599 /// csinc w9, wzr, wzr, <condition code>
4600 /// tbnz w9, #0, 0x44
4601 /// \endcode
4602 /// to
4603 /// \code
4604 /// b.<inverted condition code>
4605 /// \endcode
4606 ///
4607 /// 2. \code
4608 /// csinc w9, wzr, wzr, <condition code>
4609 /// tbz w9, #0, 0x44
4610 /// \endcode
4611 /// to
4612 /// \code
4613 /// b.<condition code>
4614 /// \endcode
4615 ///
4616 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
4617 /// compare's constant operand is power of 2.
4618 ///
4619 /// Examples:
4620 /// \code
4621 /// and w8, w8, #0x400
4622 /// cbnz w8, L1
4623 /// \endcode
4624 /// to
4625 /// \code
4626 /// tbnz w8, #10, L1
4627 /// \endcode
4628 ///
4629 /// \param MI Conditional Branch
4630 /// \return True when the simple conditional branch is generated
4631 ///
4633  bool IsNegativeBranch = false;
4634  bool IsTestAndBranch = false;
4635  unsigned TargetBBInMI = 0;
4636  switch (MI.getOpcode()) {
4637  default:
4638  llvm_unreachable("Unknown branch instruction?");
4639  case AArch64::Bcc:
4640  return false;
4641  case AArch64::CBZW:
4642  case AArch64::CBZX:
4643  TargetBBInMI = 1;
4644  break;
4645  case AArch64::CBNZW:
4646  case AArch64::CBNZX:
4647  TargetBBInMI = 1;
4648  IsNegativeBranch = true;
4649  break;
4650  case AArch64::TBZW:
4651  case AArch64::TBZX:
4652  TargetBBInMI = 2;
4653  IsTestAndBranch = true;
4654  break;
4655  case AArch64::TBNZW:
4656  case AArch64::TBNZX:
4657  TargetBBInMI = 2;
4658  IsNegativeBranch = true;
4659  IsTestAndBranch = true;
4660  break;
4661  }
4662  // So we increment a zero register and test for bits other
4663  // than bit 0? Conservatively bail out in case the verifier
4664  // missed this case.
4665  if (IsTestAndBranch && MI.getOperand(1).getImm())
4666  return false;
4667 
4668  // Find Definition.
4669  assert(MI.getParent() && "Incomplete machine instruciton\n");
4670  MachineBasicBlock *MBB = MI.getParent();
4671  MachineFunction *MF = MBB->getParent();
4672  MachineRegisterInfo *MRI = &MF->getRegInfo();
4673  unsigned VReg = MI.getOperand(0).getReg();
4675  return false;
4676 
4677  MachineInstr *DefMI = MRI->getVRegDef(VReg);
4678 
4679  // Look through COPY instructions to find definition.
4680  while (DefMI->isCopy()) {
4681  unsigned CopyVReg = DefMI->getOperand(1).getReg();
4682  if (!MRI->hasOneNonDBGUse(CopyVReg))
4683  return false;
4684  if (!MRI->hasOneDef(CopyVReg))
4685  return false;
4686  DefMI = MRI->getVRegDef(CopyVReg);
4687  }
4688 
4689  switch (DefMI->getOpcode()) {
4690  default:
4691  return false;
4692  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
4693  case AArch64::ANDWri:
4694  case AArch64::ANDXri: {
4695  if (IsTestAndBranch)
4696  return false;
4697  if (DefMI->getParent() != MBB)
4698  return false;
4699  if (!MRI->hasOneNonDBGUse(VReg))
4700  return false;
4701 
4702  bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
4704  DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
4705  if (!isPowerOf2_64(Mask))
4706  return false;
4707 
4708  MachineOperand &MO = DefMI->getOperand(1);
4709  unsigned NewReg = MO.getReg();
4711  return false;
4712 
4713  assert(!MRI->def_empty(NewReg) && "Register must be defined.");
4714 
4715  MachineBasicBlock &RefToMBB = *MBB;
4716  MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
4717  DebugLoc DL = MI.getDebugLoc();
4718  unsigned Imm = Log2_64(Mask);
4719  unsigned Opc = (Imm < 32)
4720  ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
4721  : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
4722  MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
4723  .addReg(NewReg)
4724  .addImm(Imm)
4725  .addMBB(TBB);
4726  // Register lives on to the CBZ now.
4727  MO.setIsKill(false);
4728 
4729  // For immediate smaller than 32, we need to use the 32-bit
4730  // variant (W) in all cases. Indeed the 64-bit variant does not
4731  // allow to encode them.
4732  // Therefore, if the input register is 64-bit, we need to take the
4733  // 32-bit sub-part.
4734  if (!Is32Bit && Imm < 32)
4735  NewMI->getOperand(0).setSubReg(AArch64::sub_32);
4736  MI.eraseFromParent();
4737  return true;
4738  }
4739  // Look for CSINC
4740  case AArch64::CSINCWr:
4741  case AArch64::CSINCXr: {
4742  if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
4743  DefMI->getOperand(2).getReg() == AArch64::WZR) &&
4744  !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
4745  DefMI->getOperand(2).getReg() == AArch64::XZR))
4746  return false;
4747 
4748  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
4749  return false;
4750 
4752  // Convert only when the condition code is not modified between
4753  // the CSINC and the branch. The CC may be used by other
4754  // instructions in between.
4756  return false;
4757  MachineBasicBlock &RefToMBB = *MBB;
4758  MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
4759  DebugLoc DL = MI.getDebugLoc();
4760  if (IsNegativeBranch)
4762  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
4763  MI.eraseFromParent();
4764  return true;
4765  }
4766  }
4767 }
4768 
4769 std::pair<unsigned, unsigned>
4771  const unsigned Mask = AArch64II::MO_FRAGMENT;
4772  return std::make_pair(TF & Mask, TF & ~Mask);
4773 }
4774 
4777  using namespace AArch64II;
4778 
4779  static const std::pair<unsigned, const char *> TargetFlags[] = {
4780  {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
4781  {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
4782  {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
4783  {MO_HI12, "aarch64-hi12"}};
4784  return makeArrayRef(TargetFlags);
4785 }
4786 
4789  using namespace AArch64II;
4790 
4791  static const std::pair<unsigned, const char *> TargetFlags[] = {
4792  {MO_COFFSTUB, "aarch64-coffstub"},
4793  {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"},
4794  {MO_S, "aarch64-s"}, {MO_TLS, "aarch64-tls"},
4795  {MO_DLLIMPORT, "aarch64-dllimport"}};
4796  return makeArrayRef(TargetFlags);
4797 }
4798 
4801  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
4802  {{MOSuppressPair, "aarch64-suppress-pair"},
4803  {MOStridedAccess, "aarch64-strided-access"}};
4804  return makeArrayRef(TargetFlags);
4805 }
4806 
4807 /// Constants defining how certain sequences should be outlined.
4808 /// This encompasses how an outlined function should be called, and what kind of
4809 /// frame should be emitted for that outlined function.
4810 ///
4811 /// \p MachineOutlinerDefault implies that the function should be called with
4812 /// a save and restore of LR to the stack.
4813 ///
4814 /// That is,
4815 ///
4816 /// I1 Save LR OUTLINED_FUNCTION:
4817 /// I2 --> BL OUTLINED_FUNCTION I1
4818 /// I3 Restore LR I2
4819 /// I3
4820 /// RET
4821 ///
4822 /// * Call construction overhead: 3 (save + BL + restore)
4823 /// * Frame construction overhead: 1 (ret)
4824 /// * Requires stack fixups? Yes
4825 ///
4826 /// \p MachineOutlinerTailCall implies that the function is being created from
4827 /// a sequence of instructions ending in a return.
4828 ///
4829 /// That is,
4830 ///
4831 /// I1 OUTLINED_FUNCTION:
4832 /// I2 --> B OUTLINED_FUNCTION I1
4833 /// RET I2
4834 /// RET
4835 ///
4836 /// * Call construction overhead: 1 (B)
4837 /// * Frame construction overhead: 0 (Return included in sequence)
4838 /// * Requires stack fixups? No
4839 ///
4840 /// \p MachineOutlinerNoLRSave implies that the function should be called using
4841 /// a BL instruction, but doesn't require LR to be saved and restored. This
4842 /// happens when LR is known to be dead.
4843 ///
4844 /// That is,
4845 ///
4846 /// I1 OUTLINED_FUNCTION:
4847 /// I2 --> BL OUTLINED_FUNCTION I1
4848 /// I3 I2
4849 /// I3
4850 /// RET
4851 ///
4852 /// * Call construction overhead: 1 (BL)
4853 /// * Frame construction overhead: 1 (RET)
4854 /// * Requires stack fixups? No
4855 ///
4856 /// \p MachineOutlinerThunk implies that the function is being created from
4857 /// a sequence of instructions ending in a call. The outlined function is
4858 /// called with a BL instruction, and the outlined function tail-calls the
4859 /// original call destination.
4860 ///
4861 /// That is,
4862 ///
4863 /// I1 OUTLINED_FUNCTION:
4864 /// I2 --> BL OUTLINED_FUNCTION I1
4865 /// BL f I2
4866 /// B f
4867 /// * Call construction overhead: 1 (BL)
4868 /// * Frame construction overhead: 0
4869 /// * Requires stack fixups? No
4870 ///
4871 /// \p MachineOutlinerRegSave implies that the function should be called with a
4872 /// save and restore of LR to an available register. This allows us to avoid
4873 /// stack fixups. Note that this outlining variant is compatible with the
4874 /// NoLRSave case.
4875 ///
4876 /// That is,
4877 ///
4878 /// I1 Save LR OUTLINED_FUNCTION:
4879 /// I2 --> BL OUTLINED_FUNCTION I1
4880 /// I3 Restore LR I2
4881 /// I3
4882 /// RET
4883 ///
4884 /// * Call construction overhead: 3 (save + BL + restore)
4885 /// * Frame construction overhead: 1 (ret)
4886 /// * Requires stack fixups? No
4888  MachineOutlinerDefault, /// Emit a save, restore, call, and return.
4889  MachineOutlinerTailCall, /// Only emit a branch.
4890  MachineOutlinerNoLRSave, /// Emit a call and return.
4891  MachineOutlinerThunk, /// Emit a call and tail-call.
4892  MachineOutlinerRegSave /// Same as default, but save to a register.
4893 };
4894 
4897  HasCalls = 0x4,
4899 };
4900 
4901 unsigned
4902 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
4903  assert(C.LRUWasSet && "LRU wasn't set?");
4904  MachineFunction *MF = C.getMF();
4905  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
4906  MF->getSubtarget().getRegisterInfo());
4907 
4908  // Check if there is an available register across the sequence that we can
4909  // use.
4910  for (unsigned Reg : AArch64::GPR64RegClass) {
4911  if (!ARI->isReservedReg(*MF, Reg) &&
4912  Reg != AArch64::LR && // LR is not reserved, but don't use it.
4913  Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
4914  Reg != AArch64::X17 && // Ditto for X17.
4916  return Reg;
4917  }
4918 
4919  // No suitable register. Return 0.
4920  return 0u;
4921 }
4922 
4925  std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
4926  outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
4927  unsigned SequenceSize =
4928  std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
4929  [this](unsigned Sum, const MachineInstr &MI) {
4930  return Sum + getInstSizeInBytes(MI);
4931  });
4932 
4933  // Properties about candidate MBBs that hold for all of them.
4934  unsigned FlagsSetInAll = 0xF;
4935 
4936  // Compute liveness information for each candidate, and set FlagsSetInAll.
4938  std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
4939  [&FlagsSetInAll](outliner::Candidate &C) {
4940  FlagsSetInAll &= C.Flags;
4941  });
4942 
4943  // According to the AArch64 Procedure Call Standard, the following are
4944  // undefined on entry/exit from a function call:
4945  //
4946  // * Registers x16, x17, (and thus w16, w17)
4947  // * Condition codes (and thus the NZCV register)
4948  //
4949  // Because if this, we can't outline any sequence of instructions where
4950  // one
4951  // of these registers is live into/across it. Thus, we need to delete
4952  // those
4953  // candidates.
4954  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
4955  // If the unsafe registers in this block are all dead, then we don't need
4956  // to compute liveness here.
4957  if (C.Flags & UnsafeRegsDead)
4958  return false;
4959  C.initLRU(TRI);
4960  LiveRegUnits LRU = C.LRU;
4961  return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) ||
4962  !LRU.available(AArch64::NZCV));
4963  };
4964 
4965  // Are there any candidates where those registers are live?
4966  if (!(FlagsSetInAll & UnsafeRegsDead)) {
4967  // Erase every candidate that violates the restrictions above. (It could be
4968  // true that we have viable candidates, so it's not worth bailing out in
4969  // the case that, say, 1 out of 20 candidates violate the restructions.)
4970  RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
4971  RepeatedSequenceLocs.end(),
4972  CantGuaranteeValueAcrossCall),
4973  RepeatedSequenceLocs.end());
4974 
4975  // If the sequence doesn't have enough candidates left, then we're done.
4976  if (RepeatedSequenceLocs.size() < 2)
4977  return outliner::OutlinedFunction();
4978  }
4979 
4980  // At this point, we have only "safe" candidates to outline. Figure out
4981  // frame + call instruction information.
4982 
4983  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
4984 
4985  // Helper lambda which sets call information for every candidate.
4986  auto SetCandidateCallInfo =
4987  [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
4988  for (outliner::Candidate &C : RepeatedSequenceLocs)
4989  C.setCallInfo(CallID, NumBytesForCall);
4990  };
4991 
4992  unsigned FrameID = MachineOutlinerDefault;
4993  unsigned NumBytesToCreateFrame = 4;
4994 
4995  bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
4996  return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
4997  });
4998 
4999  // Returns true if an instructions is safe to fix up, false otherwise.
5000  auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
5001  if (MI.isCall())
5002  return true;
5003 
5004  if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
5005  !MI.readsRegister(AArch64::SP, &TRI))
5006  return true;
5007 
5008  // Any modification of SP will break our code to save/restore LR.
5009  // FIXME: We could handle some instructions which add a constant
5010  // offset to SP, with a bit more work.
5011  if (MI.modifiesRegister(AArch64::SP, &TRI))
5012  return false;
5013 
5014  // At this point, we have a stack instruction that we might need to
5015  // fix up. We'll handle it if it's a load or store.
5016  if (MI.mayLoadOrStore()) {
5017  MachineOperand *Base; // Filled with the base operand of MI.
5018  int64_t Offset; // Filled with the offset of MI.
5019 
5020  // Does it allow us to offset the base operand and is the base the
5021  // register SP?
5022  if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
5023  Base->getReg() != AArch64::SP)
5024  return false;
5025 
5026  // Find the minimum/maximum offset for this instruction and check
5027  // if fixing it up would be in range.
5028  int64_t MinOffset,
5029  MaxOffset; // Unscaled offsets for the instruction.
5030  unsigned Scale; // The scale to multiply the offsets by.
5031  unsigned DummyWidth;
5032  getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
5033 
5034  Offset += 16; // Update the offset to what it would be if we outlined.
5035  if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
5036  return false;
5037 
5038  // It's in range, so we can outline it.
5039  return true;
5040  }
5041 
5042  // FIXME: Add handling for instructions like "add x0, sp, #8".
5043 
5044  // We can't fix it up, so don't outline it.
5045  return false;
5046  };
5047 
5048  // True if it's possible to fix up each stack instruction in this sequence.
5049  // Important for frames/call variants that modify the stack.
5050  bool AllStackInstrsSafe = std::all_of(
5051  FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
5052 
5053  // If the last instruction in any candidate is a terminator, then we should
5054  // tail call all of the candidates.
5055  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
5056  FrameID = MachineOutlinerTailCall;
5057  NumBytesToCreateFrame = 0;
5058  SetCandidateCallInfo(MachineOutlinerTailCall, 4);
5059  }
5060 
5061  else if (LastInstrOpcode == AArch64::BL ||
5062  (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
5063  // FIXME: Do we need to check if the code after this uses the value of LR?
5064  FrameID = MachineOutlinerThunk;
5065  NumBytesToCreateFrame = 0;
5066  SetCandidateCallInfo(MachineOutlinerThunk, 4);
5067  }
5068 
5069  else {
5070  // We need to decide how to emit calls + frames. We can always emit the same
5071  // frame if we don't need to save to the stack. If we have to save to the
5072  // stack, then we need a different frame.
5073  unsigned NumBytesNoStackCalls = 0;
5074  std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
5075 
5076  for (outliner::Candidate &C : RepeatedSequenceLocs) {
5077  C.initLRU(TRI);
5078 
5079  // Is LR available? If so, we don't need a save.
5080  if (C.LRU.available(AArch64::LR)) {
5081  NumBytesNoStackCalls += 4;
5083  CandidatesWithoutStackFixups.push_back(C);
5084  }
5085 
5086  // Is an unused register available? If so, we won't modify the stack, so
5087  // we can outline with the same frame type as those that don't save LR.
5088  else if (findRegisterToSaveLRTo(C)) {
5089  NumBytesNoStackCalls += 12;
5091  CandidatesWithoutStackFixups.push_back(C);
5092  }
5093 
5094  // Is SP used in the sequence at all? If not, we don't have to modify
5095  // the stack, so we are guaranteed to get the same frame.
5096  else if (C.UsedInSequence.available(AArch64::SP)) {
5097  NumBytesNoStackCalls += 12;
5099  CandidatesWithoutStackFixups.push_back(C);
5100  }
5101 
5102  // If we outline this, we need to modify the stack. Pretend we don't
5103  // outline this by saving all of its bytes.
5104  else {
5105  NumBytesNoStackCalls += SequenceSize;
5106  }
5107  }
5108 
5109  // If there are no places where we have to save LR, then note that we
5110  // don't have to update the stack. Otherwise, give every candidate the
5111  // default call type, as long as it's safe to do so.
5112  if (!AllStackInstrsSafe ||
5113  NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
5114  RepeatedSequenceLocs = CandidatesWithoutStackFixups;
5115  FrameID = MachineOutlinerNoLRSave;
5116  } else {
5117  SetCandidateCallInfo(MachineOutlinerDefault, 12);
5118  }
5119 
5120  // If we dropped all of the candidates, bail out here.
5121  if (RepeatedSequenceLocs.size() < 2) {
5122  RepeatedSequenceLocs.clear();
5123  return outliner::OutlinedFunction();
5124  }
5125  }
5126 
5127  // Does every candidate's MBB contain a call? If so, then we might have a call
5128  // in the range.
5129  if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
5130  // Check if the range contains a call. These require a save + restore of the
5131  // link register.
5132  bool ModStackToSaveLR = false;
5133  if (std::any_of(FirstCand.front(), FirstCand.back(),
5134  [](const MachineInstr &MI) { return MI.isCall(); }))
5135  ModStackToSaveLR = true;
5136 
5137  // Handle the last instruction separately. If this is a tail call, then the
5138  // last instruction is a call. We don't want to save + restore in this case.
5139  // However, it could be possible that the last instruction is a call without
5140  // it being valid to tail call this sequence. We should consider this as
5141  // well.
5142  else if (FrameID != MachineOutlinerThunk &&
5143  FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
5144  ModStackToSaveLR = true;
5145 
5146  if (ModStackToSaveLR) {
5147  // We can't fix up the stack. Bail out.
5148  if (!AllStackInstrsSafe) {
5149  RepeatedSequenceLocs.clear();
5150  return outliner::OutlinedFunction();
5151  }
5152 
5153  // Save + restore LR.
5154  NumBytesToCreateFrame += 8;
5155  }
5156  }
5157 
5158  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
5159  NumBytesToCreateFrame, FrameID);
5160 }
5161 
5163  MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
5164  const Function &F = MF.getFunction();
5165 
5166  // Can F be deduplicated by the linker? If it can, don't outline from it.
5167  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
5168  return false;
5169 
5170  // Don't outline from functions with section markings; the program could
5171  // expect that all the code is in the named section.
5172  // FIXME: Allow outlining from multiple functions with the same section
5173  // marking.
5174  if (F.hasSection())
5175  return false;
5176 
5177  // Outlining from functions with redzones is unsafe since the outliner may
5178  // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
5179  // outline from it.
5181  if (!AFI || AFI->hasRedZone().getValueOr(true))
5182  return false;
5183 
5184  // It's safe to outline from MF.
5185  return true;
5186 }
5187 
5189  unsigned &Flags) const {
5190  // Check if LR is available through all of the MBB. If it's not, then set
5191  // a flag.
5193  "Suitable Machine Function for outlining must track liveness");
5195 
5196  std::for_each(MBB.rbegin(), MBB.rend(),
5197  [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
5198 
5199  // Check if each of the unsafe registers are available...
5200  bool W16AvailableInBlock = LRU.available(AArch64::W16);
5201  bool W17AvailableInBlock = LRU.available(AArch64::W17);
5202  bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV);
5203 
5204  // If all of these are dead (and not live out), we know we don't have to check
5205  // them later.
5206  if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock)
5208 
5209  // Now, add the live outs to the set.
5210  LRU.addLiveOuts(MBB);
5211 
5212  // If any of these registers is available in the MBB, but also a live out of
5213  // the block, then we know outlining is unsafe.
5214  if (W16AvailableInBlock && !LRU.available(AArch64::W16))
5215  return false;
5216  if (W17AvailableInBlock && !LRU.available(AArch64::W17))
5217  return false;
5218  if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV))
5219  return false;
5220 
5221  // Check if there's a call inside this MachineBasicBlock. If there is, then
5222  // set a flag.
5223  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
5225 
5226  MachineFunction *MF = MBB.getParent();
5227 
5228  // In the event that we outline, we may have to save LR. If there is an
5229  // available register in the MBB, then we'll always save LR there. Check if
5230  // this is true.
5231  bool CanSaveLR = false;
5232  const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>(
5233  MF->getSubtarget().getRegisterInfo());
5234 
5235  // Check if there is an available register across the sequence that we can
5236  // use.
5237  for (unsigned Reg : AArch64::GPR64RegClass) {
5238  if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR &&
5239  Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) {
5240  CanSaveLR = true;
5241  break;
5242  }
5243  }
5244 
5245  // Check if we have a register we can save LR to, and if LR was used
5246  // somewhere. If both of those things are true, then we need to evaluate the
5247  // safety of outlining stack instructions later.
5248  if (!CanSaveLR && !LRU.available(AArch64::LR))
5250 
5251  return true;
5252 }
5253 
5256  unsigned Flags) const {
5257  MachineInstr &MI = *MIT;
5258  MachineBasicBlock *MBB = MI.getParent();
5259  MachineFunction *MF = MBB->getParent();
5260  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
5261 
5262  // Don't outline LOHs.
5263  if (FuncInfo->getLOHRelated().count(&MI))
5265 
5266  // Don't allow debug values to impact outlining type.
5267  if (MI.isDebugInstr() || MI.isIndirectDebugValue())
5269 
5270  // At this point, KILL instructions don't really tell us much so we can go
5271  // ahead and skip over them.
5272  if (MI.isKill())
5274 
5275  // Is this a terminator for a basic block?
5276  if (MI.isTerminator()) {
5277 
5278  // Is this the end of a function?
5279  if (MI.getParent()->succ_empty())
5281 
5282  // It's not, so don't outline it.
5284  }
5285 
5286  // Make sure none of the operands are un-outlinable.
5287  for (const MachineOperand &MOP : MI.operands()) {
5288  if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
5289  MOP.isTargetIndex())
5291 
5292  // If it uses LR or W30 explicitly, then don't touch it.
5293  if (MOP.isReg() && !MOP.isImplicit() &&
5294  (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
5296  }
5297 
5298  // Special cases for instructions that can always be outlined, but will fail
5299  // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
5300  // be outlined because they don't require a *specific* value to be in LR.
5301  if (MI.getOpcode() == AArch64::ADRP)
5303 
5304  // If MI is a call we might be able to outline it. We don't want to outline
5305  // any calls that rely on the position of items on the stack. When we outline
5306  // something containing a call, we have to emit a save and restore of LR in
5307  // the outlined function. Currently, this always happens by saving LR to the
5308  // stack. Thus, if we outline, say, half the parameters for a function call
5309  // plus the call, then we'll break the callee's expectations for the layout
5310  // of the stack.
5311  //
5312  // FIXME: Allow calls to functions which construct a stack frame, as long
5313  // as they don't access arguments on the stack.
5314  // FIXME: Figure out some way to analyze functions defined in other modules.
5315  // We should be able to compute the memory usage based on the IR calling
5316  // convention, even if we can't see the definition.
5317  if (MI.isCall()) {
5318  // Get the function associated with the call. Look at each operand and find
5319  // the one that represents the callee and get its name.
5320  const Function *Callee = nullptr;
5321  for (const MachineOperand &MOP : MI.operands()) {
5322  if (MOP.isGlobal()) {
5323  Callee = dyn_cast<Function>(MOP.getGlobal());
5324  break;
5325  }
5326  }
5327 
5328  // Never outline calls to mcount. There isn't any rule that would require
5329  // this, but the Linux kernel's "ftrace" feature depends on it.
5330  if (Callee && Callee->getName() == "\01_mcount")
5332 
5333  // If we don't know anything about the callee, assume it depends on the
5334  // stack layout of the caller. In that case, it's only legal to outline
5335  // as a tail-call. Whitelist the call instructions we know about so we
5336  // don't get unexpected results with call pseudo-instructions.
5337  auto UnknownCallOutlineType = outliner::InstrType::Illegal;
5338  if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
5339  UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
5340 
5341  if (!Callee)
5342  return UnknownCallOutlineType;
5343 
5344  // We have a function we have information about. Check it if it's something
5345  // can safely outline.
5346  MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
5347 
5348  // We don't know what's going on with the callee at all. Don't touch it.
5349  if (!CalleeMF)
5350  return UnknownCallOutlineType;
5351 
5352  // Check if we know anything about the callee saves on the function. If we
5353  // don't, then don't touch it, since that implies that we haven't
5354  // computed anything about its stack frame yet.
5355  MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
5356  if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
5357  MFI.getNumObjects() > 0)
5358  return UnknownCallOutlineType;
5359 
5360  // At this point, we can say that CalleeMF ought to not pass anything on the
5361  // stack. Therefore, we can outline it.
5363  }
5364 
5365  // Don't outline positions.
5366  if (MI.isPosition())
5368 
5369  // Don't touch the link register or W30.
5370  if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
5371  MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
5373 
5374  // Don't outline BTI instructions, because that will prevent the outlining
5375  // site from being indirectly callable.
5376  if (MI.getOpcode() == AArch64::HINT) {
5377  int64_t Imm = MI.getOperand(0).getImm();
5378  if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5380  }
5381 
5383 }
5384 
5385 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
5386  for (MachineInstr &MI : MBB) {
5388  unsigned Width;
5389  int64_t Offset;
5390 
5391  // Is this a load or store with an immediate offset with SP as the base?
5392  if (!MI.mayLoadOrStore() ||
5393  !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
5394  (Base->isReg() && Base->getReg() != AArch64::SP))
5395  continue;
5396 
5397  // It is, so we have to fix it up.
5398  unsigned Scale;
5399  int64_t Dummy1, Dummy2;
5400 
5401  MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
5402  assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
5403  getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
5404  assert(Scale != 0 && "Unexpected opcode!");
5405 
5406  // We've pushed the return address to the stack, so add 16 to the offset.
5407  // This is safe, since we already checked if it would overflow when we
5408  // checked if this instruction was legal to outline.
5409  int64_t NewImm = (Offset + 16) / Scale;
5410  StackOffsetOperand.setImm(NewImm);
5411  }
5412 }
5413 
5416  const outliner::OutlinedFunction &OF) const {
5417  // For thunk outlining, rewrite the last instruction from a call to a
5418  // tail-call.
5420  MachineInstr *Call = &*--MBB.instr_end();
5421  unsigned TailOpcode;
5422  if (Call->getOpcode() == AArch64::BL) {
5423  TailOpcode = AArch64::TCRETURNdi;
5424  } else {
5425  assert(Call->getOpcode() == AArch64::BLR);
5426  TailOpcode = AArch64::TCRETURNriALL;
5427  }
5428  MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
5429  .add(Call->getOperand(0))
5430  .addImm(0);
5431  MBB.insert(MBB.end(), TC);
5432  Call->eraseFromParent();
5433  }
5434 
5435  // Is there a call in the outlined range?
5436  auto IsNonTailCall = [](MachineInstr &MI) {
5437  return MI.isCall() && !MI.isReturn();
5438  };
5439  if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
5440  // Fix up the instructions in the range, since we're going to modify the
5441  // stack.
5443  "Can only fix up stack references once");
5444  fixupPostOutline(MBB);
5445 
5446  // LR has to be a live in so that we can save it.
5447  MBB.addLiveIn(AArch64::LR);
5448 
5450  MachineBasicBlock::iterator Et = MBB.end();
5451 
5454  Et = std::prev(MBB.end());
5455 
5456  // Insert a save before the outlined region
5457  MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
5458  .addReg(AArch64::SP, RegState::Define)
5459  .addReg(AArch64::LR)
5460  .addReg(AArch64::SP)
5461  .addImm(-16);
5462  It = MBB.insert(It, STRXpre);
5463 
5464  const TargetSubtargetInfo &STI = MF.getSubtarget();
5465  const MCRegisterInfo *MRI = STI.getRegisterInfo();
5466  unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
5467 
5468  // Add a CFI saying the stack was moved 16 B down.
5469  int64_t StackPosEntry =
5471  BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5472  .addCFIIndex(StackPosEntry)
5474 
5475  // Add a CFI saying that the LR that we want to find is now 16 B higher than
5476  // before.
5477  int64_t LRPosEntry =
5478  MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
5479  BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
5480  .addCFIIndex(LRPosEntry)
5482 
5483  // Insert a restore before the terminator for the function.
5484  MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
5485  .addReg(AArch64::SP, RegState::Define)
5486  .addReg(AArch64::LR, RegState::Define)
5487  .addReg(AArch64::SP)
5488  .addImm(16);
5489  Et = MBB.insert(Et, LDRXpost);
5490  }
5491 
5492  // If this is a tail call outlined function, then there's already a return.
5495  return;
5496 
5497  // It's not a tail call, so we have to insert the return ourselves.
5498  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
5499  .addReg(AArch64::LR, RegState::Undef);
5500  MBB.insert(MBB.end(), ret);
5501 
5502  // Did we have to modify the stack by saving the link register?
5504  return;
5505 
5506  // We modified the stack.
5507  // Walk over the basic block and fix up all the stack accesses.
5508  fixupPostOutline(MBB);
5509 }
5510 
5513  MachineFunction &MF, const outliner::Candidate &C) const {
5514 
5515  // Are we tail calling?
5517  // If yes, then we can just branch to the label.
5518  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
5519  .addGlobalAddress(M.getNamedValue(MF.getName()))
5520  .addImm(0));
5521  return It;
5522  }
5523 
5524  // Are we saving the link register?
5527  // No, so just insert the call.
5528  It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
5529  .addGlobalAddress(M.getNamedValue(MF.getName())));
5530  return It;
5531  }
5532 
5533  // We want to return the spot where we inserted the call.
5535 
5536  // Instructions for saving and restoring LR around the call instruction we're
5537  // going to insert.
5538  MachineInstr *Save;
5539  MachineInstr *Restore;
5540  // Can we save to a register?
5542  // FIXME: This logic should be sunk into a target-specific interface so that
5543  // we don't have to recompute the register.
5544  unsigned Reg = findRegisterToSaveLRTo(C);
5545