LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(*MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
212 }
213 return Size;
214}
215
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(1).getMBB();
224 Cond.push_back(LastInst->getOperand(0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(1).getMBB();
231 Cond.push_back(MachineOperand::CreateImm(-1));
232 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
233 Cond.push_back(LastInst->getOperand(0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(2).getMBB();
240 Cond.push_back(MachineOperand::CreateImm(-1));
241 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
242 Cond.push_back(LastInst->getOperand(0));
243 Cond.push_back(LastInst->getOperand(1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(3).getMBB();
250 Cond.push_back(MachineOperand::CreateImm(-1));
251 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
252 Cond.push_back(LastInst->getOperand(0));
253 Cond.push_back(LastInst->getOperand(1));
254 Cond.push_back(LastInst->getOperand(2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(3).getMBB();
259 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
260 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
261 Cond.push_back(LastInst->getOperand(0)); // Cond
262 Cond.push_back(LastInst->getOperand(1)); // Op0
263 Cond.push_back(LastInst->getOperand(2)); // Op1
264 Cond.push_back(LastInst->getOperand(4)); // Ext0
265 Cond.push_back(LastInst->getOperand(5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(Bits, BrOffset / 4);
304}
305
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(3).getMBB();
331 }
332}
333
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(BrOffset))
352 "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
355 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
356 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
357 .addReg(Reg)
358 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(0);
360 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(true))
388 "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
392 .addReg(AArch64::SP, RegState::Define)
393 .addReg(Reg)
394 .addReg(AArch64::SP)
395 .addImm(-16);
396
397 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
398
399 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
400 .addReg(AArch64::SP, RegState::Define)
402 .addReg(AArch64::SP)
403 .addImm(16);
404}
405
406// Branch analysis.
409 MachineBasicBlock *&FBB,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(*I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
432 if (isUncondBranchOpcode(LastOpc)) {
433 TBB = LastInst->getOperand(0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
451 while (isUncondBranchOpcode(SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
470 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
495 parseCondBranch(SecondLastInst, TBB, Cond);
496 FBB = LastInst->getOperand(0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
503 TBB = SecondLastInst->getOperand(0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // Use analyzeBranch to validate the branch pattern.
527 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
529 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
530 return true;
531
532 // analyzeBranch returns success with empty Cond for unconditional branches.
533 if (Cond.empty())
534 return true;
535
536 MBP.TrueDest = TBB;
537 assert(MBP.TrueDest && "expected!");
538 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
539
540 MBP.ConditionDef = nullptr;
541 MBP.SingleUseCondition = false;
542
543 // Find the conditional branch. After analyzeBranch succeeds with non-empty
544 // Cond, there's exactly one conditional branch - either last (fallthrough)
545 // or second-to-last (followed by unconditional B).
546 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
547 if (I == MBB.end())
548 return true;
549
550 if (isUncondBranchOpcode(I->getOpcode())) {
551 if (I == MBB.begin())
552 return true;
553 --I;
554 }
555
556 MachineInstr *CondBranch = &*I;
557 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
558
559 switch (CondBranch->getOpcode()) {
560 default:
561 return true;
562
563 case AArch64::Bcc:
564 // Bcc takes the NZCV flag as the operand to branch on, walk up the
565 // instruction stream to find the last instruction to define NZCV.
567 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
568 MBP.ConditionDef = &MI;
569 break;
570 }
571 }
572 return false;
573
574 case AArch64::CBZW:
575 case AArch64::CBZX:
576 case AArch64::CBNZW:
577 case AArch64::CBNZX: {
578 MBP.LHS = CondBranch->getOperand(0);
579 MBP.RHS = MachineOperand::CreateImm(0);
580 unsigned Opc = CondBranch->getOpcode();
581 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
582 ? MachineBranchPredicate::PRED_NE
583 : MachineBranchPredicate::PRED_EQ;
584 Register CondReg = MBP.LHS.getReg();
585 if (CondReg.isVirtual())
586 MBP.ConditionDef = MRI.getVRegDef(CondReg);
587 return false;
588 }
589
590 case AArch64::TBZW:
591 case AArch64::TBZX:
592 case AArch64::TBNZW:
593 case AArch64::TBNZX: {
594 Register CondReg = CondBranch->getOperand(0).getReg();
595 if (CondReg.isVirtual())
596 MBP.ConditionDef = MRI.getVRegDef(CondReg);
597 return false;
598 }
599 }
600}
601
604 if (Cond[0].getImm() != -1) {
605 // Regular Bcc
606 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
608 } else {
609 // Folded compare-and-branch
610 switch (Cond[1].getImm()) {
611 default:
612 llvm_unreachable("Unknown conditional branch!");
613 case AArch64::CBZW:
614 Cond[1].setImm(AArch64::CBNZW);
615 break;
616 case AArch64::CBNZW:
617 Cond[1].setImm(AArch64::CBZW);
618 break;
619 case AArch64::CBZX:
620 Cond[1].setImm(AArch64::CBNZX);
621 break;
622 case AArch64::CBNZX:
623 Cond[1].setImm(AArch64::CBZX);
624 break;
625 case AArch64::TBZW:
626 Cond[1].setImm(AArch64::TBNZW);
627 break;
628 case AArch64::TBNZW:
629 Cond[1].setImm(AArch64::TBZW);
630 break;
631 case AArch64::TBZX:
632 Cond[1].setImm(AArch64::TBNZX);
633 break;
634 case AArch64::TBNZX:
635 Cond[1].setImm(AArch64::TBZX);
636 break;
637
638 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
639 case AArch64::CBWPri:
640 case AArch64::CBXPri:
641 case AArch64::CBBAssertExt:
642 case AArch64::CBHAssertExt:
643 case AArch64::CBWPrr:
644 case AArch64::CBXPrr: {
645 // Pseudos using standard 4bit Arm condition codes
647 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
649 }
650 }
651 }
652
653 return false;
654}
655
657 int *BytesRemoved) const {
658 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
659 if (I == MBB.end())
660 return 0;
661
662 if (!isUncondBranchOpcode(I->getOpcode()) &&
663 !isCondBranchOpcode(I->getOpcode()))
664 return 0;
665
666 // Remove the branch.
667 I->eraseFromParent();
668
669 I = MBB.end();
670
671 if (I == MBB.begin()) {
672 if (BytesRemoved)
673 *BytesRemoved = 4;
674 return 1;
675 }
676 --I;
677 if (!isCondBranchOpcode(I->getOpcode())) {
678 if (BytesRemoved)
679 *BytesRemoved = 4;
680 return 1;
681 }
682
683 // Remove the branch.
684 I->eraseFromParent();
685 if (BytesRemoved)
686 *BytesRemoved = 8;
687
688 return 2;
689}
690
691void AArch64InstrInfo::instantiateCondBranch(
694 if (Cond[0].getImm() != -1) {
695 // Regular Bcc
696 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
697 } else {
698 // Folded compare-and-branch
699 // Note that we use addOperand instead of addReg to keep the flags.
700
701 // cbz, cbnz
702 const MachineInstrBuilder MIB =
703 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
704
705 // tbz/tbnz
706 if (Cond.size() > 3)
707 MIB.add(Cond[3]);
708
709 // cb
710 if (Cond.size() > 4)
711 MIB.add(Cond[4]);
712
713 MIB.addMBB(TBB);
714
715 // cb[b,h]
716 if (Cond.size() > 5) {
717 MIB.addImm(Cond[5].getImm());
718 MIB.addImm(Cond[6].getImm());
719 }
720 }
721}
722
725 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
726 // Shouldn't be a fall through.
727 assert(TBB && "insertBranch must not be told to insert a fallthrough");
728
729 if (!FBB) {
730 if (Cond.empty()) // Unconditional branch?
731 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
732 else
733 instantiateCondBranch(MBB, DL, TBB, Cond);
734
735 if (BytesAdded)
736 *BytesAdded = 4;
737
738 return 1;
739 }
740
741 // Two-way conditional branch.
742 instantiateCondBranch(MBB, DL, TBB, Cond);
743 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
744
745 if (BytesAdded)
746 *BytesAdded = 8;
747
748 return 2;
749}
750
752 const TargetInstrInfo &TII) {
753 for (MachineInstr &MI : MBB->terminators()) {
754 unsigned Opc = MI.getOpcode();
755 switch (Opc) {
756 case AArch64::CBZW:
757 case AArch64::CBZX:
758 case AArch64::TBZW:
759 case AArch64::TBZX:
760 // CBZ/TBZ with WZR/XZR -> unconditional B
761 if (MI.getOperand(0).getReg() == AArch64::WZR ||
762 MI.getOperand(0).getReg() == AArch64::XZR) {
763 DEBUG_WITH_TYPE("optimizeTerminators",
764 dbgs() << "Removing always taken branch: " << MI);
765 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
766 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
767 for (auto *S : Succs)
768 if (S != Target)
769 MBB->removeSuccessor(S);
770 DebugLoc DL = MI.getDebugLoc();
771 while (MBB->rbegin() != &MI)
772 MBB->rbegin()->eraseFromParent();
773 MI.eraseFromParent();
774 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
775 return true;
776 }
777 break;
778 case AArch64::CBNZW:
779 case AArch64::CBNZX:
780 case AArch64::TBNZW:
781 case AArch64::TBNZX:
782 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
783 if (MI.getOperand(0).getReg() == AArch64::WZR ||
784 MI.getOperand(0).getReg() == AArch64::XZR) {
785 DEBUG_WITH_TYPE("optimizeTerminators",
786 dbgs() << "Removing never taken branch: " << MI);
787 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
788 MI.getParent()->removeSuccessor(Target);
789 MI.eraseFromParent();
790 return true;
791 }
792 break;
793 }
794 }
795 return false;
796}
797
798// Find the original register that VReg is copied from.
799static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
800 while (Register::isVirtualRegister(VReg)) {
801 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
802 if (!DefMI->isFullCopy())
803 return VReg;
804 VReg = DefMI->getOperand(1).getReg();
805 }
806 return VReg;
807}
808
809// Determine if VReg is defined by an instruction that can be folded into a
810// csel instruction. If so, return the folded opcode, and the replacement
811// register.
812static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
813 unsigned *NewReg = nullptr) {
814 VReg = removeCopies(MRI, VReg);
816 return 0;
817
818 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
819 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
820 unsigned Opc = 0;
821 unsigned SrcReg = 0;
822 switch (DefMI->getOpcode()) {
823 case AArch64::SUBREG_TO_REG:
824 // Check for the following way to define an 64-bit immediate:
825 // %0:gpr32 = MOVi32imm 1
826 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
827 if (!DefMI->getOperand(1).isReg())
828 return 0;
829 if (!DefMI->getOperand(2).isImm() ||
830 DefMI->getOperand(2).getImm() != AArch64::sub_32)
831 return 0;
832 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
833 if (DefMI->getOpcode() != AArch64::MOVi32imm)
834 return 0;
835 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
836 return 0;
837 assert(Is64Bit);
838 SrcReg = AArch64::XZR;
839 Opc = AArch64::CSINCXr;
840 break;
841
842 case AArch64::MOVi32imm:
843 case AArch64::MOVi64imm:
844 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
845 return 0;
846 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
847 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
848 break;
849
850 case AArch64::ADDSXri:
851 case AArch64::ADDSWri:
852 // if NZCV is used, do not fold.
853 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
854 true) == -1)
855 return 0;
856 // fall-through to ADDXri and ADDWri.
857 [[fallthrough]];
858 case AArch64::ADDXri:
859 case AArch64::ADDWri:
860 // add x, 1 -> csinc.
861 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
862 DefMI->getOperand(3).getImm() != 0)
863 return 0;
864 SrcReg = DefMI->getOperand(1).getReg();
865 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
866 break;
867
868 case AArch64::ORNXrr:
869 case AArch64::ORNWrr: {
870 // not x -> csinv, represented as orn dst, xzr, src.
871 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
872 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
873 return 0;
874 SrcReg = DefMI->getOperand(2).getReg();
875 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
876 break;
877 }
878
879 case AArch64::SUBSXrr:
880 case AArch64::SUBSWrr:
881 // if NZCV is used, do not fold.
882 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
883 true) == -1)
884 return 0;
885 // fall-through to SUBXrr and SUBWrr.
886 [[fallthrough]];
887 case AArch64::SUBXrr:
888 case AArch64::SUBWrr: {
889 // neg x -> csneg, represented as sub dst, xzr, src.
890 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
891 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
892 return 0;
893 SrcReg = DefMI->getOperand(2).getReg();
894 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
895 break;
896 }
897 default:
898 return 0;
899 }
900 assert(Opc && SrcReg && "Missing parameters");
901
902 if (NewReg)
903 *NewReg = SrcReg;
904 return Opc;
905}
906
909 Register DstReg, Register TrueReg,
910 Register FalseReg, int &CondCycles,
911 int &TrueCycles,
912 int &FalseCycles) const {
913 // Check register classes.
914 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
915 const TargetRegisterClass *RC =
916 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
917 if (!RC)
918 return false;
919
920 // Also need to check the dest regclass, in case we're trying to optimize
921 // something like:
922 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
923 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
924 return false;
925
926 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
927 unsigned ExtraCondLat = Cond.size() != 1;
928
929 // GPRs are handled by csel.
930 // FIXME: Fold in x+1, -x, and ~x when applicable.
931 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
932 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
933 // Single-cycle csel, csinc, csinv, and csneg.
934 CondCycles = 1 + ExtraCondLat;
935 TrueCycles = FalseCycles = 1;
936 if (canFoldIntoCSel(MRI, TrueReg))
937 TrueCycles = 0;
938 else if (canFoldIntoCSel(MRI, FalseReg))
939 FalseCycles = 0;
940 return true;
941 }
942
943 // Scalar floating point is handled by fcsel.
944 // FIXME: Form fabs, fmin, and fmax when applicable.
945 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
946 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
947 CondCycles = 5 + ExtraCondLat;
948 TrueCycles = FalseCycles = 2;
949 return true;
950 }
951
952 // Can't do vectors.
953 return false;
954}
955
958 const DebugLoc &DL, Register DstReg,
960 Register TrueReg, Register FalseReg) const {
961 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
962
963 // Parse the condition code, see parseCondBranch() above.
965 switch (Cond.size()) {
966 default:
967 llvm_unreachable("Unknown condition opcode in Cond");
968 case 1: // b.cc
970 break;
971 case 3: { // cbz/cbnz
972 // We must insert a compare against 0.
973 bool Is64Bit;
974 switch (Cond[1].getImm()) {
975 default:
976 llvm_unreachable("Unknown branch opcode in Cond");
977 case AArch64::CBZW:
978 Is64Bit = false;
979 CC = AArch64CC::EQ;
980 break;
981 case AArch64::CBZX:
982 Is64Bit = true;
983 CC = AArch64CC::EQ;
984 break;
985 case AArch64::CBNZW:
986 Is64Bit = false;
987 CC = AArch64CC::NE;
988 break;
989 case AArch64::CBNZX:
990 Is64Bit = true;
991 CC = AArch64CC::NE;
992 break;
993 }
994 Register SrcReg = Cond[2].getReg();
995 if (Is64Bit) {
996 // cmp reg, #0 is actually subs xzr, reg, #0.
997 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
998 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
999 .addReg(SrcReg)
1000 .addImm(0)
1001 .addImm(0);
1002 } else {
1003 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1004 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1005 .addReg(SrcReg)
1006 .addImm(0)
1007 .addImm(0);
1008 }
1009 break;
1010 }
1011 case 4: { // tbz/tbnz
1012 // We must insert a tst instruction.
1013 switch (Cond[1].getImm()) {
1014 default:
1015 llvm_unreachable("Unknown branch opcode in Cond");
1016 case AArch64::TBZW:
1017 case AArch64::TBZX:
1018 CC = AArch64CC::EQ;
1019 break;
1020 case AArch64::TBNZW:
1021 case AArch64::TBNZX:
1022 CC = AArch64CC::NE;
1023 break;
1024 }
1025 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1026 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1027 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1028 .addReg(Cond[2].getReg())
1029 .addImm(
1031 else
1032 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1033 .addReg(Cond[2].getReg())
1034 .addImm(
1036 break;
1037 }
1038 case 5: { // cb
1039 // We must insert a cmp, that is a subs
1040 // 0 1 2 3 4
1041 // Cond is { -1, Opcode, CC, Op0, Op1 }
1042
1043 unsigned SubsOpc, SubsDestReg;
1044 bool IsImm = false;
1045 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1046 switch (Cond[1].getImm()) {
1047 default:
1048 llvm_unreachable("Unknown branch opcode in Cond");
1049 case AArch64::CBWPri:
1050 SubsOpc = AArch64::SUBSWri;
1051 SubsDestReg = AArch64::WZR;
1052 IsImm = true;
1053 break;
1054 case AArch64::CBXPri:
1055 SubsOpc = AArch64::SUBSXri;
1056 SubsDestReg = AArch64::XZR;
1057 IsImm = true;
1058 break;
1059 case AArch64::CBWPrr:
1060 SubsOpc = AArch64::SUBSWrr;
1061 SubsDestReg = AArch64::WZR;
1062 IsImm = false;
1063 break;
1064 case AArch64::CBXPrr:
1065 SubsOpc = AArch64::SUBSXrr;
1066 SubsDestReg = AArch64::XZR;
1067 IsImm = false;
1068 break;
1069 }
1070
1071 if (IsImm)
1072 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1073 .addReg(Cond[3].getReg())
1074 .addImm(Cond[4].getImm())
1075 .addImm(0);
1076 else
1077 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1078 .addReg(Cond[3].getReg())
1079 .addReg(Cond[4].getReg());
1080 } break;
1081 case 7: { // cb[b,h]
1082 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1083 // that have been folded. For the first operand we codegen an explicit
1084 // extension, for the second operand we fold the extension into cmp.
1085 // 0 1 2 3 4 5 6
1086 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1087
1088 // We need a new register for the now explicitly extended register
1089 Register Reg = Cond[4].getReg();
1091 unsigned ExtOpc;
1092 unsigned ExtBits;
1093 AArch64_AM::ShiftExtendType ExtendType =
1095 switch (ExtendType) {
1096 default:
1097 llvm_unreachable("Unknown shift-extend for CB instruction");
1098 case AArch64_AM::SXTB:
1099 assert(
1100 Cond[1].getImm() == AArch64::CBBAssertExt &&
1101 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1102 ExtOpc = AArch64::SBFMWri;
1103 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1104 break;
1105 case AArch64_AM::SXTH:
1106 assert(
1107 Cond[1].getImm() == AArch64::CBHAssertExt &&
1108 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1109 ExtOpc = AArch64::SBFMWri;
1110 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1111 break;
1112 case AArch64_AM::UXTB:
1113 assert(
1114 Cond[1].getImm() == AArch64::CBBAssertExt &&
1115 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1116 ExtOpc = AArch64::ANDWri;
1117 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1118 break;
1119 case AArch64_AM::UXTH:
1120 assert(
1121 Cond[1].getImm() == AArch64::CBHAssertExt &&
1122 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1123 ExtOpc = AArch64::ANDWri;
1124 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1125 break;
1126 }
1127
1128 // Build the explicit extension of the first operand
1129 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1131 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1132 if (ExtOpc != AArch64::ANDWri)
1133 MBBI.addImm(0);
1134 MBBI.addImm(ExtBits);
1135 }
1136
1137 // Now, subs with an extended second operand
1139 AArch64_AM::ShiftExtendType ExtendType =
1141 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1142 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1143 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1144 .addReg(Cond[3].getReg())
1145 .addReg(Reg)
1146 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1147 } // If no extension is needed, just a regular subs
1148 else {
1149 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1150 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1151 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1152 .addReg(Cond[3].getReg())
1153 .addReg(Reg);
1154 }
1155
1156 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1157 } break;
1158 }
1159
1160 unsigned Opc = 0;
1161 const TargetRegisterClass *RC = nullptr;
1162 bool TryFold = false;
1163 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1164 RC = &AArch64::GPR64RegClass;
1165 Opc = AArch64::CSELXr;
1166 TryFold = true;
1167 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1168 RC = &AArch64::GPR32RegClass;
1169 Opc = AArch64::CSELWr;
1170 TryFold = true;
1171 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1172 RC = &AArch64::FPR64RegClass;
1173 Opc = AArch64::FCSELDrrr;
1174 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1175 RC = &AArch64::FPR32RegClass;
1176 Opc = AArch64::FCSELSrrr;
1177 }
1178 assert(RC && "Unsupported regclass");
1179
1180 // Try folding simple instructions into the csel.
1181 if (TryFold) {
1182 unsigned NewReg = 0;
1183 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1184 if (FoldedOpc) {
1185 // The folded opcodes csinc, csinc and csneg apply the operation to
1186 // FalseReg, so we need to invert the condition.
1188 TrueReg = FalseReg;
1189 } else
1190 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1191
1192 // Fold the operation. Leave any dead instructions for DCE to clean up.
1193 if (FoldedOpc) {
1194 FalseReg = NewReg;
1195 Opc = FoldedOpc;
1196 // Extend the live range of NewReg.
1197 MRI.clearKillFlags(NewReg);
1198 }
1199 }
1200
1201 // Pull all virtual register into the appropriate class.
1202 MRI.constrainRegClass(TrueReg, RC);
1203 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1204 assert(
1205 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1206 FalseReg == AArch64::XZR) &&
1207 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1208 if (FalseReg.isVirtual())
1209 MRI.constrainRegClass(FalseReg, RC);
1210
1211 // Insert the csel.
1212 BuildMI(MBB, I, DL, get(Opc), DstReg)
1213 .addReg(TrueReg)
1214 .addReg(FalseReg)
1215 .addImm(CC);
1216}
1217
1218// Return true if Imm can be loaded into a register by a "cheap" sequence of
1219// instructions. For now, "cheap" means at most two instructions.
1220static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1221 if (BitSize == 32)
1222 return true;
1223
1224 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1225 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1227 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1228
1229 return Is.size() <= 2;
1230}
1231
1232// Check if a COPY instruction is cheap.
1233static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1234 assert(MI.isCopy() && "Expected COPY instruction");
1235 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1236
1237 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1238 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1239 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1240 if (Reg.isVirtual())
1241 return MRI.getRegClass(Reg);
1242 if (Reg.isPhysical())
1243 return RI.getMinimalPhysRegClass(Reg);
1244 return nullptr;
1245 };
1246 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1247 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1248 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1249 return false;
1250
1251 return MI.isAsCheapAsAMove();
1252}
1253
1254// FIXME: this implementation should be micro-architecture dependent, so a
1255// micro-architecture target hook should be introduced here in future.
1257 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1258 if (isExynosCheapAsMove(MI))
1259 return true;
1260 return MI.isAsCheapAsAMove();
1261 }
1262
1263 switch (MI.getOpcode()) {
1264 default:
1265 return MI.isAsCheapAsAMove();
1266
1267 case TargetOpcode::COPY:
1268 return isCheapCopy(MI, RI);
1269
1270 case AArch64::ADDWrs:
1271 case AArch64::ADDXrs:
1272 case AArch64::SUBWrs:
1273 case AArch64::SUBXrs:
1274 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1275
1276 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1277 // ORRXri, it is as cheap as MOV.
1278 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1279 case AArch64::MOVi32imm:
1280 return isCheapImmediate(MI, 32);
1281 case AArch64::MOVi64imm:
1282 return isCheapImmediate(MI, 64);
1283 }
1284}
1285
1286bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1287 switch (MI.getOpcode()) {
1288 default:
1289 return false;
1290
1291 case AArch64::ADDWrs:
1292 case AArch64::ADDXrs:
1293 case AArch64::ADDSWrs:
1294 case AArch64::ADDSXrs: {
1295 unsigned Imm = MI.getOperand(3).getImm();
1296 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1297 if (ShiftVal == 0)
1298 return true;
1299 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1300 }
1301
1302 case AArch64::ADDWrx:
1303 case AArch64::ADDXrx:
1304 case AArch64::ADDXrx64:
1305 case AArch64::ADDSWrx:
1306 case AArch64::ADDSXrx:
1307 case AArch64::ADDSXrx64: {
1308 unsigned Imm = MI.getOperand(3).getImm();
1309 switch (AArch64_AM::getArithExtendType(Imm)) {
1310 default:
1311 return false;
1312 case AArch64_AM::UXTB:
1313 case AArch64_AM::UXTH:
1314 case AArch64_AM::UXTW:
1315 case AArch64_AM::UXTX:
1316 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1317 }
1318 }
1319
1320 case AArch64::SUBWrs:
1321 case AArch64::SUBSWrs: {
1322 unsigned Imm = MI.getOperand(3).getImm();
1323 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1324 return ShiftVal == 0 ||
1325 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1326 }
1327
1328 case AArch64::SUBXrs:
1329 case AArch64::SUBSXrs: {
1330 unsigned Imm = MI.getOperand(3).getImm();
1331 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1332 return ShiftVal == 0 ||
1333 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1334 }
1335
1336 case AArch64::SUBWrx:
1337 case AArch64::SUBXrx:
1338 case AArch64::SUBXrx64:
1339 case AArch64::SUBSWrx:
1340 case AArch64::SUBSXrx:
1341 case AArch64::SUBSXrx64: {
1342 unsigned Imm = MI.getOperand(3).getImm();
1343 switch (AArch64_AM::getArithExtendType(Imm)) {
1344 default:
1345 return false;
1346 case AArch64_AM::UXTB:
1347 case AArch64_AM::UXTH:
1348 case AArch64_AM::UXTW:
1349 case AArch64_AM::UXTX:
1350 return AArch64_AM::getArithShiftValue(Imm) == 0;
1351 }
1352 }
1353
1354 case AArch64::LDRBBroW:
1355 case AArch64::LDRBBroX:
1356 case AArch64::LDRBroW:
1357 case AArch64::LDRBroX:
1358 case AArch64::LDRDroW:
1359 case AArch64::LDRDroX:
1360 case AArch64::LDRHHroW:
1361 case AArch64::LDRHHroX:
1362 case AArch64::LDRHroW:
1363 case AArch64::LDRHroX:
1364 case AArch64::LDRQroW:
1365 case AArch64::LDRQroX:
1366 case AArch64::LDRSBWroW:
1367 case AArch64::LDRSBWroX:
1368 case AArch64::LDRSBXroW:
1369 case AArch64::LDRSBXroX:
1370 case AArch64::LDRSHWroW:
1371 case AArch64::LDRSHWroX:
1372 case AArch64::LDRSHXroW:
1373 case AArch64::LDRSHXroX:
1374 case AArch64::LDRSWroW:
1375 case AArch64::LDRSWroX:
1376 case AArch64::LDRSroW:
1377 case AArch64::LDRSroX:
1378 case AArch64::LDRWroW:
1379 case AArch64::LDRWroX:
1380 case AArch64::LDRXroW:
1381 case AArch64::LDRXroX:
1382 case AArch64::PRFMroW:
1383 case AArch64::PRFMroX:
1384 case AArch64::STRBBroW:
1385 case AArch64::STRBBroX:
1386 case AArch64::STRBroW:
1387 case AArch64::STRBroX:
1388 case AArch64::STRDroW:
1389 case AArch64::STRDroX:
1390 case AArch64::STRHHroW:
1391 case AArch64::STRHHroX:
1392 case AArch64::STRHroW:
1393 case AArch64::STRHroX:
1394 case AArch64::STRQroW:
1395 case AArch64::STRQroX:
1396 case AArch64::STRSroW:
1397 case AArch64::STRSroX:
1398 case AArch64::STRWroW:
1399 case AArch64::STRWroX:
1400 case AArch64::STRXroW:
1401 case AArch64::STRXroX: {
1402 unsigned IsSigned = MI.getOperand(3).getImm();
1403 return !IsSigned;
1404 }
1405 }
1406}
1407
1408bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1409 unsigned Opc = MI.getOpcode();
1410 switch (Opc) {
1411 default:
1412 return false;
1413 case AArch64::SEH_StackAlloc:
1414 case AArch64::SEH_SaveFPLR:
1415 case AArch64::SEH_SaveFPLR_X:
1416 case AArch64::SEH_SaveReg:
1417 case AArch64::SEH_SaveReg_X:
1418 case AArch64::SEH_SaveRegP:
1419 case AArch64::SEH_SaveRegP_X:
1420 case AArch64::SEH_SaveFReg:
1421 case AArch64::SEH_SaveFReg_X:
1422 case AArch64::SEH_SaveFRegP:
1423 case AArch64::SEH_SaveFRegP_X:
1424 case AArch64::SEH_SetFP:
1425 case AArch64::SEH_AddFP:
1426 case AArch64::SEH_Nop:
1427 case AArch64::SEH_PrologEnd:
1428 case AArch64::SEH_EpilogStart:
1429 case AArch64::SEH_EpilogEnd:
1430 case AArch64::SEH_PACSignLR:
1431 case AArch64::SEH_SaveAnyRegI:
1432 case AArch64::SEH_SaveAnyRegIP:
1433 case AArch64::SEH_SaveAnyRegQP:
1434 case AArch64::SEH_SaveAnyRegQPX:
1435 case AArch64::SEH_AllocZ:
1436 case AArch64::SEH_SaveZReg:
1437 case AArch64::SEH_SavePReg:
1438 return true;
1439 }
1440}
1441
1443 Register &SrcReg, Register &DstReg,
1444 unsigned &SubIdx) const {
1445 switch (MI.getOpcode()) {
1446 default:
1447 return false;
1448 case AArch64::SBFMXri: // aka sxtw
1449 case AArch64::UBFMXri: // aka uxtw
1450 // Check for the 32 -> 64 bit extension case, these instructions can do
1451 // much more.
1452 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1453 return false;
1454 // This is a signed or unsigned 32 -> 64 bit extension.
1455 SrcReg = MI.getOperand(1).getReg();
1456 DstReg = MI.getOperand(0).getReg();
1457 SubIdx = AArch64::sub_32;
1458 return true;
1459 }
1460}
1461
1463 const MachineInstr &MIa, const MachineInstr &MIb) const {
1465 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1466 int64_t OffsetA = 0, OffsetB = 0;
1467 TypeSize WidthA(0, false), WidthB(0, false);
1468 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1469
1470 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1471 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1472
1475 return false;
1476
1477 // Retrieve the base, offset from the base and width. Width
1478 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1479 // base are identical, and the offset of a lower memory access +
1480 // the width doesn't overlap the offset of a higher memory access,
1481 // then the memory accesses are different.
1482 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1483 // are assumed to have the same scale (vscale).
1484 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1485 WidthA, TRI) &&
1486 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1487 WidthB, TRI)) {
1488 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1489 OffsetAIsScalable == OffsetBIsScalable) {
1490 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1491 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1492 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1493 if (LowWidth.isScalable() == OffsetAIsScalable &&
1494 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1495 return true;
1496 }
1497 }
1498 return false;
1499}
1500
1502 const MachineBasicBlock *MBB,
1503 const MachineFunction &MF) const {
1505 return true;
1506
1507 // Do not move an instruction that can be recognized as a branch target.
1508 if (hasBTISemantics(MI))
1509 return true;
1510
1511 switch (MI.getOpcode()) {
1512 case AArch64::HINT:
1513 // CSDB hints are scheduling barriers.
1514 if (MI.getOperand(0).getImm() == 0x14)
1515 return true;
1516 break;
1517 case AArch64::DSB:
1518 case AArch64::ISB:
1519 // DSB and ISB also are scheduling barriers.
1520 return true;
1521 case AArch64::MSRpstatesvcrImm1:
1522 // SMSTART and SMSTOP are also scheduling barriers.
1523 return true;
1524 default:;
1525 }
1526 if (isSEHInstruction(MI))
1527 return true;
1528 auto Next = std::next(MI.getIterator());
1529 return Next != MBB->end() && Next->isCFIInstruction();
1530}
1531
1532/// analyzeCompare - For a comparison instruction, return the source registers
1533/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1534/// Return true if the comparison instruction can be analyzed.
1536 Register &SrcReg2, int64_t &CmpMask,
1537 int64_t &CmpValue) const {
1538 // The first operand can be a frame index where we'd normally expect a
1539 // register.
1540 // FIXME: Pass subregisters out of analyzeCompare
1541 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1542 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1543 return false;
1544
1545 switch (MI.getOpcode()) {
1546 default:
1547 break;
1548 case AArch64::PTEST_PP:
1549 case AArch64::PTEST_PP_ANY:
1550 case AArch64::PTEST_PP_FIRST:
1551 SrcReg = MI.getOperand(0).getReg();
1552 SrcReg2 = MI.getOperand(1).getReg();
1553 if (MI.getOperand(2).getSubReg())
1554 return false;
1555
1556 // Not sure about the mask and value for now...
1557 CmpMask = ~0;
1558 CmpValue = 0;
1559 return true;
1560 case AArch64::SUBSWrr:
1561 case AArch64::SUBSWrs:
1562 case AArch64::SUBSWrx:
1563 case AArch64::SUBSXrr:
1564 case AArch64::SUBSXrs:
1565 case AArch64::SUBSXrx:
1566 case AArch64::ADDSWrr:
1567 case AArch64::ADDSWrs:
1568 case AArch64::ADDSWrx:
1569 case AArch64::ADDSXrr:
1570 case AArch64::ADDSXrs:
1571 case AArch64::ADDSXrx:
1572 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1573 SrcReg = MI.getOperand(1).getReg();
1574 SrcReg2 = MI.getOperand(2).getReg();
1575
1576 // FIXME: Pass subregisters out of analyzeCompare
1577 if (MI.getOperand(2).getSubReg())
1578 return false;
1579
1580 CmpMask = ~0;
1581 CmpValue = 0;
1582 return true;
1583 case AArch64::SUBSWri:
1584 case AArch64::ADDSWri:
1585 case AArch64::SUBSXri:
1586 case AArch64::ADDSXri:
1587 SrcReg = MI.getOperand(1).getReg();
1588 SrcReg2 = 0;
1589 CmpMask = ~0;
1590 CmpValue = MI.getOperand(2).getImm();
1591 return true;
1592 case AArch64::ANDSWri:
1593 case AArch64::ANDSXri:
1594 // ANDS does not use the same encoding scheme as the others xxxS
1595 // instructions.
1596 SrcReg = MI.getOperand(1).getReg();
1597 SrcReg2 = 0;
1598 CmpMask = ~0;
1600 MI.getOperand(2).getImm(),
1601 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1602 return true;
1603 }
1604
1605 return false;
1606}
1607
1609 MachineBasicBlock *MBB = Instr.getParent();
1610 assert(MBB && "Can't get MachineBasicBlock here");
1611 MachineFunction *MF = MBB->getParent();
1612 assert(MF && "Can't get MachineFunction here");
1616
1617 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1618 ++OpIdx) {
1619 MachineOperand &MO = Instr.getOperand(OpIdx);
1620 const TargetRegisterClass *OpRegCstraints =
1621 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1622
1623 // If there's no constraint, there's nothing to do.
1624 if (!OpRegCstraints)
1625 continue;
1626 // If the operand is a frame index, there's nothing to do here.
1627 // A frame index operand will resolve correctly during PEI.
1628 if (MO.isFI())
1629 continue;
1630
1631 assert(MO.isReg() &&
1632 "Operand has register constraints without being a register!");
1633
1634 Register Reg = MO.getReg();
1635 if (Reg.isPhysical()) {
1636 if (!OpRegCstraints->contains(Reg))
1637 return false;
1638 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1639 !MRI->constrainRegClass(Reg, OpRegCstraints))
1640 return false;
1641 }
1642
1643 return true;
1644}
1645
1646/// Return the opcode that does not set flags when possible - otherwise
1647/// return the original opcode. The caller is responsible to do the actual
1648/// substitution and legality checking.
1650 // Don't convert all compare instructions, because for some the zero register
1651 // encoding becomes the sp register.
1652 bool MIDefinesZeroReg = false;
1653 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1654 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1655 MIDefinesZeroReg = true;
1656
1657 switch (MI.getOpcode()) {
1658 default:
1659 return MI.getOpcode();
1660 case AArch64::ADDSWrr:
1661 return AArch64::ADDWrr;
1662 case AArch64::ADDSWri:
1663 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1664 case AArch64::ADDSWrs:
1665 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1666 case AArch64::ADDSWrx:
1667 return AArch64::ADDWrx;
1668 case AArch64::ADDSXrr:
1669 return AArch64::ADDXrr;
1670 case AArch64::ADDSXri:
1671 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1672 case AArch64::ADDSXrs:
1673 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1674 case AArch64::ADDSXrx:
1675 return AArch64::ADDXrx;
1676 case AArch64::SUBSWrr:
1677 return AArch64::SUBWrr;
1678 case AArch64::SUBSWri:
1679 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1680 case AArch64::SUBSWrs:
1681 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1682 case AArch64::SUBSWrx:
1683 return AArch64::SUBWrx;
1684 case AArch64::SUBSXrr:
1685 return AArch64::SUBXrr;
1686 case AArch64::SUBSXri:
1687 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1688 case AArch64::SUBSXrs:
1689 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1690 case AArch64::SUBSXrx:
1691 return AArch64::SUBXrx;
1692 }
1693}
1694
1695enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1696
1697/// True when condition flags are accessed (either by writing or reading)
1698/// on the instruction trace starting at From and ending at To.
1699///
1700/// Note: If From and To are from different blocks it's assumed CC are accessed
1701/// on the path.
1704 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1705 // Early exit if To is at the beginning of the BB.
1706 if (To == To->getParent()->begin())
1707 return true;
1708
1709 // Check whether the instructions are in the same basic block
1710 // If not, assume the condition flags might get modified somewhere.
1711 if (To->getParent() != From->getParent())
1712 return true;
1713
1714 // From must be above To.
1715 assert(std::any_of(
1716 ++To.getReverse(), To->getParent()->rend(),
1717 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1718
1719 // We iterate backward starting at \p To until we hit \p From.
1720 for (const MachineInstr &Instr :
1722 if (((AccessToCheck & AK_Write) &&
1723 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1724 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1725 return true;
1726 }
1727 return false;
1728}
1729
1730std::optional<unsigned>
1731AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1732 MachineInstr *Pred,
1733 const MachineRegisterInfo *MRI) const {
1734 unsigned MaskOpcode = Mask->getOpcode();
1735 unsigned PredOpcode = Pred->getOpcode();
1736 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1737 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1738
1739 if (PredIsWhileLike) {
1740 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1741 // instruction and the condition is "any" since WHILcc does an implicit
1742 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1743 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1744 return PredOpcode;
1745
1746 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1747 // redundant since WHILE performs an implicit PTEST with an all active
1748 // mask.
1749 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1750 getElementSizeForOpcode(MaskOpcode) ==
1751 getElementSizeForOpcode(PredOpcode))
1752 return PredOpcode;
1753
1754 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1755 // WHILEcc performs an implicit PTEST with an all active mask, setting
1756 // the N flag as the PTEST_FIRST would.
1757 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1758 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1759 return PredOpcode;
1760
1761 return {};
1762 }
1763
1764 if (PredIsPTestLike) {
1765 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1766 // instruction that sets the flags as PTEST would and the condition is
1767 // "any" since PG is always a subset of the governing predicate of the
1768 // ptest-like instruction.
1769 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1770 return PredOpcode;
1771
1772 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1773
1774 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1775 // to look through a copy and try again. This is because some instructions
1776 // take a predicate whose register class is a subset of its result class.
1777 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1778 PTestLikeMask->getOperand(1).getReg().isVirtual())
1779 PTestLikeMask =
1780 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1781
1782 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1783 // the element size matches and either the PTEST_LIKE instruction uses
1784 // the same all active mask or the condition is "any".
1785 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1786 getElementSizeForOpcode(MaskOpcode) ==
1787 getElementSizeForOpcode(PredOpcode)) {
1788 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1789 return PredOpcode;
1790 }
1791
1792 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1793 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1794 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1795 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1796 // performed by the compare could consider fewer lanes for these element
1797 // sizes.
1798 //
1799 // For example, consider
1800 //
1801 // ptrue p0.b ; P0=1111-1111-1111-1111
1802 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1803 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1804 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1805 // ; ^ last active
1806 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1807 // ; ^ last active
1808 //
1809 // where the compare generates a canonical all active 32-bit predicate
1810 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1811 // active flag, whereas the PTEST instruction with the same mask doesn't.
1812 // For PTEST_ANY this doesn't apply as the flags in this case would be
1813 // identical regardless of element size.
1814 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1815 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1816 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1817 return PredOpcode;
1818
1819 return {};
1820 }
1821
1822 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1823 // opcode so the PTEST becomes redundant.
1824 switch (PredOpcode) {
1825 case AArch64::AND_PPzPP:
1826 case AArch64::BIC_PPzPP:
1827 case AArch64::EOR_PPzPP:
1828 case AArch64::NAND_PPzPP:
1829 case AArch64::NOR_PPzPP:
1830 case AArch64::ORN_PPzPP:
1831 case AArch64::ORR_PPzPP:
1832 case AArch64::BRKA_PPzP:
1833 case AArch64::BRKPA_PPzPP:
1834 case AArch64::BRKB_PPzP:
1835 case AArch64::BRKPB_PPzPP:
1836 case AArch64::RDFFR_PPz: {
1837 // Check to see if our mask is the same. If not the resulting flag bits
1838 // may be different and we can't remove the ptest.
1839 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1840 if (Mask != PredMask)
1841 return {};
1842 break;
1843 }
1844 case AArch64::BRKN_PPzP: {
1845 // BRKN uses an all active implicit mask to set flags unlike the other
1846 // flag-setting instructions.
1847 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1848 if ((MaskOpcode != AArch64::PTRUE_B) ||
1849 (Mask->getOperand(1).getImm() != 31))
1850 return {};
1851 break;
1852 }
1853 case AArch64::PTRUE_B:
1854 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1855 break;
1856 default:
1857 // Bail out if we don't recognize the input
1858 return {};
1859 }
1860
1861 return convertToFlagSettingOpc(PredOpcode);
1862}
1863
1864/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1865/// operation which could set the flags in an identical manner
1866bool AArch64InstrInfo::optimizePTestInstr(
1867 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1868 const MachineRegisterInfo *MRI) const {
1869 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1870 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1871
1872 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1873 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1874 // before the branch to extract each subregister.
1875 auto Op = Pred->getOperand(1);
1876 if (Op.isReg() && Op.getReg().isVirtual() &&
1877 Op.getSubReg() == AArch64::psub0)
1878 Pred = MRI->getUniqueVRegDef(Op.getReg());
1879 }
1880
1881 unsigned PredOpcode = Pred->getOpcode();
1882 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1883 if (!NewOp)
1884 return false;
1885
1886 const TargetRegisterInfo *TRI = &getRegisterInfo();
1887
1888 // If another instruction between Pred and PTest accesses flags, don't remove
1889 // the ptest or update the earlier instruction to modify them.
1890 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1891 return false;
1892
1893 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1894 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1895 // operand to be replaced with an equivalent instruction that also sets the
1896 // flags.
1897 PTest->eraseFromParent();
1898 if (*NewOp != PredOpcode) {
1899 Pred->setDesc(get(*NewOp));
1900 bool succeeded = UpdateOperandRegClass(*Pred);
1901 (void)succeeded;
1902 assert(succeeded && "Operands have incompatible register classes!");
1903 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1904 }
1905
1906 // Ensure that the flags def is live.
1907 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1908 unsigned i = 0, e = Pred->getNumOperands();
1909 for (; i != e; ++i) {
1910 MachineOperand &MO = Pred->getOperand(i);
1911 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1912 MO.setIsDead(false);
1913 break;
1914 }
1915 }
1916 }
1917 return true;
1918}
1919
1920/// Try to optimize a compare instruction. A compare instruction is an
1921/// instruction which produces AArch64::NZCV. It can be truly compare
1922/// instruction
1923/// when there are no uses of its destination register.
1924///
1925/// The following steps are tried in order:
1926/// 1. Convert CmpInstr into an unconditional version.
1927/// 2. Remove CmpInstr if above there is an instruction producing a needed
1928/// condition code or an instruction which can be converted into such an
1929/// instruction.
1930/// Only comparison with zero is supported.
1932 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1933 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1934 assert(CmpInstr.getParent());
1935 assert(MRI);
1936
1937 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1938 int DeadNZCVIdx =
1939 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1940 if (DeadNZCVIdx != -1) {
1941 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1942 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1943 CmpInstr.eraseFromParent();
1944 return true;
1945 }
1946 unsigned Opc = CmpInstr.getOpcode();
1947 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1948 if (NewOpc == Opc)
1949 return false;
1950 const MCInstrDesc &MCID = get(NewOpc);
1951 CmpInstr.setDesc(MCID);
1952 CmpInstr.removeOperand(DeadNZCVIdx);
1953 bool succeeded = UpdateOperandRegClass(CmpInstr);
1954 (void)succeeded;
1955 assert(succeeded && "Some operands reg class are incompatible!");
1956 return true;
1957 }
1958
1959 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1960 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1961 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1962 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1963
1964 if (SrcReg2 != 0)
1965 return false;
1966
1967 // CmpInstr is a Compare instruction if destination register is not used.
1968 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1969 return false;
1970
1971 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1972 return true;
1973 return (CmpValue == 0 || CmpValue == 1) &&
1974 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1975}
1976
1977/// Get opcode of S version of Instr.
1978/// If Instr is S version its opcode is returned.
1979/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1980/// or we are not interested in it.
1981static unsigned sForm(MachineInstr &Instr) {
1982 switch (Instr.getOpcode()) {
1983 default:
1984 return AArch64::INSTRUCTION_LIST_END;
1985
1986 case AArch64::ADDSWrr:
1987 case AArch64::ADDSWri:
1988 case AArch64::ADDSXrr:
1989 case AArch64::ADDSXri:
1990 case AArch64::ADDSWrx:
1991 case AArch64::ADDSXrx:
1992 case AArch64::SUBSWrr:
1993 case AArch64::SUBSWri:
1994 case AArch64::SUBSWrx:
1995 case AArch64::SUBSXrr:
1996 case AArch64::SUBSXri:
1997 case AArch64::SUBSXrx:
1998 case AArch64::ANDSWri:
1999 case AArch64::ANDSWrr:
2000 case AArch64::ANDSWrs:
2001 case AArch64::ANDSXri:
2002 case AArch64::ANDSXrr:
2003 case AArch64::ANDSXrs:
2004 case AArch64::BICSWrr:
2005 case AArch64::BICSXrr:
2006 case AArch64::BICSWrs:
2007 case AArch64::BICSXrs:
2008 return Instr.getOpcode();
2009
2010 case AArch64::ADDWrr:
2011 return AArch64::ADDSWrr;
2012 case AArch64::ADDWri:
2013 return AArch64::ADDSWri;
2014 case AArch64::ADDXrr:
2015 return AArch64::ADDSXrr;
2016 case AArch64::ADDXri:
2017 return AArch64::ADDSXri;
2018 case AArch64::ADDWrx:
2019 return AArch64::ADDSWrx;
2020 case AArch64::ADDXrx:
2021 return AArch64::ADDSXrx;
2022 case AArch64::ADCWr:
2023 return AArch64::ADCSWr;
2024 case AArch64::ADCXr:
2025 return AArch64::ADCSXr;
2026 case AArch64::SUBWrr:
2027 return AArch64::SUBSWrr;
2028 case AArch64::SUBWri:
2029 return AArch64::SUBSWri;
2030 case AArch64::SUBXrr:
2031 return AArch64::SUBSXrr;
2032 case AArch64::SUBXri:
2033 return AArch64::SUBSXri;
2034 case AArch64::SUBWrx:
2035 return AArch64::SUBSWrx;
2036 case AArch64::SUBXrx:
2037 return AArch64::SUBSXrx;
2038 case AArch64::SBCWr:
2039 return AArch64::SBCSWr;
2040 case AArch64::SBCXr:
2041 return AArch64::SBCSXr;
2042 case AArch64::ANDWri:
2043 return AArch64::ANDSWri;
2044 case AArch64::ANDXri:
2045 return AArch64::ANDSXri;
2046 case AArch64::ANDWrr:
2047 return AArch64::ANDSWrr;
2048 case AArch64::ANDWrs:
2049 return AArch64::ANDSWrs;
2050 case AArch64::ANDXrr:
2051 return AArch64::ANDSXrr;
2052 case AArch64::ANDXrs:
2053 return AArch64::ANDSXrs;
2054 case AArch64::BICWrr:
2055 return AArch64::BICSWrr;
2056 case AArch64::BICXrr:
2057 return AArch64::BICSXrr;
2058 case AArch64::BICWrs:
2059 return AArch64::BICSWrs;
2060 case AArch64::BICXrs:
2061 return AArch64::BICSXrs;
2062 }
2063}
2064
2065/// Check if AArch64::NZCV should be alive in successors of MBB.
2067 for (auto *BB : MBB->successors())
2068 if (BB->isLiveIn(AArch64::NZCV))
2069 return true;
2070 return false;
2071}
2072
2073/// \returns The condition code operand index for \p Instr if it is a branch
2074/// or select and -1 otherwise.
2075static int
2077 switch (Instr.getOpcode()) {
2078 default:
2079 return -1;
2080
2081 case AArch64::Bcc: {
2082 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2083 assert(Idx >= 2);
2084 return Idx - 2;
2085 }
2086
2087 case AArch64::CSINVWr:
2088 case AArch64::CSINVXr:
2089 case AArch64::CSINCWr:
2090 case AArch64::CSINCXr:
2091 case AArch64::CSELWr:
2092 case AArch64::CSELXr:
2093 case AArch64::CSNEGWr:
2094 case AArch64::CSNEGXr:
2095 case AArch64::FCSELSrrr:
2096 case AArch64::FCSELDrrr: {
2097 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2098 assert(Idx >= 1);
2099 return Idx - 1;
2100 }
2101 }
2102}
2103
2104/// Find a condition code used by the instruction.
2105/// Returns AArch64CC::Invalid if either the instruction does not use condition
2106/// codes or we don't optimize CmpInstr in the presence of such instructions.
2109 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2110 Instr.getOperand(CCIdx).getImm())
2112}
2113
2116 UsedNZCV UsedFlags;
2117 switch (CC) {
2118 default:
2119 break;
2120
2121 case AArch64CC::EQ: // Z set
2122 case AArch64CC::NE: // Z clear
2123 UsedFlags.Z = true;
2124 break;
2125
2126 case AArch64CC::HI: // Z clear and C set
2127 case AArch64CC::LS: // Z set or C clear
2128 UsedFlags.Z = true;
2129 [[fallthrough]];
2130 case AArch64CC::HS: // C set
2131 case AArch64CC::LO: // C clear
2132 UsedFlags.C = true;
2133 break;
2134
2135 case AArch64CC::MI: // N set
2136 case AArch64CC::PL: // N clear
2137 UsedFlags.N = true;
2138 break;
2139
2140 case AArch64CC::VS: // V set
2141 case AArch64CC::VC: // V clear
2142 UsedFlags.V = true;
2143 break;
2144
2145 case AArch64CC::GT: // Z clear, N and V the same
2146 case AArch64CC::LE: // Z set, N and V differ
2147 UsedFlags.Z = true;
2148 [[fallthrough]];
2149 case AArch64CC::GE: // N and V the same
2150 case AArch64CC::LT: // N and V differ
2151 UsedFlags.N = true;
2152 UsedFlags.V = true;
2153 break;
2154 }
2155 return UsedFlags;
2156}
2157
2158/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2159/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2160/// \returns std::nullopt otherwise.
2161///
2162/// Collect instructions using that flags in \p CCUseInstrs if provided.
2163std::optional<UsedNZCV>
2165 const TargetRegisterInfo &TRI,
2166 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2167 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2168 if (MI.getParent() != CmpParent)
2169 return std::nullopt;
2170
2171 if (areCFlagsAliveInSuccessors(CmpParent))
2172 return std::nullopt;
2173
2174 UsedNZCV NZCVUsedAfterCmp;
2176 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2177 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2179 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2180 return std::nullopt;
2181 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2182 if (CCUseInstrs)
2183 CCUseInstrs->push_back(&Instr);
2184 }
2185 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2186 break;
2187 }
2188 return NZCVUsedAfterCmp;
2189}
2190
2191static bool isADDSRegImm(unsigned Opcode) {
2192 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2193}
2194
2195static bool isSUBSRegImm(unsigned Opcode) {
2196 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2197}
2198
2200 unsigned Opc = sForm(MI);
2201 switch (Opc) {
2202 case AArch64::ANDSWri:
2203 case AArch64::ANDSWrr:
2204 case AArch64::ANDSWrs:
2205 case AArch64::ANDSXri:
2206 case AArch64::ANDSXrr:
2207 case AArch64::ANDSXrs:
2208 case AArch64::BICSWrr:
2209 case AArch64::BICSXrr:
2210 case AArch64::BICSWrs:
2211 case AArch64::BICSXrs:
2212 return true;
2213 default:
2214 return false;
2215 }
2216}
2217
2218/// Check if CmpInstr can be substituted by MI.
2219///
2220/// CmpInstr can be substituted:
2221/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2222/// - and, MI and CmpInstr are from the same MachineBB
2223/// - and, condition flags are not alive in successors of the CmpInstr parent
2224/// - and, if MI opcode is the S form there must be no defs of flags between
2225/// MI and CmpInstr
2226/// or if MI opcode is not the S form there must be neither defs of flags
2227/// nor uses of flags between MI and CmpInstr.
2228/// - and, if C/V flags are not used after CmpInstr
2229/// or if N flag is used but MI produces poison value if signed overflow
2230/// occurs.
2232 const TargetRegisterInfo &TRI) {
2233 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2234 // that may or may not set flags.
2235 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2236
2237 const unsigned CmpOpcode = CmpInstr.getOpcode();
2238 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2239 return false;
2240
2241 assert((CmpInstr.getOperand(2).isImm() &&
2242 CmpInstr.getOperand(2).getImm() == 0) &&
2243 "Caller guarantees that CmpInstr compares with constant 0");
2244
2245 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2246 if (!NZVCUsed || NZVCUsed->C)
2247 return false;
2248
2249 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2250 // '%vreg = add ...' or '%vreg = sub ...'.
2251 // Condition flag V is used to indicate signed overflow.
2252 // 1) MI and CmpInstr set N and V to the same value.
2253 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2254 // signed overflow occurs, so CmpInstr could still be simplified away.
2255 // Note that Ands and Bics instructions always clear the V flag.
2256 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2257 return false;
2258
2259 AccessKind AccessToCheck = AK_Write;
2260 if (sForm(MI) != MI.getOpcode())
2261 AccessToCheck = AK_All;
2262 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2263}
2264
2265/// Substitute an instruction comparing to zero with another instruction
2266/// which produces needed condition flags.
2267///
2268/// Return true on success.
2269bool AArch64InstrInfo::substituteCmpToZero(
2270 MachineInstr &CmpInstr, unsigned SrcReg,
2271 const MachineRegisterInfo &MRI) const {
2272 // Get the unique definition of SrcReg.
2273 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2274 if (!MI)
2275 return false;
2276
2277 const TargetRegisterInfo &TRI = getRegisterInfo();
2278
2279 unsigned NewOpc = sForm(*MI);
2280 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2281 return false;
2282
2283 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2284 return false;
2285
2286 // Update the instruction to set NZCV.
2287 MI->setDesc(get(NewOpc));
2288 CmpInstr.eraseFromParent();
2290 (void)succeeded;
2291 assert(succeeded && "Some operands reg class are incompatible!");
2292 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2293 return true;
2294}
2295
2296/// \returns True if \p CmpInstr can be removed.
2297///
2298/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2299/// codes used in \p CCUseInstrs must be inverted.
2301 int CmpValue, const TargetRegisterInfo &TRI,
2303 bool &IsInvertCC) {
2304 assert((CmpValue == 0 || CmpValue == 1) &&
2305 "Only comparisons to 0 or 1 considered for removal!");
2306
2307 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2308 unsigned MIOpc = MI.getOpcode();
2309 if (MIOpc == AArch64::CSINCWr) {
2310 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2311 MI.getOperand(2).getReg() != AArch64::WZR)
2312 return false;
2313 } else if (MIOpc == AArch64::CSINCXr) {
2314 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2315 MI.getOperand(2).getReg() != AArch64::XZR)
2316 return false;
2317 } else {
2318 return false;
2319 }
2321 if (MICC == AArch64CC::Invalid)
2322 return false;
2323
2324 // NZCV needs to be defined
2325 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2326 return false;
2327
2328 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2329 const unsigned CmpOpcode = CmpInstr.getOpcode();
2330 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2331 if (CmpValue && !IsSubsRegImm)
2332 return false;
2333 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2334 return false;
2335
2336 // MI conditions allowed: eq, ne, mi, pl
2337 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2338 if (MIUsedNZCV.C || MIUsedNZCV.V)
2339 return false;
2340
2341 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2342 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2343 // Condition flags are not used in CmpInstr basic block successors and only
2344 // Z or N flags allowed to be used after CmpInstr within its basic block
2345 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2346 return false;
2347 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2348 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2349 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2350 return false;
2351 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2352 if (MIUsedNZCV.N && !CmpValue)
2353 return false;
2354
2355 // There must be no defs of flags between MI and CmpInstr
2356 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2357 return false;
2358
2359 // Condition code is inverted in the following cases:
2360 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2361 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2362 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2363 (!CmpValue && MICC == AArch64CC::NE);
2364 return true;
2365}
2366
2367/// Remove comparison in csinc-cmp sequence
2368///
2369/// Examples:
2370/// 1. \code
2371/// csinc w9, wzr, wzr, ne
2372/// cmp w9, #0
2373/// b.eq
2374/// \endcode
2375/// to
2376/// \code
2377/// csinc w9, wzr, wzr, ne
2378/// b.ne
2379/// \endcode
2380///
2381/// 2. \code
2382/// csinc x2, xzr, xzr, mi
2383/// cmp x2, #1
2384/// b.pl
2385/// \endcode
2386/// to
2387/// \code
2388/// csinc x2, xzr, xzr, mi
2389/// b.pl
2390/// \endcode
2391///
2392/// \param CmpInstr comparison instruction
2393/// \return True when comparison removed
2394bool AArch64InstrInfo::removeCmpToZeroOrOne(
2395 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2396 const MachineRegisterInfo &MRI) const {
2397 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2398 if (!MI)
2399 return false;
2400 const TargetRegisterInfo &TRI = getRegisterInfo();
2401 SmallVector<MachineInstr *, 4> CCUseInstrs;
2402 bool IsInvertCC = false;
2403 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2404 IsInvertCC))
2405 return false;
2406 // Make transformation
2407 CmpInstr.eraseFromParent();
2408 if (IsInvertCC) {
2409 // Invert condition codes in CmpInstr CC users
2410 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2411 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2412 assert(Idx >= 0 && "Unexpected instruction using CC.");
2413 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2415 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2416 CCOperand.setImm(CCUse);
2417 }
2418 }
2419 return true;
2420}
2421
2422bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2423 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2424 MI.getOpcode() != AArch64::CATCHRET)
2425 return false;
2426
2427 MachineBasicBlock &MBB = *MI.getParent();
2428 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2429 auto TRI = Subtarget.getRegisterInfo();
2430 DebugLoc DL = MI.getDebugLoc();
2431
2432 if (MI.getOpcode() == AArch64::CATCHRET) {
2433 // Skip to the first instruction before the epilog.
2434 const TargetInstrInfo *TII =
2436 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2438 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2439 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2440 FirstEpilogSEH != MBB.begin())
2441 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2442 if (FirstEpilogSEH != MBB.begin())
2443 FirstEpilogSEH = std::next(FirstEpilogSEH);
2444 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2445 .addReg(AArch64::X0, RegState::Define)
2446 .addMBB(TargetMBB);
2447 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2448 .addReg(AArch64::X0, RegState::Define)
2449 .addReg(AArch64::X0)
2450 .addMBB(TargetMBB)
2451 .addImm(0);
2452 TargetMBB->setMachineBlockAddressTaken();
2453 return true;
2454 }
2455
2456 Register Reg = MI.getOperand(0).getReg();
2458 if (M.getStackProtectorGuard() == "sysreg") {
2459 const AArch64SysReg::SysReg *SrcReg =
2460 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2461 if (!SrcReg)
2462 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2463
2464 // mrs xN, sysreg
2465 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2467 .addImm(SrcReg->Encoding);
2468 int Offset = M.getStackProtectorGuardOffset();
2469 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2470 // ldr xN, [xN, #offset]
2471 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2472 .addDef(Reg)
2474 .addImm(Offset / 8);
2475 } else if (Offset >= -256 && Offset <= 255) {
2476 // ldur xN, [xN, #offset]
2477 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2478 .addDef(Reg)
2480 .addImm(Offset);
2481 } else if (Offset >= -4095 && Offset <= 4095) {
2482 if (Offset > 0) {
2483 // add xN, xN, #offset
2484 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2485 .addDef(Reg)
2487 .addImm(Offset)
2488 .addImm(0);
2489 } else {
2490 // sub xN, xN, #offset
2491 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2492 .addDef(Reg)
2494 .addImm(-Offset)
2495 .addImm(0);
2496 }
2497 // ldr xN, [xN]
2498 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2499 .addDef(Reg)
2501 .addImm(0);
2502 } else {
2503 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2504 // than 23760.
2505 // It might be nice to use AArch64::MOVi32imm here, which would get
2506 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2507 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2508 // AArch64FrameLowering might help us find such a scratch register
2509 // though. If we failed to find a scratch register, we could emit a
2510 // stream of add instructions to build up the immediate. Or, we could try
2511 // to insert a AArch64::MOVi32imm before register allocation so that we
2512 // didn't need to scavenge for a scratch register.
2513 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2514 }
2515 MBB.erase(MI);
2516 return true;
2517 }
2518
2519 const GlobalValue *GV =
2520 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2521 const TargetMachine &TM = MBB.getParent()->getTarget();
2522 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2523 const unsigned char MO_NC = AArch64II::MO_NC;
2524
2525 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2526 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2527 .addGlobalAddress(GV, 0, OpFlags);
2528 if (Subtarget.isTargetILP32()) {
2529 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2530 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2531 .addDef(Reg32, RegState::Dead)
2533 .addImm(0)
2534 .addMemOperand(*MI.memoperands_begin())
2536 } else {
2537 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2539 .addImm(0)
2540 .addMemOperand(*MI.memoperands_begin());
2541 }
2542 } else if (TM.getCodeModel() == CodeModel::Large) {
2543 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2544 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2545 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2546 .addImm(0);
2547 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2549 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2550 .addImm(16);
2551 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2553 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2554 .addImm(32);
2555 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2558 .addImm(48);
2559 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2561 .addImm(0)
2562 .addMemOperand(*MI.memoperands_begin());
2563 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2564 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2565 .addGlobalAddress(GV, 0, OpFlags);
2566 } else {
2567 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2568 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2569 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2570 if (Subtarget.isTargetILP32()) {
2571 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2572 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2573 .addDef(Reg32, RegState::Dead)
2575 .addGlobalAddress(GV, 0, LoFlags)
2576 .addMemOperand(*MI.memoperands_begin())
2578 } else {
2579 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2581 .addGlobalAddress(GV, 0, LoFlags)
2582 .addMemOperand(*MI.memoperands_begin());
2583 }
2584 }
2585
2586 MBB.erase(MI);
2587
2588 return true;
2589}
2590
2591// Return true if this instruction simply sets its single destination register
2592// to zero. This is equivalent to a register rename of the zero-register.
2594 switch (MI.getOpcode()) {
2595 default:
2596 break;
2597 case AArch64::MOVZWi:
2598 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2599 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2600 assert(MI.getDesc().getNumOperands() == 3 &&
2601 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2602 return true;
2603 }
2604 break;
2605 case AArch64::ANDWri: // and Rd, Rzr, #imm
2606 return MI.getOperand(1).getReg() == AArch64::WZR;
2607 case AArch64::ANDXri:
2608 return MI.getOperand(1).getReg() == AArch64::XZR;
2609 case TargetOpcode::COPY:
2610 return MI.getOperand(1).getReg() == AArch64::WZR;
2611 }
2612 return false;
2613}
2614
2615// Return true if this instruction simply renames a general register without
2616// modifying bits.
2618 switch (MI.getOpcode()) {
2619 default:
2620 break;
2621 case TargetOpcode::COPY: {
2622 // GPR32 copies will by lowered to ORRXrs
2623 Register DstReg = MI.getOperand(0).getReg();
2624 return (AArch64::GPR32RegClass.contains(DstReg) ||
2625 AArch64::GPR64RegClass.contains(DstReg));
2626 }
2627 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2628 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2629 assert(MI.getDesc().getNumOperands() == 4 &&
2630 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2631 return true;
2632 }
2633 break;
2634 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2635 if (MI.getOperand(2).getImm() == 0) {
2636 assert(MI.getDesc().getNumOperands() == 4 &&
2637 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2638 return true;
2639 }
2640 break;
2641 }
2642 return false;
2643}
2644
2645// Return true if this instruction simply renames a general register without
2646// modifying bits.
2648 switch (MI.getOpcode()) {
2649 default:
2650 break;
2651 case TargetOpcode::COPY: {
2652 Register DstReg = MI.getOperand(0).getReg();
2653 return AArch64::FPR128RegClass.contains(DstReg);
2654 }
2655 case AArch64::ORRv16i8:
2656 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2657 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2658 "invalid ORRv16i8 operands");
2659 return true;
2660 }
2661 break;
2662 }
2663 return false;
2664}
2665
2666static bool isFrameLoadOpcode(int Opcode) {
2667 switch (Opcode) {
2668 default:
2669 return false;
2670 case AArch64::LDRWui:
2671 case AArch64::LDRXui:
2672 case AArch64::LDRBui:
2673 case AArch64::LDRHui:
2674 case AArch64::LDRSui:
2675 case AArch64::LDRDui:
2676 case AArch64::LDRQui:
2677 case AArch64::LDR_PXI:
2678 return true;
2679 }
2680}
2681
2683 int &FrameIndex) const {
2684 if (!isFrameLoadOpcode(MI.getOpcode()))
2685 return Register();
2686
2687 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2688 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2689 FrameIndex = MI.getOperand(1).getIndex();
2690 return MI.getOperand(0).getReg();
2691 }
2692 return Register();
2693}
2694
2695static bool isFrameStoreOpcode(int Opcode) {
2696 switch (Opcode) {
2697 default:
2698 return false;
2699 case AArch64::STRWui:
2700 case AArch64::STRXui:
2701 case AArch64::STRBui:
2702 case AArch64::STRHui:
2703 case AArch64::STRSui:
2704 case AArch64::STRDui:
2705 case AArch64::STRQui:
2706 case AArch64::STR_PXI:
2707 return true;
2708 }
2709}
2710
2712 int &FrameIndex) const {
2713 if (!isFrameStoreOpcode(MI.getOpcode()))
2714 return Register();
2715
2716 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2717 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2718 FrameIndex = MI.getOperand(1).getIndex();
2719 return MI.getOperand(0).getReg();
2720 }
2721 return Register();
2722}
2723
2725 int &FrameIndex) const {
2726 if (!isFrameStoreOpcode(MI.getOpcode()))
2727 return Register();
2728
2729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2730 return Reg;
2731
2733 if (hasStoreToStackSlot(MI, Accesses)) {
2734 if (Accesses.size() > 1)
2735 return Register();
2736
2737 FrameIndex =
2738 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2739 ->getFrameIndex();
2740 return MI.getOperand(0).getReg();
2741 }
2742 return Register();
2743}
2744
2746 int &FrameIndex) const {
2747 if (!isFrameLoadOpcode(MI.getOpcode()))
2748 return Register();
2749
2750 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2751 return Reg;
2752
2754 if (hasLoadFromStackSlot(MI, Accesses)) {
2755 if (Accesses.size() > 1)
2756 return Register();
2757
2758 FrameIndex =
2759 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2760 ->getFrameIndex();
2761 return MI.getOperand(0).getReg();
2762 }
2763 return Register();
2764}
2765
2766/// Check all MachineMemOperands for a hint to suppress pairing.
2768 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2769 return MMO->getFlags() & MOSuppressPair;
2770 });
2771}
2772
2773/// Set a flag on the first MachineMemOperand to suppress pairing.
2775 if (MI.memoperands_empty())
2776 return;
2777 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2778}
2779
2780/// Check all MachineMemOperands for a hint that the load/store is strided.
2782 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2783 return MMO->getFlags() & MOStridedAccess;
2784 });
2785}
2786
2788 switch (Opc) {
2789 default:
2790 return false;
2791 case AArch64::STURSi:
2792 case AArch64::STRSpre:
2793 case AArch64::STURDi:
2794 case AArch64::STRDpre:
2795 case AArch64::STURQi:
2796 case AArch64::STRQpre:
2797 case AArch64::STURBBi:
2798 case AArch64::STURHHi:
2799 case AArch64::STURWi:
2800 case AArch64::STRWpre:
2801 case AArch64::STURXi:
2802 case AArch64::STRXpre:
2803 case AArch64::LDURSi:
2804 case AArch64::LDRSpre:
2805 case AArch64::LDURDi:
2806 case AArch64::LDRDpre:
2807 case AArch64::LDURQi:
2808 case AArch64::LDRQpre:
2809 case AArch64::LDURWi:
2810 case AArch64::LDRWpre:
2811 case AArch64::LDURXi:
2812 case AArch64::LDRXpre:
2813 case AArch64::LDRSWpre:
2814 case AArch64::LDURSWi:
2815 case AArch64::LDURHHi:
2816 case AArch64::LDURBBi:
2817 case AArch64::LDURSBWi:
2818 case AArch64::LDURSHWi:
2819 return true;
2820 }
2821}
2822
2823std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2824 switch (Opc) {
2825 default: return {};
2826 case AArch64::PRFMui: return AArch64::PRFUMi;
2827 case AArch64::LDRXui: return AArch64::LDURXi;
2828 case AArch64::LDRWui: return AArch64::LDURWi;
2829 case AArch64::LDRBui: return AArch64::LDURBi;
2830 case AArch64::LDRHui: return AArch64::LDURHi;
2831 case AArch64::LDRSui: return AArch64::LDURSi;
2832 case AArch64::LDRDui: return AArch64::LDURDi;
2833 case AArch64::LDRQui: return AArch64::LDURQi;
2834 case AArch64::LDRBBui: return AArch64::LDURBBi;
2835 case AArch64::LDRHHui: return AArch64::LDURHHi;
2836 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2837 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2838 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2839 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2840 case AArch64::LDRSWui: return AArch64::LDURSWi;
2841 case AArch64::STRXui: return AArch64::STURXi;
2842 case AArch64::STRWui: return AArch64::STURWi;
2843 case AArch64::STRBui: return AArch64::STURBi;
2844 case AArch64::STRHui: return AArch64::STURHi;
2845 case AArch64::STRSui: return AArch64::STURSi;
2846 case AArch64::STRDui: return AArch64::STURDi;
2847 case AArch64::STRQui: return AArch64::STURQi;
2848 case AArch64::STRBBui: return AArch64::STURBBi;
2849 case AArch64::STRHHui: return AArch64::STURHHi;
2850 }
2851}
2852
2854 switch (Opc) {
2855 default:
2856 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2857 case AArch64::ADDG:
2858 case AArch64::LDAPURBi:
2859 case AArch64::LDAPURHi:
2860 case AArch64::LDAPURi:
2861 case AArch64::LDAPURSBWi:
2862 case AArch64::LDAPURSBXi:
2863 case AArch64::LDAPURSHWi:
2864 case AArch64::LDAPURSHXi:
2865 case AArch64::LDAPURSWi:
2866 case AArch64::LDAPURXi:
2867 case AArch64::LDR_PPXI:
2868 case AArch64::LDR_PXI:
2869 case AArch64::LDR_ZXI:
2870 case AArch64::LDR_ZZXI:
2871 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2872 case AArch64::LDR_ZZZXI:
2873 case AArch64::LDR_ZZZZXI:
2874 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2875 case AArch64::LDRBBui:
2876 case AArch64::LDRBui:
2877 case AArch64::LDRDui:
2878 case AArch64::LDRHHui:
2879 case AArch64::LDRHui:
2880 case AArch64::LDRQui:
2881 case AArch64::LDRSBWui:
2882 case AArch64::LDRSBXui:
2883 case AArch64::LDRSHWui:
2884 case AArch64::LDRSHXui:
2885 case AArch64::LDRSui:
2886 case AArch64::LDRSWui:
2887 case AArch64::LDRWui:
2888 case AArch64::LDRXui:
2889 case AArch64::LDURBBi:
2890 case AArch64::LDURBi:
2891 case AArch64::LDURDi:
2892 case AArch64::LDURHHi:
2893 case AArch64::LDURHi:
2894 case AArch64::LDURQi:
2895 case AArch64::LDURSBWi:
2896 case AArch64::LDURSBXi:
2897 case AArch64::LDURSHWi:
2898 case AArch64::LDURSHXi:
2899 case AArch64::LDURSi:
2900 case AArch64::LDURSWi:
2901 case AArch64::LDURWi:
2902 case AArch64::LDURXi:
2903 case AArch64::PRFMui:
2904 case AArch64::PRFUMi:
2905 case AArch64::ST2Gi:
2906 case AArch64::STGi:
2907 case AArch64::STLURBi:
2908 case AArch64::STLURHi:
2909 case AArch64::STLURWi:
2910 case AArch64::STLURXi:
2911 case AArch64::StoreSwiftAsyncContext:
2912 case AArch64::STR_PPXI:
2913 case AArch64::STR_PXI:
2914 case AArch64::STR_ZXI:
2915 case AArch64::STR_ZZXI:
2916 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2917 case AArch64::STR_ZZZXI:
2918 case AArch64::STR_ZZZZXI:
2919 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2920 case AArch64::STRBBui:
2921 case AArch64::STRBui:
2922 case AArch64::STRDui:
2923 case AArch64::STRHHui:
2924 case AArch64::STRHui:
2925 case AArch64::STRQui:
2926 case AArch64::STRSui:
2927 case AArch64::STRWui:
2928 case AArch64::STRXui:
2929 case AArch64::STURBBi:
2930 case AArch64::STURBi:
2931 case AArch64::STURDi:
2932 case AArch64::STURHHi:
2933 case AArch64::STURHi:
2934 case AArch64::STURQi:
2935 case AArch64::STURSi:
2936 case AArch64::STURWi:
2937 case AArch64::STURXi:
2938 case AArch64::STZ2Gi:
2939 case AArch64::STZGi:
2940 case AArch64::TAGPstack:
2941 return 2;
2942 case AArch64::LD1B_D_IMM:
2943 case AArch64::LD1B_H_IMM:
2944 case AArch64::LD1B_IMM:
2945 case AArch64::LD1B_S_IMM:
2946 case AArch64::LD1D_IMM:
2947 case AArch64::LD1H_D_IMM:
2948 case AArch64::LD1H_IMM:
2949 case AArch64::LD1H_S_IMM:
2950 case AArch64::LD1RB_D_IMM:
2951 case AArch64::LD1RB_H_IMM:
2952 case AArch64::LD1RB_IMM:
2953 case AArch64::LD1RB_S_IMM:
2954 case AArch64::LD1RD_IMM:
2955 case AArch64::LD1RH_D_IMM:
2956 case AArch64::LD1RH_IMM:
2957 case AArch64::LD1RH_S_IMM:
2958 case AArch64::LD1RSB_D_IMM:
2959 case AArch64::LD1RSB_H_IMM:
2960 case AArch64::LD1RSB_S_IMM:
2961 case AArch64::LD1RSH_D_IMM:
2962 case AArch64::LD1RSH_S_IMM:
2963 case AArch64::LD1RSW_IMM:
2964 case AArch64::LD1RW_D_IMM:
2965 case AArch64::LD1RW_IMM:
2966 case AArch64::LD1SB_D_IMM:
2967 case AArch64::LD1SB_H_IMM:
2968 case AArch64::LD1SB_S_IMM:
2969 case AArch64::LD1SH_D_IMM:
2970 case AArch64::LD1SH_S_IMM:
2971 case AArch64::LD1SW_D_IMM:
2972 case AArch64::LD1W_D_IMM:
2973 case AArch64::LD1W_IMM:
2974 case AArch64::LD2B_IMM:
2975 case AArch64::LD2D_IMM:
2976 case AArch64::LD2H_IMM:
2977 case AArch64::LD2W_IMM:
2978 case AArch64::LD3B_IMM:
2979 case AArch64::LD3D_IMM:
2980 case AArch64::LD3H_IMM:
2981 case AArch64::LD3W_IMM:
2982 case AArch64::LD4B_IMM:
2983 case AArch64::LD4D_IMM:
2984 case AArch64::LD4H_IMM:
2985 case AArch64::LD4W_IMM:
2986 case AArch64::LDG:
2987 case AArch64::LDNF1B_D_IMM:
2988 case AArch64::LDNF1B_H_IMM:
2989 case AArch64::LDNF1B_IMM:
2990 case AArch64::LDNF1B_S_IMM:
2991 case AArch64::LDNF1D_IMM:
2992 case AArch64::LDNF1H_D_IMM:
2993 case AArch64::LDNF1H_IMM:
2994 case AArch64::LDNF1H_S_IMM:
2995 case AArch64::LDNF1SB_D_IMM:
2996 case AArch64::LDNF1SB_H_IMM:
2997 case AArch64::LDNF1SB_S_IMM:
2998 case AArch64::LDNF1SH_D_IMM:
2999 case AArch64::LDNF1SH_S_IMM:
3000 case AArch64::LDNF1SW_D_IMM:
3001 case AArch64::LDNF1W_D_IMM:
3002 case AArch64::LDNF1W_IMM:
3003 case AArch64::LDNPDi:
3004 case AArch64::LDNPQi:
3005 case AArch64::LDNPSi:
3006 case AArch64::LDNPWi:
3007 case AArch64::LDNPXi:
3008 case AArch64::LDNT1B_ZRI:
3009 case AArch64::LDNT1D_ZRI:
3010 case AArch64::LDNT1H_ZRI:
3011 case AArch64::LDNT1W_ZRI:
3012 case AArch64::LDPDi:
3013 case AArch64::LDPQi:
3014 case AArch64::LDPSi:
3015 case AArch64::LDPWi:
3016 case AArch64::LDPXi:
3017 case AArch64::LDRBBpost:
3018 case AArch64::LDRBBpre:
3019 case AArch64::LDRBpost:
3020 case AArch64::LDRBpre:
3021 case AArch64::LDRDpost:
3022 case AArch64::LDRDpre:
3023 case AArch64::LDRHHpost:
3024 case AArch64::LDRHHpre:
3025 case AArch64::LDRHpost:
3026 case AArch64::LDRHpre:
3027 case AArch64::LDRQpost:
3028 case AArch64::LDRQpre:
3029 case AArch64::LDRSpost:
3030 case AArch64::LDRSpre:
3031 case AArch64::LDRWpost:
3032 case AArch64::LDRWpre:
3033 case AArch64::LDRXpost:
3034 case AArch64::LDRXpre:
3035 case AArch64::ST1B_D_IMM:
3036 case AArch64::ST1B_H_IMM:
3037 case AArch64::ST1B_IMM:
3038 case AArch64::ST1B_S_IMM:
3039 case AArch64::ST1D_IMM:
3040 case AArch64::ST1H_D_IMM:
3041 case AArch64::ST1H_IMM:
3042 case AArch64::ST1H_S_IMM:
3043 case AArch64::ST1W_D_IMM:
3044 case AArch64::ST1W_IMM:
3045 case AArch64::ST2B_IMM:
3046 case AArch64::ST2D_IMM:
3047 case AArch64::ST2H_IMM:
3048 case AArch64::ST2W_IMM:
3049 case AArch64::ST3B_IMM:
3050 case AArch64::ST3D_IMM:
3051 case AArch64::ST3H_IMM:
3052 case AArch64::ST3W_IMM:
3053 case AArch64::ST4B_IMM:
3054 case AArch64::ST4D_IMM:
3055 case AArch64::ST4H_IMM:
3056 case AArch64::ST4W_IMM:
3057 case AArch64::STGPi:
3058 case AArch64::STGPreIndex:
3059 case AArch64::STZGPreIndex:
3060 case AArch64::ST2GPreIndex:
3061 case AArch64::STZ2GPreIndex:
3062 case AArch64::STGPostIndex:
3063 case AArch64::STZGPostIndex:
3064 case AArch64::ST2GPostIndex:
3065 case AArch64::STZ2GPostIndex:
3066 case AArch64::STNPDi:
3067 case AArch64::STNPQi:
3068 case AArch64::STNPSi:
3069 case AArch64::STNPWi:
3070 case AArch64::STNPXi:
3071 case AArch64::STNT1B_ZRI:
3072 case AArch64::STNT1D_ZRI:
3073 case AArch64::STNT1H_ZRI:
3074 case AArch64::STNT1W_ZRI:
3075 case AArch64::STPDi:
3076 case AArch64::STPQi:
3077 case AArch64::STPSi:
3078 case AArch64::STPWi:
3079 case AArch64::STPXi:
3080 case AArch64::STRBBpost:
3081 case AArch64::STRBBpre:
3082 case AArch64::STRBpost:
3083 case AArch64::STRBpre:
3084 case AArch64::STRDpost:
3085 case AArch64::STRDpre:
3086 case AArch64::STRHHpost:
3087 case AArch64::STRHHpre:
3088 case AArch64::STRHpost:
3089 case AArch64::STRHpre:
3090 case AArch64::STRQpost:
3091 case AArch64::STRQpre:
3092 case AArch64::STRSpost:
3093 case AArch64::STRSpre:
3094 case AArch64::STRWpost:
3095 case AArch64::STRWpre:
3096 case AArch64::STRXpost:
3097 case AArch64::STRXpre:
3098 return 3;
3099 case AArch64::LDPDpost:
3100 case AArch64::LDPDpre:
3101 case AArch64::LDPQpost:
3102 case AArch64::LDPQpre:
3103 case AArch64::LDPSpost:
3104 case AArch64::LDPSpre:
3105 case AArch64::LDPWpost:
3106 case AArch64::LDPWpre:
3107 case AArch64::LDPXpost:
3108 case AArch64::LDPXpre:
3109 case AArch64::STGPpre:
3110 case AArch64::STGPpost:
3111 case AArch64::STPDpost:
3112 case AArch64::STPDpre:
3113 case AArch64::STPQpost:
3114 case AArch64::STPQpre:
3115 case AArch64::STPSpost:
3116 case AArch64::STPSpre:
3117 case AArch64::STPWpost:
3118 case AArch64::STPWpre:
3119 case AArch64::STPXpost:
3120 case AArch64::STPXpre:
3121 return 4;
3122 }
3123}
3124
3126 switch (MI.getOpcode()) {
3127 default:
3128 return false;
3129 // Scaled instructions.
3130 case AArch64::STRSui:
3131 case AArch64::STRDui:
3132 case AArch64::STRQui:
3133 case AArch64::STRXui:
3134 case AArch64::STRWui:
3135 case AArch64::LDRSui:
3136 case AArch64::LDRDui:
3137 case AArch64::LDRQui:
3138 case AArch64::LDRXui:
3139 case AArch64::LDRWui:
3140 case AArch64::LDRSWui:
3141 // Unscaled instructions.
3142 case AArch64::STURSi:
3143 case AArch64::STRSpre:
3144 case AArch64::STURDi:
3145 case AArch64::STRDpre:
3146 case AArch64::STURQi:
3147 case AArch64::STRQpre:
3148 case AArch64::STURWi:
3149 case AArch64::STRWpre:
3150 case AArch64::STURXi:
3151 case AArch64::STRXpre:
3152 case AArch64::LDURSi:
3153 case AArch64::LDRSpre:
3154 case AArch64::LDURDi:
3155 case AArch64::LDRDpre:
3156 case AArch64::LDURQi:
3157 case AArch64::LDRQpre:
3158 case AArch64::LDURWi:
3159 case AArch64::LDRWpre:
3160 case AArch64::LDURXi:
3161 case AArch64::LDRXpre:
3162 case AArch64::LDURSWi:
3163 case AArch64::LDRSWpre:
3164 // SVE instructions.
3165 case AArch64::LDR_ZXI:
3166 case AArch64::STR_ZXI:
3167 return true;
3168 }
3169}
3170
3172 switch (MI.getOpcode()) {
3173 default:
3174 assert((!MI.isCall() || !MI.isReturn()) &&
3175 "Unexpected instruction - was a new tail call opcode introduced?");
3176 return false;
3177 case AArch64::TCRETURNdi:
3178 case AArch64::TCRETURNri:
3179 case AArch64::TCRETURNrix16x17:
3180 case AArch64::TCRETURNrix17:
3181 case AArch64::TCRETURNrinotx16:
3182 case AArch64::TCRETURNriALL:
3183 case AArch64::AUTH_TCRETURN:
3184 case AArch64::AUTH_TCRETURN_BTI:
3185 return true;
3186 }
3187}
3188
3190 switch (Opc) {
3191 default:
3192 llvm_unreachable("Opcode has no flag setting equivalent!");
3193 // 32-bit cases:
3194 case AArch64::ADDWri:
3195 return AArch64::ADDSWri;
3196 case AArch64::ADDWrr:
3197 return AArch64::ADDSWrr;
3198 case AArch64::ADDWrs:
3199 return AArch64::ADDSWrs;
3200 case AArch64::ADDWrx:
3201 return AArch64::ADDSWrx;
3202 case AArch64::ANDWri:
3203 return AArch64::ANDSWri;
3204 case AArch64::ANDWrr:
3205 return AArch64::ANDSWrr;
3206 case AArch64::ANDWrs:
3207 return AArch64::ANDSWrs;
3208 case AArch64::BICWrr:
3209 return AArch64::BICSWrr;
3210 case AArch64::BICWrs:
3211 return AArch64::BICSWrs;
3212 case AArch64::SUBWri:
3213 return AArch64::SUBSWri;
3214 case AArch64::SUBWrr:
3215 return AArch64::SUBSWrr;
3216 case AArch64::SUBWrs:
3217 return AArch64::SUBSWrs;
3218 case AArch64::SUBWrx:
3219 return AArch64::SUBSWrx;
3220 // 64-bit cases:
3221 case AArch64::ADDXri:
3222 return AArch64::ADDSXri;
3223 case AArch64::ADDXrr:
3224 return AArch64::ADDSXrr;
3225 case AArch64::ADDXrs:
3226 return AArch64::ADDSXrs;
3227 case AArch64::ADDXrx:
3228 return AArch64::ADDSXrx;
3229 case AArch64::ANDXri:
3230 return AArch64::ANDSXri;
3231 case AArch64::ANDXrr:
3232 return AArch64::ANDSXrr;
3233 case AArch64::ANDXrs:
3234 return AArch64::ANDSXrs;
3235 case AArch64::BICXrr:
3236 return AArch64::BICSXrr;
3237 case AArch64::BICXrs:
3238 return AArch64::BICSXrs;
3239 case AArch64::SUBXri:
3240 return AArch64::SUBSXri;
3241 case AArch64::SUBXrr:
3242 return AArch64::SUBSXrr;
3243 case AArch64::SUBXrs:
3244 return AArch64::SUBSXrs;
3245 case AArch64::SUBXrx:
3246 return AArch64::SUBSXrx;
3247 // SVE instructions:
3248 case AArch64::AND_PPzPP:
3249 return AArch64::ANDS_PPzPP;
3250 case AArch64::BIC_PPzPP:
3251 return AArch64::BICS_PPzPP;
3252 case AArch64::EOR_PPzPP:
3253 return AArch64::EORS_PPzPP;
3254 case AArch64::NAND_PPzPP:
3255 return AArch64::NANDS_PPzPP;
3256 case AArch64::NOR_PPzPP:
3257 return AArch64::NORS_PPzPP;
3258 case AArch64::ORN_PPzPP:
3259 return AArch64::ORNS_PPzPP;
3260 case AArch64::ORR_PPzPP:
3261 return AArch64::ORRS_PPzPP;
3262 case AArch64::BRKA_PPzP:
3263 return AArch64::BRKAS_PPzP;
3264 case AArch64::BRKPA_PPzPP:
3265 return AArch64::BRKPAS_PPzPP;
3266 case AArch64::BRKB_PPzP:
3267 return AArch64::BRKBS_PPzP;
3268 case AArch64::BRKPB_PPzPP:
3269 return AArch64::BRKPBS_PPzPP;
3270 case AArch64::BRKN_PPzP:
3271 return AArch64::BRKNS_PPzP;
3272 case AArch64::RDFFR_PPz:
3273 return AArch64::RDFFRS_PPz;
3274 case AArch64::PTRUE_B:
3275 return AArch64::PTRUES_B;
3276 }
3277}
3278
3279// Is this a candidate for ld/st merging or pairing? For example, we don't
3280// touch volatiles or load/stores that have a hint to avoid pair formation.
3282
3283 bool IsPreLdSt = isPreLdSt(MI);
3284
3285 // If this is a volatile load/store, don't mess with it.
3286 if (MI.hasOrderedMemoryRef())
3287 return false;
3288
3289 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3290 // For Pre-inc LD/ST, the operand is shifted by one.
3291 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3292 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3293 "Expected a reg or frame index operand.");
3294
3295 // For Pre-indexed addressing quadword instructions, the third operand is the
3296 // immediate value.
3297 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3298
3299 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3300 return false;
3301
3302 // Can't merge/pair if the instruction modifies the base register.
3303 // e.g., ldr x0, [x0]
3304 // This case will never occur with an FI base.
3305 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3306 // STR<S,D,Q,W,X>pre, it can be merged.
3307 // For example:
3308 // ldr q0, [x11, #32]!
3309 // ldr q1, [x11, #16]
3310 // to
3311 // ldp q0, q1, [x11, #32]!
3312 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3313 Register BaseReg = MI.getOperand(1).getReg();
3315 if (MI.modifiesRegister(BaseReg, TRI))
3316 return false;
3317 }
3318
3319 // Pairing SVE fills/spills is only valid for little-endian targets that
3320 // implement VLS 128.
3321 switch (MI.getOpcode()) {
3322 default:
3323 break;
3324 case AArch64::LDR_ZXI:
3325 case AArch64::STR_ZXI:
3326 if (!Subtarget.isLittleEndian() ||
3327 Subtarget.getSVEVectorSizeInBits() != 128)
3328 return false;
3329 }
3330
3331 // Check if this load/store has a hint to avoid pair formation.
3332 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3334 return false;
3335
3336 // Do not pair any callee-save store/reload instructions in the
3337 // prologue/epilogue if the CFI information encoded the operations as separate
3338 // instructions, as that will cause the size of the actual prologue to mismatch
3339 // with the prologue size recorded in the Windows CFI.
3340 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3341 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3342 MI.getMF()->getFunction().needsUnwindTableEntry();
3343 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3345 return false;
3346
3347 // On some CPUs quad load/store pairs are slower than two single load/stores.
3348 if (Subtarget.isPaired128Slow()) {
3349 switch (MI.getOpcode()) {
3350 default:
3351 break;
3352 case AArch64::LDURQi:
3353 case AArch64::STURQi:
3354 case AArch64::LDRQui:
3355 case AArch64::STRQui:
3356 return false;
3357 }
3358 }
3359
3360 return true;
3361}
3362
3365 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3366 const TargetRegisterInfo *TRI) const {
3367 if (!LdSt.mayLoadOrStore())
3368 return false;
3369
3370 const MachineOperand *BaseOp;
3371 TypeSize WidthN(0, false);
3372 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3373 WidthN, TRI))
3374 return false;
3375 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3376 // vector.
3377 Width = LocationSize::precise(WidthN);
3378 BaseOps.push_back(BaseOp);
3379 return true;
3380}
3381
3382std::optional<ExtAddrMode>
3384 const TargetRegisterInfo *TRI) const {
3385 const MachineOperand *Base; // Filled with the base operand of MI.
3386 int64_t Offset; // Filled with the offset of MI.
3387 bool OffsetIsScalable;
3388 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3389 return std::nullopt;
3390
3391 if (!Base->isReg())
3392 return std::nullopt;
3393 ExtAddrMode AM;
3394 AM.BaseReg = Base->getReg();
3395 AM.Displacement = Offset;
3396 AM.ScaledReg = 0;
3397 AM.Scale = 0;
3398 return AM;
3399}
3400
3402 Register Reg,
3403 const MachineInstr &AddrI,
3404 ExtAddrMode &AM) const {
3405 // Filter out instructions into which we cannot fold.
3406 unsigned NumBytes;
3407 int64_t OffsetScale = 1;
3408 switch (MemI.getOpcode()) {
3409 default:
3410 return false;
3411
3412 case AArch64::LDURQi:
3413 case AArch64::STURQi:
3414 NumBytes = 16;
3415 break;
3416
3417 case AArch64::LDURDi:
3418 case AArch64::STURDi:
3419 case AArch64::LDURXi:
3420 case AArch64::STURXi:
3421 NumBytes = 8;
3422 break;
3423
3424 case AArch64::LDURWi:
3425 case AArch64::LDURSWi:
3426 case AArch64::STURWi:
3427 NumBytes = 4;
3428 break;
3429
3430 case AArch64::LDURHi:
3431 case AArch64::STURHi:
3432 case AArch64::LDURHHi:
3433 case AArch64::STURHHi:
3434 case AArch64::LDURSHXi:
3435 case AArch64::LDURSHWi:
3436 NumBytes = 2;
3437 break;
3438
3439 case AArch64::LDRBroX:
3440 case AArch64::LDRBBroX:
3441 case AArch64::LDRSBXroX:
3442 case AArch64::LDRSBWroX:
3443 case AArch64::STRBroX:
3444 case AArch64::STRBBroX:
3445 case AArch64::LDURBi:
3446 case AArch64::LDURBBi:
3447 case AArch64::LDURSBXi:
3448 case AArch64::LDURSBWi:
3449 case AArch64::STURBi:
3450 case AArch64::STURBBi:
3451 case AArch64::LDRBui:
3452 case AArch64::LDRBBui:
3453 case AArch64::LDRSBXui:
3454 case AArch64::LDRSBWui:
3455 case AArch64::STRBui:
3456 case AArch64::STRBBui:
3457 NumBytes = 1;
3458 break;
3459
3460 case AArch64::LDRQroX:
3461 case AArch64::STRQroX:
3462 case AArch64::LDRQui:
3463 case AArch64::STRQui:
3464 NumBytes = 16;
3465 OffsetScale = 16;
3466 break;
3467
3468 case AArch64::LDRDroX:
3469 case AArch64::STRDroX:
3470 case AArch64::LDRXroX:
3471 case AArch64::STRXroX:
3472 case AArch64::LDRDui:
3473 case AArch64::STRDui:
3474 case AArch64::LDRXui:
3475 case AArch64::STRXui:
3476 NumBytes = 8;
3477 OffsetScale = 8;
3478 break;
3479
3480 case AArch64::LDRWroX:
3481 case AArch64::LDRSWroX:
3482 case AArch64::STRWroX:
3483 case AArch64::LDRWui:
3484 case AArch64::LDRSWui:
3485 case AArch64::STRWui:
3486 NumBytes = 4;
3487 OffsetScale = 4;
3488 break;
3489
3490 case AArch64::LDRHroX:
3491 case AArch64::STRHroX:
3492 case AArch64::LDRHHroX:
3493 case AArch64::STRHHroX:
3494 case AArch64::LDRSHXroX:
3495 case AArch64::LDRSHWroX:
3496 case AArch64::LDRHui:
3497 case AArch64::STRHui:
3498 case AArch64::LDRHHui:
3499 case AArch64::STRHHui:
3500 case AArch64::LDRSHXui:
3501 case AArch64::LDRSHWui:
3502 NumBytes = 2;
3503 OffsetScale = 2;
3504 break;
3505 }
3506
3507 // Check the fold operand is not the loaded/stored value.
3508 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3509 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3510 return false;
3511
3512 // Handle memory instructions with a [Reg, Reg] addressing mode.
3513 if (MemI.getOperand(2).isReg()) {
3514 // Bail if the addressing mode already includes extension of the offset
3515 // register.
3516 if (MemI.getOperand(3).getImm())
3517 return false;
3518
3519 // Check if we actually have a scaled offset.
3520 if (MemI.getOperand(4).getImm() == 0)
3521 OffsetScale = 1;
3522
3523 // If the address instructions is folded into the base register, then the
3524 // addressing mode must not have a scale. Then we can swap the base and the
3525 // scaled registers.
3526 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3527 return false;
3528
3529 switch (AddrI.getOpcode()) {
3530 default:
3531 return false;
3532
3533 case AArch64::SBFMXri:
3534 // sxtw Xa, Wm
3535 // ldr Xd, [Xn, Xa, lsl #N]
3536 // ->
3537 // ldr Xd, [Xn, Wm, sxtw #N]
3538 if (AddrI.getOperand(2).getImm() != 0 ||
3539 AddrI.getOperand(3).getImm() != 31)
3540 return false;
3541
3542 AM.BaseReg = MemI.getOperand(1).getReg();
3543 if (AM.BaseReg == Reg)
3544 AM.BaseReg = MemI.getOperand(2).getReg();
3545 AM.ScaledReg = AddrI.getOperand(1).getReg();
3546 AM.Scale = OffsetScale;
3547 AM.Displacement = 0;
3549 return true;
3550
3551 case TargetOpcode::SUBREG_TO_REG: {
3552 // mov Wa, Wm
3553 // ldr Xd, [Xn, Xa, lsl #N]
3554 // ->
3555 // ldr Xd, [Xn, Wm, uxtw #N]
3556
3557 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3558 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3559 return false;
3560
3561 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3562 Register OffsetReg = AddrI.getOperand(1).getReg();
3563 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3564 return false;
3565
3566 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3567 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3568 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3569 DefMI.getOperand(3).getImm() != 0)
3570 return false;
3571
3572 AM.BaseReg = MemI.getOperand(1).getReg();
3573 if (AM.BaseReg == Reg)
3574 AM.BaseReg = MemI.getOperand(2).getReg();
3575 AM.ScaledReg = DefMI.getOperand(2).getReg();
3576 AM.Scale = OffsetScale;
3577 AM.Displacement = 0;
3579 return true;
3580 }
3581 }
3582 }
3583
3584 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3585
3586 // Check we are not breaking a potential conversion to an LDP.
3587 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3588 int64_t NewOffset) -> bool {
3589 int64_t MinOffset, MaxOffset;
3590 switch (NumBytes) {
3591 default:
3592 return true;
3593 case 4:
3594 MinOffset = -256;
3595 MaxOffset = 252;
3596 break;
3597 case 8:
3598 MinOffset = -512;
3599 MaxOffset = 504;
3600 break;
3601 case 16:
3602 MinOffset = -1024;
3603 MaxOffset = 1008;
3604 break;
3605 }
3606 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3607 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3608 };
3609 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3610 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3611 int64_t NewOffset = OldOffset + Disp;
3612 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3613 return false;
3614 // If the old offset would fit into an LDP, but the new offset wouldn't,
3615 // bail out.
3616 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3617 return false;
3618 AM.BaseReg = AddrI.getOperand(1).getReg();
3619 AM.ScaledReg = 0;
3620 AM.Scale = 0;
3621 AM.Displacement = NewOffset;
3623 return true;
3624 };
3625
3626 auto canFoldAddRegIntoAddrMode =
3627 [&](int64_t Scale,
3629 if (MemI.getOperand(2).getImm() != 0)
3630 return false;
3631 if ((unsigned)Scale != Scale)
3632 return false;
3633 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3634 return false;
3635 AM.BaseReg = AddrI.getOperand(1).getReg();
3636 AM.ScaledReg = AddrI.getOperand(2).getReg();
3637 AM.Scale = Scale;
3638 AM.Displacement = 0;
3639 AM.Form = Form;
3640 return true;
3641 };
3642
3643 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3644 unsigned Opcode = MemI.getOpcode();
3645 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3646 Subtarget.isSTRQroSlow();
3647 };
3648
3649 int64_t Disp = 0;
3650 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3651 switch (AddrI.getOpcode()) {
3652 default:
3653 return false;
3654
3655 case AArch64::ADDXri:
3656 // add Xa, Xn, #N
3657 // ldr Xd, [Xa, #M]
3658 // ->
3659 // ldr Xd, [Xn, #N'+M]
3660 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3661 return canFoldAddSubImmIntoAddrMode(Disp);
3662
3663 case AArch64::SUBXri:
3664 // sub Xa, Xn, #N
3665 // ldr Xd, [Xa, #M]
3666 // ->
3667 // ldr Xd, [Xn, #N'+M]
3668 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3669 return canFoldAddSubImmIntoAddrMode(-Disp);
3670
3671 case AArch64::ADDXrs: {
3672 // add Xa, Xn, Xm, lsl #N
3673 // ldr Xd, [Xa]
3674 // ->
3675 // ldr Xd, [Xn, Xm, lsl #N]
3676
3677 // Don't fold the add if the result would be slower, unless optimising for
3678 // size.
3679 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3681 return false;
3682 Shift = AArch64_AM::getShiftValue(Shift);
3683 if (!OptSize) {
3684 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3685 return false;
3686 if (avoidSlowSTRQ(MemI))
3687 return false;
3688 }
3689 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3690 }
3691
3692 case AArch64::ADDXrr:
3693 // add Xa, Xn, Xm
3694 // ldr Xd, [Xa]
3695 // ->
3696 // ldr Xd, [Xn, Xm, lsl #0]
3697
3698 // Don't fold the add if the result would be slower, unless optimising for
3699 // size.
3700 if (!OptSize && avoidSlowSTRQ(MemI))
3701 return false;
3702 return canFoldAddRegIntoAddrMode(1);
3703
3704 case AArch64::ADDXrx:
3705 // add Xa, Xn, Wm, {s,u}xtw #N
3706 // ldr Xd, [Xa]
3707 // ->
3708 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3709
3710 // Don't fold the add if the result would be slower, unless optimising for
3711 // size.
3712 if (!OptSize && avoidSlowSTRQ(MemI))
3713 return false;
3714
3715 // Can fold only sign-/zero-extend of a word.
3716 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3718 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3719 return false;
3720
3721 return canFoldAddRegIntoAddrMode(
3722 1ULL << AArch64_AM::getArithShiftValue(Imm),
3725 }
3726}
3727
3728// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3729// return the opcode of an instruction performing the same operation, but using
3730// the [Reg, Reg] addressing mode.
3731static unsigned regOffsetOpcode(unsigned Opcode) {
3732 switch (Opcode) {
3733 default:
3734 llvm_unreachable("Address folding not implemented for instruction");
3735
3736 case AArch64::LDURQi:
3737 case AArch64::LDRQui:
3738 return AArch64::LDRQroX;
3739 case AArch64::STURQi:
3740 case AArch64::STRQui:
3741 return AArch64::STRQroX;
3742 case AArch64::LDURDi:
3743 case AArch64::LDRDui:
3744 return AArch64::LDRDroX;
3745 case AArch64::STURDi:
3746 case AArch64::STRDui:
3747 return AArch64::STRDroX;
3748 case AArch64::LDURXi:
3749 case AArch64::LDRXui:
3750 return AArch64::LDRXroX;
3751 case AArch64::STURXi:
3752 case AArch64::STRXui:
3753 return AArch64::STRXroX;
3754 case AArch64::LDURWi:
3755 case AArch64::LDRWui:
3756 return AArch64::LDRWroX;
3757 case AArch64::LDURSWi:
3758 case AArch64::LDRSWui:
3759 return AArch64::LDRSWroX;
3760 case AArch64::STURWi:
3761 case AArch64::STRWui:
3762 return AArch64::STRWroX;
3763 case AArch64::LDURHi:
3764 case AArch64::LDRHui:
3765 return AArch64::LDRHroX;
3766 case AArch64::STURHi:
3767 case AArch64::STRHui:
3768 return AArch64::STRHroX;
3769 case AArch64::LDURHHi:
3770 case AArch64::LDRHHui:
3771 return AArch64::LDRHHroX;
3772 case AArch64::STURHHi:
3773 case AArch64::STRHHui:
3774 return AArch64::STRHHroX;
3775 case AArch64::LDURSHXi:
3776 case AArch64::LDRSHXui:
3777 return AArch64::LDRSHXroX;
3778 case AArch64::LDURSHWi:
3779 case AArch64::LDRSHWui:
3780 return AArch64::LDRSHWroX;
3781 case AArch64::LDURBi:
3782 case AArch64::LDRBui:
3783 return AArch64::LDRBroX;
3784 case AArch64::LDURBBi:
3785 case AArch64::LDRBBui:
3786 return AArch64::LDRBBroX;
3787 case AArch64::LDURSBXi:
3788 case AArch64::LDRSBXui:
3789 return AArch64::LDRSBXroX;
3790 case AArch64::LDURSBWi:
3791 case AArch64::LDRSBWui:
3792 return AArch64::LDRSBWroX;
3793 case AArch64::STURBi:
3794 case AArch64::STRBui:
3795 return AArch64::STRBroX;
3796 case AArch64::STURBBi:
3797 case AArch64::STRBBui:
3798 return AArch64::STRBBroX;
3799 }
3800}
3801
3802// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3803// the opcode of an instruction performing the same operation, but using the
3804// [Reg, #Imm] addressing mode with scaled offset.
3805unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3806 switch (Opcode) {
3807 default:
3808 llvm_unreachable("Address folding not implemented for instruction");
3809
3810 case AArch64::LDURQi:
3811 Scale = 16;
3812 return AArch64::LDRQui;
3813 case AArch64::STURQi:
3814 Scale = 16;
3815 return AArch64::STRQui;
3816 case AArch64::LDURDi:
3817 Scale = 8;
3818 return AArch64::LDRDui;
3819 case AArch64::STURDi:
3820 Scale = 8;
3821 return AArch64::STRDui;
3822 case AArch64::LDURXi:
3823 Scale = 8;
3824 return AArch64::LDRXui;
3825 case AArch64::STURXi:
3826 Scale = 8;
3827 return AArch64::STRXui;
3828 case AArch64::LDURWi:
3829 Scale = 4;
3830 return AArch64::LDRWui;
3831 case AArch64::LDURSWi:
3832 Scale = 4;
3833 return AArch64::LDRSWui;
3834 case AArch64::STURWi:
3835 Scale = 4;
3836 return AArch64::STRWui;
3837 case AArch64::LDURHi:
3838 Scale = 2;
3839 return AArch64::LDRHui;
3840 case AArch64::STURHi:
3841 Scale = 2;
3842 return AArch64::STRHui;
3843 case AArch64::LDURHHi:
3844 Scale = 2;
3845 return AArch64::LDRHHui;
3846 case AArch64::STURHHi:
3847 Scale = 2;
3848 return AArch64::STRHHui;
3849 case AArch64::LDURSHXi:
3850 Scale = 2;
3851 return AArch64::LDRSHXui;
3852 case AArch64::LDURSHWi:
3853 Scale = 2;
3854 return AArch64::LDRSHWui;
3855 case AArch64::LDURBi:
3856 Scale = 1;
3857 return AArch64::LDRBui;
3858 case AArch64::LDURBBi:
3859 Scale = 1;
3860 return AArch64::LDRBBui;
3861 case AArch64::LDURSBXi:
3862 Scale = 1;
3863 return AArch64::LDRSBXui;
3864 case AArch64::LDURSBWi:
3865 Scale = 1;
3866 return AArch64::LDRSBWui;
3867 case AArch64::STURBi:
3868 Scale = 1;
3869 return AArch64::STRBui;
3870 case AArch64::STURBBi:
3871 Scale = 1;
3872 return AArch64::STRBBui;
3873 case AArch64::LDRQui:
3874 case AArch64::STRQui:
3875 Scale = 16;
3876 return Opcode;
3877 case AArch64::LDRDui:
3878 case AArch64::STRDui:
3879 case AArch64::LDRXui:
3880 case AArch64::STRXui:
3881 Scale = 8;
3882 return Opcode;
3883 case AArch64::LDRWui:
3884 case AArch64::LDRSWui:
3885 case AArch64::STRWui:
3886 Scale = 4;
3887 return Opcode;
3888 case AArch64::LDRHui:
3889 case AArch64::STRHui:
3890 case AArch64::LDRHHui:
3891 case AArch64::STRHHui:
3892 case AArch64::LDRSHXui:
3893 case AArch64::LDRSHWui:
3894 Scale = 2;
3895 return Opcode;
3896 case AArch64::LDRBui:
3897 case AArch64::LDRBBui:
3898 case AArch64::LDRSBXui:
3899 case AArch64::LDRSBWui:
3900 case AArch64::STRBui:
3901 case AArch64::STRBBui:
3902 Scale = 1;
3903 return Opcode;
3904 }
3905}
3906
3907// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3908// the opcode of an instruction performing the same operation, but using the
3909// [Reg, #Imm] addressing mode with unscaled offset.
3910unsigned unscaledOffsetOpcode(unsigned Opcode) {
3911 switch (Opcode) {
3912 default:
3913 llvm_unreachable("Address folding not implemented for instruction");
3914
3915 case AArch64::LDURQi:
3916 case AArch64::STURQi:
3917 case AArch64::LDURDi:
3918 case AArch64::STURDi:
3919 case AArch64::LDURXi:
3920 case AArch64::STURXi:
3921 case AArch64::LDURWi:
3922 case AArch64::LDURSWi:
3923 case AArch64::STURWi:
3924 case AArch64::LDURHi:
3925 case AArch64::STURHi:
3926 case AArch64::LDURHHi:
3927 case AArch64::STURHHi:
3928 case AArch64::LDURSHXi:
3929 case AArch64::LDURSHWi:
3930 case AArch64::LDURBi:
3931 case AArch64::STURBi:
3932 case AArch64::LDURBBi:
3933 case AArch64::STURBBi:
3934 case AArch64::LDURSBWi:
3935 case AArch64::LDURSBXi:
3936 return Opcode;
3937 case AArch64::LDRQui:
3938 return AArch64::LDURQi;
3939 case AArch64::STRQui:
3940 return AArch64::STURQi;
3941 case AArch64::LDRDui:
3942 return AArch64::LDURDi;
3943 case AArch64::STRDui:
3944 return AArch64::STURDi;
3945 case AArch64::LDRXui:
3946 return AArch64::LDURXi;
3947 case AArch64::STRXui:
3948 return AArch64::STURXi;
3949 case AArch64::LDRWui:
3950 return AArch64::LDURWi;
3951 case AArch64::LDRSWui:
3952 return AArch64::LDURSWi;
3953 case AArch64::STRWui:
3954 return AArch64::STURWi;
3955 case AArch64::LDRHui:
3956 return AArch64::LDURHi;
3957 case AArch64::STRHui:
3958 return AArch64::STURHi;
3959 case AArch64::LDRHHui:
3960 return AArch64::LDURHHi;
3961 case AArch64::STRHHui:
3962 return AArch64::STURHHi;
3963 case AArch64::LDRSHXui:
3964 return AArch64::LDURSHXi;
3965 case AArch64::LDRSHWui:
3966 return AArch64::LDURSHWi;
3967 case AArch64::LDRBBui:
3968 return AArch64::LDURBBi;
3969 case AArch64::LDRBui:
3970 return AArch64::LDURBi;
3971 case AArch64::STRBBui:
3972 return AArch64::STURBBi;
3973 case AArch64::STRBui:
3974 return AArch64::STURBi;
3975 case AArch64::LDRSBWui:
3976 return AArch64::LDURSBWi;
3977 case AArch64::LDRSBXui:
3978 return AArch64::LDURSBXi;
3979 }
3980}
3981
3982// Given the opcode of a memory load/store instruction, return the opcode of an
3983// instruction performing the same operation, but using
3984// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3985// offset register.
3986static unsigned offsetExtendOpcode(unsigned Opcode) {
3987 switch (Opcode) {
3988 default:
3989 llvm_unreachable("Address folding not implemented for instruction");
3990
3991 case AArch64::LDRQroX:
3992 case AArch64::LDURQi:
3993 case AArch64::LDRQui:
3994 return AArch64::LDRQroW;
3995 case AArch64::STRQroX:
3996 case AArch64::STURQi:
3997 case AArch64::STRQui:
3998 return AArch64::STRQroW;
3999 case AArch64::LDRDroX:
4000 case AArch64::LDURDi:
4001 case AArch64::LDRDui:
4002 return AArch64::LDRDroW;
4003 case AArch64::STRDroX:
4004 case AArch64::STURDi:
4005 case AArch64::STRDui:
4006 return AArch64::STRDroW;
4007 case AArch64::LDRXroX:
4008 case AArch64::LDURXi:
4009 case AArch64::LDRXui:
4010 return AArch64::LDRXroW;
4011 case AArch64::STRXroX:
4012 case AArch64::STURXi:
4013 case AArch64::STRXui:
4014 return AArch64::STRXroW;
4015 case AArch64::LDRWroX:
4016 case AArch64::LDURWi:
4017 case AArch64::LDRWui:
4018 return AArch64::LDRWroW;
4019 case AArch64::LDRSWroX:
4020 case AArch64::LDURSWi:
4021 case AArch64::LDRSWui:
4022 return AArch64::LDRSWroW;
4023 case AArch64::STRWroX:
4024 case AArch64::STURWi:
4025 case AArch64::STRWui:
4026 return AArch64::STRWroW;
4027 case AArch64::LDRHroX:
4028 case AArch64::LDURHi:
4029 case AArch64::LDRHui:
4030 return AArch64::LDRHroW;
4031 case AArch64::STRHroX:
4032 case AArch64::STURHi:
4033 case AArch64::STRHui:
4034 return AArch64::STRHroW;
4035 case AArch64::LDRHHroX:
4036 case AArch64::LDURHHi:
4037 case AArch64::LDRHHui:
4038 return AArch64::LDRHHroW;
4039 case AArch64::STRHHroX:
4040 case AArch64::STURHHi:
4041 case AArch64::STRHHui:
4042 return AArch64::STRHHroW;
4043 case AArch64::LDRSHXroX:
4044 case AArch64::LDURSHXi:
4045 case AArch64::LDRSHXui:
4046 return AArch64::LDRSHXroW;
4047 case AArch64::LDRSHWroX:
4048 case AArch64::LDURSHWi:
4049 case AArch64::LDRSHWui:
4050 return AArch64::LDRSHWroW;
4051 case AArch64::LDRBroX:
4052 case AArch64::LDURBi:
4053 case AArch64::LDRBui:
4054 return AArch64::LDRBroW;
4055 case AArch64::LDRBBroX:
4056 case AArch64::LDURBBi:
4057 case AArch64::LDRBBui:
4058 return AArch64::LDRBBroW;
4059 case AArch64::LDRSBXroX:
4060 case AArch64::LDURSBXi:
4061 case AArch64::LDRSBXui:
4062 return AArch64::LDRSBXroW;
4063 case AArch64::LDRSBWroX:
4064 case AArch64::LDURSBWi:
4065 case AArch64::LDRSBWui:
4066 return AArch64::LDRSBWroW;
4067 case AArch64::STRBroX:
4068 case AArch64::STURBi:
4069 case AArch64::STRBui:
4070 return AArch64::STRBroW;
4071 case AArch64::STRBBroX:
4072 case AArch64::STURBBi:
4073 case AArch64::STRBBui:
4074 return AArch64::STRBBroW;
4075 }
4076}
4077
4079 const ExtAddrMode &AM) const {
4080
4081 const DebugLoc &DL = MemI.getDebugLoc();
4082 MachineBasicBlock &MBB = *MemI.getParent();
4084
4086 if (AM.ScaledReg) {
4087 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4088 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4089 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4090 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4091 .addReg(MemI.getOperand(0).getReg(),
4092 getDefRegState(MemI.mayLoad()))
4093 .addReg(AM.BaseReg)
4094 .addReg(AM.ScaledReg)
4095 .addImm(0)
4096 .addImm(AM.Scale > 1)
4097 .setMemRefs(MemI.memoperands())
4098 .setMIFlags(MemI.getFlags());
4099 return B.getInstr();
4100 }
4101
4102 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4103 "Addressing mode not supported for folding");
4104
4105 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4106 unsigned Scale = 1;
4107 unsigned Opcode = MemI.getOpcode();
4108 if (isInt<9>(AM.Displacement))
4109 Opcode = unscaledOffsetOpcode(Opcode);
4110 else
4111 Opcode = scaledOffsetOpcode(Opcode, Scale);
4112
4113 auto B =
4114 BuildMI(MBB, MemI, DL, get(Opcode))
4115 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4116 .addReg(AM.BaseReg)
4117 .addImm(AM.Displacement / Scale)
4118 .setMemRefs(MemI.memoperands())
4119 .setMIFlags(MemI.getFlags());
4120 return B.getInstr();
4121 }
4122
4125 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4126 assert(AM.ScaledReg && !AM.Displacement &&
4127 "Address offset can be a register or an immediate, but not both");
4128 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4129 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4130 // Make sure the offset register is in the correct register class.
4131 Register OffsetReg = AM.ScaledReg;
4132 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4133 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4134 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4135 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4136 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4137 }
4138 auto B =
4139 BuildMI(MBB, MemI, DL, get(Opcode))
4140 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4141 .addReg(AM.BaseReg)
4142 .addReg(OffsetReg)
4144 .addImm(AM.Scale != 1)
4145 .setMemRefs(MemI.memoperands())
4146 .setMIFlags(MemI.getFlags());
4147
4148 return B.getInstr();
4149 }
4150
4152 "Function must not be called with an addressing mode it can't handle");
4153}
4154
4155/// Return true if the opcode is a post-index ld/st instruction, which really
4156/// loads from base+0.
4157static bool isPostIndexLdStOpcode(unsigned Opcode) {
4158 switch (Opcode) {
4159 default:
4160 return false;
4161 case AArch64::LD1Fourv16b_POST:
4162 case AArch64::LD1Fourv1d_POST:
4163 case AArch64::LD1Fourv2d_POST:
4164 case AArch64::LD1Fourv2s_POST:
4165 case AArch64::LD1Fourv4h_POST:
4166 case AArch64::LD1Fourv4s_POST:
4167 case AArch64::LD1Fourv8b_POST:
4168 case AArch64::LD1Fourv8h_POST:
4169 case AArch64::LD1Onev16b_POST:
4170 case AArch64::LD1Onev1d_POST:
4171 case AArch64::LD1Onev2d_POST:
4172 case AArch64::LD1Onev2s_POST:
4173 case AArch64::LD1Onev4h_POST:
4174 case AArch64::LD1Onev4s_POST:
4175 case AArch64::LD1Onev8b_POST:
4176 case AArch64::LD1Onev8h_POST:
4177 case AArch64::LD1Rv16b_POST:
4178 case AArch64::LD1Rv1d_POST:
4179 case AArch64::LD1Rv2d_POST:
4180 case AArch64::LD1Rv2s_POST:
4181 case AArch64::LD1Rv4h_POST:
4182 case AArch64::LD1Rv4s_POST:
4183 case AArch64::LD1Rv8b_POST:
4184 case AArch64::LD1Rv8h_POST:
4185 case AArch64::LD1Threev16b_POST:
4186 case AArch64::LD1Threev1d_POST:
4187 case AArch64::LD1Threev2d_POST:
4188 case AArch64::LD1Threev2s_POST:
4189 case AArch64::LD1Threev4h_POST:
4190 case AArch64::LD1Threev4s_POST:
4191 case AArch64::LD1Threev8b_POST:
4192 case AArch64::LD1Threev8h_POST:
4193 case AArch64::LD1Twov16b_POST:
4194 case AArch64::LD1Twov1d_POST:
4195 case AArch64::LD1Twov2d_POST:
4196 case AArch64::LD1Twov2s_POST:
4197 case AArch64::LD1Twov4h_POST:
4198 case AArch64::LD1Twov4s_POST:
4199 case AArch64::LD1Twov8b_POST:
4200 case AArch64::LD1Twov8h_POST:
4201 case AArch64::LD1i16_POST:
4202 case AArch64::LD1i32_POST:
4203 case AArch64::LD1i64_POST:
4204 case AArch64::LD1i8_POST:
4205 case AArch64::LD2Rv16b_POST:
4206 case AArch64::LD2Rv1d_POST:
4207 case AArch64::LD2Rv2d_POST:
4208 case AArch64::LD2Rv2s_POST:
4209 case AArch64::LD2Rv4h_POST:
4210 case AArch64::LD2Rv4s_POST:
4211 case AArch64::LD2Rv8b_POST:
4212 case AArch64::LD2Rv8h_POST:
4213 case AArch64::LD2Twov16b_POST:
4214 case AArch64::LD2Twov2d_POST:
4215 case AArch64::LD2Twov2s_POST:
4216 case AArch64::LD2Twov4h_POST:
4217 case AArch64::LD2Twov4s_POST:
4218 case AArch64::LD2Twov8b_POST:
4219 case AArch64::LD2Twov8h_POST:
4220 case AArch64::LD2i16_POST:
4221 case AArch64::LD2i32_POST:
4222 case AArch64::LD2i64_POST:
4223 case AArch64::LD2i8_POST:
4224 case AArch64::LD3Rv16b_POST:
4225 case AArch64::LD3Rv1d_POST:
4226 case AArch64::LD3Rv2d_POST:
4227 case AArch64::LD3Rv2s_POST:
4228 case AArch64::LD3Rv4h_POST:
4229 case AArch64::LD3Rv4s_POST:
4230 case AArch64::LD3Rv8b_POST:
4231 case AArch64::LD3Rv8h_POST:
4232 case AArch64::LD3Threev16b_POST:
4233 case AArch64::LD3Threev2d_POST:
4234 case AArch64::LD3Threev2s_POST:
4235 case AArch64::LD3Threev4h_POST:
4236 case AArch64::LD3Threev4s_POST:
4237 case AArch64::LD3Threev8b_POST:
4238 case AArch64::LD3Threev8h_POST:
4239 case AArch64::LD3i16_POST:
4240 case AArch64::LD3i32_POST:
4241 case AArch64::LD3i64_POST:
4242 case AArch64::LD3i8_POST:
4243 case AArch64::LD4Fourv16b_POST:
4244 case AArch64::LD4Fourv2d_POST:
4245 case AArch64::LD4Fourv2s_POST:
4246 case AArch64::LD4Fourv4h_POST:
4247 case AArch64::LD4Fourv4s_POST:
4248 case AArch64::LD4Fourv8b_POST:
4249 case AArch64::LD4Fourv8h_POST:
4250 case AArch64::LD4Rv16b_POST:
4251 case AArch64::LD4Rv1d_POST:
4252 case AArch64::LD4Rv2d_POST:
4253 case AArch64::LD4Rv2s_POST:
4254 case AArch64::LD4Rv4h_POST:
4255 case AArch64::LD4Rv4s_POST:
4256 case AArch64::LD4Rv8b_POST:
4257 case AArch64::LD4Rv8h_POST:
4258 case AArch64::LD4i16_POST:
4259 case AArch64::LD4i32_POST:
4260 case AArch64::LD4i64_POST:
4261 case AArch64::LD4i8_POST:
4262 case AArch64::LDAPRWpost:
4263 case AArch64::LDAPRXpost:
4264 case AArch64::LDIAPPWpost:
4265 case AArch64::LDIAPPXpost:
4266 case AArch64::LDPDpost:
4267 case AArch64::LDPQpost:
4268 case AArch64::LDPSWpost:
4269 case AArch64::LDPSpost:
4270 case AArch64::LDPWpost:
4271 case AArch64::LDPXpost:
4272 case AArch64::LDRBBpost:
4273 case AArch64::LDRBpost:
4274 case AArch64::LDRDpost:
4275 case AArch64::LDRHHpost:
4276 case AArch64::LDRHpost:
4277 case AArch64::LDRQpost:
4278 case AArch64::LDRSBWpost:
4279 case AArch64::LDRSBXpost:
4280 case AArch64::LDRSHWpost:
4281 case AArch64::LDRSHXpost:
4282 case AArch64::LDRSWpost:
4283 case AArch64::LDRSpost:
4284 case AArch64::LDRWpost:
4285 case AArch64::LDRXpost:
4286 case AArch64::ST1Fourv16b_POST:
4287 case AArch64::ST1Fourv1d_POST:
4288 case AArch64::ST1Fourv2d_POST:
4289 case AArch64::ST1Fourv2s_POST:
4290 case AArch64::ST1Fourv4h_POST:
4291 case AArch64::ST1Fourv4s_POST:
4292 case AArch64::ST1Fourv8b_POST:
4293 case AArch64::ST1Fourv8h_POST:
4294 case AArch64::ST1Onev16b_POST:
4295 case AArch64::ST1Onev1d_POST:
4296 case AArch64::ST1Onev2d_POST:
4297 case AArch64::ST1Onev2s_POST:
4298 case AArch64::ST1Onev4h_POST:
4299 case AArch64::ST1Onev4s_POST:
4300 case AArch64::ST1Onev8b_POST:
4301 case AArch64::ST1Onev8h_POST:
4302 case AArch64::ST1Threev16b_POST:
4303 case AArch64::ST1Threev1d_POST:
4304 case AArch64::ST1Threev2d_POST:
4305 case AArch64::ST1Threev2s_POST:
4306 case AArch64::ST1Threev4h_POST:
4307 case AArch64::ST1Threev4s_POST:
4308 case AArch64::ST1Threev8b_POST:
4309 case AArch64::ST1Threev8h_POST:
4310 case AArch64::ST1Twov16b_POST:
4311 case AArch64::ST1Twov1d_POST:
4312 case AArch64::ST1Twov2d_POST:
4313 case AArch64::ST1Twov2s_POST:
4314 case AArch64::ST1Twov4h_POST:
4315 case AArch64::ST1Twov4s_POST:
4316 case AArch64::ST1Twov8b_POST:
4317 case AArch64::ST1Twov8h_POST:
4318 case AArch64::ST1i16_POST:
4319 case AArch64::ST1i32_POST:
4320 case AArch64::ST1i64_POST:
4321 case AArch64::ST1i8_POST:
4322 case AArch64::ST2GPostIndex:
4323 case AArch64::ST2Twov16b_POST:
4324 case AArch64::ST2Twov2d_POST:
4325 case AArch64::ST2Twov2s_POST:
4326 case AArch64::ST2Twov4h_POST:
4327 case AArch64::ST2Twov4s_POST:
4328 case AArch64::ST2Twov8b_POST:
4329 case AArch64::ST2Twov8h_POST:
4330 case AArch64::ST2i16_POST:
4331 case AArch64::ST2i32_POST:
4332 case AArch64::ST2i64_POST:
4333 case AArch64::ST2i8_POST:
4334 case AArch64::ST3Threev16b_POST:
4335 case AArch64::ST3Threev2d_POST:
4336 case AArch64::ST3Threev2s_POST:
4337 case AArch64::ST3Threev4h_POST:
4338 case AArch64::ST3Threev4s_POST:
4339 case AArch64::ST3Threev8b_POST:
4340 case AArch64::ST3Threev8h_POST:
4341 case AArch64::ST3i16_POST:
4342 case AArch64::ST3i32_POST:
4343 case AArch64::ST3i64_POST:
4344 case AArch64::ST3i8_POST:
4345 case AArch64::ST4Fourv16b_POST:
4346 case AArch64::ST4Fourv2d_POST:
4347 case AArch64::ST4Fourv2s_POST:
4348 case AArch64::ST4Fourv4h_POST:
4349 case AArch64::ST4Fourv4s_POST:
4350 case AArch64::ST4Fourv8b_POST:
4351 case AArch64::ST4Fourv8h_POST:
4352 case AArch64::ST4i16_POST:
4353 case AArch64::ST4i32_POST:
4354 case AArch64::ST4i64_POST:
4355 case AArch64::ST4i8_POST:
4356 case AArch64::STGPostIndex:
4357 case AArch64::STGPpost:
4358 case AArch64::STPDpost:
4359 case AArch64::STPQpost:
4360 case AArch64::STPSpost:
4361 case AArch64::STPWpost:
4362 case AArch64::STPXpost:
4363 case AArch64::STRBBpost:
4364 case AArch64::STRBpost:
4365 case AArch64::STRDpost:
4366 case AArch64::STRHHpost:
4367 case AArch64::STRHpost:
4368 case AArch64::STRQpost:
4369 case AArch64::STRSpost:
4370 case AArch64::STRWpost:
4371 case AArch64::STRXpost:
4372 case AArch64::STZ2GPostIndex:
4373 case AArch64::STZGPostIndex:
4374 return true;
4375 }
4376}
4377
4379 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4380 bool &OffsetIsScalable, TypeSize &Width,
4381 const TargetRegisterInfo *TRI) const {
4382 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4383 // Handle only loads/stores with base register followed by immediate offset.
4384 if (LdSt.getNumExplicitOperands() == 3) {
4385 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4386 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4387 !LdSt.getOperand(2).isImm())
4388 return false;
4389 } else if (LdSt.getNumExplicitOperands() == 4) {
4390 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4391 if (!LdSt.getOperand(1).isReg() ||
4392 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4393 !LdSt.getOperand(3).isImm())
4394 return false;
4395 } else
4396 return false;
4397
4398 // Get the scaling factor for the instruction and set the width for the
4399 // instruction.
4400 TypeSize Scale(0U, false);
4401 int64_t Dummy1, Dummy2;
4402
4403 // If this returns false, then it's an instruction we don't want to handle.
4404 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4405 return false;
4406
4407 // Compute the offset. Offset is calculated as the immediate operand
4408 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4409 // set to 1. Postindex are a special case which have an offset of 0.
4410 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4411 BaseOp = &LdSt.getOperand(2);
4412 Offset = 0;
4413 } else if (LdSt.getNumExplicitOperands() == 3) {
4414 BaseOp = &LdSt.getOperand(1);
4415 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4416 } else {
4417 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4418 BaseOp = &LdSt.getOperand(2);
4419 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4420 }
4421 OffsetIsScalable = Scale.isScalable();
4422
4423 return BaseOp->isReg() || BaseOp->isFI();
4424}
4425
4428 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4429 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4430 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4431 return OfsOp;
4432}
4433
4434bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4435 TypeSize &Width, int64_t &MinOffset,
4436 int64_t &MaxOffset) {
4437 switch (Opcode) {
4438 // Not a memory operation or something we want to handle.
4439 default:
4440 Scale = TypeSize::getFixed(0);
4441 Width = TypeSize::getFixed(0);
4442 MinOffset = MaxOffset = 0;
4443 return false;
4444 // LDR / STR
4445 case AArch64::LDRQui:
4446 case AArch64::STRQui:
4447 Scale = TypeSize::getFixed(16);
4448 Width = TypeSize::getFixed(16);
4449 MinOffset = 0;
4450 MaxOffset = 4095;
4451 break;
4452 case AArch64::LDRXui:
4453 case AArch64::LDRDui:
4454 case AArch64::STRXui:
4455 case AArch64::STRDui:
4456 case AArch64::PRFMui:
4457 Scale = TypeSize::getFixed(8);
4458 Width = TypeSize::getFixed(8);
4459 MinOffset = 0;
4460 MaxOffset = 4095;
4461 break;
4462 case AArch64::LDRWui:
4463 case AArch64::LDRSui:
4464 case AArch64::LDRSWui:
4465 case AArch64::STRWui:
4466 case AArch64::STRSui:
4467 Scale = TypeSize::getFixed(4);
4468 Width = TypeSize::getFixed(4);
4469 MinOffset = 0;
4470 MaxOffset = 4095;
4471 break;
4472 case AArch64::LDRHui:
4473 case AArch64::LDRHHui:
4474 case AArch64::LDRSHWui:
4475 case AArch64::LDRSHXui:
4476 case AArch64::STRHui:
4477 case AArch64::STRHHui:
4478 Scale = TypeSize::getFixed(2);
4479 Width = TypeSize::getFixed(2);
4480 MinOffset = 0;
4481 MaxOffset = 4095;
4482 break;
4483 case AArch64::LDRBui:
4484 case AArch64::LDRBBui:
4485 case AArch64::LDRSBWui:
4486 case AArch64::LDRSBXui:
4487 case AArch64::STRBui:
4488 case AArch64::STRBBui:
4489 Scale = TypeSize::getFixed(1);
4490 Width = TypeSize::getFixed(1);
4491 MinOffset = 0;
4492 MaxOffset = 4095;
4493 break;
4494 // post/pre inc
4495 case AArch64::STRQpre:
4496 case AArch64::LDRQpost:
4497 Scale = TypeSize::getFixed(1);
4498 Width = TypeSize::getFixed(16);
4499 MinOffset = -256;
4500 MaxOffset = 255;
4501 break;
4502 case AArch64::LDRDpost:
4503 case AArch64::LDRDpre:
4504 case AArch64::LDRXpost:
4505 case AArch64::LDRXpre:
4506 case AArch64::STRDpost:
4507 case AArch64::STRDpre:
4508 case AArch64::STRXpost:
4509 case AArch64::STRXpre:
4510 Scale = TypeSize::getFixed(1);
4511 Width = TypeSize::getFixed(8);
4512 MinOffset = -256;
4513 MaxOffset = 255;
4514 break;
4515 case AArch64::STRWpost:
4516 case AArch64::STRWpre:
4517 case AArch64::LDRWpost:
4518 case AArch64::LDRWpre:
4519 case AArch64::STRSpost:
4520 case AArch64::STRSpre:
4521 case AArch64::LDRSpost:
4522 case AArch64::LDRSpre:
4523 Scale = TypeSize::getFixed(1);
4524 Width = TypeSize::getFixed(4);
4525 MinOffset = -256;
4526 MaxOffset = 255;
4527 break;
4528 case AArch64::LDRHpost:
4529 case AArch64::LDRHpre:
4530 case AArch64::STRHpost:
4531 case AArch64::STRHpre:
4532 case AArch64::LDRHHpost:
4533 case AArch64::LDRHHpre:
4534 case AArch64::STRHHpost:
4535 case AArch64::STRHHpre:
4536 Scale = TypeSize::getFixed(1);
4537 Width = TypeSize::getFixed(2);
4538 MinOffset = -256;
4539 MaxOffset = 255;
4540 break;
4541 case AArch64::LDRBpost:
4542 case AArch64::LDRBpre:
4543 case AArch64::STRBpost:
4544 case AArch64::STRBpre:
4545 case AArch64::LDRBBpost:
4546 case AArch64::LDRBBpre:
4547 case AArch64::STRBBpost:
4548 case AArch64::STRBBpre:
4549 Scale = TypeSize::getFixed(1);
4550 Width = TypeSize::getFixed(1);
4551 MinOffset = -256;
4552 MaxOffset = 255;
4553 break;
4554 // Unscaled
4555 case AArch64::LDURQi:
4556 case AArch64::STURQi:
4557 Scale = TypeSize::getFixed(1);
4558 Width = TypeSize::getFixed(16);
4559 MinOffset = -256;
4560 MaxOffset = 255;
4561 break;
4562 case AArch64::LDURXi:
4563 case AArch64::LDURDi:
4564 case AArch64::LDAPURXi:
4565 case AArch64::STURXi:
4566 case AArch64::STURDi:
4567 case AArch64::STLURXi:
4568 case AArch64::PRFUMi:
4569 Scale = TypeSize::getFixed(1);
4570 Width = TypeSize::getFixed(8);
4571 MinOffset = -256;
4572 MaxOffset = 255;
4573 break;
4574 case AArch64::LDURWi:
4575 case AArch64::LDURSi:
4576 case AArch64::LDURSWi:
4577 case AArch64::LDAPURi:
4578 case AArch64::LDAPURSWi:
4579 case AArch64::STURWi:
4580 case AArch64::STURSi:
4581 case AArch64::STLURWi:
4582 Scale = TypeSize::getFixed(1);
4583 Width = TypeSize::getFixed(4);
4584 MinOffset = -256;
4585 MaxOffset = 255;
4586 break;
4587 case AArch64::LDURHi:
4588 case AArch64::LDURHHi:
4589 case AArch64::LDURSHXi:
4590 case AArch64::LDURSHWi:
4591 case AArch64::LDAPURHi:
4592 case AArch64::LDAPURSHWi:
4593 case AArch64::LDAPURSHXi:
4594 case AArch64::STURHi:
4595 case AArch64::STURHHi:
4596 case AArch64::STLURHi:
4597 Scale = TypeSize::getFixed(1);
4598 Width = TypeSize::getFixed(2);
4599 MinOffset = -256;
4600 MaxOffset = 255;
4601 break;
4602 case AArch64::LDURBi:
4603 case AArch64::LDURBBi:
4604 case AArch64::LDURSBXi:
4605 case AArch64::LDURSBWi:
4606 case AArch64::LDAPURBi:
4607 case AArch64::LDAPURSBWi:
4608 case AArch64::LDAPURSBXi:
4609 case AArch64::STURBi:
4610 case AArch64::STURBBi:
4611 case AArch64::STLURBi:
4612 Scale = TypeSize::getFixed(1);
4613 Width = TypeSize::getFixed(1);
4614 MinOffset = -256;
4615 MaxOffset = 255;
4616 break;
4617 // LDP / STP (including pre/post inc)
4618 case AArch64::LDPQi:
4619 case AArch64::LDNPQi:
4620 case AArch64::STPQi:
4621 case AArch64::STNPQi:
4622 case AArch64::LDPQpost:
4623 case AArch64::LDPQpre:
4624 case AArch64::STPQpost:
4625 case AArch64::STPQpre:
4626 Scale = TypeSize::getFixed(16);
4627 Width = TypeSize::getFixed(16 * 2);
4628 MinOffset = -64;
4629 MaxOffset = 63;
4630 break;
4631 case AArch64::LDPXi:
4632 case AArch64::LDPDi:
4633 case AArch64::LDNPXi:
4634 case AArch64::LDNPDi:
4635 case AArch64::STPXi:
4636 case AArch64::STPDi:
4637 case AArch64::STNPXi:
4638 case AArch64::STNPDi:
4639 case AArch64::LDPDpost:
4640 case AArch64::LDPDpre:
4641 case AArch64::LDPXpost:
4642 case AArch64::LDPXpre:
4643 case AArch64::STPDpost:
4644 case AArch64::STPDpre:
4645 case AArch64::STPXpost:
4646 case AArch64::STPXpre:
4647 Scale = TypeSize::getFixed(8);
4648 Width = TypeSize::getFixed(8 * 2);
4649 MinOffset = -64;
4650 MaxOffset = 63;
4651 break;
4652 case AArch64::LDPWi:
4653 case AArch64::LDPSi:
4654 case AArch64::LDNPWi:
4655 case AArch64::LDNPSi:
4656 case AArch64::STPWi:
4657 case AArch64::STPSi:
4658 case AArch64::STNPWi:
4659 case AArch64::STNPSi:
4660 case AArch64::LDPSpost:
4661 case AArch64::LDPSpre:
4662 case AArch64::LDPWpost:
4663 case AArch64::LDPWpre:
4664 case AArch64::STPSpost:
4665 case AArch64::STPSpre:
4666 case AArch64::STPWpost:
4667 case AArch64::STPWpre:
4668 Scale = TypeSize::getFixed(4);
4669 Width = TypeSize::getFixed(4 * 2);
4670 MinOffset = -64;
4671 MaxOffset = 63;
4672 break;
4673 case AArch64::StoreSwiftAsyncContext:
4674 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4675 Scale = TypeSize::getFixed(1);
4676 Width = TypeSize::getFixed(8);
4677 MinOffset = 0;
4678 MaxOffset = 4095;
4679 break;
4680 case AArch64::ADDG:
4681 Scale = TypeSize::getFixed(16);
4682 Width = TypeSize::getFixed(0);
4683 MinOffset = 0;
4684 MaxOffset = 63;
4685 break;
4686 case AArch64::TAGPstack:
4687 Scale = TypeSize::getFixed(16);
4688 Width = TypeSize::getFixed(0);
4689 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4690 // of 63 (not 64!).
4691 MinOffset = -63;
4692 MaxOffset = 63;
4693 break;
4694 case AArch64::LDG:
4695 case AArch64::STGi:
4696 case AArch64::STGPreIndex:
4697 case AArch64::STGPostIndex:
4698 case AArch64::STZGi:
4699 case AArch64::STZGPreIndex:
4700 case AArch64::STZGPostIndex:
4701 Scale = TypeSize::getFixed(16);
4702 Width = TypeSize::getFixed(16);
4703 MinOffset = -256;
4704 MaxOffset = 255;
4705 break;
4706 // SVE
4707 case AArch64::STR_ZZZZXI:
4708 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4709 case AArch64::LDR_ZZZZXI:
4710 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4711 Scale = TypeSize::getScalable(16);
4712 Width = TypeSize::getScalable(16 * 4);
4713 MinOffset = -256;
4714 MaxOffset = 252;
4715 break;
4716 case AArch64::STR_ZZZXI:
4717 case AArch64::LDR_ZZZXI:
4718 Scale = TypeSize::getScalable(16);
4719 Width = TypeSize::getScalable(16 * 3);
4720 MinOffset = -256;
4721 MaxOffset = 253;
4722 break;
4723 case AArch64::STR_ZZXI:
4724 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4725 case AArch64::LDR_ZZXI:
4726 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4727 Scale = TypeSize::getScalable(16);
4728 Width = TypeSize::getScalable(16 * 2);
4729 MinOffset = -256;
4730 MaxOffset = 254;
4731 break;
4732 case AArch64::LDR_PXI:
4733 case AArch64::STR_PXI:
4734 Scale = TypeSize::getScalable(2);
4735 Width = TypeSize::getScalable(2);
4736 MinOffset = -256;
4737 MaxOffset = 255;
4738 break;
4739 case AArch64::LDR_PPXI:
4740 case AArch64::STR_PPXI:
4741 Scale = TypeSize::getScalable(2);
4742 Width = TypeSize::getScalable(2 * 2);
4743 MinOffset = -256;
4744 MaxOffset = 254;
4745 break;
4746 case AArch64::LDR_ZXI:
4747 case AArch64::STR_ZXI:
4748 Scale = TypeSize::getScalable(16);
4749 Width = TypeSize::getScalable(16);
4750 MinOffset = -256;
4751 MaxOffset = 255;
4752 break;
4753 case AArch64::LD1B_IMM:
4754 case AArch64::LD1H_IMM:
4755 case AArch64::LD1W_IMM:
4756 case AArch64::LD1D_IMM:
4757 case AArch64::LDNT1B_ZRI:
4758 case AArch64::LDNT1H_ZRI:
4759 case AArch64::LDNT1W_ZRI:
4760 case AArch64::LDNT1D_ZRI:
4761 case AArch64::ST1B_IMM:
4762 case AArch64::ST1H_IMM:
4763 case AArch64::ST1W_IMM:
4764 case AArch64::ST1D_IMM:
4765 case AArch64::STNT1B_ZRI:
4766 case AArch64::STNT1H_ZRI:
4767 case AArch64::STNT1W_ZRI:
4768 case AArch64::STNT1D_ZRI:
4769 case AArch64::LDNF1B_IMM:
4770 case AArch64::LDNF1H_IMM:
4771 case AArch64::LDNF1W_IMM:
4772 case AArch64::LDNF1D_IMM:
4773 // A full vectors worth of data
4774 // Width = mbytes * elements
4775 Scale = TypeSize::getScalable(16);
4776 Width = TypeSize::getScalable(16);
4777 MinOffset = -8;
4778 MaxOffset = 7;
4779 break;
4780 case AArch64::LD2B_IMM:
4781 case AArch64::LD2H_IMM:
4782 case AArch64::LD2W_IMM:
4783 case AArch64::LD2D_IMM:
4784 case AArch64::ST2B_IMM:
4785 case AArch64::ST2H_IMM:
4786 case AArch64::ST2W_IMM:
4787 case AArch64::ST2D_IMM:
4788 Scale = TypeSize::getScalable(32);
4789 Width = TypeSize::getScalable(16 * 2);
4790 MinOffset = -8;
4791 MaxOffset = 7;
4792 break;
4793 case AArch64::LD3B_IMM:
4794 case AArch64::LD3H_IMM:
4795 case AArch64::LD3W_IMM:
4796 case AArch64::LD3D_IMM:
4797 case AArch64::ST3B_IMM:
4798 case AArch64::ST3H_IMM:
4799 case AArch64::ST3W_IMM:
4800 case AArch64::ST3D_IMM:
4801 Scale = TypeSize::getScalable(48);
4802 Width = TypeSize::getScalable(16 * 3);
4803 MinOffset = -8;
4804 MaxOffset = 7;
4805 break;
4806 case AArch64::LD4B_IMM:
4807 case AArch64::LD4H_IMM:
4808 case AArch64::LD4W_IMM:
4809 case AArch64::LD4D_IMM:
4810 case AArch64::ST4B_IMM:
4811 case AArch64::ST4H_IMM:
4812 case AArch64::ST4W_IMM:
4813 case AArch64::ST4D_IMM:
4814 Scale = TypeSize::getScalable(64);
4815 Width = TypeSize::getScalable(16 * 4);
4816 MinOffset = -8;
4817 MaxOffset = 7;
4818 break;
4819 case AArch64::LD1B_H_IMM:
4820 case AArch64::LD1SB_H_IMM:
4821 case AArch64::LD1H_S_IMM:
4822 case AArch64::LD1SH_S_IMM:
4823 case AArch64::LD1W_D_IMM:
4824 case AArch64::LD1SW_D_IMM:
4825 case AArch64::ST1B_H_IMM:
4826 case AArch64::ST1H_S_IMM:
4827 case AArch64::ST1W_D_IMM:
4828 case AArch64::LDNF1B_H_IMM:
4829 case AArch64::LDNF1SB_H_IMM:
4830 case AArch64::LDNF1H_S_IMM:
4831 case AArch64::LDNF1SH_S_IMM:
4832 case AArch64::LDNF1W_D_IMM:
4833 case AArch64::LDNF1SW_D_IMM:
4834 // A half vector worth of data
4835 // Width = mbytes * elements
4836 Scale = TypeSize::getScalable(8);
4837 Width = TypeSize::getScalable(8);
4838 MinOffset = -8;
4839 MaxOffset = 7;
4840 break;
4841 case AArch64::LD1B_S_IMM:
4842 case AArch64::LD1SB_S_IMM:
4843 case AArch64::LD1H_D_IMM:
4844 case AArch64::LD1SH_D_IMM:
4845 case AArch64::ST1B_S_IMM:
4846 case AArch64::ST1H_D_IMM:
4847 case AArch64::LDNF1B_S_IMM:
4848 case AArch64::LDNF1SB_S_IMM:
4849 case AArch64::LDNF1H_D_IMM:
4850 case AArch64::LDNF1SH_D_IMM:
4851 // A quarter vector worth of data
4852 // Width = mbytes * elements
4853 Scale = TypeSize::getScalable(4);
4854 Width = TypeSize::getScalable(4);
4855 MinOffset = -8;
4856 MaxOffset = 7;
4857 break;
4858 case AArch64::LD1B_D_IMM:
4859 case AArch64::LD1SB_D_IMM:
4860 case AArch64::ST1B_D_IMM:
4861 case AArch64::LDNF1B_D_IMM:
4862 case AArch64::LDNF1SB_D_IMM:
4863 // A eighth vector worth of data
4864 // Width = mbytes * elements
4865 Scale = TypeSize::getScalable(2);
4866 Width = TypeSize::getScalable(2);
4867 MinOffset = -8;
4868 MaxOffset = 7;
4869 break;
4870 case AArch64::ST2Gi:
4871 case AArch64::ST2GPreIndex:
4872 case AArch64::ST2GPostIndex:
4873 case AArch64::STZ2Gi:
4874 case AArch64::STZ2GPreIndex:
4875 case AArch64::STZ2GPostIndex:
4876 Scale = TypeSize::getFixed(16);
4877 Width = TypeSize::getFixed(32);
4878 MinOffset = -256;
4879 MaxOffset = 255;
4880 break;
4881 case AArch64::STGPi:
4882 case AArch64::STGPpost:
4883 case AArch64::STGPpre:
4884 Scale = TypeSize::getFixed(16);
4885 Width = TypeSize::getFixed(16);
4886 MinOffset = -64;
4887 MaxOffset = 63;
4888 break;
4889 case AArch64::LD1RB_IMM:
4890 case AArch64::LD1RB_H_IMM:
4891 case AArch64::LD1RB_S_IMM:
4892 case AArch64::LD1RB_D_IMM:
4893 case AArch64::LD1RSB_H_IMM:
4894 case AArch64::LD1RSB_S_IMM:
4895 case AArch64::LD1RSB_D_IMM:
4896 Scale = TypeSize::getFixed(1);
4897 Width = TypeSize::getFixed(1);
4898 MinOffset = 0;
4899 MaxOffset = 63;
4900 break;
4901 case AArch64::LD1RH_IMM:
4902 case AArch64::LD1RH_S_IMM:
4903 case AArch64::LD1RH_D_IMM:
4904 case AArch64::LD1RSH_S_IMM:
4905 case AArch64::LD1RSH_D_IMM:
4906 Scale = TypeSize::getFixed(2);
4907 Width = TypeSize::getFixed(2);
4908 MinOffset = 0;
4909 MaxOffset = 63;
4910 break;
4911 case AArch64::LD1RW_IMM:
4912 case AArch64::LD1RW_D_IMM:
4913 case AArch64::LD1RSW_IMM:
4914 Scale = TypeSize::getFixed(4);
4915 Width = TypeSize::getFixed(4);
4916 MinOffset = 0;
4917 MaxOffset = 63;
4918 break;
4919 case AArch64::LD1RD_IMM:
4920 Scale = TypeSize::getFixed(8);
4921 Width = TypeSize::getFixed(8);
4922 MinOffset = 0;
4923 MaxOffset = 63;
4924 break;
4925 }
4926
4927 return true;
4928}
4929
4930// Scaling factor for unscaled load or store.
4932 switch (Opc) {
4933 default:
4934 llvm_unreachable("Opcode has unknown scale!");
4935 case AArch64::LDRBui:
4936 case AArch64::LDRBBui:
4937 case AArch64::LDURBBi:
4938 case AArch64::LDRSBWui:
4939 case AArch64::LDURSBWi:
4940 case AArch64::STRBui:
4941 case AArch64::STRBBui:
4942 case AArch64::STURBBi:
4943 return 1;
4944 case AArch64::LDRHui:
4945 case AArch64::LDRHHui:
4946 case AArch64::LDURHHi:
4947 case AArch64::LDRSHWui:
4948 case AArch64::LDURSHWi:
4949 case AArch64::STRHui:
4950 case AArch64::STRHHui:
4951 case AArch64::STURHHi:
4952 return 2;
4953 case AArch64::LDRSui:
4954 case AArch64::LDURSi:
4955 case AArch64::LDRSpre:
4956 case AArch64::LDRSWui:
4957 case AArch64::LDURSWi:
4958 case AArch64::LDRSWpre:
4959 case AArch64::LDRWpre:
4960 case AArch64::LDRWui:
4961 case AArch64::LDURWi:
4962 case AArch64::STRSui:
4963 case AArch64::STURSi:
4964 case AArch64::STRSpre:
4965 case AArch64::STRWui:
4966 case AArch64::STURWi:
4967 case AArch64::STRWpre:
4968 case AArch64::LDPSi:
4969 case AArch64::LDPSWi:
4970 case AArch64::LDPWi:
4971 case AArch64::STPSi:
4972 case AArch64::STPWi:
4973 return 4;
4974 case AArch64::LDRDui:
4975 case AArch64::LDURDi:
4976 case AArch64::LDRDpre:
4977 case AArch64::LDRXui:
4978 case AArch64::LDURXi:
4979 case AArch64::LDRXpre:
4980 case AArch64::STRDui:
4981 case AArch64::STURDi:
4982 case AArch64::STRDpre:
4983 case AArch64::STRXui:
4984 case AArch64::STURXi:
4985 case AArch64::STRXpre:
4986 case AArch64::LDPDi:
4987 case AArch64::LDPXi:
4988 case AArch64::STPDi:
4989 case AArch64::STPXi:
4990 return 8;
4991 case AArch64::LDRQui:
4992 case AArch64::LDURQi:
4993 case AArch64::STRQui:
4994 case AArch64::STURQi:
4995 case AArch64::STRQpre:
4996 case AArch64::LDPQi:
4997 case AArch64::LDRQpre:
4998 case AArch64::STPQi:
4999 case AArch64::STGi:
5000 case AArch64::STZGi:
5001 case AArch64::ST2Gi:
5002 case AArch64::STZ2Gi:
5003 case AArch64::STGPi:
5004 return 16;
5005 }
5006}
5007
5009 switch (MI.getOpcode()) {
5010 default:
5011 return false;
5012 case AArch64::LDRWpre:
5013 case AArch64::LDRXpre:
5014 case AArch64::LDRSWpre:
5015 case AArch64::LDRSpre:
5016 case AArch64::LDRDpre:
5017 case AArch64::LDRQpre:
5018 return true;
5019 }
5020}
5021
5023 switch (MI.getOpcode()) {
5024 default:
5025 return false;
5026 case AArch64::STRWpre:
5027 case AArch64::STRXpre:
5028 case AArch64::STRSpre:
5029 case AArch64::STRDpre:
5030 case AArch64::STRQpre:
5031 return true;
5032 }
5033}
5034
5036 return isPreLd(MI) || isPreSt(MI);
5037}
5038
5040 switch (MI.getOpcode()) {
5041 default:
5042 return false;
5043 case AArch64::LDPSi:
5044 case AArch64::LDPSWi:
5045 case AArch64::LDPDi:
5046 case AArch64::LDPQi:
5047 case AArch64::LDPWi:
5048 case AArch64::LDPXi:
5049 case AArch64::STPSi:
5050 case AArch64::STPDi:
5051 case AArch64::STPQi:
5052 case AArch64::STPWi:
5053 case AArch64::STPXi:
5054 case AArch64::STGPi:
5055 return true;
5056 }
5057}
5058
5060 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5061 unsigned Idx =
5063 : 1;
5064 return MI.getOperand(Idx);
5065}
5066
5067const MachineOperand &
5069 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5070 unsigned Idx =
5072 : 2;
5073 return MI.getOperand(Idx);
5074}
5075
5076const MachineOperand &
5078 switch (MI.getOpcode()) {
5079 default:
5080 llvm_unreachable("Unexpected opcode");
5081 case AArch64::LDRBroX:
5082 case AArch64::LDRBBroX:
5083 case AArch64::LDRSBXroX:
5084 case AArch64::LDRSBWroX:
5085 case AArch64::LDRHroX:
5086 case AArch64::LDRHHroX:
5087 case AArch64::LDRSHXroX:
5088 case AArch64::LDRSHWroX:
5089 case AArch64::LDRWroX:
5090 case AArch64::LDRSroX:
5091 case AArch64::LDRSWroX:
5092 case AArch64::LDRDroX:
5093 case AArch64::LDRXroX:
5094 case AArch64::LDRQroX:
5095 return MI.getOperand(4);
5096 }
5097}
5098
5100 Register Reg) {
5101 if (MI.getParent() == nullptr)
5102 return nullptr;
5103 const MachineFunction *MF = MI.getParent()->getParent();
5104 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5105}
5106
5108 auto IsHFPR = [&](const MachineOperand &Op) {
5109 if (!Op.isReg())
5110 return false;
5111 auto Reg = Op.getReg();
5112 if (Reg.isPhysical())
5113 return AArch64::FPR16RegClass.contains(Reg);
5114 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5115 return TRC == &AArch64::FPR16RegClass ||
5116 TRC == &AArch64::FPR16_loRegClass;
5117 };
5118 return llvm::any_of(MI.operands(), IsHFPR);
5119}
5120
5122 auto IsQFPR = [&](const MachineOperand &Op) {
5123 if (!Op.isReg())
5124 return false;
5125 auto Reg = Op.getReg();
5126 if (Reg.isPhysical())
5127 return AArch64::FPR128RegClass.contains(Reg);
5128 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5129 return TRC == &AArch64::FPR128RegClass ||
5130 TRC == &AArch64::FPR128_loRegClass;
5131 };
5132 return llvm::any_of(MI.operands(), IsQFPR);
5133}
5134
5136 switch (MI.getOpcode()) {
5137 case AArch64::BRK:
5138 case AArch64::HLT:
5139 case AArch64::PACIASP:
5140 case AArch64::PACIBSP:
5141 // Implicit BTI behavior.
5142 return true;
5143 case AArch64::PAUTH_PROLOGUE:
5144 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5145 return true;
5146 case AArch64::HINT: {
5147 unsigned Imm = MI.getOperand(0).getImm();
5148 // Explicit BTI instruction.
5149 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5150 return true;
5151 // PACI(A|B)SP instructions.
5152 if (Imm == 25 || Imm == 27)
5153 return true;
5154 return false;
5155 }
5156 default:
5157 return false;
5158 }
5159}
5160
5162 if (Reg == 0)
5163 return false;
5164 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5165 return AArch64::FPR128RegClass.contains(Reg) ||
5166 AArch64::FPR64RegClass.contains(Reg) ||
5167 AArch64::FPR32RegClass.contains(Reg) ||
5168 AArch64::FPR16RegClass.contains(Reg) ||
5169 AArch64::FPR8RegClass.contains(Reg);
5170}
5171
5173 auto IsFPR = [&](const MachineOperand &Op) {
5174 if (!Op.isReg())
5175 return false;
5176 auto Reg = Op.getReg();
5177 if (Reg.isPhysical())
5178 return isFpOrNEON(Reg);
5179
5180 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5181 return TRC == &AArch64::FPR128RegClass ||
5182 TRC == &AArch64::FPR128_loRegClass ||
5183 TRC == &AArch64::FPR64RegClass ||
5184 TRC == &AArch64::FPR64_loRegClass ||
5185 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5186 TRC == &AArch64::FPR8RegClass;
5187 };
5188 return llvm::any_of(MI.operands(), IsFPR);
5189}
5190
5191// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5192// scaled.
5193static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5195
5196 // If the byte-offset isn't a multiple of the stride, we can't scale this
5197 // offset.
5198 if (Offset % Scale != 0)
5199 return false;
5200
5201 // Convert the byte-offset used by unscaled into an "element" offset used
5202 // by the scaled pair load/store instructions.
5203 Offset /= Scale;
5204 return true;
5205}
5206
5207static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5208 if (FirstOpc == SecondOpc)
5209 return true;
5210 // We can also pair sign-ext and zero-ext instructions.
5211 switch (FirstOpc) {
5212 default:
5213 return false;
5214 case AArch64::STRSui:
5215 case AArch64::STURSi:
5216 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5217 case AArch64::STRDui:
5218 case AArch64::STURDi:
5219 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5220 case AArch64::STRQui:
5221 case AArch64::STURQi:
5222 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5223 case AArch64::STRWui:
5224 case AArch64::STURWi:
5225 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5226 case AArch64::STRXui:
5227 case AArch64::STURXi:
5228 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5229 case AArch64::LDRSui:
5230 case AArch64::LDURSi:
5231 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5232 case AArch64::LDRDui:
5233 case AArch64::LDURDi:
5234 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5235 case AArch64::LDRQui:
5236 case AArch64::LDURQi:
5237 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5238 case AArch64::LDRWui:
5239 case AArch64::LDURWi:
5240 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5241 case AArch64::LDRSWui:
5242 case AArch64::LDURSWi:
5243 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5244 case AArch64::LDRXui:
5245 case AArch64::LDURXi:
5246 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5247 }
5248 // These instructions can't be paired based on their opcodes.
5249 return false;
5250}
5251
5252static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5253 int64_t Offset1, unsigned Opcode1, int FI2,
5254 int64_t Offset2, unsigned Opcode2) {
5255 // Accesses through fixed stack object frame indices may access a different
5256 // fixed stack slot. Check that the object offsets + offsets match.
5257 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5258 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5259 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5260 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5261 // Convert to scaled object offsets.
5262 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5263 if (ObjectOffset1 % Scale1 != 0)
5264 return false;
5265 ObjectOffset1 /= Scale1;
5266 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5267 if (ObjectOffset2 % Scale2 != 0)
5268 return false;
5269 ObjectOffset2 /= Scale2;
5270 ObjectOffset1 += Offset1;
5271 ObjectOffset2 += Offset2;
5272 return ObjectOffset1 + 1 == ObjectOffset2;
5273 }
5274
5275 return FI1 == FI2;
5276}
5277
5278/// Detect opportunities for ldp/stp formation.
5279///
5280/// Only called for LdSt for which getMemOperandWithOffset returns true.
5282 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5283 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5284 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5285 unsigned NumBytes) const {
5286 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5287 const MachineOperand &BaseOp1 = *BaseOps1.front();
5288 const MachineOperand &BaseOp2 = *BaseOps2.front();
5289 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5290 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5291 if (BaseOp1.getType() != BaseOp2.getType())
5292 return false;
5293
5294 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5295 "Only base registers and frame indices are supported.");
5296
5297 // Check for both base regs and base FI.
5298 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5299 return false;
5300
5301 // Only cluster up to a single pair.
5302 if (ClusterSize > 2)
5303 return false;
5304
5305 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5306 return false;
5307
5308 // Can we pair these instructions based on their opcodes?
5309 unsigned FirstOpc = FirstLdSt.getOpcode();
5310 unsigned SecondOpc = SecondLdSt.getOpcode();
5311 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5312 return false;
5313
5314 // Can't merge volatiles or load/stores that have a hint to avoid pair
5315 // formation, for example.
5316 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5317 !isCandidateToMergeOrPair(SecondLdSt))
5318 return false;
5319
5320 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5321 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5322 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5323 return false;
5324
5325 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5326 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5327 return false;
5328
5329 // Pairwise instructions have a 7-bit signed offset field.
5330 if (Offset1 > 63 || Offset1 < -64)
5331 return false;
5332
5333 // The caller should already have ordered First/SecondLdSt by offset.
5334 // Note: except for non-equal frame index bases
5335 if (BaseOp1.isFI()) {
5336 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5337 "Caller should have ordered offsets.");
5338
5339 const MachineFrameInfo &MFI =
5340 FirstLdSt.getParent()->getParent()->getFrameInfo();
5341 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5342 BaseOp2.getIndex(), Offset2, SecondOpc);
5343 }
5344
5345 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5346
5347 return Offset1 + 1 == Offset2;
5348}
5349
5351 MCRegister Reg, unsigned SubIdx,
5352 RegState State,
5353 const TargetRegisterInfo *TRI) {
5354 if (!SubIdx)
5355 return MIB.addReg(Reg, State);
5356
5357 if (Reg.isPhysical())
5358 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5359 return MIB.addReg(Reg, State, SubIdx);
5360}
5361
5362static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5363 unsigned NumRegs) {
5364 // We really want the positive remainder mod 32 here, that happens to be
5365 // easily obtainable with a mask.
5366 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5367}
5368
5371 const DebugLoc &DL, MCRegister DestReg,
5372 MCRegister SrcReg, bool KillSrc,
5373 unsigned Opcode,
5374 ArrayRef<unsigned> Indices) const {
5375 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5377 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5378 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5379 unsigned NumRegs = Indices.size();
5380
5381 int SubReg = 0, End = NumRegs, Incr = 1;
5382 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5383 SubReg = NumRegs - 1;
5384 End = -1;
5385 Incr = -1;
5386 }
5387
5388 for (; SubReg != End; SubReg += Incr) {
5389 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5390 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5391 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5392 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5393 }
5394}
5395
5398 const DebugLoc &DL, MCRegister DestReg,
5399 MCRegister SrcReg, bool KillSrc,
5400 unsigned Opcode, unsigned ZeroReg,
5401 llvm::ArrayRef<unsigned> Indices) const {
5403 unsigned NumRegs = Indices.size();
5404
5405#ifndef NDEBUG
5406 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5407 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5408 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5409 "GPR reg sequences should not be able to overlap");
5410#endif
5411
5412 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5413 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5414 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5415 MIB.addReg(ZeroReg);
5416 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5417 MIB.addImm(0);
5418 }
5419}
5420
5421/// Returns true if the instruction at I is in a streaming call site region,
5422/// within a single basic block.
5423/// A "call site streaming region" starts after smstart and ends at smstop
5424/// around a call to a streaming function. This walks backward from I.
5427 MachineFunction &MF = *MBB.getParent();
5429 if (!AFI->hasStreamingModeChanges())
5430 return false;
5431 // Walk backwards to find smstart/smstop
5432 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5433 unsigned Opc = MI.getOpcode();
5434 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5435 // Check if this is SM change (not ZA)
5436 int64_t PState = MI.getOperand(0).getImm();
5437 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5438 // Operand 1 is 1 for start, 0 for stop
5439 return MI.getOperand(1).getImm() == 1;
5440 }
5441 }
5442 }
5443 return false;
5444}
5445
5446/// Returns true if in a streaming call site region without SME-FA64.
5447static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5450 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5451}
5452
5455 const DebugLoc &DL, Register DestReg,
5456 Register SrcReg, bool KillSrc,
5457 bool RenamableDest,
5458 bool RenamableSrc) const {
5459 ++NumCopyInstrs;
5460 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5461 AArch64::GPR32spRegClass.contains(SrcReg)) {
5462 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5463 // If either operand is WSP, expand to ADD #0.
5464 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5465 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5466 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5467 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5468 &AArch64::GPR64spRegClass);
5469 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5470 &AArch64::GPR64spRegClass);
5471 // This instruction is reading and writing X registers. This may upset
5472 // the register scavenger and machine verifier, so we need to indicate
5473 // that we are reading an undefined value from SrcRegX, but a proper
5474 // value from SrcReg.
5475 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5476 .addReg(SrcRegX, RegState::Undef)
5477 .addImm(0)
5479 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5480 ++NumZCRegMoveInstrsGPR;
5481 } else {
5482 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5483 .addReg(SrcReg, getKillRegState(KillSrc))
5484 .addImm(0)
5486 if (Subtarget.hasZeroCycleRegMoveGPR32())
5487 ++NumZCRegMoveInstrsGPR;
5488 }
5489 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5490 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5491 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5492 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5493 &AArch64::GPR64spRegClass);
5494 assert(DestRegX.isValid() && "Destination super-reg not valid");
5495 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5496 &AArch64::GPR64spRegClass);
5497 assert(SrcRegX.isValid() && "Source super-reg not valid");
5498 // This instruction is reading and writing X registers. This may upset
5499 // the register scavenger and machine verifier, so we need to indicate
5500 // that we are reading an undefined value from SrcRegX, but a proper
5501 // value from SrcReg.
5502 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5503 .addReg(AArch64::XZR)
5504 .addReg(SrcRegX, RegState::Undef)
5505 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5506 ++NumZCRegMoveInstrsGPR;
5507 } else {
5508 // Otherwise, expand to ORR WZR.
5509 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5510 .addReg(AArch64::WZR)
5511 .addReg(SrcReg, getKillRegState(KillSrc));
5512 if (Subtarget.hasZeroCycleRegMoveGPR32())
5513 ++NumZCRegMoveInstrsGPR;
5514 }
5515 return;
5516 }
5517
5518 // GPR32 zeroing
5519 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5520 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5521 !Subtarget.hasZeroCycleZeroingGPR32()) {
5522 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5523 &AArch64::GPR64spRegClass);
5524 assert(DestRegX.isValid() && "Destination super-reg not valid");
5525 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5526 .addImm(0)
5528 ++NumZCZeroingInstrsGPR;
5529 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5530 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5531 .addImm(0)
5533 ++NumZCZeroingInstrsGPR;
5534 } else {
5535 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5536 .addReg(AArch64::WZR)
5537 .addReg(AArch64::WZR);
5538 }
5539 return;
5540 }
5541
5542 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5543 AArch64::GPR64spRegClass.contains(SrcReg)) {
5544 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5545 // If either operand is SP, expand to ADD #0.
5546 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5547 .addReg(SrcReg, getKillRegState(KillSrc))
5548 .addImm(0)
5550 if (Subtarget.hasZeroCycleRegMoveGPR64())
5551 ++NumZCRegMoveInstrsGPR;
5552 } else {
5553 // Otherwise, expand to ORR XZR.
5554 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5555 .addReg(AArch64::XZR)
5556 .addReg(SrcReg, getKillRegState(KillSrc));
5557 if (Subtarget.hasZeroCycleRegMoveGPR64())
5558 ++NumZCRegMoveInstrsGPR;
5559 }
5560 return;
5561 }
5562
5563 // GPR64 zeroing
5564 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5565 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5566 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5567 .addImm(0)
5569 ++NumZCZeroingInstrsGPR;
5570 } else {
5571 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5572 .addReg(AArch64::XZR)
5573 .addReg(AArch64::XZR);
5574 }
5575 return;
5576 }
5577
5578 // Copy a Predicate register by ORRing with itself.
5579 if (AArch64::PPRRegClass.contains(DestReg) &&
5580 AArch64::PPRRegClass.contains(SrcReg)) {
5581 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5582 "Unexpected SVE register.");
5583 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5584 .addReg(SrcReg) // Pg
5585 .addReg(SrcReg)
5586 .addReg(SrcReg, getKillRegState(KillSrc));
5587 return;
5588 }
5589
5590 // Copy a predicate-as-counter register by ORRing with itself as if it
5591 // were a regular predicate (mask) register.
5592 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5593 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5594 if (DestIsPNR || SrcIsPNR) {
5595 auto ToPPR = [](MCRegister R) -> MCRegister {
5596 return (R - AArch64::PN0) + AArch64::P0;
5597 };
5598 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5599 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5600
5601 if (PPRSrcReg != PPRDestReg) {
5602 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5603 .addReg(PPRSrcReg) // Pg
5604 .addReg(PPRSrcReg)
5605 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5606 if (DestIsPNR)
5607 NewMI.addDef(DestReg, RegState::Implicit);
5608 }
5609 return;
5610 }
5611
5612 // Copy a Z register by ORRing with itself.
5613 if (AArch64::ZPRRegClass.contains(DestReg) &&
5614 AArch64::ZPRRegClass.contains(SrcReg)) {
5615 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5616 "Unexpected SVE register.");
5617 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5618 .addReg(SrcReg)
5619 .addReg(SrcReg, getKillRegState(KillSrc));
5620 return;
5621 }
5622
5623 // Copy a Z register pair by copying the individual sub-registers.
5624 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5625 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5626 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5627 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5628 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5629 "Unexpected SVE register.");
5630 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5631 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5632 Indices);
5633 return;
5634 }
5635
5636 // Copy a Z register triple by copying the individual sub-registers.
5637 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5638 AArch64::ZPR3RegClass.contains(SrcReg)) {
5639 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5640 "Unexpected SVE register.");
5641 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5642 AArch64::zsub2};
5643 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5644 Indices);
5645 return;
5646 }
5647
5648 // Copy a Z register quad by copying the individual sub-registers.
5649 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5650 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5651 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5652 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5653 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5654 "Unexpected SVE register.");
5655 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5656 AArch64::zsub2, AArch64::zsub3};
5657 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5658 Indices);
5659 return;
5660 }
5661
5662 // Copy a DDDD register quad by copying the individual sub-registers.
5663 if (AArch64::DDDDRegClass.contains(DestReg) &&
5664 AArch64::DDDDRegClass.contains(SrcReg)) {
5665 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5666 AArch64::dsub2, AArch64::dsub3};
5667 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5668 Indices);
5669 return;
5670 }
5671
5672 // Copy a DDD register triple by copying the individual sub-registers.
5673 if (AArch64::DDDRegClass.contains(DestReg) &&
5674 AArch64::DDDRegClass.contains(SrcReg)) {
5675 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5676 AArch64::dsub2};
5677 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5678 Indices);
5679 return;
5680 }
5681
5682 // Copy a DD register pair by copying the individual sub-registers.
5683 if (AArch64::DDRegClass.contains(DestReg) &&
5684 AArch64::DDRegClass.contains(SrcReg)) {
5685 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5686 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5687 Indices);
5688 return;
5689 }
5690
5691 // Copy a QQQQ register quad by copying the individual sub-registers.
5692 if (AArch64::QQQQRegClass.contains(DestReg) &&
5693 AArch64::QQQQRegClass.contains(SrcReg)) {
5694 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5695 AArch64::qsub2, AArch64::qsub3};
5696 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5697 Indices);
5698 return;
5699 }
5700
5701 // Copy a QQQ register triple by copying the individual sub-registers.
5702 if (AArch64::QQQRegClass.contains(DestReg) &&
5703 AArch64::QQQRegClass.contains(SrcReg)) {
5704 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5705 AArch64::qsub2};
5706 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5707 Indices);
5708 return;
5709 }
5710
5711 // Copy a QQ register pair by copying the individual sub-registers.
5712 if (AArch64::QQRegClass.contains(DestReg) &&
5713 AArch64::QQRegClass.contains(SrcReg)) {
5714 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5715 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5716 Indices);
5717 return;
5718 }
5719
5720 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5721 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5722 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5723 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5724 AArch64::XZR, Indices);
5725 return;
5726 }
5727
5728 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5729 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5730 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5731 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5732 AArch64::WZR, Indices);
5733 return;
5734 }
5735
5736 if (AArch64::FPR128RegClass.contains(DestReg) &&
5737 AArch64::FPR128RegClass.contains(SrcReg)) {
5738 // In streaming regions, NEON is illegal but streaming-SVE is available.
5739 // Use SVE for copies if we're in a streaming region and SME is available.
5740 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5741 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5742 !Subtarget.isNeonAvailable()) ||
5743 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5744 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5745 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5746 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5747 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5748 } else if (Subtarget.isNeonAvailable()) {
5749 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5750 .addReg(SrcReg)
5751 .addReg(SrcReg, getKillRegState(KillSrc));
5752 if (Subtarget.hasZeroCycleRegMoveFPR128())
5753 ++NumZCRegMoveInstrsFPR;
5754 } else {
5755 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5756 .addReg(AArch64::SP, RegState::Define)
5757 .addReg(SrcReg, getKillRegState(KillSrc))
5758 .addReg(AArch64::SP)
5759 .addImm(-16);
5760 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5761 .addReg(AArch64::SP, RegState::Define)
5762 .addReg(DestReg, RegState::Define)
5763 .addReg(AArch64::SP)
5764 .addImm(16);
5765 }
5766 return;
5767 }
5768
5769 if (AArch64::FPR64RegClass.contains(DestReg) &&
5770 AArch64::FPR64RegClass.contains(SrcReg)) {
5771 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5772 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5773 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5774 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5775 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5776 &AArch64::FPR128RegClass);
5777 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5778 &AArch64::FPR128RegClass);
5779 // This instruction is reading and writing Q registers. This may upset
5780 // the register scavenger and machine verifier, so we need to indicate
5781 // that we are reading an undefined value from SrcRegQ, but a proper
5782 // value from SrcReg.
5783 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5784 .addReg(SrcRegQ, RegState::Undef)
5785 .addReg(SrcRegQ, RegState::Undef)
5786 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5787 ++NumZCRegMoveInstrsFPR;
5788 } else {
5789 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5790 .addReg(SrcReg, getKillRegState(KillSrc));
5791 if (Subtarget.hasZeroCycleRegMoveFPR64())
5792 ++NumZCRegMoveInstrsFPR;
5793 }
5794 return;
5795 }
5796
5797 if (AArch64::FPR32RegClass.contains(DestReg) &&
5798 AArch64::FPR32RegClass.contains(SrcReg)) {
5799 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5800 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5801 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5802 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5803 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5804 &AArch64::FPR128RegClass);
5805 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5806 &AArch64::FPR128RegClass);
5807 // This instruction is reading and writing Q registers. This may upset
5808 // the register scavenger and machine verifier, so we need to indicate
5809 // that we are reading an undefined value from SrcRegQ, but a proper
5810 // value from SrcReg.
5811 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5812 .addReg(SrcRegQ, RegState::Undef)
5813 .addReg(SrcRegQ, RegState::Undef)
5814 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5815 ++NumZCRegMoveInstrsFPR;
5816 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5817 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5818 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5819 &AArch64::FPR64RegClass);
5820 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5821 &AArch64::FPR64RegClass);
5822 // This instruction is reading and writing D registers. This may upset
5823 // the register scavenger and machine verifier, so we need to indicate
5824 // that we are reading an undefined value from SrcRegD, but a proper
5825 // value from SrcReg.
5826 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5827 .addReg(SrcRegD, RegState::Undef)
5828 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5829 ++NumZCRegMoveInstrsFPR;
5830 } else {
5831 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5832 .addReg(SrcReg, getKillRegState(KillSrc));
5833 if (Subtarget.hasZeroCycleRegMoveFPR32())
5834 ++NumZCRegMoveInstrsFPR;
5835 }
5836 return;
5837 }
5838
5839 if (AArch64::FPR16RegClass.contains(DestReg) &&
5840 AArch64::FPR16RegClass.contains(SrcReg)) {
5841 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5842 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5843 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5844 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5845 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5846 &AArch64::FPR128RegClass);
5847 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5848 &AArch64::FPR128RegClass);
5849 // This instruction is reading and writing Q registers. This may upset
5850 // the register scavenger and machine verifier, so we need to indicate
5851 // that we are reading an undefined value from SrcRegQ, but a proper
5852 // value from SrcReg.
5853 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5854 .addReg(SrcRegQ, RegState::Undef)
5855 .addReg(SrcRegQ, RegState::Undef)
5856 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5857 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5858 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5859 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5860 &AArch64::FPR64RegClass);
5861 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5862 &AArch64::FPR64RegClass);
5863 // This instruction is reading and writing D registers. This may upset
5864 // the register scavenger and machine verifier, so we need to indicate
5865 // that we are reading an undefined value from SrcRegD, but a proper
5866 // value from SrcReg.
5867 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5868 .addReg(SrcRegD, RegState::Undef)
5869 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5870 } else {
5871 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5872 &AArch64::FPR32RegClass);
5873 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5874 &AArch64::FPR32RegClass);
5875 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5876 .addReg(SrcReg, getKillRegState(KillSrc));
5877 }
5878 return;
5879 }
5880
5881 if (AArch64::FPR8RegClass.contains(DestReg) &&
5882 AArch64::FPR8RegClass.contains(SrcReg)) {
5883 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5884 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5885 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5886 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5887 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5888 &AArch64::FPR128RegClass);
5889 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5890 &AArch64::FPR128RegClass);
5891 // This instruction is reading and writing Q registers. This may upset
5892 // the register scavenger and machine verifier, so we need to indicate
5893 // that we are reading an undefined value from SrcRegQ, but a proper
5894 // value from SrcReg.
5895 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5896 .addReg(SrcRegQ, RegState::Undef)
5897 .addReg(SrcRegQ, RegState::Undef)
5898 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5899 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5900 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5901 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5902 &AArch64::FPR64RegClass);
5903 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5904 &AArch64::FPR64RegClass);
5905 // This instruction is reading and writing D registers. This may upset
5906 // the register scavenger and machine verifier, so we need to indicate
5907 // that we are reading an undefined value from SrcRegD, but a proper
5908 // value from SrcReg.
5909 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5910 .addReg(SrcRegD, RegState::Undef)
5911 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5912 } else {
5913 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5914 &AArch64::FPR32RegClass);
5915 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5916 &AArch64::FPR32RegClass);
5917 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5918 .addReg(SrcReg, getKillRegState(KillSrc));
5919 }
5920 return;
5921 }
5922
5923 // Copies between GPR64 and FPR64.
5924 if (AArch64::FPR64RegClass.contains(DestReg) &&
5925 AArch64::GPR64RegClass.contains(SrcReg)) {
5926 if (AArch64::XZR == SrcReg) {
5927 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5928 } else {
5929 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5930 .addReg(SrcReg, getKillRegState(KillSrc));
5931 }
5932 return;
5933 }
5934 if (AArch64::GPR64RegClass.contains(DestReg) &&
5935 AArch64::FPR64RegClass.contains(SrcReg)) {
5936 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5937 .addReg(SrcReg, getKillRegState(KillSrc));
5938 return;
5939 }
5940 // Copies between GPR32 and FPR32.
5941 if (AArch64::FPR32RegClass.contains(DestReg) &&
5942 AArch64::GPR32RegClass.contains(SrcReg)) {
5943 if (AArch64::WZR == SrcReg) {
5944 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5945 } else {
5946 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5947 .addReg(SrcReg, getKillRegState(KillSrc));
5948 }
5949 return;
5950 }
5951 if (AArch64::GPR32RegClass.contains(DestReg) &&
5952 AArch64::FPR32RegClass.contains(SrcReg)) {
5953 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5954 .addReg(SrcReg, getKillRegState(KillSrc));
5955 return;
5956 }
5957
5958 if (DestReg == AArch64::NZCV) {
5959 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5960 BuildMI(MBB, I, DL, get(AArch64::MSR))
5961 .addImm(AArch64SysReg::NZCV)
5962 .addReg(SrcReg, getKillRegState(KillSrc))
5963 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5964 return;
5965 }
5966
5967 if (SrcReg == AArch64::NZCV) {
5968 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5969 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5970 .addImm(AArch64SysReg::NZCV)
5971 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5972 return;
5973 }
5974
5975#ifndef NDEBUG
5976 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
5977 << "\n";
5978#endif
5979 llvm_unreachable("unimplemented reg-to-reg copy");
5980}
5981
5984 MachineBasicBlock::iterator InsertBefore,
5985 const MCInstrDesc &MCID,
5986 Register SrcReg, bool IsKill,
5987 unsigned SubIdx0, unsigned SubIdx1, int FI,
5988 MachineMemOperand *MMO) {
5989 Register SrcReg0 = SrcReg;
5990 Register SrcReg1 = SrcReg;
5991 if (SrcReg.isPhysical()) {
5992 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5993 SubIdx0 = 0;
5994 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5995 SubIdx1 = 0;
5996 }
5997 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5998 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5999 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6000 .addFrameIndex(FI)
6001 .addImm(0)
6002 .addMemOperand(MMO);
6003}
6004
6007 Register SrcReg, bool isKill, int FI,
6008 const TargetRegisterClass *RC,
6009 Register VReg,
6010 MachineInstr::MIFlag Flags) const {
6011 MachineFunction &MF = *MBB.getParent();
6012 MachineFrameInfo &MFI = MF.getFrameInfo();
6013
6015 MachineMemOperand *MMO =
6017 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6018 unsigned Opc = 0;
6019 bool Offset = true;
6021 unsigned StackID = TargetStackID::Default;
6022 switch (RI.getSpillSize(*RC)) {
6023 case 1:
6024 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6025 Opc = AArch64::STRBui;
6026 break;
6027 case 2: {
6028 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6029 Opc = AArch64::STRHui;
6030 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6031 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6032 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6033 "Unexpected register store without SVE store instructions");
6034 Opc = AArch64::STR_PXI;
6036 }
6037 break;
6038 }
6039 case 4:
6040 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6041 Opc = AArch64::STRWui;
6042 if (SrcReg.isVirtual())
6043 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6044 else
6045 assert(SrcReg != AArch64::WSP);
6046 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6047 Opc = AArch64::STRSui;
6048 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6049 Opc = AArch64::STR_PPXI;
6051 }
6052 break;
6053 case 8:
6054 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6055 Opc = AArch64::STRXui;
6056 if (SrcReg.isVirtual())
6057 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6058 else
6059 assert(SrcReg != AArch64::SP);
6060 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6061 Opc = AArch64::STRDui;
6062 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6064 get(AArch64::STPWi), SrcReg, isKill,
6065 AArch64::sube32, AArch64::subo32, FI, MMO);
6066 return;
6067 }
6068 break;
6069 case 16:
6070 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6071 Opc = AArch64::STRQui;
6072 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6073 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6074 Opc = AArch64::ST1Twov1d;
6075 Offset = false;
6076 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6078 get(AArch64::STPXi), SrcReg, isKill,
6079 AArch64::sube64, AArch64::subo64, FI, MMO);
6080 return;
6081 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6082 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6083 "Unexpected register store without SVE store instructions");
6084 Opc = AArch64::STR_ZXI;
6086 }
6087 break;
6088 case 24:
6089 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6090 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6091 Opc = AArch64::ST1Threev1d;
6092 Offset = false;
6093 }
6094 break;
6095 case 32:
6096 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6097 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6098 Opc = AArch64::ST1Fourv1d;
6099 Offset = false;
6100 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6101 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6102 Opc = AArch64::ST1Twov2d;
6103 Offset = false;
6104 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6105 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6106 "Unexpected register store without SVE store instructions");
6107 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6109 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6110 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6111 "Unexpected register store without SVE store instructions");
6112 Opc = AArch64::STR_ZZXI;
6114 }
6115 break;
6116 case 48:
6117 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6118 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6119 Opc = AArch64::ST1Threev2d;
6120 Offset = false;
6121 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6122 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6123 "Unexpected register store without SVE store instructions");
6124 Opc = AArch64::STR_ZZZXI;
6126 }
6127 break;
6128 case 64:
6129 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6130 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6131 Opc = AArch64::ST1Fourv2d;
6132 Offset = false;
6133 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6134 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6135 "Unexpected register store without SVE store instructions");
6136 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6138 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6139 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6140 "Unexpected register store without SVE store instructions");
6141 Opc = AArch64::STR_ZZZZXI;
6143 }
6144 break;
6145 }
6146 assert(Opc && "Unknown register class");
6147 MFI.setStackID(FI, StackID);
6148
6150 .addReg(SrcReg, getKillRegState(isKill))
6151 .addFrameIndex(FI);
6152
6153 if (Offset)
6154 MI.addImm(0);
6155 if (PNRReg.isValid())
6156 MI.addDef(PNRReg, RegState::Implicit);
6157 MI.addMemOperand(MMO);
6158}
6159
6162 MachineBasicBlock::iterator InsertBefore,
6163 const MCInstrDesc &MCID,
6164 Register DestReg, unsigned SubIdx0,
6165 unsigned SubIdx1, int FI,
6166 MachineMemOperand *MMO) {
6167 Register DestReg0 = DestReg;
6168 Register DestReg1 = DestReg;
6169 bool IsUndef = true;
6170 if (DestReg.isPhysical()) {
6171 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6172 SubIdx0 = 0;
6173 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6174 SubIdx1 = 0;
6175 IsUndef = false;
6176 }
6177 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6178 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6179 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6180 .addFrameIndex(FI)
6181 .addImm(0)
6182 .addMemOperand(MMO);
6183}
6184
6187 Register DestReg, int FI,
6188 const TargetRegisterClass *RC,
6189 Register VReg, unsigned SubReg,
6190 MachineInstr::MIFlag Flags) const {
6191 MachineFunction &MF = *MBB.getParent();
6192 MachineFrameInfo &MFI = MF.getFrameInfo();
6194 MachineMemOperand *MMO =
6196 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6197
6198 unsigned Opc = 0;
6199 bool Offset = true;
6200 unsigned StackID = TargetStackID::Default;
6202 switch (TRI.getSpillSize(*RC)) {
6203 case 1:
6204 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6205 Opc = AArch64::LDRBui;
6206 break;
6207 case 2: {
6208 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6209 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6210 Opc = AArch64::LDRHui;
6211 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6212 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6213 "Unexpected register load without SVE load instructions");
6214 if (IsPNR)
6215 PNRReg = DestReg;
6216 Opc = AArch64::LDR_PXI;
6218 }
6219 break;
6220 }
6221 case 4:
6222 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6223 Opc = AArch64::LDRWui;
6224 if (DestReg.isVirtual())
6225 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6226 else
6227 assert(DestReg != AArch64::WSP);
6228 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6229 Opc = AArch64::LDRSui;
6230 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6231 Opc = AArch64::LDR_PPXI;
6233 }
6234 break;
6235 case 8:
6236 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6237 Opc = AArch64::LDRXui;
6238 if (DestReg.isVirtual())
6239 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6240 else
6241 assert(DestReg != AArch64::SP);
6242 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6243 Opc = AArch64::LDRDui;
6244 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6246 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6247 AArch64::subo32, FI, MMO);
6248 return;
6249 }
6250 break;
6251 case 16:
6252 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6253 Opc = AArch64::LDRQui;
6254 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6255 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6256 Opc = AArch64::LD1Twov1d;
6257 Offset = false;
6258 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6260 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6261 AArch64::subo64, FI, MMO);
6262 return;
6263 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6264 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6265 "Unexpected register load without SVE load instructions");
6266 Opc = AArch64::LDR_ZXI;
6268 }
6269 break;
6270 case 24:
6271 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6272 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6273 Opc = AArch64::LD1Threev1d;
6274 Offset = false;
6275 }
6276 break;
6277 case 32:
6278 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6279 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6280 Opc = AArch64::LD1Fourv1d;
6281 Offset = false;
6282 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6283 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6284 Opc = AArch64::LD1Twov2d;
6285 Offset = false;
6286 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6287 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6288 "Unexpected register load without SVE load instructions");
6289 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6291 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6292 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6293 "Unexpected register load without SVE load instructions");
6294 Opc = AArch64::LDR_ZZXI;
6296 }
6297 break;
6298 case 48:
6299 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6300 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6301 Opc = AArch64::LD1Threev2d;
6302 Offset = false;
6303 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6304 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6305 "Unexpected register load without SVE load instructions");
6306 Opc = AArch64::LDR_ZZZXI;
6308 }
6309 break;
6310 case 64:
6311 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6312 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6313 Opc = AArch64::LD1Fourv2d;
6314 Offset = false;
6315 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6316 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6317 "Unexpected register load without SVE load instructions");
6318 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6320 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6321 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6322 "Unexpected register load without SVE load instructions");
6323 Opc = AArch64::LDR_ZZZZXI;
6325 }
6326 break;
6327 }
6328
6329 assert(Opc && "Unknown register class");
6330 MFI.setStackID(FI, StackID);
6331
6333 .addReg(DestReg, getDefRegState(true))
6334 .addFrameIndex(FI);
6335 if (Offset)
6336 MI.addImm(0);
6337 if (PNRReg.isValid() && !PNRReg.isVirtual())
6338 MI.addDef(PNRReg, RegState::Implicit);
6339 MI.addMemOperand(MMO);
6340}
6341
6343 const MachineInstr &UseMI,
6344 const TargetRegisterInfo *TRI) {
6345 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6346 UseMI.getIterator()),
6347 [TRI](const MachineInstr &I) {
6348 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6349 I.readsRegister(AArch64::NZCV, TRI);
6350 });
6351}
6352
6353void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6354 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6355 // The smallest scalable element supported by scaled SVE addressing
6356 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6357 // byte offset must always be a multiple of 2.
6358 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6359
6360 // VGSized offsets are divided by '2', because the VG register is the
6361 // the number of 64bit granules as opposed to 128bit vector chunks,
6362 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6363 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6364 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6365 ByteSized = Offset.getFixed();
6366 VGSized = Offset.getScalable() / 2;
6367}
6368
6369/// Returns the offset in parts to which this frame offset can be
6370/// decomposed for the purpose of describing a frame offset.
6371/// For non-scalable offsets this is simply its byte size.
6372void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6373 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6374 int64_t &NumDataVectors) {
6375 // The smallest scalable element supported by scaled SVE addressing
6376 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6377 // byte offset must always be a multiple of 2.
6378 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6379
6380 NumBytes = Offset.getFixed();
6381 NumDataVectors = 0;
6382 NumPredicateVectors = Offset.getScalable() / 2;
6383 // This method is used to get the offsets to adjust the frame offset.
6384 // If the function requires ADDPL to be used and needs more than two ADDPL
6385 // instructions, part of the offset is folded into NumDataVectors so that it
6386 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6387 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6388 NumPredicateVectors > 62) {
6389 NumDataVectors = NumPredicateVectors / 8;
6390 NumPredicateVectors -= NumDataVectors * 8;
6391 }
6392}
6393
6394// Convenience function to create a DWARF expression for: Constant `Operation`.
6395// This helper emits compact sequences for common cases. For example, for`-15
6396// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6399 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6400 // -Constant (1 to 31)
6401 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6402 Operation = dwarf::DW_OP_minus;
6403 } else if (Constant >= 0 && Constant <= 31) {
6404 // Literal value 0 to 31
6405 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6406 } else {
6407 // Signed constant
6408 Expr.push_back(dwarf::DW_OP_consts);
6410 }
6411 return Expr.push_back(Operation);
6412}
6413
6414// Convenience function to create a DWARF expression for a register.
6415static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6416 Expr.push_back((char)dwarf::DW_OP_bregx);
6418 Expr.push_back(0);
6419}
6420
6421// Convenience function to create a DWARF expression for loading a register from
6422// a CFA offset.
6424 int64_t OffsetFromDefCFA) {
6425 // This assumes the top of the DWARF stack contains the CFA.
6426 Expr.push_back(dwarf::DW_OP_dup);
6427 // Add the offset to the register.
6428 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6429 // Dereference the address (loads a 64 bit value)..
6430 Expr.push_back(dwarf::DW_OP_deref);
6431}
6432
6433// Convenience function to create a comment for
6434// (+/-) NumBytes (* RegScale)?
6435static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6436 StringRef RegScale = {}) {
6437 if (NumBytes) {
6438 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6439 if (!RegScale.empty())
6440 Comment << ' ' << RegScale;
6441 }
6442}
6443
6444// Creates an MCCFIInstruction:
6445// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6447 unsigned Reg,
6448 const StackOffset &Offset) {
6449 int64_t NumBytes, NumVGScaledBytes;
6450 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6451 NumVGScaledBytes);
6452 std::string CommentBuffer;
6453 llvm::raw_string_ostream Comment(CommentBuffer);
6454
6455 if (Reg == AArch64::SP)
6456 Comment << "sp";
6457 else if (Reg == AArch64::FP)
6458 Comment << "fp";
6459 else
6460 Comment << printReg(Reg, &TRI);
6461
6462 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6463 SmallString<64> Expr;
6464 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6465 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6466 // Reg + NumBytes
6467 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6468 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6469 appendOffsetComment(NumBytes, Comment);
6470 if (NumVGScaledBytes) {
6471 // + VG * NumVGScaledBytes
6472 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6473 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6474 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6475 Expr.push_back(dwarf::DW_OP_plus);
6476 }
6477
6478 // Wrap this into DW_CFA_def_cfa.
6479 SmallString<64> DefCfaExpr;
6480 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6481 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6482 DefCfaExpr.append(Expr.str());
6483 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6484 Comment.str());
6485}
6486
6488 unsigned FrameReg, unsigned Reg,
6489 const StackOffset &Offset,
6490 bool LastAdjustmentWasScalable) {
6491 if (Offset.getScalable())
6492 return createDefCFAExpression(TRI, Reg, Offset);
6493
6494 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6495 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6496
6497 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6498 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6499}
6500
6503 const StackOffset &OffsetFromDefCFA,
6504 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6505 int64_t NumBytes, NumVGScaledBytes;
6506 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6507 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6508
6509 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6510
6511 // Non-scalable offsets can use DW_CFA_offset directly.
6512 if (!NumVGScaledBytes)
6513 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6514
6515 std::string CommentBuffer;
6516 llvm::raw_string_ostream Comment(CommentBuffer);
6517 Comment << printReg(Reg, &TRI) << " @ cfa";
6518
6519 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6520 assert(NumVGScaledBytes && "Expected scalable offset");
6521 SmallString<64> OffsetExpr;
6522 // + VG * NumVGScaledBytes
6523 StringRef VGRegScale;
6524 if (IncomingVGOffsetFromDefCFA) {
6525 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6526 VGRegScale = "* IncomingVG";
6527 } else {
6528 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6529 VGRegScale = "* VG";
6530 }
6531 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6532 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6533 OffsetExpr.push_back(dwarf::DW_OP_plus);
6534 if (NumBytes) {
6535 // + NumBytes
6536 appendOffsetComment(NumBytes, Comment);
6537 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6538 }
6539
6540 // Wrap this into DW_CFA_expression
6541 SmallString<64> CfaExpr;
6542 CfaExpr.push_back(dwarf::DW_CFA_expression);
6543 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6544 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6545 CfaExpr.append(OffsetExpr.str());
6546
6547 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6548 Comment.str());
6549}
6550
6551// Helper function to emit a frame offset adjustment from a given
6552// pointer (SrcReg), stored into DestReg. This function is explicit
6553// in that it requires the opcode.
6556 const DebugLoc &DL, unsigned DestReg,
6557 unsigned SrcReg, int64_t Offset, unsigned Opc,
6558 const TargetInstrInfo *TII,
6559 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6560 bool *HasWinCFI, bool EmitCFAOffset,
6561 StackOffset CFAOffset, unsigned FrameReg) {
6562 int Sign = 1;
6563 unsigned MaxEncoding, ShiftSize;
6564 switch (Opc) {
6565 case AArch64::ADDXri:
6566 case AArch64::ADDSXri:
6567 case AArch64::SUBXri:
6568 case AArch64::SUBSXri:
6569 MaxEncoding = 0xfff;
6570 ShiftSize = 12;
6571 break;
6572 case AArch64::ADDVL_XXI:
6573 case AArch64::ADDPL_XXI:
6574 case AArch64::ADDSVL_XXI:
6575 case AArch64::ADDSPL_XXI:
6576 MaxEncoding = 31;
6577 ShiftSize = 0;
6578 if (Offset < 0) {
6579 MaxEncoding = 32;
6580 Sign = -1;
6581 Offset = -Offset;
6582 }
6583 break;
6584 default:
6585 llvm_unreachable("Unsupported opcode");
6586 }
6587
6588 // `Offset` can be in bytes or in "scalable bytes".
6589 int VScale = 1;
6590 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6591 VScale = 16;
6592 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6593 VScale = 2;
6594
6595 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6596 // scratch register. If DestReg is a virtual register, use it as the
6597 // scratch register; otherwise, create a new virtual register (to be
6598 // replaced by the scavenger at the end of PEI). That case can be optimized
6599 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6600 // register can be loaded with offset%8 and the add/sub can use an extending
6601 // instruction with LSL#3.
6602 // Currently the function handles any offsets but generates a poor sequence
6603 // of code.
6604 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6605
6606 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6607 Register TmpReg = DestReg;
6608 if (TmpReg == AArch64::XZR)
6609 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6610 &AArch64::GPR64RegClass);
6611 do {
6612 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6613 unsigned LocalShiftSize = 0;
6614 if (ThisVal > MaxEncoding) {
6615 ThisVal = ThisVal >> ShiftSize;
6616 LocalShiftSize = ShiftSize;
6617 }
6618 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6619 "Encoding cannot handle value that big");
6620
6621 Offset -= ThisVal << LocalShiftSize;
6622 if (Offset == 0)
6623 TmpReg = DestReg;
6624 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6625 .addReg(SrcReg)
6626 .addImm(Sign * (int)ThisVal);
6627 if (ShiftSize)
6628 MBI = MBI.addImm(
6630 MBI = MBI.setMIFlag(Flag);
6631
6632 auto Change =
6633 VScale == 1
6634 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6635 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6636 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6637 CFAOffset += Change;
6638 else
6639 CFAOffset -= Change;
6640 if (EmitCFAOffset && DestReg == TmpReg) {
6641 MachineFunction &MF = *MBB.getParent();
6642 const TargetSubtargetInfo &STI = MF.getSubtarget();
6643 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6644
6645 unsigned CFIIndex = MF.addFrameInst(
6646 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6647 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6648 .addCFIIndex(CFIIndex)
6649 .setMIFlags(Flag);
6650 }
6651
6652 if (NeedsWinCFI) {
6653 int Imm = (int)(ThisVal << LocalShiftSize);
6654 if (VScale != 1 && DestReg == AArch64::SP) {
6655 if (HasWinCFI)
6656 *HasWinCFI = true;
6657 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6658 .addImm(ThisVal)
6659 .setMIFlag(Flag);
6660 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6661 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6662 assert(VScale == 1 && "Expected non-scalable operation");
6663 if (HasWinCFI)
6664 *HasWinCFI = true;
6665 if (Imm == 0)
6666 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6667 else
6668 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6669 .addImm(Imm)
6670 .setMIFlag(Flag);
6671 assert(Offset == 0 && "Expected remaining offset to be zero to "
6672 "emit a single SEH directive");
6673 } else if (DestReg == AArch64::SP) {
6674 assert(VScale == 1 && "Expected non-scalable operation");
6675 if (HasWinCFI)
6676 *HasWinCFI = true;
6677 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6678 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6679 .addImm(Imm)
6680 .setMIFlag(Flag);
6681 }
6682 }
6683
6684 SrcReg = TmpReg;
6685 } while (Offset);
6686}
6687
6690 unsigned DestReg, unsigned SrcReg,
6692 MachineInstr::MIFlag Flag, bool SetNZCV,
6693 bool NeedsWinCFI, bool *HasWinCFI,
6694 bool EmitCFAOffset, StackOffset CFAOffset,
6695 unsigned FrameReg) {
6696 // If a function is marked as arm_locally_streaming, then the runtime value of
6697 // vscale in the prologue/epilogue is different the runtime value of vscale
6698 // in the function's body. To avoid having to consider multiple vscales,
6699 // we can use `addsvl` to allocate any scalable stack-slots, which under
6700 // most circumstances will be only locals, not callee-save slots.
6701 const Function &F = MBB.getParent()->getFunction();
6702 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6703
6704 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6705 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6706 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6707
6708 // Insert ADDSXri for scalable offset at the end.
6709 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6710 if (NeedsFinalDefNZCV)
6711 SetNZCV = false;
6712
6713 // First emit non-scalable frame offsets, or a simple 'mov'.
6714 if (Bytes || (!Offset && SrcReg != DestReg)) {
6715 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6716 "SP increment/decrement not 8-byte aligned");
6717 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6718 if (Bytes < 0) {
6719 Bytes = -Bytes;
6720 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6721 }
6722 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6723 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6724 FrameReg);
6725 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6726 ? StackOffset::getFixed(-Bytes)
6727 : StackOffset::getFixed(Bytes);
6728 SrcReg = DestReg;
6729 FrameReg = DestReg;
6730 }
6731
6732 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6733 "WinCFI can't allocate fractions of an SVE data vector");
6734
6735 if (NumDataVectors) {
6736 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6737 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6738 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6739 FrameReg);
6740 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6741 SrcReg = DestReg;
6742 }
6743
6744 if (NumPredicateVectors) {
6745 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6746 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6747 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6748 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6749 FrameReg);
6750 }
6751
6752 if (NeedsFinalDefNZCV)
6753 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6754 .addReg(DestReg)
6755 .addImm(0)
6756 .addImm(0);
6757}
6758
6761 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6762 LiveIntervals *LIS, VirtRegMap *VRM) const {
6763 // This is a bit of a hack. Consider this instruction:
6764 //
6765 // %0 = COPY %sp; GPR64all:%0
6766 //
6767 // We explicitly chose GPR64all for the virtual register so such a copy might
6768 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6769 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6770 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6771 //
6772 // To prevent that, we are going to constrain the %0 register class here.
6773 if (MI.isFullCopy()) {
6774 Register DstReg = MI.getOperand(0).getReg();
6775 Register SrcReg = MI.getOperand(1).getReg();
6776 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6777 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6778 return nullptr;
6779 }
6780 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6781 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6782 return nullptr;
6783 }
6784 // Nothing can folded with copy from/to NZCV.
6785 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6786 return nullptr;
6787 }
6788
6789 // Handle the case where a copy is being spilled or filled but the source
6790 // and destination register class don't match. For example:
6791 //
6792 // %0 = COPY %xzr; GPR64common:%0
6793 //
6794 // In this case we can still safely fold away the COPY and generate the
6795 // following spill code:
6796 //
6797 // STRXui %xzr, %stack.0
6798 //
6799 // This also eliminates spilled cross register class COPYs (e.g. between x and
6800 // d regs) of the same size. For example:
6801 //
6802 // %0 = COPY %1; GPR64:%0, FPR64:%1
6803 //
6804 // will be filled as
6805 //
6806 // LDRDui %0, fi<#0>
6807 //
6808 // instead of
6809 //
6810 // LDRXui %Temp, fi<#0>
6811 // %0 = FMOV %Temp
6812 //
6813 if (MI.isCopy() && Ops.size() == 1 &&
6814 // Make sure we're only folding the explicit COPY defs/uses.
6815 (Ops[0] == 0 || Ops[0] == 1)) {
6816 bool IsSpill = Ops[0] == 0;
6817 bool IsFill = !IsSpill;
6819 const MachineRegisterInfo &MRI = MF.getRegInfo();
6820 MachineBasicBlock &MBB = *MI.getParent();
6821 const MachineOperand &DstMO = MI.getOperand(0);
6822 const MachineOperand &SrcMO = MI.getOperand(1);
6823 Register DstReg = DstMO.getReg();
6824 Register SrcReg = SrcMO.getReg();
6825 // This is slightly expensive to compute for physical regs since
6826 // getMinimalPhysRegClass is slow.
6827 auto getRegClass = [&](unsigned Reg) {
6828 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6829 : TRI.getMinimalPhysRegClass(Reg);
6830 };
6831
6832 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6833 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6834 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6835 "Mismatched register size in non subreg COPY");
6836 if (IsSpill)
6837 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6838 getRegClass(SrcReg), Register());
6839 else
6840 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6841 getRegClass(DstReg), Register());
6842 return &*--InsertPt;
6843 }
6844
6845 // Handle cases like spilling def of:
6846 //
6847 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6848 //
6849 // where the physical register source can be widened and stored to the full
6850 // virtual reg destination stack slot, in this case producing:
6851 //
6852 // STRXui %xzr, %stack.0
6853 //
6854 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6855 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6856 assert(SrcMO.getSubReg() == 0 &&
6857 "Unexpected subreg on physical register");
6858 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6859 FrameIndex, &AArch64::GPR64RegClass, Register());
6860 return &*--InsertPt;
6861 }
6862
6863 // Handle cases like filling use of:
6864 //
6865 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6866 //
6867 // where we can load the full virtual reg source stack slot, into the subreg
6868 // destination, in this case producing:
6869 //
6870 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6871 //
6872 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6873 const TargetRegisterClass *FillRC = nullptr;
6874 switch (DstMO.getSubReg()) {
6875 default:
6876 break;
6877 case AArch64::sub_32:
6878 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6879 FillRC = &AArch64::GPR32RegClass;
6880 break;
6881 case AArch64::ssub:
6882 FillRC = &AArch64::FPR32RegClass;
6883 break;
6884 case AArch64::dsub:
6885 FillRC = &AArch64::FPR64RegClass;
6886 break;
6887 }
6888
6889 if (FillRC) {
6890 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6891 TRI.getRegSizeInBits(*FillRC) &&
6892 "Mismatched regclass size on folded subreg COPY");
6893 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6894 Register());
6895 MachineInstr &LoadMI = *--InsertPt;
6896 MachineOperand &LoadDst = LoadMI.getOperand(0);
6897 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6898 LoadDst.setSubReg(DstMO.getSubReg());
6899 LoadDst.setIsUndef();
6900 return &LoadMI;
6901 }
6902 }
6903 }
6904
6905 // Cannot fold.
6906 return nullptr;
6907}
6908
6910 StackOffset &SOffset,
6911 bool *OutUseUnscaledOp,
6912 unsigned *OutUnscaledOp,
6913 int64_t *EmittableOffset) {
6914 // Set output values in case of early exit.
6915 if (EmittableOffset)
6916 *EmittableOffset = 0;
6917 if (OutUseUnscaledOp)
6918 *OutUseUnscaledOp = false;
6919 if (OutUnscaledOp)
6920 *OutUnscaledOp = 0;
6921
6922 // Exit early for structured vector spills/fills as they can't take an
6923 // immediate offset.
6924 switch (MI.getOpcode()) {
6925 default:
6926 break;
6927 case AArch64::LD1Rv1d:
6928 case AArch64::LD1Rv2s:
6929 case AArch64::LD1Rv2d:
6930 case AArch64::LD1Rv4h:
6931 case AArch64::LD1Rv4s:
6932 case AArch64::LD1Rv8b:
6933 case AArch64::LD1Rv8h:
6934 case AArch64::LD1Rv16b:
6935 case AArch64::LD1Twov2d:
6936 case AArch64::LD1Threev2d:
6937 case AArch64::LD1Fourv2d:
6938 case AArch64::LD1Twov1d:
6939 case AArch64::LD1Threev1d:
6940 case AArch64::LD1Fourv1d:
6941 case AArch64::ST1Twov2d:
6942 case AArch64::ST1Threev2d:
6943 case AArch64::ST1Fourv2d:
6944 case AArch64::ST1Twov1d:
6945 case AArch64::ST1Threev1d:
6946 case AArch64::ST1Fourv1d:
6947 case AArch64::ST1i8:
6948 case AArch64::ST1i16:
6949 case AArch64::ST1i32:
6950 case AArch64::ST1i64:
6951 case AArch64::IRG:
6952 case AArch64::IRGstack:
6953 case AArch64::STGloop:
6954 case AArch64::STZGloop:
6956 }
6957
6958 // Get the min/max offset and the scale.
6959 TypeSize ScaleValue(0U, false), Width(0U, false);
6960 int64_t MinOff, MaxOff;
6961 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6962 MaxOff))
6963 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6964
6965 // Construct the complete offset.
6966 bool IsMulVL = ScaleValue.isScalable();
6967 unsigned Scale = ScaleValue.getKnownMinValue();
6968 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6969
6970 const MachineOperand &ImmOpnd =
6971 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6972 Offset += ImmOpnd.getImm() * Scale;
6973
6974 // If the offset doesn't match the scale, we rewrite the instruction to
6975 // use the unscaled instruction instead. Likewise, if we have a negative
6976 // offset and there is an unscaled op to use.
6977 std::optional<unsigned> UnscaledOp =
6979 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6980 if (useUnscaledOp &&
6981 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6982 MaxOff))
6983 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6984
6985 Scale = ScaleValue.getKnownMinValue();
6986 assert(IsMulVL == ScaleValue.isScalable() &&
6987 "Unscaled opcode has different value for scalable");
6988
6989 int64_t Remainder = Offset % Scale;
6990 assert(!(Remainder && useUnscaledOp) &&
6991 "Cannot have remainder when using unscaled op");
6992
6993 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6994 int64_t NewOffset = Offset / Scale;
6995 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6996 Offset = Remainder;
6997 else {
6998 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6999 Offset = Offset - (NewOffset * Scale);
7000 }
7001
7002 if (EmittableOffset)
7003 *EmittableOffset = NewOffset;
7004 if (OutUseUnscaledOp)
7005 *OutUseUnscaledOp = useUnscaledOp;
7006 if (OutUnscaledOp && UnscaledOp)
7007 *OutUnscaledOp = *UnscaledOp;
7008
7009 if (IsMulVL)
7010 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7011 else
7012 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7014 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7015}
7016
7018 unsigned FrameReg, StackOffset &Offset,
7019 const AArch64InstrInfo *TII) {
7020 unsigned Opcode = MI.getOpcode();
7021 unsigned ImmIdx = FrameRegIdx + 1;
7022
7023 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7024 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7025 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7026 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7027 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7028 MI.eraseFromParent();
7029 Offset = StackOffset();
7030 return true;
7031 }
7032
7033 int64_t NewOffset;
7034 unsigned UnscaledOp;
7035 bool UseUnscaledOp;
7036 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7037 &UnscaledOp, &NewOffset);
7040 // Replace the FrameIndex with FrameReg.
7041 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7042 if (UseUnscaledOp)
7043 MI.setDesc(TII->get(UnscaledOp));
7044
7045 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7046 return !Offset;
7047 }
7048
7049 return false;
7050}
7051
7057
7058MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7059
7060// AArch64 supports MachineCombiner.
7061bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7062
7063// True when Opc sets flag
7064static bool isCombineInstrSettingFlag(unsigned Opc) {
7065 switch (Opc) {
7066 case AArch64::ADDSWrr:
7067 case AArch64::ADDSWri:
7068 case AArch64::ADDSXrr:
7069 case AArch64::ADDSXri:
7070 case AArch64::SUBSWrr:
7071 case AArch64::SUBSXrr:
7072 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7073 case AArch64::SUBSWri:
7074 case AArch64::SUBSXri:
7075 return true;
7076 default:
7077 break;
7078 }
7079 return false;
7080}
7081
7082// 32b Opcodes that can be combined with a MUL
7083static bool isCombineInstrCandidate32(unsigned Opc) {
7084 switch (Opc) {
7085 case AArch64::ADDWrr:
7086 case AArch64::ADDWri:
7087 case AArch64::SUBWrr:
7088 case AArch64::ADDSWrr:
7089 case AArch64::ADDSWri:
7090 case AArch64::SUBSWrr:
7091 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7092 case AArch64::SUBWri:
7093 case AArch64::SUBSWri:
7094 return true;
7095 default:
7096 break;
7097 }
7098 return false;
7099}
7100
7101// 64b Opcodes that can be combined with a MUL
7102static bool isCombineInstrCandidate64(unsigned Opc) {
7103 switch (Opc) {
7104 case AArch64::ADDXrr:
7105 case AArch64::ADDXri:
7106 case AArch64::SUBXrr:
7107 case AArch64::ADDSXrr:
7108 case AArch64::ADDSXri:
7109 case AArch64::SUBSXrr:
7110 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7111 case AArch64::SUBXri:
7112 case AArch64::SUBSXri:
7113 case AArch64::ADDv8i8:
7114 case AArch64::ADDv16i8:
7115 case AArch64::ADDv4i16:
7116 case AArch64::ADDv8i16:
7117 case AArch64::ADDv2i32:
7118 case AArch64::ADDv4i32:
7119 case AArch64::SUBv8i8:
7120 case AArch64::SUBv16i8:
7121 case AArch64::SUBv4i16:
7122 case AArch64::SUBv8i16:
7123 case AArch64::SUBv2i32:
7124 case AArch64::SUBv4i32:
7125 return true;
7126 default:
7127 break;
7128 }
7129 return false;
7130}
7131
7132// FP Opcodes that can be combined with a FMUL.
7133static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7134 switch (Inst.getOpcode()) {
7135 default:
7136 break;
7137 case AArch64::FADDHrr:
7138 case AArch64::FADDSrr:
7139 case AArch64::FADDDrr:
7140 case AArch64::FADDv4f16:
7141 case AArch64::FADDv8f16:
7142 case AArch64::FADDv2f32:
7143 case AArch64::FADDv2f64:
7144 case AArch64::FADDv4f32:
7145 case AArch64::FSUBHrr:
7146 case AArch64::FSUBSrr:
7147 case AArch64::FSUBDrr:
7148 case AArch64::FSUBv4f16:
7149 case AArch64::FSUBv8f16:
7150 case AArch64::FSUBv2f32:
7151 case AArch64::FSUBv2f64:
7152 case AArch64::FSUBv4f32:
7154 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7155 // the target options or if FADD/FSUB has the contract fast-math flag.
7156 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7158 }
7159 return false;
7160}
7161
7162// Opcodes that can be combined with a MUL
7166
7167//
7168// Utility routine that checks if \param MO is defined by an
7169// \param CombineOpc instruction in the basic block \param MBB
7171 unsigned CombineOpc, unsigned ZeroReg = 0,
7172 bool CheckZeroReg = false) {
7173 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7174 MachineInstr *MI = nullptr;
7175
7176 if (MO.isReg() && MO.getReg().isVirtual())
7177 MI = MRI.getUniqueVRegDef(MO.getReg());
7178 // And it needs to be in the trace (otherwise, it won't have a depth).
7179 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7180 return false;
7181 // Must only used by the user we combine with.
7182 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7183 return false;
7184
7185 if (CheckZeroReg) {
7186 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7187 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7188 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7189 // The third input reg must be zero.
7190 if (MI->getOperand(3).getReg() != ZeroReg)
7191 return false;
7192 }
7193
7194 if (isCombineInstrSettingFlag(CombineOpc) &&
7195 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7196 return false;
7197
7198 return true;
7199}
7200
7201//
7202// Is \param MO defined by an integer multiply and can be combined?
7204 unsigned MulOpc, unsigned ZeroReg) {
7205 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7206}
7207
7208//
7209// Is \param MO defined by a floating-point multiply and can be combined?
7211 unsigned MulOpc) {
7212 return canCombine(MBB, MO, MulOpc);
7213}
7214
7215// TODO: There are many more machine instruction opcodes to match:
7216// 1. Other data types (integer, vectors)
7217// 2. Other math / logic operations (xor, or)
7218// 3. Other forms of the same operation (intrinsics and other variants)
7219bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7220 bool Invert) const {
7221 if (Invert)
7222 return false;
7223 switch (Inst.getOpcode()) {
7224 // == Floating-point types ==
7225 // -- Floating-point instructions --
7226 case AArch64::FADDHrr:
7227 case AArch64::FADDSrr:
7228 case AArch64::FADDDrr:
7229 case AArch64::FMULHrr:
7230 case AArch64::FMULSrr:
7231 case AArch64::FMULDrr:
7232 case AArch64::FMULX16:
7233 case AArch64::FMULX32:
7234 case AArch64::FMULX64:
7235 // -- Advanced SIMD instructions --
7236 case AArch64::FADDv4f16:
7237 case AArch64::FADDv8f16:
7238 case AArch64::FADDv2f32:
7239 case AArch64::FADDv4f32:
7240 case AArch64::FADDv2f64:
7241 case AArch64::FMULv4f16:
7242 case AArch64::FMULv8f16:
7243 case AArch64::FMULv2f32:
7244 case AArch64::FMULv4f32:
7245 case AArch64::FMULv2f64:
7246 case AArch64::FMULXv4f16:
7247 case AArch64::FMULXv8f16:
7248 case AArch64::FMULXv2f32:
7249 case AArch64::FMULXv4f32:
7250 case AArch64::FMULXv2f64:
7251 // -- SVE instructions --
7252 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7253 // in the SVE instruction set (though there are predicated ones).
7254 case AArch64::FADD_ZZZ_H:
7255 case AArch64::FADD_ZZZ_S:
7256 case AArch64::FADD_ZZZ_D:
7257 case AArch64::FMUL_ZZZ_H:
7258 case AArch64::FMUL_ZZZ_S:
7259 case AArch64::FMUL_ZZZ_D:
7262
7263 // == Integer types ==
7264 // -- Base instructions --
7265 // Opcodes MULWrr and MULXrr don't exist because
7266 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7267 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7268 // The machine-combiner does not support three-source-operands machine
7269 // instruction. So we cannot reassociate MULs.
7270 case AArch64::ADDWrr:
7271 case AArch64::ADDXrr:
7272 case AArch64::ANDWrr:
7273 case AArch64::ANDXrr:
7274 case AArch64::ORRWrr:
7275 case AArch64::ORRXrr:
7276 case AArch64::EORWrr:
7277 case AArch64::EORXrr:
7278 case AArch64::EONWrr:
7279 case AArch64::EONXrr:
7280 // -- Advanced SIMD instructions --
7281 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7282 // in the Advanced SIMD instruction set.
7283 case AArch64::ADDv8i8:
7284 case AArch64::ADDv16i8:
7285 case AArch64::ADDv4i16:
7286 case AArch64::ADDv8i16:
7287 case AArch64::ADDv2i32:
7288 case AArch64::ADDv4i32:
7289 case AArch64::ADDv1i64:
7290 case AArch64::ADDv2i64:
7291 case AArch64::MULv8i8:
7292 case AArch64::MULv16i8:
7293 case AArch64::MULv4i16:
7294 case AArch64::MULv8i16:
7295 case AArch64::MULv2i32:
7296 case AArch64::MULv4i32:
7297 case AArch64::ANDv8i8:
7298 case AArch64::ANDv16i8:
7299 case AArch64::ORRv8i8:
7300 case AArch64::ORRv16i8:
7301 case AArch64::EORv8i8:
7302 case AArch64::EORv16i8:
7303 // -- SVE instructions --
7304 case AArch64::ADD_ZZZ_B:
7305 case AArch64::ADD_ZZZ_H:
7306 case AArch64::ADD_ZZZ_S:
7307 case AArch64::ADD_ZZZ_D:
7308 case AArch64::MUL_ZZZ_B:
7309 case AArch64::MUL_ZZZ_H:
7310 case AArch64::MUL_ZZZ_S:
7311 case AArch64::MUL_ZZZ_D:
7312 case AArch64::AND_ZZZ:
7313 case AArch64::ORR_ZZZ:
7314 case AArch64::EOR_ZZZ:
7315 return true;
7316
7317 default:
7318 return false;
7319 }
7320}
7321
7322/// Find instructions that can be turned into madd.
7324 SmallVectorImpl<unsigned> &Patterns) {
7325 unsigned Opc = Root.getOpcode();
7326 MachineBasicBlock &MBB = *Root.getParent();
7327 bool Found = false;
7328
7330 return false;
7332 int Cmp_NZCV =
7333 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7334 // When NZCV is live bail out.
7335 if (Cmp_NZCV == -1)
7336 return false;
7337 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7338 // When opcode can't change bail out.
7339 // CHECKME: do we miss any cases for opcode conversion?
7340 if (NewOpc == Opc)
7341 return false;
7342 Opc = NewOpc;
7343 }
7344
7345 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7346 unsigned Pattern) {
7347 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7348 Patterns.push_back(Pattern);
7349 Found = true;
7350 }
7351 };
7352
7353 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7354 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7355 Patterns.push_back(Pattern);
7356 Found = true;
7357 }
7358 };
7359
7361
7362 switch (Opc) {
7363 default:
7364 break;
7365 case AArch64::ADDWrr:
7366 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7367 "ADDWrr does not have register operands");
7368 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7369 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7370 break;
7371 case AArch64::ADDXrr:
7372 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7373 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7374 break;
7375 case AArch64::SUBWrr:
7376 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7377 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7378 break;
7379 case AArch64::SUBXrr:
7380 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7381 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7382 break;
7383 case AArch64::ADDWri:
7384 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7385 break;
7386 case AArch64::ADDXri:
7387 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7388 break;
7389 case AArch64::SUBWri:
7390 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7391 break;
7392 case AArch64::SUBXri:
7393 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7394 break;
7395 case AArch64::ADDv8i8:
7396 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7397 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7398 break;
7399 case AArch64::ADDv16i8:
7400 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7401 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7402 break;
7403 case AArch64::ADDv4i16:
7404 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7405 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7406 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7407 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7408 break;
7409 case AArch64::ADDv8i16:
7410 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7411 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7412 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7413 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7414 break;
7415 case AArch64::ADDv2i32:
7416 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7417 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7418 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7419 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7420 break;
7421 case AArch64::ADDv4i32:
7422 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7423 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7424 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7425 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7426 break;
7427 case AArch64::SUBv8i8:
7428 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7429 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7430 break;
7431 case AArch64::SUBv16i8:
7432 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7433 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7434 break;
7435 case AArch64::SUBv4i16:
7436 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7437 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7438 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7439 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7440 break;
7441 case AArch64::SUBv8i16:
7442 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7443 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7444 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7445 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7446 break;
7447 case AArch64::SUBv2i32:
7448 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7449 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7450 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7451 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7452 break;
7453 case AArch64::SUBv4i32:
7454 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7455 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7456 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7457 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7458 break;
7459 }
7460 return Found;
7461}
7462
7463bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7464 switch (Opcode) {
7465 default:
7466 break;
7467 case AArch64::UABALB_ZZZ_D:
7468 case AArch64::UABALB_ZZZ_H:
7469 case AArch64::UABALB_ZZZ_S:
7470 case AArch64::UABALT_ZZZ_D:
7471 case AArch64::UABALT_ZZZ_H:
7472 case AArch64::UABALT_ZZZ_S:
7473 case AArch64::SABALB_ZZZ_D:
7474 case AArch64::SABALB_ZZZ_S:
7475 case AArch64::SABALB_ZZZ_H:
7476 case AArch64::SABALT_ZZZ_D:
7477 case AArch64::SABALT_ZZZ_S:
7478 case AArch64::SABALT_ZZZ_H:
7479 case AArch64::UABALv16i8_v8i16:
7480 case AArch64::UABALv2i32_v2i64:
7481 case AArch64::UABALv4i16_v4i32:
7482 case AArch64::UABALv4i32_v2i64:
7483 case AArch64::UABALv8i16_v4i32:
7484 case AArch64::UABALv8i8_v8i16:
7485 case AArch64::UABAv16i8:
7486 case AArch64::UABAv2i32:
7487 case AArch64::UABAv4i16:
7488 case AArch64::UABAv4i32:
7489 case AArch64::UABAv8i16:
7490 case AArch64::UABAv8i8:
7491 case AArch64::SABALv16i8_v8i16:
7492 case AArch64::SABALv2i32_v2i64:
7493 case AArch64::SABALv4i16_v4i32:
7494 case AArch64::SABALv4i32_v2i64:
7495 case AArch64::SABALv8i16_v4i32:
7496 case AArch64::SABALv8i8_v8i16:
7497 case AArch64::SABAv16i8:
7498 case AArch64::SABAv2i32:
7499 case AArch64::SABAv4i16:
7500 case AArch64::SABAv4i32:
7501 case AArch64::SABAv8i16:
7502 case AArch64::SABAv8i8:
7503 return true;
7504 }
7505
7506 return false;
7507}
7508
7509unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7510 unsigned AccumulationOpcode) const {
7511 switch (AccumulationOpcode) {
7512 default:
7513 llvm_unreachable("Unsupported accumulation Opcode!");
7514 case AArch64::UABALB_ZZZ_D:
7515 return AArch64::UABDLB_ZZZ_D;
7516 case AArch64::UABALB_ZZZ_H:
7517 return AArch64::UABDLB_ZZZ_H;
7518 case AArch64::UABALB_ZZZ_S:
7519 return AArch64::UABDLB_ZZZ_S;
7520 case AArch64::UABALT_ZZZ_D:
7521 return AArch64::UABDLT_ZZZ_D;
7522 case AArch64::UABALT_ZZZ_H:
7523 return AArch64::UABDLT_ZZZ_H;
7524 case AArch64::UABALT_ZZZ_S:
7525 return AArch64::UABDLT_ZZZ_S;
7526 case AArch64::UABALv16i8_v8i16:
7527 return AArch64::UABDLv16i8_v8i16;
7528 case AArch64::UABALv2i32_v2i64:
7529 return AArch64::UABDLv2i32_v2i64;
7530 case AArch64::UABALv4i16_v4i32:
7531 return AArch64::UABDLv4i16_v4i32;
7532 case AArch64::UABALv4i32_v2i64:
7533 return AArch64::UABDLv4i32_v2i64;
7534 case AArch64::UABALv8i16_v4i32:
7535 return AArch64::UABDLv8i16_v4i32;
7536 case AArch64::UABALv8i8_v8i16:
7537 return AArch64::UABDLv8i8_v8i16;
7538 case AArch64::UABAv16i8:
7539 return AArch64::UABDv16i8;
7540 case AArch64::UABAv2i32:
7541 return AArch64::UABDv2i32;
7542 case AArch64::UABAv4i16:
7543 return AArch64::UABDv4i16;
7544 case AArch64::UABAv4i32:
7545 return AArch64::UABDv4i32;
7546 case AArch64::UABAv8i16:
7547 return AArch64::UABDv8i16;
7548 case AArch64::UABAv8i8:
7549 return AArch64::UABDv8i8;
7550 case AArch64::SABALB_ZZZ_D:
7551 return AArch64::SABDLB_ZZZ_D;
7552 case AArch64::SABALB_ZZZ_S:
7553 return AArch64::SABDLB_ZZZ_S;
7554 case AArch64::SABALB_ZZZ_H:
7555 return AArch64::SABDLB_ZZZ_H;
7556 case AArch64::SABALT_ZZZ_D:
7557 return AArch64::SABDLT_ZZZ_D;
7558 case AArch64::SABALT_ZZZ_S:
7559 return AArch64::SABDLT_ZZZ_S;
7560 case AArch64::SABALT_ZZZ_H:
7561 return AArch64::SABDLT_ZZZ_H;
7562 case AArch64::SABALv16i8_v8i16:
7563 return AArch64::SABDLv16i8_v8i16;
7564 case AArch64::SABALv2i32_v2i64:
7565 return AArch64::SABDLv2i32_v2i64;
7566 case AArch64::SABALv4i16_v4i32:
7567 return AArch64::SABDLv4i16_v4i32;
7568 case AArch64::SABALv4i32_v2i64:
7569 return AArch64::SABDLv4i32_v2i64;
7570 case AArch64::SABALv8i16_v4i32:
7571 return AArch64::SABDLv8i16_v4i32;
7572 case AArch64::SABALv8i8_v8i16:
7573 return AArch64::SABDLv8i8_v8i16;
7574 case AArch64::SABAv16i8:
7575 return AArch64::SABDv16i8;
7576 case AArch64::SABAv2i32:
7577 return AArch64::SABAv2i32;
7578 case AArch64::SABAv4i16:
7579 return AArch64::SABDv4i16;
7580 case AArch64::SABAv4i32:
7581 return AArch64::SABDv4i32;
7582 case AArch64::SABAv8i16:
7583 return AArch64::SABDv8i16;
7584 case AArch64::SABAv8i8:
7585 return AArch64::SABDv8i8;
7586 }
7587}
7588
7589/// Floating-Point Support
7590
7591/// Find instructions that can be turned into madd.
7593 SmallVectorImpl<unsigned> &Patterns) {
7594
7595 if (!isCombineInstrCandidateFP(Root))
7596 return false;
7597
7598 MachineBasicBlock &MBB = *Root.getParent();
7599 bool Found = false;
7600
7601 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7602 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7603 Patterns.push_back(Pattern);
7604 return true;
7605 }
7606 return false;
7607 };
7608
7610
7611 switch (Root.getOpcode()) {
7612 default:
7613 assert(false && "Unsupported FP instruction in combiner\n");
7614 break;
7615 case AArch64::FADDHrr:
7616 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7617 "FADDHrr does not have register operands");
7618
7619 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7620 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7621 break;
7622 case AArch64::FADDSrr:
7623 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7624 "FADDSrr does not have register operands");
7625
7626 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7627 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7628
7629 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7630 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7631 break;
7632 case AArch64::FADDDrr:
7633 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7634 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7635
7636 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7637 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7638 break;
7639 case AArch64::FADDv4f16:
7640 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7641 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7642
7643 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7644 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7645 break;
7646 case AArch64::FADDv8f16:
7647 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7648 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7649
7650 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7651 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7652 break;
7653 case AArch64::FADDv2f32:
7654 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7655 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7656
7657 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7658 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7659 break;
7660 case AArch64::FADDv2f64:
7661 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7662 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7663
7664 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7665 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7666 break;
7667 case AArch64::FADDv4f32:
7668 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7669 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7670
7671 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7672 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7673 break;
7674 case AArch64::FSUBHrr:
7675 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7676 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7677 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7678 break;
7679 case AArch64::FSUBSrr:
7680 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7681
7682 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7683 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7684
7685 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7686 break;
7687 case AArch64::FSUBDrr:
7688 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7689
7690 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7691 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7692
7693 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7694 break;
7695 case AArch64::FSUBv4f16:
7696 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7697 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7698
7699 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7700 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7701 break;
7702 case AArch64::FSUBv8f16:
7703 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7704 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7705
7706 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7707 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7708 break;
7709 case AArch64::FSUBv2f32:
7710 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7711 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7712
7713 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7714 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7715 break;
7716 case AArch64::FSUBv2f64:
7717 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7718 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7719
7720 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7721 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7722 break;
7723 case AArch64::FSUBv4f32:
7724 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7725 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7726
7727 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7728 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7729 break;
7730 }
7731 return Found;
7732}
7733
7735 SmallVectorImpl<unsigned> &Patterns) {
7736 MachineBasicBlock &MBB = *Root.getParent();
7737 bool Found = false;
7738
7739 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7740 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7741 MachineOperand &MO = Root.getOperand(Operand);
7742 MachineInstr *MI = nullptr;
7743 if (MO.isReg() && MO.getReg().isVirtual())
7744 MI = MRI.getUniqueVRegDef(MO.getReg());
7745 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7746 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7747 MI->getOperand(1).getReg().isVirtual())
7748 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7749 if (MI && MI->getOpcode() == Opcode) {
7750 Patterns.push_back(Pattern);
7751 return true;
7752 }
7753 return false;
7754 };
7755
7757
7758 switch (Root.getOpcode()) {
7759 default:
7760 return false;
7761 case AArch64::FMULv2f32:
7762 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7763 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7764 break;
7765 case AArch64::FMULv2f64:
7766 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7767 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7768 break;
7769 case AArch64::FMULv4f16:
7770 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7771 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7772 break;
7773 case AArch64::FMULv4f32:
7774 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7775 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7776 break;
7777 case AArch64::FMULv8f16:
7778 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7779 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7780 break;
7781 }
7782
7783 return Found;
7784}
7785
7787 SmallVectorImpl<unsigned> &Patterns) {
7788 unsigned Opc = Root.getOpcode();
7789 MachineBasicBlock &MBB = *Root.getParent();
7790 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7791
7792 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7793 MachineOperand &MO = Root.getOperand(1);
7794 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
7795 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7796 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7800 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7801 Patterns.push_back(Pattern);
7802 return true;
7803 }
7804 return false;
7805 };
7806
7807 switch (Opc) {
7808 default:
7809 break;
7810 case AArch64::FNEGDr:
7811 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7812 case AArch64::FNEGSr:
7813 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7814 }
7815
7816 return false;
7817}
7818
7819/// Return true when a code sequence can improve throughput. It
7820/// should be called only for instructions in loops.
7821/// \param Pattern - combiner pattern
7823 switch (Pattern) {
7824 default:
7825 break;
7931 return true;
7932 } // end switch (Pattern)
7933 return false;
7934}
7935
7936/// Find other MI combine patterns.
7938 SmallVectorImpl<unsigned> &Patterns) {
7939 // A - (B + C) ==> (A - B) - C or (A - C) - B
7940 unsigned Opc = Root.getOpcode();
7941 MachineBasicBlock &MBB = *Root.getParent();
7942
7943 switch (Opc) {
7944 case AArch64::SUBWrr:
7945 case AArch64::SUBSWrr:
7946 case AArch64::SUBXrr:
7947 case AArch64::SUBSXrr:
7948 // Found candidate root.
7949 break;
7950 default:
7951 return false;
7952 }
7953
7955 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7956 -1)
7957 return false;
7958
7959 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7960 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7961 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7962 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7965 return true;
7966 }
7967
7968 return false;
7969}
7970
7971/// Check if the given instruction forms a gather load pattern that can be
7972/// optimized for better Memory-Level Parallelism (MLP). This function
7973/// identifies chains of NEON lane load instructions that load data from
7974/// different memory addresses into individual lanes of a 128-bit vector
7975/// register, then attempts to split the pattern into parallel loads to break
7976/// the serial dependency between instructions.
7977///
7978/// Pattern Matched:
7979/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
7980/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
7981///
7982/// Transformed Into:
7983/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
7984/// to combine the results, enabling better memory-level parallelism.
7985///
7986/// Supported Element Types:
7987/// - 32-bit elements (LD1i32, 4 lanes total)
7988/// - 16-bit elements (LD1i16, 8 lanes total)
7989/// - 8-bit elements (LD1i8, 16 lanes total)
7991 SmallVectorImpl<unsigned> &Patterns,
7992 unsigned LoadLaneOpCode, unsigned NumLanes) {
7993 const MachineFunction *MF = Root.getMF();
7994
7995 // Early exit if optimizing for size.
7996 if (MF->getFunction().hasMinSize())
7997 return false;
7998
7999 const MachineRegisterInfo &MRI = MF->getRegInfo();
8001
8002 // The root of the pattern must load into the last lane of the vector.
8003 if (Root.getOperand(2).getImm() != NumLanes - 1)
8004 return false;
8005
8006 // Check that we have load into all lanes except lane 0.
8007 // For each load we also want to check that:
8008 // 1. It has a single non-debug use (since we will be replacing the virtual
8009 // register)
8010 // 2. That the addressing mode only uses a single pointer operand
8011 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8012 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8013 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8015 while (!RemainingLanes.empty() && CurrInstr &&
8016 CurrInstr->getOpcode() == LoadLaneOpCode &&
8017 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8018 CurrInstr->getNumOperands() == 4) {
8019 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8020 LoadInstrs.push_back(CurrInstr);
8021 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8022 }
8023
8024 // Check that we have found a match for lanes N-1.. 1.
8025 if (!RemainingLanes.empty())
8026 return false;
8027
8028 // Match the SUBREG_TO_REG sequence.
8029 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8030 return false;
8031
8032 // Verify that the subreg to reg loads an integer into the first lane.
8033 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8034 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8035 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8036 return false;
8037
8038 // Verify that it also has a single non debug use.
8039 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8040 return false;
8041
8042 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8043
8044 // If there is any chance of aliasing, do not apply the pattern.
8045 // Walk backward through the MBB starting from Root.
8046 // Exit early if we've encountered all load instructions or hit the search
8047 // limit.
8048 auto MBBItr = Root.getIterator();
8049 unsigned RemainingSteps = GatherOptSearchLimit;
8050 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8051 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8052 const MachineBasicBlock *MBB = Root.getParent();
8053
8054 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8055 !RemainingLoadInstrs.empty();
8056 --MBBItr, --RemainingSteps) {
8057 const MachineInstr &CurrInstr = *MBBItr;
8058
8059 // Remove this instruction from remaining loads if it's one we're tracking.
8060 RemainingLoadInstrs.erase(&CurrInstr);
8061
8062 // Check for potential aliasing with any of the load instructions to
8063 // optimize.
8064 if (CurrInstr.isLoadFoldBarrier())
8065 return false;
8066 }
8067
8068 // If we hit the search limit without finding all load instructions,
8069 // don't match the pattern.
8070 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8071 return false;
8072
8073 switch (NumLanes) {
8074 case 4:
8076 break;
8077 case 8:
8079 break;
8080 case 16:
8082 break;
8083 default:
8084 llvm_unreachable("Got bad number of lanes for gather pattern.");
8085 }
8086
8087 return true;
8088}
8089
8090/// Search for patterns of LD instructions we can optimize.
8092 SmallVectorImpl<unsigned> &Patterns) {
8093
8094 // The pattern searches for loads into single lanes.
8095 switch (Root.getOpcode()) {
8096 case AArch64::LD1i32:
8097 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8098 case AArch64::LD1i16:
8099 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8100 case AArch64::LD1i8:
8101 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8102 default:
8103 return false;
8104 }
8105}
8106
8107/// Generate optimized instruction sequence for gather load patterns to improve
8108/// Memory-Level Parallelism (MLP). This function transforms a chain of
8109/// sequential NEON lane loads into parallel vector loads that can execute
8110/// concurrently.
8111static void
8115 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8116 unsigned Pattern, unsigned NumLanes) {
8117 MachineFunction &MF = *Root.getParent()->getParent();
8120
8121 // Gather the initial load instructions to build the pattern.
8122 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8123 MachineInstr *CurrInstr = &Root;
8124 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8125 LoadToLaneInstrs.push_back(CurrInstr);
8126 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8127 }
8128
8129 // Sort the load instructions according to the lane.
8130 llvm::sort(LoadToLaneInstrs,
8131 [](const MachineInstr *A, const MachineInstr *B) {
8132 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8133 });
8134
8135 MachineInstr *SubregToReg = CurrInstr;
8136 LoadToLaneInstrs.push_back(
8137 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8138 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8139
8140 const TargetRegisterClass *FPR128RegClass =
8141 MRI.getRegClass(Root.getOperand(0).getReg());
8142
8143 // Helper lambda to create a LD1 instruction.
8144 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8145 Register SrcRegister, unsigned Lane,
8146 Register OffsetRegister,
8147 bool OffsetRegisterKillState) {
8148 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8149 MachineInstrBuilder LoadIndexIntoRegister =
8150 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8151 NewRegister)
8152 .addReg(SrcRegister)
8153 .addImm(Lane)
8154 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8155 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8156 InsInstrs.push_back(LoadIndexIntoRegister);
8157 return NewRegister;
8158 };
8159
8160 // Helper to create load instruction based on the NumLanes in the NEON
8161 // register we are rewriting.
8162 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8163 Register OffsetReg,
8164 bool KillState) -> MachineInstrBuilder {
8165 unsigned Opcode;
8166 switch (NumLanes) {
8167 case 4:
8168 Opcode = AArch64::LDRSui;
8169 break;
8170 case 8:
8171 Opcode = AArch64::LDRHui;
8172 break;
8173 case 16:
8174 Opcode = AArch64::LDRBui;
8175 break;
8176 default:
8178 "Got unsupported number of lanes in machine-combiner gather pattern");
8179 }
8180 // Immediate offset load
8181 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8182 .addReg(OffsetReg)
8183 .addImm(0);
8184 };
8185
8186 // Load the remaining lanes into register 0.
8187 auto LanesToLoadToReg0 =
8188 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8189 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8190 Register PrevReg = SubregToReg->getOperand(0).getReg();
8191 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8192 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8193 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8194 OffsetRegOperand.getReg(),
8195 OffsetRegOperand.isKill());
8196 DelInstrs.push_back(LoadInstr);
8197 }
8198 Register LastLoadReg0 = PrevReg;
8199
8200 // First load into register 1. Perform an integer load to zero out the upper
8201 // lanes in a single instruction.
8202 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8203 MachineInstr *OriginalSplitLoad =
8204 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8205 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8206 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8207
8208 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8209 OriginalSplitLoad->getOperand(3);
8210 MachineInstrBuilder MiddleIndexLoadInstr =
8211 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8212 OriginalSplitToLoadOffsetOperand.getReg(),
8213 OriginalSplitToLoadOffsetOperand.isKill());
8214
8215 InstrIdxForVirtReg.insert(
8216 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8217 InsInstrs.push_back(MiddleIndexLoadInstr);
8218 DelInstrs.push_back(OriginalSplitLoad);
8219
8220 // Subreg To Reg instruction for register 1.
8221 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8222 unsigned SubregType;
8223 switch (NumLanes) {
8224 case 4:
8225 SubregType = AArch64::ssub;
8226 break;
8227 case 8:
8228 SubregType = AArch64::hsub;
8229 break;
8230 case 16:
8231 SubregType = AArch64::bsub;
8232 break;
8233 default:
8235 "Got invalid NumLanes for machine-combiner gather pattern");
8236 }
8237
8238 auto SubRegToRegInstr =
8239 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8240 DestRegForSubregToReg)
8241 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8242 .addImm(SubregType);
8243 InstrIdxForVirtReg.insert(
8244 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8245 InsInstrs.push_back(SubRegToRegInstr);
8246
8247 // Load remaining lanes into register 1.
8248 auto LanesToLoadToReg1 =
8249 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8250 LoadToLaneInstrsAscending.end());
8251 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8252 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8253 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8254 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8255 OffsetRegOperand.getReg(),
8256 OffsetRegOperand.isKill());
8257
8258 // Do not add the last reg to DelInstrs - it will be removed later.
8259 if (Index == NumLanes / 2 - 2) {
8260 break;
8261 }
8262 DelInstrs.push_back(LoadInstr);
8263 }
8264 Register LastLoadReg1 = PrevReg;
8265
8266 // Create the final zip instruction to combine the results.
8267 MachineInstrBuilder ZipInstr =
8268 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8269 Root.getOperand(0).getReg())
8270 .addReg(LastLoadReg0)
8271 .addReg(LastLoadReg1);
8272 InsInstrs.push_back(ZipInstr);
8273}
8274
8288
8289/// Return true when there is potentially a faster code sequence for an
8290/// instruction chain ending in \p Root. All potential patterns are listed in
8291/// the \p Pattern vector. Pattern should be sorted in priority order since the
8292/// pattern evaluator stops checking as soon as it finds a faster sequence.
8293
8294bool AArch64InstrInfo::getMachineCombinerPatterns(
8295 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8296 bool DoRegPressureReduce) const {
8297 // Integer patterns
8298 if (getMaddPatterns(Root, Patterns))
8299 return true;
8300 // Floating point patterns
8301 if (getFMULPatterns(Root, Patterns))
8302 return true;
8303 if (getFMAPatterns(Root, Patterns))
8304 return true;
8305 if (getFNEGPatterns(Root, Patterns))
8306 return true;
8307
8308 // Other patterns
8309 if (getMiscPatterns(Root, Patterns))
8310 return true;
8311
8312 // Load patterns
8313 if (getLoadPatterns(Root, Patterns))
8314 return true;
8315
8316 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8317 DoRegPressureReduce);
8318}
8319
8321/// genFusedMultiply - Generate fused multiply instructions.
8322/// This function supports both integer and floating point instructions.
8323/// A typical example:
8324/// F|MUL I=A,B,0
8325/// F|ADD R,I,C
8326/// ==> F|MADD R,A,B,C
8327/// \param MF Containing MachineFunction
8328/// \param MRI Register information
8329/// \param TII Target information
8330/// \param Root is the F|ADD instruction
8331/// \param [out] InsInstrs is a vector of machine instructions and will
8332/// contain the generated madd instruction
8333/// \param IdxMulOpd is index of operand in Root that is the result of
8334/// the F|MUL. In the example above IdxMulOpd is 1.
8335/// \param MaddOpc the opcode fo the f|madd instruction
8336/// \param RC Register class of operands
8337/// \param kind of fma instruction (addressing mode) to be generated
8338/// \param ReplacedAddend is the result register from the instruction
8339/// replacing the non-combined operand, if any.
8340static MachineInstr *
8342 const TargetInstrInfo *TII, MachineInstr &Root,
8343 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8344 unsigned MaddOpc, const TargetRegisterClass *RC,
8346 const Register *ReplacedAddend = nullptr) {
8347 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8348
8349 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8350 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8351 Register ResultReg = Root.getOperand(0).getReg();
8352 Register SrcReg0 = MUL->getOperand(1).getReg();
8353 bool Src0IsKill = MUL->getOperand(1).isKill();
8354 Register SrcReg1 = MUL->getOperand(2).getReg();
8355 bool Src1IsKill = MUL->getOperand(2).isKill();
8356
8357 Register SrcReg2;
8358 bool Src2IsKill;
8359 if (ReplacedAddend) {
8360 // If we just generated a new addend, we must be it's only use.
8361 SrcReg2 = *ReplacedAddend;
8362 Src2IsKill = true;
8363 } else {
8364 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8365 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8366 }
8367
8368 if (ResultReg.isVirtual())
8369 MRI.constrainRegClass(ResultReg, RC);
8370 if (SrcReg0.isVirtual())
8371 MRI.constrainRegClass(SrcReg0, RC);
8372 if (SrcReg1.isVirtual())
8373 MRI.constrainRegClass(SrcReg1, RC);
8374 if (SrcReg2.isVirtual())
8375 MRI.constrainRegClass(SrcReg2, RC);
8376
8378 if (kind == FMAInstKind::Default)
8379 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8380 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8381 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8382 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8383 else if (kind == FMAInstKind::Indexed)
8384 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8385 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8386 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8387 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8388 .addImm(MUL->getOperand(3).getImm());
8389 else if (kind == FMAInstKind::Accumulator)
8390 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8391 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8392 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8393 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8394 else
8395 assert(false && "Invalid FMA instruction kind \n");
8396 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8397 InsInstrs.push_back(MIB);
8398 return MUL;
8399}
8400
8401static MachineInstr *
8403 const TargetInstrInfo *TII, MachineInstr &Root,
8405 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8406
8407 unsigned Opc = 0;
8408 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8409 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8410 Opc = AArch64::FNMADDSrrr;
8411 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8412 Opc = AArch64::FNMADDDrrr;
8413 else
8414 return nullptr;
8415
8416 Register ResultReg = Root.getOperand(0).getReg();
8417 Register SrcReg0 = MAD->getOperand(1).getReg();
8418 Register SrcReg1 = MAD->getOperand(2).getReg();
8419 Register SrcReg2 = MAD->getOperand(3).getReg();
8420 bool Src0IsKill = MAD->getOperand(1).isKill();
8421 bool Src1IsKill = MAD->getOperand(2).isKill();
8422 bool Src2IsKill = MAD->getOperand(3).isKill();
8423 if (ResultReg.isVirtual())
8424 MRI.constrainRegClass(ResultReg, RC);
8425 if (SrcReg0.isVirtual())
8426 MRI.constrainRegClass(SrcReg0, RC);
8427 if (SrcReg1.isVirtual())
8428 MRI.constrainRegClass(SrcReg1, RC);
8429 if (SrcReg2.isVirtual())
8430 MRI.constrainRegClass(SrcReg2, RC);
8431
8433 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8434 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8435 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8436 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8437 InsInstrs.push_back(MIB);
8438
8439 return MAD;
8440}
8441
8442/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8443static MachineInstr *
8446 unsigned IdxDupOp, unsigned MulOpc,
8448 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8449 "Invalid index of FMUL operand");
8450
8451 MachineFunction &MF = *Root.getMF();
8453
8454 MachineInstr *Dup =
8455 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8456
8457 if (Dup->getOpcode() == TargetOpcode::COPY)
8458 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8459
8460 Register DupSrcReg = Dup->getOperand(1).getReg();
8461 MRI.clearKillFlags(DupSrcReg);
8462 MRI.constrainRegClass(DupSrcReg, RC);
8463
8464 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8465
8466 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8467 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8468
8469 Register ResultReg = Root.getOperand(0).getReg();
8470
8472 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8473 .add(MulOp)
8474 .addReg(DupSrcReg)
8475 .addImm(DupSrcLane);
8476
8477 InsInstrs.push_back(MIB);
8478 return &Root;
8479}
8480
8481/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8482/// instructions.
8483///
8484/// \see genFusedMultiply
8488 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8489 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8491}
8492
8493/// genNeg - Helper to generate an intermediate negation of the second operand
8494/// of Root
8496 const TargetInstrInfo *TII, MachineInstr &Root,
8498 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8499 unsigned MnegOpc, const TargetRegisterClass *RC) {
8500 Register NewVR = MRI.createVirtualRegister(RC);
8502 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8503 .add(Root.getOperand(2));
8504 InsInstrs.push_back(MIB);
8505
8506 assert(InstrIdxForVirtReg.empty());
8507 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8508
8509 return NewVR;
8510}
8511
8512/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8513/// instructions with an additional negation of the accumulator
8517 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8518 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8519 assert(IdxMulOpd == 1);
8520
8521 Register NewVR =
8522 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8523 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8524 FMAInstKind::Accumulator, &NewVR);
8525}
8526
8527/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8528/// instructions.
8529///
8530/// \see genFusedMultiply
8534 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8535 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8537}
8538
8539/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8540/// instructions with an additional negation of the accumulator
8544 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8545 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8546 assert(IdxMulOpd == 1);
8547
8548 Register NewVR =
8549 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8550
8551 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8552 FMAInstKind::Indexed, &NewVR);
8553}
8554
8555/// genMaddR - Generate madd instruction and combine mul and add using
8556/// an extra virtual register
8557/// Example - an ADD intermediate needs to be stored in a register:
8558/// MUL I=A,B,0
8559/// ADD R,I,Imm
8560/// ==> ORR V, ZR, Imm
8561/// ==> MADD R,A,B,V
8562/// \param MF Containing MachineFunction
8563/// \param MRI Register information
8564/// \param TII Target information
8565/// \param Root is the ADD instruction
8566/// \param [out] InsInstrs is a vector of machine instructions and will
8567/// contain the generated madd instruction
8568/// \param IdxMulOpd is index of operand in Root that is the result of
8569/// the MUL. In the example above IdxMulOpd is 1.
8570/// \param MaddOpc the opcode fo the madd instruction
8571/// \param VR is a virtual register that holds the value of an ADD operand
8572/// (V in the example above).
8573/// \param RC Register class of operands
8575 const TargetInstrInfo *TII, MachineInstr &Root,
8577 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8578 const TargetRegisterClass *RC) {
8579 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8580
8581 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8582 Register ResultReg = Root.getOperand(0).getReg();
8583 Register SrcReg0 = MUL->getOperand(1).getReg();
8584 bool Src0IsKill = MUL->getOperand(1).isKill();
8585 Register SrcReg1 = MUL->getOperand(2).getReg();
8586 bool Src1IsKill = MUL->getOperand(2).isKill();
8587
8588 if (ResultReg.isVirtual())
8589 MRI.constrainRegClass(ResultReg, RC);
8590 if (SrcReg0.isVirtual())
8591 MRI.constrainRegClass(SrcReg0, RC);
8592 if (SrcReg1.isVirtual())
8593 MRI.constrainRegClass(SrcReg1, RC);
8595 MRI.constrainRegClass(VR, RC);
8596
8598 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8599 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8600 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8601 .addReg(VR);
8602 // Insert the MADD
8603 InsInstrs.push_back(MIB);
8604 return MUL;
8605}
8606
8607/// Do the following transformation
8608/// A - (B + C) ==> (A - B) - C
8609/// A - (B + C) ==> (A - C) - B
8611 const TargetInstrInfo *TII, MachineInstr &Root,
8614 unsigned IdxOpd1,
8615 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8616 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8617 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8618 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8619
8620 Register ResultReg = Root.getOperand(0).getReg();
8621 Register RegA = Root.getOperand(1).getReg();
8622 bool RegAIsKill = Root.getOperand(1).isKill();
8623 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8624 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8625 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8626 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8627 Register NewVR =
8628 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
8629
8630 unsigned Opcode = Root.getOpcode();
8631 if (Opcode == AArch64::SUBSWrr)
8632 Opcode = AArch64::SUBWrr;
8633 else if (Opcode == AArch64::SUBSXrr)
8634 Opcode = AArch64::SUBXrr;
8635 else
8636 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8637 "Unexpected instruction opcode.");
8638
8639 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8640 Flags &= ~MachineInstr::NoSWrap;
8641 Flags &= ~MachineInstr::NoUWrap;
8642
8643 MachineInstrBuilder MIB1 =
8644 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8645 .addReg(RegA, getKillRegState(RegAIsKill))
8646 .addReg(RegB, getKillRegState(RegBIsKill))
8647 .setMIFlags(Flags);
8648 MachineInstrBuilder MIB2 =
8649 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8650 .addReg(NewVR, getKillRegState(true))
8651 .addReg(RegC, getKillRegState(RegCIsKill))
8652 .setMIFlags(Flags);
8653
8654 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8655 InsInstrs.push_back(MIB1);
8656 InsInstrs.push_back(MIB2);
8657 DelInstrs.push_back(AddMI);
8658 DelInstrs.push_back(&Root);
8659}
8660
8661unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8662 unsigned int AccumulatorOpCode) const {
8663 switch (AccumulatorOpCode) {
8664 case AArch64::UABALB_ZZZ_D:
8665 case AArch64::SABALB_ZZZ_D:
8666 case AArch64::UABALT_ZZZ_D:
8667 case AArch64::SABALT_ZZZ_D:
8668 return AArch64::ADD_ZZZ_D;
8669 case AArch64::UABALB_ZZZ_H:
8670 case AArch64::SABALB_ZZZ_H:
8671 case AArch64::UABALT_ZZZ_H:
8672 case AArch64::SABALT_ZZZ_H:
8673 return AArch64::ADD_ZZZ_H;
8674 case AArch64::UABALB_ZZZ_S:
8675 case AArch64::SABALB_ZZZ_S:
8676 case AArch64::UABALT_ZZZ_S:
8677 case AArch64::SABALT_ZZZ_S:
8678 return AArch64::ADD_ZZZ_S;
8679 case AArch64::UABALv16i8_v8i16:
8680 case AArch64::SABALv8i8_v8i16:
8681 case AArch64::SABAv8i16:
8682 case AArch64::UABAv8i16:
8683 return AArch64::ADDv8i16;
8684 case AArch64::SABALv2i32_v2i64:
8685 case AArch64::UABALv2i32_v2i64:
8686 case AArch64::SABALv4i32_v2i64:
8687 return AArch64::ADDv2i64;
8688 case AArch64::UABALv4i16_v4i32:
8689 case AArch64::SABALv4i16_v4i32:
8690 case AArch64::SABALv8i16_v4i32:
8691 case AArch64::SABAv4i32:
8692 case AArch64::UABAv4i32:
8693 return AArch64::ADDv4i32;
8694 case AArch64::UABALv4i32_v2i64:
8695 return AArch64::ADDv2i64;
8696 case AArch64::UABALv8i16_v4i32:
8697 return AArch64::ADDv4i32;
8698 case AArch64::UABALv8i8_v8i16:
8699 case AArch64::SABALv16i8_v8i16:
8700 return AArch64::ADDv8i16;
8701 case AArch64::UABAv16i8:
8702 case AArch64::SABAv16i8:
8703 return AArch64::ADDv16i8;
8704 case AArch64::UABAv4i16:
8705 case AArch64::SABAv4i16:
8706 return AArch64::ADDv4i16;
8707 case AArch64::UABAv2i32:
8708 case AArch64::SABAv2i32:
8709 return AArch64::ADDv2i32;
8710 case AArch64::UABAv8i8:
8711 case AArch64::SABAv8i8:
8712 return AArch64::ADDv8i8;
8713 default:
8714 llvm_unreachable("Unknown accumulator opcode");
8715 }
8716}
8717
8718/// When getMachineCombinerPatterns() finds potential patterns,
8719/// this function generates the instructions that could replace the
8720/// original code sequence
8721void AArch64InstrInfo::genAlternativeCodeSequence(
8722 MachineInstr &Root, unsigned Pattern,
8725 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8726 MachineBasicBlock &MBB = *Root.getParent();
8727 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8728 MachineFunction &MF = *MBB.getParent();
8729 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8730
8731 MachineInstr *MUL = nullptr;
8732 const TargetRegisterClass *RC;
8733 unsigned Opc;
8734 switch (Pattern) {
8735 default:
8736 // Reassociate instructions.
8737 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8738 DelInstrs, InstrIdxForVirtReg);
8739 return;
8741 // A - (B + C)
8742 // ==> (A - B) - C
8743 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8744 InstrIdxForVirtReg);
8745 return;
8747 // A - (B + C)
8748 // ==> (A - C) - B
8749 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8750 InstrIdxForVirtReg);
8751 return;
8754 // MUL I=A,B,0
8755 // ADD R,I,C
8756 // ==> MADD R,A,B,C
8757 // --- Create(MADD);
8759 Opc = AArch64::MADDWrrr;
8760 RC = &AArch64::GPR32RegClass;
8761 } else {
8762 Opc = AArch64::MADDXrrr;
8763 RC = &AArch64::GPR64RegClass;
8764 }
8765 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8766 break;
8769 // MUL I=A,B,0
8770 // ADD R,C,I
8771 // ==> MADD R,A,B,C
8772 // --- Create(MADD);
8774 Opc = AArch64::MADDWrrr;
8775 RC = &AArch64::GPR32RegClass;
8776 } else {
8777 Opc = AArch64::MADDXrrr;
8778 RC = &AArch64::GPR64RegClass;
8779 }
8780 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8781 break;
8786 // MUL I=A,B,0
8787 // ADD/SUB R,I,Imm
8788 // ==> MOV V, Imm/-Imm
8789 // ==> MADD R,A,B,V
8790 // --- Create(MADD);
8791 const TargetRegisterClass *RC;
8792 unsigned BitSize, MovImm;
8795 MovImm = AArch64::MOVi32imm;
8796 RC = &AArch64::GPR32spRegClass;
8797 BitSize = 32;
8798 Opc = AArch64::MADDWrrr;
8799 RC = &AArch64::GPR32RegClass;
8800 } else {
8801 MovImm = AArch64::MOVi64imm;
8802 RC = &AArch64::GPR64spRegClass;
8803 BitSize = 64;
8804 Opc = AArch64::MADDXrrr;
8805 RC = &AArch64::GPR64RegClass;
8806 }
8807 Register NewVR = MRI.createVirtualRegister(RC);
8808 uint64_t Imm = Root.getOperand(2).getImm();
8809
8810 if (Root.getOperand(3).isImm()) {
8811 unsigned Val = Root.getOperand(3).getImm();
8812 Imm = Imm << Val;
8813 }
8814 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8816 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8817 // Check that the immediate can be composed via a single instruction.
8819 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8820 if (Insn.size() != 1)
8821 return;
8822 MachineInstrBuilder MIB1 =
8823 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8824 .addImm(IsSub ? -Imm : Imm);
8825 InsInstrs.push_back(MIB1);
8826 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8827 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8828 break;
8829 }
8832 // MUL I=A,B,0
8833 // SUB R,I, C
8834 // ==> SUB V, 0, C
8835 // ==> MADD R,A,B,V // = -C + A*B
8836 // --- Create(MADD);
8837 const TargetRegisterClass *SubRC;
8838 unsigned SubOpc, ZeroReg;
8840 SubOpc = AArch64::SUBWrr;
8841 SubRC = &AArch64::GPR32spRegClass;
8842 ZeroReg = AArch64::WZR;
8843 Opc = AArch64::MADDWrrr;
8844 RC = &AArch64::GPR32RegClass;
8845 } else {
8846 SubOpc = AArch64::SUBXrr;
8847 SubRC = &AArch64::GPR64spRegClass;
8848 ZeroReg = AArch64::XZR;
8849 Opc = AArch64::MADDXrrr;
8850 RC = &AArch64::GPR64RegClass;
8851 }
8852 Register NewVR = MRI.createVirtualRegister(SubRC);
8853 // SUB NewVR, 0, C
8854 MachineInstrBuilder MIB1 =
8855 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8856 .addReg(ZeroReg)
8857 .add(Root.getOperand(2));
8858 InsInstrs.push_back(MIB1);
8859 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8860 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8861 break;
8862 }
8865 // MUL I=A,B,0
8866 // SUB R,C,I
8867 // ==> MSUB R,A,B,C (computes C - A*B)
8868 // --- Create(MSUB);
8870 Opc = AArch64::MSUBWrrr;
8871 RC = &AArch64::GPR32RegClass;
8872 } else {
8873 Opc = AArch64::MSUBXrrr;
8874 RC = &AArch64::GPR64RegClass;
8875 }
8876 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8877 break;
8879 Opc = AArch64::MLAv8i8;
8880 RC = &AArch64::FPR64RegClass;
8881 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8882 break;
8884 Opc = AArch64::MLAv8i8;
8885 RC = &AArch64::FPR64RegClass;
8886 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8887 break;
8889 Opc = AArch64::MLAv16i8;
8890 RC = &AArch64::FPR128RegClass;
8891 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8892 break;
8894 Opc = AArch64::MLAv16i8;
8895 RC = &AArch64::FPR128RegClass;
8896 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8897 break;
8899 Opc = AArch64::MLAv4i16;
8900 RC = &AArch64::FPR64RegClass;
8901 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8902 break;
8904 Opc = AArch64::MLAv4i16;
8905 RC = &AArch64::FPR64RegClass;
8906 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8907 break;
8909 Opc = AArch64::MLAv8i16;
8910 RC = &AArch64::FPR128RegClass;
8911 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8912 break;
8914 Opc = AArch64::MLAv8i16;
8915 RC = &AArch64::FPR128RegClass;
8916 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8917 break;
8919 Opc = AArch64::MLAv2i32;
8920 RC = &AArch64::FPR64RegClass;
8921 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8922 break;
8924 Opc = AArch64::MLAv2i32;
8925 RC = &AArch64::FPR64RegClass;
8926 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8927 break;
8929 Opc = AArch64::MLAv4i32;
8930 RC = &AArch64::FPR128RegClass;
8931 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8932 break;
8934 Opc = AArch64::MLAv4i32;
8935 RC = &AArch64::FPR128RegClass;
8936 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8937 break;
8938
8940 Opc = AArch64::MLAv8i8;
8941 RC = &AArch64::FPR64RegClass;
8942 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8943 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8944 RC);
8945 break;
8947 Opc = AArch64::MLSv8i8;
8948 RC = &AArch64::FPR64RegClass;
8949 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8950 break;
8952 Opc = AArch64::MLAv16i8;
8953 RC = &AArch64::FPR128RegClass;
8954 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8955 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
8956 RC);
8957 break;
8959 Opc = AArch64::MLSv16i8;
8960 RC = &AArch64::FPR128RegClass;
8961 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8962 break;
8964 Opc = AArch64::MLAv4i16;
8965 RC = &AArch64::FPR64RegClass;
8966 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8967 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8968 RC);
8969 break;
8971 Opc = AArch64::MLSv4i16;
8972 RC = &AArch64::FPR64RegClass;
8973 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8974 break;
8976 Opc = AArch64::MLAv8i16;
8977 RC = &AArch64::FPR128RegClass;
8978 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8979 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8980 RC);
8981 break;
8983 Opc = AArch64::MLSv8i16;
8984 RC = &AArch64::FPR128RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8986 break;
8988 Opc = AArch64::MLAv2i32;
8989 RC = &AArch64::FPR64RegClass;
8990 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8991 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8992 RC);
8993 break;
8995 Opc = AArch64::MLSv2i32;
8996 RC = &AArch64::FPR64RegClass;
8997 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8998 break;
9000 Opc = AArch64::MLAv4i32;
9001 RC = &AArch64::FPR128RegClass;
9002 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9003 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9004 RC);
9005 break;
9007 Opc = AArch64::MLSv4i32;
9008 RC = &AArch64::FPR128RegClass;
9009 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9010 break;
9011
9013 Opc = AArch64::MLAv4i16_indexed;
9014 RC = &AArch64::FPR64RegClass;
9015 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9016 break;
9018 Opc = AArch64::MLAv4i16_indexed;
9019 RC = &AArch64::FPR64RegClass;
9020 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9021 break;
9023 Opc = AArch64::MLAv8i16_indexed;
9024 RC = &AArch64::FPR128RegClass;
9025 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9026 break;
9028 Opc = AArch64::MLAv8i16_indexed;
9029 RC = &AArch64::FPR128RegClass;
9030 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9031 break;
9033 Opc = AArch64::MLAv2i32_indexed;
9034 RC = &AArch64::FPR64RegClass;
9035 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9036 break;
9038 Opc = AArch64::MLAv2i32_indexed;
9039 RC = &AArch64::FPR64RegClass;
9040 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9041 break;
9043 Opc = AArch64::MLAv4i32_indexed;
9044 RC = &AArch64::FPR128RegClass;
9045 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9046 break;
9048 Opc = AArch64::MLAv4i32_indexed;
9049 RC = &AArch64::FPR128RegClass;
9050 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9051 break;
9052
9054 Opc = AArch64::MLAv4i16_indexed;
9055 RC = &AArch64::FPR64RegClass;
9056 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9057 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9058 RC);
9059 break;
9061 Opc = AArch64::MLSv4i16_indexed;
9062 RC = &AArch64::FPR64RegClass;
9063 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9064 break;
9066 Opc = AArch64::MLAv8i16_indexed;
9067 RC = &AArch64::FPR128RegClass;
9068 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9069 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9070 RC);
9071 break;
9073 Opc = AArch64::MLSv8i16_indexed;
9074 RC = &AArch64::FPR128RegClass;
9075 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9076 break;
9078 Opc = AArch64::MLAv2i32_indexed;
9079 RC = &AArch64::FPR64RegClass;
9080 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9081 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9082 RC);
9083 break;
9085 Opc = AArch64::MLSv2i32_indexed;
9086 RC = &AArch64::FPR64RegClass;
9087 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9088 break;
9090 Opc = AArch64::MLAv4i32_indexed;
9091 RC = &AArch64::FPR128RegClass;
9092 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9093 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9094 RC);
9095 break;
9097 Opc = AArch64::MLSv4i32_indexed;
9098 RC = &AArch64::FPR128RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9100 break;
9101
9102 // Floating Point Support
9104 Opc = AArch64::FMADDHrrr;
9105 RC = &AArch64::FPR16RegClass;
9106 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9107 break;
9109 Opc = AArch64::FMADDSrrr;
9110 RC = &AArch64::FPR32RegClass;
9111 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9112 break;
9114 Opc = AArch64::FMADDDrrr;
9115 RC = &AArch64::FPR64RegClass;
9116 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9117 break;
9118
9120 Opc = AArch64::FMADDHrrr;
9121 RC = &AArch64::FPR16RegClass;
9122 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9123 break;
9125 Opc = AArch64::FMADDSrrr;
9126 RC = &AArch64::FPR32RegClass;
9127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9128 break;
9130 Opc = AArch64::FMADDDrrr;
9131 RC = &AArch64::FPR64RegClass;
9132 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9133 break;
9134
9136 Opc = AArch64::FMLAv1i32_indexed;
9137 RC = &AArch64::FPR32RegClass;
9138 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9140 break;
9142 Opc = AArch64::FMLAv1i32_indexed;
9143 RC = &AArch64::FPR32RegClass;
9144 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9146 break;
9147
9149 Opc = AArch64::FMLAv1i64_indexed;
9150 RC = &AArch64::FPR64RegClass;
9151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9153 break;
9155 Opc = AArch64::FMLAv1i64_indexed;
9156 RC = &AArch64::FPR64RegClass;
9157 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9159 break;
9160
9162 RC = &AArch64::FPR64RegClass;
9163 Opc = AArch64::FMLAv4i16_indexed;
9164 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9166 break;
9168 RC = &AArch64::FPR64RegClass;
9169 Opc = AArch64::FMLAv4f16;
9170 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9172 break;
9174 RC = &AArch64::FPR64RegClass;
9175 Opc = AArch64::FMLAv4i16_indexed;
9176 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9178 break;
9180 RC = &AArch64::FPR64RegClass;
9181 Opc = AArch64::FMLAv4f16;
9182 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9184 break;
9185
9188 RC = &AArch64::FPR64RegClass;
9190 Opc = AArch64::FMLAv2i32_indexed;
9191 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9193 } else {
9194 Opc = AArch64::FMLAv2f32;
9195 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9197 }
9198 break;
9201 RC = &AArch64::FPR64RegClass;
9203 Opc = AArch64::FMLAv2i32_indexed;
9204 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9206 } else {
9207 Opc = AArch64::FMLAv2f32;
9208 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9210 }
9211 break;
9212
9214 RC = &AArch64::FPR128RegClass;
9215 Opc = AArch64::FMLAv8i16_indexed;
9216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9218 break;
9220 RC = &AArch64::FPR128RegClass;
9221 Opc = AArch64::FMLAv8f16;
9222 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9224 break;
9226 RC = &AArch64::FPR128RegClass;
9227 Opc = AArch64::FMLAv8i16_indexed;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9230 break;
9232 RC = &AArch64::FPR128RegClass;
9233 Opc = AArch64::FMLAv8f16;
9234 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9236 break;
9237
9240 RC = &AArch64::FPR128RegClass;
9242 Opc = AArch64::FMLAv2i64_indexed;
9243 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9245 } else {
9246 Opc = AArch64::FMLAv2f64;
9247 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9249 }
9250 break;
9253 RC = &AArch64::FPR128RegClass;
9255 Opc = AArch64::FMLAv2i64_indexed;
9256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9258 } else {
9259 Opc = AArch64::FMLAv2f64;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9262 }
9263 break;
9264
9267 RC = &AArch64::FPR128RegClass;
9269 Opc = AArch64::FMLAv4i32_indexed;
9270 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9272 } else {
9273 Opc = AArch64::FMLAv4f32;
9274 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9276 }
9277 break;
9278
9281 RC = &AArch64::FPR128RegClass;
9283 Opc = AArch64::FMLAv4i32_indexed;
9284 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9286 } else {
9287 Opc = AArch64::FMLAv4f32;
9288 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9290 }
9291 break;
9292
9294 Opc = AArch64::FNMSUBHrrr;
9295 RC = &AArch64::FPR16RegClass;
9296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9297 break;
9299 Opc = AArch64::FNMSUBSrrr;
9300 RC = &AArch64::FPR32RegClass;
9301 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9302 break;
9304 Opc = AArch64::FNMSUBDrrr;
9305 RC = &AArch64::FPR64RegClass;
9306 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9307 break;
9308
9310 Opc = AArch64::FNMADDHrrr;
9311 RC = &AArch64::FPR16RegClass;
9312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9313 break;
9315 Opc = AArch64::FNMADDSrrr;
9316 RC = &AArch64::FPR32RegClass;
9317 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9318 break;
9320 Opc = AArch64::FNMADDDrrr;
9321 RC = &AArch64::FPR64RegClass;
9322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9323 break;
9324
9326 Opc = AArch64::FMSUBHrrr;
9327 RC = &AArch64::FPR16RegClass;
9328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9329 break;
9331 Opc = AArch64::FMSUBSrrr;
9332 RC = &AArch64::FPR32RegClass;
9333 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9334 break;
9336 Opc = AArch64::FMSUBDrrr;
9337 RC = &AArch64::FPR64RegClass;
9338 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9339 break;
9340
9342 Opc = AArch64::FMLSv1i32_indexed;
9343 RC = &AArch64::FPR32RegClass;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9346 break;
9347
9349 Opc = AArch64::FMLSv1i64_indexed;
9350 RC = &AArch64::FPR64RegClass;
9351 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9353 break;
9354
9357 RC = &AArch64::FPR64RegClass;
9358 Register NewVR = MRI.createVirtualRegister(RC);
9359 MachineInstrBuilder MIB1 =
9360 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9361 .add(Root.getOperand(2));
9362 InsInstrs.push_back(MIB1);
9363 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9365 Opc = AArch64::FMLAv4f16;
9366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9367 FMAInstKind::Accumulator, &NewVR);
9368 } else {
9369 Opc = AArch64::FMLAv4i16_indexed;
9370 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9371 FMAInstKind::Indexed, &NewVR);
9372 }
9373 break;
9374 }
9376 RC = &AArch64::FPR64RegClass;
9377 Opc = AArch64::FMLSv4f16;
9378 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9380 break;
9382 RC = &AArch64::FPR64RegClass;
9383 Opc = AArch64::FMLSv4i16_indexed;
9384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9386 break;
9387
9390 RC = &AArch64::FPR64RegClass;
9392 Opc = AArch64::FMLSv2i32_indexed;
9393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9395 } else {
9396 Opc = AArch64::FMLSv2f32;
9397 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9399 }
9400 break;
9401
9404 RC = &AArch64::FPR128RegClass;
9405 Register NewVR = MRI.createVirtualRegister(RC);
9406 MachineInstrBuilder MIB1 =
9407 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9408 .add(Root.getOperand(2));
9409 InsInstrs.push_back(MIB1);
9410 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9412 Opc = AArch64::FMLAv8f16;
9413 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9414 FMAInstKind::Accumulator, &NewVR);
9415 } else {
9416 Opc = AArch64::FMLAv8i16_indexed;
9417 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9418 FMAInstKind::Indexed, &NewVR);
9419 }
9420 break;
9421 }
9423 RC = &AArch64::FPR128RegClass;
9424 Opc = AArch64::FMLSv8f16;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9427 break;
9429 RC = &AArch64::FPR128RegClass;
9430 Opc = AArch64::FMLSv8i16_indexed;
9431 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9433 break;
9434
9437 RC = &AArch64::FPR128RegClass;
9439 Opc = AArch64::FMLSv2i64_indexed;
9440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9442 } else {
9443 Opc = AArch64::FMLSv2f64;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9446 }
9447 break;
9448
9451 RC = &AArch64::FPR128RegClass;
9453 Opc = AArch64::FMLSv4i32_indexed;
9454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9456 } else {
9457 Opc = AArch64::FMLSv4f32;
9458 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9460 }
9461 break;
9464 RC = &AArch64::FPR64RegClass;
9465 Register NewVR = MRI.createVirtualRegister(RC);
9466 MachineInstrBuilder MIB1 =
9467 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9468 .add(Root.getOperand(2));
9469 InsInstrs.push_back(MIB1);
9470 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9472 Opc = AArch64::FMLAv2i32_indexed;
9473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9474 FMAInstKind::Indexed, &NewVR);
9475 } else {
9476 Opc = AArch64::FMLAv2f32;
9477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9478 FMAInstKind::Accumulator, &NewVR);
9479 }
9480 break;
9481 }
9484 RC = &AArch64::FPR128RegClass;
9485 Register NewVR = MRI.createVirtualRegister(RC);
9486 MachineInstrBuilder MIB1 =
9487 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9488 .add(Root.getOperand(2));
9489 InsInstrs.push_back(MIB1);
9490 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9492 Opc = AArch64::FMLAv4i32_indexed;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9494 FMAInstKind::Indexed, &NewVR);
9495 } else {
9496 Opc = AArch64::FMLAv4f32;
9497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9498 FMAInstKind::Accumulator, &NewVR);
9499 }
9500 break;
9501 }
9504 RC = &AArch64::FPR128RegClass;
9505 Register NewVR = MRI.createVirtualRegister(RC);
9506 MachineInstrBuilder MIB1 =
9507 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9508 .add(Root.getOperand(2));
9509 InsInstrs.push_back(MIB1);
9510 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9512 Opc = AArch64::FMLAv2i64_indexed;
9513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9514 FMAInstKind::Indexed, &NewVR);
9515 } else {
9516 Opc = AArch64::FMLAv2f64;
9517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9518 FMAInstKind::Accumulator, &NewVR);
9519 }
9520 break;
9521 }
9524 unsigned IdxDupOp =
9526 : 2;
9527 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9528 &AArch64::FPR128RegClass, MRI);
9529 break;
9530 }
9533 unsigned IdxDupOp =
9535 : 2;
9536 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9537 &AArch64::FPR128RegClass, MRI);
9538 break;
9539 }
9542 unsigned IdxDupOp =
9544 : 2;
9545 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9546 &AArch64::FPR128_loRegClass, MRI);
9547 break;
9548 }
9551 unsigned IdxDupOp =
9553 : 2;
9554 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9555 &AArch64::FPR128RegClass, MRI);
9556 break;
9557 }
9560 unsigned IdxDupOp =
9562 : 2;
9563 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9564 &AArch64::FPR128_loRegClass, MRI);
9565 break;
9566 }
9568 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9569 break;
9570 }
9572 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9573 Pattern, 4);
9574 break;
9575 }
9577 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9578 Pattern, 8);
9579 break;
9580 }
9582 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9583 Pattern, 16);
9584 break;
9585 }
9586
9587 } // end switch (Pattern)
9588 // Record MUL and ADD/SUB for deletion
9589 if (MUL)
9590 DelInstrs.push_back(MUL);
9591 DelInstrs.push_back(&Root);
9592
9593 // Set the flags on the inserted instructions to be the merged flags of the
9594 // instructions that we have combined.
9595 uint32_t Flags = Root.getFlags();
9596 if (MUL)
9597 Flags = Root.mergeFlagsWith(*MUL);
9598 for (auto *MI : InsInstrs)
9599 MI->setFlags(Flags);
9600}
9601
9602/// Replace csincr-branch sequence by simple conditional branch
9603///
9604/// Examples:
9605/// 1. \code
9606/// csinc w9, wzr, wzr, <condition code>
9607/// tbnz w9, #0, 0x44
9608/// \endcode
9609/// to
9610/// \code
9611/// b.<inverted condition code>
9612/// \endcode
9613///
9614/// 2. \code
9615/// csinc w9, wzr, wzr, <condition code>
9616/// tbz w9, #0, 0x44
9617/// \endcode
9618/// to
9619/// \code
9620/// b.<condition code>
9621/// \endcode
9622///
9623/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9624/// compare's constant operand is power of 2.
9625///
9626/// Examples:
9627/// \code
9628/// and w8, w8, #0x400
9629/// cbnz w8, L1
9630/// \endcode
9631/// to
9632/// \code
9633/// tbnz w8, #10, L1
9634/// \endcode
9635///
9636/// \param MI Conditional Branch
9637/// \return True when the simple conditional branch is generated
9638///
9640 bool IsNegativeBranch = false;
9641 bool IsTestAndBranch = false;
9642 unsigned TargetBBInMI = 0;
9643 switch (MI.getOpcode()) {
9644 default:
9645 llvm_unreachable("Unknown branch instruction?");
9646 case AArch64::Bcc:
9647 case AArch64::CBWPri:
9648 case AArch64::CBXPri:
9649 case AArch64::CBBAssertExt:
9650 case AArch64::CBHAssertExt:
9651 case AArch64::CBWPrr:
9652 case AArch64::CBXPrr:
9653 return false;
9654 case AArch64::CBZW:
9655 case AArch64::CBZX:
9656 TargetBBInMI = 1;
9657 break;
9658 case AArch64::CBNZW:
9659 case AArch64::CBNZX:
9660 TargetBBInMI = 1;
9661 IsNegativeBranch = true;
9662 break;
9663 case AArch64::TBZW:
9664 case AArch64::TBZX:
9665 TargetBBInMI = 2;
9666 IsTestAndBranch = true;
9667 break;
9668 case AArch64::TBNZW:
9669 case AArch64::TBNZX:
9670 TargetBBInMI = 2;
9671 IsNegativeBranch = true;
9672 IsTestAndBranch = true;
9673 break;
9674 }
9675 // So we increment a zero register and test for bits other
9676 // than bit 0? Conservatively bail out in case the verifier
9677 // missed this case.
9678 if (IsTestAndBranch && MI.getOperand(1).getImm())
9679 return false;
9680
9681 // Find Definition.
9682 assert(MI.getParent() && "Incomplete machine instruction\n");
9683 MachineBasicBlock *MBB = MI.getParent();
9684 MachineFunction *MF = MBB->getParent();
9686 Register VReg = MI.getOperand(0).getReg();
9687 if (!VReg.isVirtual())
9688 return false;
9689
9690 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9691
9692 // Look through COPY instructions to find definition.
9693 while (DefMI->isCopy()) {
9694 Register CopyVReg = DefMI->getOperand(1).getReg();
9695 if (!MRI->hasOneNonDBGUse(CopyVReg))
9696 return false;
9697 if (!MRI->hasOneDef(CopyVReg))
9698 return false;
9699 DefMI = MRI->getVRegDef(CopyVReg);
9700 }
9701
9702 switch (DefMI->getOpcode()) {
9703 default:
9704 return false;
9705 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9706 case AArch64::ANDWri:
9707 case AArch64::ANDXri: {
9708 if (IsTestAndBranch)
9709 return false;
9710 if (DefMI->getParent() != MBB)
9711 return false;
9712 if (!MRI->hasOneNonDBGUse(VReg))
9713 return false;
9714
9715 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9717 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9718 if (!isPowerOf2_64(Mask))
9719 return false;
9720
9721 MachineOperand &MO = DefMI->getOperand(1);
9722 Register NewReg = MO.getReg();
9723 if (!NewReg.isVirtual())
9724 return false;
9725
9726 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9727
9728 MachineBasicBlock &RefToMBB = *MBB;
9729 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9730 DebugLoc DL = MI.getDebugLoc();
9731 unsigned Imm = Log2_64(Mask);
9732 unsigned Opc = (Imm < 32)
9733 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9734 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9735 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9736 .addReg(NewReg)
9737 .addImm(Imm)
9738 .addMBB(TBB);
9739 // Register lives on to the CBZ now.
9740 MO.setIsKill(false);
9741
9742 // For immediate smaller than 32, we need to use the 32-bit
9743 // variant (W) in all cases. Indeed the 64-bit variant does not
9744 // allow to encode them.
9745 // Therefore, if the input register is 64-bit, we need to take the
9746 // 32-bit sub-part.
9747 if (!Is32Bit && Imm < 32)
9748 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9749 MI.eraseFromParent();
9750 return true;
9751 }
9752 // Look for CSINC
9753 case AArch64::CSINCWr:
9754 case AArch64::CSINCXr: {
9755 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9756 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9757 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9758 DefMI->getOperand(2).getReg() == AArch64::XZR))
9759 return false;
9760
9761 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9762 true) != -1)
9763 return false;
9764
9765 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9766 // Convert only when the condition code is not modified between
9767 // the CSINC and the branch. The CC may be used by other
9768 // instructions in between.
9770 return false;
9771 MachineBasicBlock &RefToMBB = *MBB;
9772 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9773 DebugLoc DL = MI.getDebugLoc();
9774 if (IsNegativeBranch)
9776 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9777 MI.eraseFromParent();
9778 return true;
9779 }
9780 }
9781}
9782
9783std::pair<unsigned, unsigned>
9784AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9785 const unsigned Mask = AArch64II::MO_FRAGMENT;
9786 return std::make_pair(TF & Mask, TF & ~Mask);
9787}
9788
9790AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9791 using namespace AArch64II;
9792
9793 static const std::pair<unsigned, const char *> TargetFlags[] = {
9794 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9795 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9796 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9797 {MO_HI12, "aarch64-hi12"}};
9798 return ArrayRef(TargetFlags);
9799}
9800
9802AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9803 using namespace AArch64II;
9804
9805 static const std::pair<unsigned, const char *> TargetFlags[] = {
9806 {MO_COFFSTUB, "aarch64-coffstub"},
9807 {MO_GOT, "aarch64-got"},
9808 {MO_NC, "aarch64-nc"},
9809 {MO_S, "aarch64-s"},
9810 {MO_TLS, "aarch64-tls"},
9811 {MO_DLLIMPORT, "aarch64-dllimport"},
9812 {MO_PREL, "aarch64-prel"},
9813 {MO_TAGGED, "aarch64-tagged"},
9814 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9815 };
9816 return ArrayRef(TargetFlags);
9817}
9818
9820AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9821 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9822 {{MOSuppressPair, "aarch64-suppress-pair"},
9823 {MOStridedAccess, "aarch64-strided-access"}};
9824 return ArrayRef(TargetFlags);
9825}
9826
9827/// Constants defining how certain sequences should be outlined.
9828/// This encompasses how an outlined function should be called, and what kind of
9829/// frame should be emitted for that outlined function.
9830///
9831/// \p MachineOutlinerDefault implies that the function should be called with
9832/// a save and restore of LR to the stack.
9833///
9834/// That is,
9835///
9836/// I1 Save LR OUTLINED_FUNCTION:
9837/// I2 --> BL OUTLINED_FUNCTION I1
9838/// I3 Restore LR I2
9839/// I3
9840/// RET
9841///
9842/// * Call construction overhead: 3 (save + BL + restore)
9843/// * Frame construction overhead: 1 (ret)
9844/// * Requires stack fixups? Yes
9845///
9846/// \p MachineOutlinerTailCall implies that the function is being created from
9847/// a sequence of instructions ending in a return.
9848///
9849/// That is,
9850///
9851/// I1 OUTLINED_FUNCTION:
9852/// I2 --> B OUTLINED_FUNCTION I1
9853/// RET I2
9854/// RET
9855///
9856/// * Call construction overhead: 1 (B)
9857/// * Frame construction overhead: 0 (Return included in sequence)
9858/// * Requires stack fixups? No
9859///
9860/// \p MachineOutlinerNoLRSave implies that the function should be called using
9861/// a BL instruction, but doesn't require LR to be saved and restored. This
9862/// happens when LR is known to be dead.
9863///
9864/// That is,
9865///
9866/// I1 OUTLINED_FUNCTION:
9867/// I2 --> BL OUTLINED_FUNCTION I1
9868/// I3 I2
9869/// I3
9870/// RET
9871///
9872/// * Call construction overhead: 1 (BL)
9873/// * Frame construction overhead: 1 (RET)
9874/// * Requires stack fixups? No
9875///
9876/// \p MachineOutlinerThunk implies that the function is being created from
9877/// a sequence of instructions ending in a call. The outlined function is
9878/// called with a BL instruction, and the outlined function tail-calls the
9879/// original call destination.
9880///
9881/// That is,
9882///
9883/// I1 OUTLINED_FUNCTION:
9884/// I2 --> BL OUTLINED_FUNCTION I1
9885/// BL f I2
9886/// B f
9887/// * Call construction overhead: 1 (BL)
9888/// * Frame construction overhead: 0
9889/// * Requires stack fixups? No
9890///
9891/// \p MachineOutlinerRegSave implies that the function should be called with a
9892/// save and restore of LR to an available register. This allows us to avoid
9893/// stack fixups. Note that this outlining variant is compatible with the
9894/// NoLRSave case.
9895///
9896/// That is,
9897///
9898/// I1 Save LR OUTLINED_FUNCTION:
9899/// I2 --> BL OUTLINED_FUNCTION I1
9900/// I3 Restore LR I2
9901/// I3
9902/// RET
9903///
9904/// * Call construction overhead: 3 (save + BL + restore)
9905/// * Frame construction overhead: 1 (ret)
9906/// * Requires stack fixups? No
9908 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9909 MachineOutlinerTailCall, /// Only emit a branch.
9910 MachineOutlinerNoLRSave, /// Emit a call and return.
9911 MachineOutlinerThunk, /// Emit a call and tail-call.
9912 MachineOutlinerRegSave /// Same as default, but save to a register.
9913};
9914
9920
9922AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9923 MachineFunction *MF = C.getMF();
9924 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9925 const AArch64RegisterInfo *ARI =
9926 static_cast<const AArch64RegisterInfo *>(&TRI);
9927 // Check if there is an available register across the sequence that we can
9928 // use.
9929 for (unsigned Reg : AArch64::GPR64RegClass) {
9930 if (!ARI->isReservedReg(*MF, Reg) &&
9931 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9932 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9933 Reg != AArch64::X17 && // Ditto for X17.
9934 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9935 C.isAvailableInsideSeq(Reg, TRI))
9936 return Reg;
9937 }
9938 return Register();
9939}
9940
9941static bool
9943 const outliner::Candidate &b) {
9944 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9945 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9946
9947 return MFIa->getSignReturnAddressCondition() ==
9949}
9950
9951static bool
9953 const outliner::Candidate &b) {
9954 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9955 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9956
9957 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9958}
9959
9961 const outliner::Candidate &b) {
9962 const AArch64Subtarget &SubtargetA =
9964 const AArch64Subtarget &SubtargetB =
9965 b.getMF()->getSubtarget<AArch64Subtarget>();
9966 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9967}
9968
9969std::optional<std::unique_ptr<outliner::OutlinedFunction>>
9970AArch64InstrInfo::getOutliningCandidateInfo(
9971 const MachineModuleInfo &MMI,
9972 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9973 unsigned MinRepeats) const {
9974 unsigned SequenceSize = 0;
9975 for (auto &MI : RepeatedSequenceLocs[0])
9976 SequenceSize += getInstSizeInBytes(MI);
9977
9978 unsigned NumBytesToCreateFrame = 0;
9979
9980 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
9981 // These instructions are fused together by the scheduler.
9982 // Any candidate where ADRP is the last instruction should be rejected
9983 // as that will lead to splitting ADRP pair.
9984 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
9985 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
9986 if (LastMI.getOpcode() == AArch64::ADRP &&
9987 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
9988 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9989 return std::nullopt;
9990 }
9991
9992 // Similarly any candidate where the first instruction is ADD/LDR with a
9993 // page offset should be rejected to avoid ADRP splitting.
9994 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
9995 FirstMI.getOpcode() == AArch64::LDRXui) &&
9996 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
9997 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9998 return std::nullopt;
9999 }
10000
10001 // We only allow outlining for functions having exactly matching return
10002 // address signing attributes, i.e., all share the same value for the
10003 // attribute "sign-return-address" and all share the same type of key they
10004 // are signed with.
10005 // Additionally we require all functions to simultaneously either support
10006 // v8.3a features or not. Otherwise an outlined function could get signed
10007 // using dedicated v8.3 instructions and a call from a function that doesn't
10008 // support v8.3 instructions would therefore be invalid.
10009 if (std::adjacent_find(
10010 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10011 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10012 // Return true if a and b are non-equal w.r.t. return address
10013 // signing or support of v8.3a features
10014 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10015 outliningCandidatesSigningKeyConsensus(a, b) &&
10016 outliningCandidatesV8_3OpsConsensus(a, b)) {
10017 return false;
10018 }
10019 return true;
10020 }) != RepeatedSequenceLocs.end()) {
10021 return std::nullopt;
10022 }
10023
10024 // Since at this point all candidates agree on their return address signing
10025 // picking just one is fine. If the candidate functions potentially sign their
10026 // return addresses, the outlined function should do the same. Note that in
10027 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10028 // not certainly true that the outlined function will have to sign its return
10029 // address but this decision is made later, when the decision to outline
10030 // has already been made.
10031 // The same holds for the number of additional instructions we need: On
10032 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10033 // necessary. However, at this point we don't know if the outlined function
10034 // will have a RET instruction so we assume the worst.
10035 const TargetRegisterInfo &TRI = getRegisterInfo();
10036 // Performing a tail call may require extra checks when PAuth is enabled.
10037 // If PAuth is disabled, set it to zero for uniformity.
10038 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10039 const auto RASignCondition = RepeatedSequenceLocs[0]
10040 .getMF()
10041 ->getInfo<AArch64FunctionInfo>()
10042 ->getSignReturnAddressCondition();
10043 if (RASignCondition != SignReturnAddress::None) {
10044 // One PAC and one AUT instructions
10045 NumBytesToCreateFrame += 8;
10046
10047 // PAuth is enabled - set extra tail call cost, if any.
10048 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10049 *RepeatedSequenceLocs[0].getMF());
10050 NumBytesToCheckLRInTCEpilogue =
10052 // Checking the authenticated LR value may significantly impact
10053 // SequenceSize, so account for it for more precise results.
10054 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10055 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10056
10057 // We have to check if sp modifying instructions would get outlined.
10058 // If so we only allow outlining if sp is unchanged overall, so matching
10059 // sub and add instructions are okay to outline, all other sp modifications
10060 // are not
10061 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10062 int SPValue = 0;
10063 for (auto &MI : C) {
10064 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10065 switch (MI.getOpcode()) {
10066 case AArch64::ADDXri:
10067 case AArch64::ADDWri:
10068 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10069 assert(MI.getOperand(2).isImm() &&
10070 "Expected operand to be immediate");
10071 assert(MI.getOperand(1).isReg() &&
10072 "Expected operand to be a register");
10073 // Check if the add just increments sp. If so, we search for
10074 // matching sub instructions that decrement sp. If not, the
10075 // modification is illegal
10076 if (MI.getOperand(1).getReg() == AArch64::SP)
10077 SPValue += MI.getOperand(2).getImm();
10078 else
10079 return true;
10080 break;
10081 case AArch64::SUBXri:
10082 case AArch64::SUBWri:
10083 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10084 assert(MI.getOperand(2).isImm() &&
10085 "Expected operand to be immediate");
10086 assert(MI.getOperand(1).isReg() &&
10087 "Expected operand to be a register");
10088 // Check if the sub just decrements sp. If so, we search for
10089 // matching add instructions that increment sp. If not, the
10090 // modification is illegal
10091 if (MI.getOperand(1).getReg() == AArch64::SP)
10092 SPValue -= MI.getOperand(2).getImm();
10093 else
10094 return true;
10095 break;
10096 default:
10097 return true;
10098 }
10099 }
10100 }
10101 if (SPValue)
10102 return true;
10103 return false;
10104 };
10105 // Remove candidates with illegal stack modifying instructions
10106 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10107
10108 // If the sequence doesn't have enough candidates left, then we're done.
10109 if (RepeatedSequenceLocs.size() < MinRepeats)
10110 return std::nullopt;
10111 }
10112
10113 // Properties about candidate MBBs that hold for all of them.
10114 unsigned FlagsSetInAll = 0xF;
10115
10116 // Compute liveness information for each candidate, and set FlagsSetInAll.
10117 for (outliner::Candidate &C : RepeatedSequenceLocs)
10118 FlagsSetInAll &= C.Flags;
10119
10120 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10121
10122 // Helper lambda which sets call information for every candidate.
10123 auto SetCandidateCallInfo =
10124 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10125 for (outliner::Candidate &C : RepeatedSequenceLocs)
10126 C.setCallInfo(CallID, NumBytesForCall);
10127 };
10128
10129 unsigned FrameID = MachineOutlinerDefault;
10130 NumBytesToCreateFrame += 4;
10131
10132 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10133 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10134 });
10135
10136 // We check to see if CFI Instructions are present, and if they are
10137 // we find the number of CFI Instructions in the candidates.
10138 unsigned CFICount = 0;
10139 for (auto &I : RepeatedSequenceLocs[0]) {
10140 if (I.isCFIInstruction())
10141 CFICount++;
10142 }
10143
10144 // We compare the number of found CFI Instructions to the number of CFI
10145 // instructions in the parent function for each candidate. We must check this
10146 // since if we outline one of the CFI instructions in a function, we have to
10147 // outline them all for correctness. If we do not, the address offsets will be
10148 // incorrect between the two sections of the program.
10149 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10150 std::vector<MCCFIInstruction> CFIInstructions =
10151 C.getMF()->getFrameInstructions();
10152
10153 if (CFICount > 0 && CFICount != CFIInstructions.size())
10154 return std::nullopt;
10155 }
10156
10157 // Returns true if an instructions is safe to fix up, false otherwise.
10158 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10159 if (MI.isCall())
10160 return true;
10161
10162 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10163 !MI.readsRegister(AArch64::SP, &TRI))
10164 return true;
10165
10166 // Any modification of SP will break our code to save/restore LR.
10167 // FIXME: We could handle some instructions which add a constant
10168 // offset to SP, with a bit more work.
10169 if (MI.modifiesRegister(AArch64::SP, &TRI))
10170 return false;
10171
10172 // At this point, we have a stack instruction that we might need to
10173 // fix up. We'll handle it if it's a load or store.
10174 if (MI.mayLoadOrStore()) {
10175 const MachineOperand *Base; // Filled with the base operand of MI.
10176 int64_t Offset; // Filled with the offset of MI.
10177 bool OffsetIsScalable;
10178
10179 // Does it allow us to offset the base operand and is the base the
10180 // register SP?
10181 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10182 !Base->isReg() || Base->getReg() != AArch64::SP)
10183 return false;
10184
10185 // Fixe-up code below assumes bytes.
10186 if (OffsetIsScalable)
10187 return false;
10188
10189 // Find the minimum/maximum offset for this instruction and check
10190 // if fixing it up would be in range.
10191 int64_t MinOffset,
10192 MaxOffset; // Unscaled offsets for the instruction.
10193 // The scale to multiply the offsets by.
10194 TypeSize Scale(0U, false), DummyWidth(0U, false);
10195 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10196
10197 Offset += 16; // Update the offset to what it would be if we outlined.
10198 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10199 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10200 return false;
10201
10202 // It's in range, so we can outline it.
10203 return true;
10204 }
10205
10206 // FIXME: Add handling for instructions like "add x0, sp, #8".
10207
10208 // We can't fix it up, so don't outline it.
10209 return false;
10210 };
10211
10212 // True if it's possible to fix up each stack instruction in this sequence.
10213 // Important for frames/call variants that modify the stack.
10214 bool AllStackInstrsSafe =
10215 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10216
10217 // If the last instruction in any candidate is a terminator, then we should
10218 // tail call all of the candidates.
10219 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10220 FrameID = MachineOutlinerTailCall;
10221 NumBytesToCreateFrame = 0;
10222 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10223 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10224 }
10225
10226 else if (LastInstrOpcode == AArch64::BL ||
10227 ((LastInstrOpcode == AArch64::BLR ||
10228 LastInstrOpcode == AArch64::BLRNoIP) &&
10229 !HasBTI)) {
10230 // FIXME: Do we need to check if the code after this uses the value of LR?
10231 FrameID = MachineOutlinerThunk;
10232 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10233 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10234 }
10235
10236 else {
10237 // We need to decide how to emit calls + frames. We can always emit the same
10238 // frame if we don't need to save to the stack. If we have to save to the
10239 // stack, then we need a different frame.
10240 unsigned NumBytesNoStackCalls = 0;
10241 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10242
10243 // Check if we have to save LR.
10244 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10245 bool LRAvailable =
10247 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10248 : true;
10249 // If we have a noreturn caller, then we're going to be conservative and
10250 // say that we have to save LR. If we don't have a ret at the end of the
10251 // block, then we can't reason about liveness accurately.
10252 //
10253 // FIXME: We can probably do better than always disabling this in
10254 // noreturn functions by fixing up the liveness info.
10255 bool IsNoReturn =
10256 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10257
10258 // Is LR available? If so, we don't need a save.
10259 if (LRAvailable && !IsNoReturn) {
10260 NumBytesNoStackCalls += 4;
10261 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10262 CandidatesWithoutStackFixups.push_back(C);
10263 }
10264
10265 // Is an unused register available? If so, we won't modify the stack, so
10266 // we can outline with the same frame type as those that don't save LR.
10267 else if (findRegisterToSaveLRTo(C)) {
10268 NumBytesNoStackCalls += 12;
10269 C.setCallInfo(MachineOutlinerRegSave, 12);
10270 CandidatesWithoutStackFixups.push_back(C);
10271 }
10272
10273 // Is SP used in the sequence at all? If not, we don't have to modify
10274 // the stack, so we are guaranteed to get the same frame.
10275 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10276 NumBytesNoStackCalls += 12;
10277 C.setCallInfo(MachineOutlinerDefault, 12);
10278 CandidatesWithoutStackFixups.push_back(C);
10279 }
10280
10281 // If we outline this, we need to modify the stack. Pretend we don't
10282 // outline this by saving all of its bytes.
10283 else {
10284 NumBytesNoStackCalls += SequenceSize;
10285 }
10286 }
10287
10288 // If there are no places where we have to save LR, then note that we
10289 // don't have to update the stack. Otherwise, give every candidate the
10290 // default call type, as long as it's safe to do so.
10291 if (!AllStackInstrsSafe ||
10292 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10293 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10294 FrameID = MachineOutlinerNoLRSave;
10295 if (RepeatedSequenceLocs.size() < MinRepeats)
10296 return std::nullopt;
10297 } else {
10298 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10299
10300 // Bugzilla ID: 46767
10301 // TODO: Check if fixing up the stack more than once is safe so we can
10302 // outline these.
10303 //
10304 // An outline resulting in a caller that requires stack fixups at the
10305 // callsite to a callee that also requires stack fixups can happen when
10306 // there are no available registers at the candidate callsite for a
10307 // candidate that itself also has calls.
10308 //
10309 // In other words if function_containing_sequence in the following pseudo
10310 // assembly requires that we save LR at the point of the call, but there
10311 // are no available registers: in this case we save using SP and as a
10312 // result the SP offsets requires stack fixups by multiples of 16.
10313 //
10314 // function_containing_sequence:
10315 // ...
10316 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10317 // call OUTLINED_FUNCTION_N
10318 // restore LR from SP
10319 // ...
10320 //
10321 // OUTLINED_FUNCTION_N:
10322 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10323 // ...
10324 // bl foo
10325 // restore LR from SP
10326 // ret
10327 //
10328 // Because the code to handle more than one stack fixup does not
10329 // currently have the proper checks for legality, these cases will assert
10330 // in the AArch64 MachineOutliner. This is because the code to do this
10331 // needs more hardening, testing, better checks that generated code is
10332 // legal, etc and because it is only verified to handle a single pass of
10333 // stack fixup.
10334 //
10335 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10336 // these cases until they are known to be handled. Bugzilla 46767 is
10337 // referenced in comments at the assert site.
10338 //
10339 // To avoid asserting (or generating non-legal code on noassert builds)
10340 // we remove all candidates which would need more than one stack fixup by
10341 // pruning the cases where the candidate has calls while also having no
10342 // available LR and having no available general purpose registers to copy
10343 // LR to (ie one extra stack save/restore).
10344 //
10345 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10346 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10347 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10348 return (llvm::any_of(C, IsCall)) &&
10349 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10350 !findRegisterToSaveLRTo(C));
10351 });
10352 }
10353 }
10354
10355 // If we dropped all of the candidates, bail out here.
10356 if (RepeatedSequenceLocs.size() < MinRepeats)
10357 return std::nullopt;
10358 }
10359
10360 // Does every candidate's MBB contain a call? If so, then we might have a call
10361 // in the range.
10362 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10363 // Check if the range contains a call. These require a save + restore of the
10364 // link register.
10365 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10366 bool ModStackToSaveLR = false;
10367 if (any_of(drop_end(FirstCand),
10368 [](const MachineInstr &MI) { return MI.isCall(); }))
10369 ModStackToSaveLR = true;
10370
10371 // Handle the last instruction separately. If this is a tail call, then the
10372 // last instruction is a call. We don't want to save + restore in this case.
10373 // However, it could be possible that the last instruction is a call without
10374 // it being valid to tail call this sequence. We should consider this as
10375 // well.
10376 else if (FrameID != MachineOutlinerThunk &&
10377 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10378 ModStackToSaveLR = true;
10379
10380 if (ModStackToSaveLR) {
10381 // We can't fix up the stack. Bail out.
10382 if (!AllStackInstrsSafe)
10383 return std::nullopt;
10384
10385 // Save + restore LR.
10386 NumBytesToCreateFrame += 8;
10387 }
10388 }
10389
10390 // If we have CFI instructions, we can only outline if the outlined section
10391 // can be a tail call
10392 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10393 return std::nullopt;
10394
10395 return std::make_unique<outliner::OutlinedFunction>(
10396 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10397}
10398
10399void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10400 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10401 // If a bunch of candidates reach this point they must agree on their return
10402 // address signing. It is therefore enough to just consider the signing
10403 // behaviour of one of them
10404 const auto &CFn = Candidates.front().getMF()->getFunction();
10405
10406 if (CFn.hasFnAttribute("ptrauth-returns"))
10407 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10408 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10409 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10410 // Since all candidates belong to the same module, just copy the
10411 // function-level attributes of an arbitrary function.
10412 if (CFn.hasFnAttribute("sign-return-address"))
10413 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10414 if (CFn.hasFnAttribute("sign-return-address-key"))
10415 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10416
10417 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10418}
10419
10420bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10421 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10422 const Function &F = MF.getFunction();
10423
10424 // Can F be deduplicated by the linker? If it can, don't outline from it.
10425 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10426 return false;
10427
10428 // Don't outline from functions with section markings; the program could
10429 // expect that all the code is in the named section.
10430 // FIXME: Allow outlining from multiple functions with the same section
10431 // marking.
10432 if (F.hasSection())
10433 return false;
10434
10435 // Outlining from functions with redzones is unsafe since the outliner may
10436 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10437 // outline from it.
10438 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10439 if (!AFI || AFI->hasRedZone().value_or(true))
10440 return false;
10441
10442 // FIXME: Determine whether it is safe to outline from functions which contain
10443 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10444 // outlined together and ensure it is safe to outline with async unwind info,
10445 // required for saving & restoring VG around calls.
10446 if (AFI->hasStreamingModeChanges())
10447 return false;
10448
10449 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10451 return false;
10452
10453 // It's safe to outline from MF.
10454 return true;
10455}
10456
10458AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10459 unsigned &Flags) const {
10461 "Must track liveness!");
10463 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10464 Ranges;
10465 // According to the AArch64 Procedure Call Standard, the following are
10466 // undefined on entry/exit from a function call:
10467 //
10468 // * Registers x16, x17, (and thus w16, w17)
10469 // * Condition codes (and thus the NZCV register)
10470 //
10471 // If any of these registers are used inside or live across an outlined
10472 // function, then they may be modified later, either by the compiler or
10473 // some other tool (like the linker).
10474 //
10475 // To avoid outlining in these situations, partition each block into ranges
10476 // where these registers are dead. We will only outline from those ranges.
10477 LiveRegUnits LRU(getRegisterInfo());
10478 auto AreAllUnsafeRegsDead = [&LRU]() {
10479 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10480 LRU.available(AArch64::NZCV);
10481 };
10482
10483 // We need to know if LR is live across an outlining boundary later on in
10484 // order to decide how we'll create the outlined call, frame, etc.
10485 //
10486 // It's pretty expensive to check this for *every candidate* within a block.
10487 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10488 // to compute liveness from the end of the block for O(n) candidates within
10489 // the block.
10490 //
10491 // So, to improve the average case, let's keep track of liveness from the end
10492 // of the block to the beginning of *every outlinable range*. If we know that
10493 // LR is available in every range we could outline from, then we know that
10494 // we don't need to check liveness for any candidate within that range.
10495 bool LRAvailableEverywhere = true;
10496 // Compute liveness bottom-up.
10497 LRU.addLiveOuts(MBB);
10498 // Update flags that require info about the entire MBB.
10499 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10500 if (MI.isCall() && !MI.isTerminator())
10502 };
10503 // Range: [RangeBegin, RangeEnd)
10504 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10505 unsigned RangeLen;
10506 auto CreateNewRangeStartingAt =
10507 [&RangeBegin, &RangeEnd,
10508 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10509 RangeBegin = NewBegin;
10510 RangeEnd = std::next(RangeBegin);
10511 RangeLen = 0;
10512 };
10513 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10514 // At least one unsafe register is not dead. We do not want to outline at
10515 // this point. If it is long enough to outline from and does not cross a
10516 // bundle boundary, save the range [RangeBegin, RangeEnd).
10517 if (RangeLen <= 1)
10518 return;
10519 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10520 return;
10521 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10522 return;
10523 Ranges.emplace_back(RangeBegin, RangeEnd);
10524 };
10525 // Find the first point where all unsafe registers are dead.
10526 // FIND: <safe instr> <-- end of first potential range
10527 // SKIP: <unsafe def>
10528 // SKIP: ... everything between ...
10529 // SKIP: <unsafe use>
10530 auto FirstPossibleEndPt = MBB.instr_rbegin();
10531 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10532 LRU.stepBackward(*FirstPossibleEndPt);
10533 // Update flags that impact how we outline across the entire block,
10534 // regardless of safety.
10535 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10536 if (AreAllUnsafeRegsDead())
10537 break;
10538 }
10539 // If we exhausted the entire block, we have no safe ranges to outline.
10540 if (FirstPossibleEndPt == MBB.instr_rend())
10541 return Ranges;
10542 // Current range.
10543 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10544 // StartPt points to the first place where all unsafe registers
10545 // are dead (if there is any such point). Begin partitioning the MBB into
10546 // ranges.
10547 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10548 LRU.stepBackward(MI);
10549 UpdateWholeMBBFlags(MI);
10550 if (!AreAllUnsafeRegsDead()) {
10551 SaveRangeIfNonEmpty();
10552 CreateNewRangeStartingAt(MI.getIterator());
10553 continue;
10554 }
10555 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10556 RangeBegin = MI.getIterator();
10557 ++RangeLen;
10558 }
10559 // Above loop misses the last (or only) range. If we are still safe, then
10560 // let's save the range.
10561 if (AreAllUnsafeRegsDead())
10562 SaveRangeIfNonEmpty();
10563 if (Ranges.empty())
10564 return Ranges;
10565 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10566 // the order.
10567 std::reverse(Ranges.begin(), Ranges.end());
10568 // If there is at least one outlinable range where LR is unavailable
10569 // somewhere, remember that.
10570 if (!LRAvailableEverywhere)
10572 return Ranges;
10573}
10574
10576AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10578 unsigned Flags) const {
10579 MachineInstr &MI = *MIT;
10580
10581 // Don't outline anything used for return address signing. The outlined
10582 // function will get signed later if needed
10583 switch (MI.getOpcode()) {
10584 case AArch64::PACM:
10585 case AArch64::PACIASP:
10586 case AArch64::PACIBSP:
10587 case AArch64::PACIASPPC:
10588 case AArch64::PACIBSPPC:
10589 case AArch64::AUTIASP:
10590 case AArch64::AUTIBSP:
10591 case AArch64::AUTIASPPCi:
10592 case AArch64::AUTIASPPCr:
10593 case AArch64::AUTIBSPPCi:
10594 case AArch64::AUTIBSPPCr:
10595 case AArch64::RETAA:
10596 case AArch64::RETAB:
10597 case AArch64::RETAASPPCi:
10598 case AArch64::RETAASPPCr:
10599 case AArch64::RETABSPPCi:
10600 case AArch64::RETABSPPCr:
10601 case AArch64::EMITBKEY:
10602 case AArch64::PAUTH_PROLOGUE:
10603 case AArch64::PAUTH_EPILOGUE:
10605 }
10606
10607 // We can only outline these if we will tail call the outlined function, or
10608 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10609 // in a tail call.
10610 //
10611 // FIXME: If the proper fixups for the offset are implemented, this should be
10612 // possible.
10613 if (MI.isCFIInstruction())
10615
10616 // Is this a terminator for a basic block?
10617 if (MI.isTerminator())
10618 // TargetInstrInfo::getOutliningType has already filtered out anything
10619 // that would break this, so we can allow it here.
10621
10622 // Make sure none of the operands are un-outlinable.
10623 for (const MachineOperand &MOP : MI.operands()) {
10624 // A check preventing CFI indices was here before, but only CFI
10625 // instructions should have those.
10626 assert(!MOP.isCFIIndex());
10627
10628 // If it uses LR or W30 explicitly, then don't touch it.
10629 if (MOP.isReg() && !MOP.isImplicit() &&
10630 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10632 }
10633
10634 // Special cases for instructions that can always be outlined, but will fail
10635 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10636 // be outlined because they don't require a *specific* value to be in LR.
10637 if (MI.getOpcode() == AArch64::ADRP)
10639
10640 // If MI is a call we might be able to outline it. We don't want to outline
10641 // any calls that rely on the position of items on the stack. When we outline
10642 // something containing a call, we have to emit a save and restore of LR in
10643 // the outlined function. Currently, this always happens by saving LR to the
10644 // stack. Thus, if we outline, say, half the parameters for a function call
10645 // plus the call, then we'll break the callee's expectations for the layout
10646 // of the stack.
10647 //
10648 // FIXME: Allow calls to functions which construct a stack frame, as long
10649 // as they don't access arguments on the stack.
10650 // FIXME: Figure out some way to analyze functions defined in other modules.
10651 // We should be able to compute the memory usage based on the IR calling
10652 // convention, even if we can't see the definition.
10653 if (MI.isCall()) {
10654 // Get the function associated with the call. Look at each operand and find
10655 // the one that represents the callee and get its name.
10656 const Function *Callee = nullptr;
10657 for (const MachineOperand &MOP : MI.operands()) {
10658 if (MOP.isGlobal()) {
10659 Callee = dyn_cast<Function>(MOP.getGlobal());
10660 break;
10661 }
10662 }
10663
10664 // Never outline calls to mcount. There isn't any rule that would require
10665 // this, but the Linux kernel's "ftrace" feature depends on it.
10666 if (Callee && Callee->getName() == "\01_mcount")
10668
10669 // If we don't know anything about the callee, assume it depends on the
10670 // stack layout of the caller. In that case, it's only legal to outline
10671 // as a tail-call. Explicitly list the call instructions we know about so we
10672 // don't get unexpected results with call pseudo-instructions.
10673 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10674 if (MI.getOpcode() == AArch64::BLR ||
10675 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10676 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10677
10678 if (!Callee)
10679 return UnknownCallOutlineType;
10680
10681 // We have a function we have information about. Check it if it's something
10682 // can safely outline.
10683 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10684
10685 // We don't know what's going on with the callee at all. Don't touch it.
10686 if (!CalleeMF)
10687 return UnknownCallOutlineType;
10688
10689 // Check if we know anything about the callee saves on the function. If we
10690 // don't, then don't touch it, since that implies that we haven't
10691 // computed anything about its stack frame yet.
10692 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10693 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10694 MFI.getNumObjects() > 0)
10695 return UnknownCallOutlineType;
10696
10697 // At this point, we can say that CalleeMF ought to not pass anything on the
10698 // stack. Therefore, we can outline it.
10700 }
10701
10702 // Don't touch the link register or W30.
10703 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10704 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10706
10707 // Don't outline BTI instructions, because that will prevent the outlining
10708 // site from being indirectly callable.
10709 if (hasBTISemantics(MI))
10711
10713}
10714
10715void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10716 for (MachineInstr &MI : MBB) {
10717 const MachineOperand *Base;
10718 TypeSize Width(0, false);
10719 int64_t Offset;
10720 bool OffsetIsScalable;
10721
10722 // Is this a load or store with an immediate offset with SP as the base?
10723 if (!MI.mayLoadOrStore() ||
10724 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10725 &RI) ||
10726 (Base->isReg() && Base->getReg() != AArch64::SP))
10727 continue;
10728
10729 // It is, so we have to fix it up.
10730 TypeSize Scale(0U, false);
10731 int64_t Dummy1, Dummy2;
10732
10733 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10734 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10735 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10736 assert(Scale != 0 && "Unexpected opcode!");
10737 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10738
10739 // We've pushed the return address to the stack, so add 16 to the offset.
10740 // This is safe, since we already checked if it would overflow when we
10741 // checked if this instruction was legal to outline.
10742 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10743 StackOffsetOperand.setImm(NewImm);
10744 }
10745}
10746
10748 const AArch64InstrInfo *TII,
10749 bool ShouldSignReturnAddr) {
10750 if (!ShouldSignReturnAddr)
10751 return;
10752
10753 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10755 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
10756 TII->get(AArch64::PAUTH_EPILOGUE))
10758}
10759
10760void AArch64InstrInfo::buildOutlinedFrame(
10762 const outliner::OutlinedFunction &OF) const {
10763
10764 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10765
10766 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10767 FI->setOutliningStyle("Tail Call");
10768 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10769 // For thunk outlining, rewrite the last instruction from a call to a
10770 // tail-call.
10771 MachineInstr *Call = &*--MBB.instr_end();
10772 unsigned TailOpcode;
10773 if (Call->getOpcode() == AArch64::BL) {
10774 TailOpcode = AArch64::TCRETURNdi;
10775 } else {
10776 assert(Call->getOpcode() == AArch64::BLR ||
10777 Call->getOpcode() == AArch64::BLRNoIP);
10778 TailOpcode = AArch64::TCRETURNriALL;
10779 }
10780 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10781 .add(Call->getOperand(0))
10782 .addImm(0);
10783 MBB.insert(MBB.end(), TC);
10785
10786 FI->setOutliningStyle("Thunk");
10787 }
10788
10789 bool IsLeafFunction = true;
10790
10791 // Is there a call in the outlined range?
10792 auto IsNonTailCall = [](const MachineInstr &MI) {
10793 return MI.isCall() && !MI.isReturn();
10794 };
10795
10796 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10797 // Fix up the instructions in the range, since we're going to modify the
10798 // stack.
10799
10800 // Bugzilla ID: 46767
10801 // TODO: Check if fixing up twice is safe so we can outline these.
10802 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10803 "Can only fix up stack references once");
10804 fixupPostOutline(MBB);
10805
10806 IsLeafFunction = false;
10807
10808 // LR has to be a live in so that we can save it.
10809 if (!MBB.isLiveIn(AArch64::LR))
10810 MBB.addLiveIn(AArch64::LR);
10811
10814
10815 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10816 OF.FrameConstructionID == MachineOutlinerThunk)
10817 Et = std::prev(MBB.end());
10818
10819 // Insert a save before the outlined region
10820 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10821 .addReg(AArch64::SP, RegState::Define)
10822 .addReg(AArch64::LR)
10823 .addReg(AArch64::SP)
10824 .addImm(-16);
10825 It = MBB.insert(It, STRXpre);
10826
10827 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10828 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10829
10830 // Add a CFI saying the stack was moved 16 B down.
10831 CFIBuilder.buildDefCFAOffset(16);
10832
10833 // Add a CFI saying that the LR that we want to find is now 16 B higher
10834 // than before.
10835 CFIBuilder.buildOffset(AArch64::LR, -16);
10836 }
10837
10838 // Insert a restore before the terminator for the function.
10839 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10840 .addReg(AArch64::SP, RegState::Define)
10841 .addReg(AArch64::LR, RegState::Define)
10842 .addReg(AArch64::SP)
10843 .addImm(16);
10844 Et = MBB.insert(Et, LDRXpost);
10845 }
10846
10847 auto RASignCondition = FI->getSignReturnAddressCondition();
10848 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10849 RASignCondition, !IsLeafFunction);
10850
10851 // If this is a tail call outlined function, then there's already a return.
10852 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10853 OF.FrameConstructionID == MachineOutlinerThunk) {
10854 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10855 return;
10856 }
10857
10858 // It's not a tail call, so we have to insert the return ourselves.
10859
10860 // LR has to be a live in so that we can return to it.
10861 if (!MBB.isLiveIn(AArch64::LR))
10862 MBB.addLiveIn(AArch64::LR);
10863
10864 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10865 .addReg(AArch64::LR);
10866 MBB.insert(MBB.end(), ret);
10867
10868 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10869
10870 FI->setOutliningStyle("Function");
10871
10872 // Did we have to modify the stack by saving the link register?
10873 if (OF.FrameConstructionID != MachineOutlinerDefault)
10874 return;
10875
10876 // We modified the stack.
10877 // Walk over the basic block and fix up all the stack accesses.
10878 fixupPostOutline(MBB);
10879}
10880
10881MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10884
10885 // Are we tail calling?
10886 if (C.CallConstructionID == MachineOutlinerTailCall) {
10887 // If yes, then we can just branch to the label.
10888 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10889 .addGlobalAddress(M.getNamedValue(MF.getName()))
10890 .addImm(0));
10891 return It;
10892 }
10893
10894 // Are we saving the link register?
10895 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10896 C.CallConstructionID == MachineOutlinerThunk) {
10897 // No, so just insert the call.
10898 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10899 .addGlobalAddress(M.getNamedValue(MF.getName())));
10900 return It;
10901 }
10902
10903 // We want to return the spot where we inserted the call.
10905
10906 // Instructions for saving and restoring LR around the call instruction we're
10907 // going to insert.
10908 MachineInstr *Save;
10909 MachineInstr *Restore;
10910 // Can we save to a register?
10911 if (C.CallConstructionID == MachineOutlinerRegSave) {
10912 // FIXME: This logic should be sunk into a target-specific interface so that
10913 // we don't have to recompute the register.
10914 Register Reg = findRegisterToSaveLRTo(C);
10915 assert(Reg && "No callee-saved register available?");
10916
10917 // LR has to be a live in so that we can save it.
10918 if (!MBB.isLiveIn(AArch64::LR))
10919 MBB.addLiveIn(AArch64::LR);
10920
10921 // Save and restore LR from Reg.
10922 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
10923 .addReg(AArch64::XZR)
10924 .addReg(AArch64::LR)
10925 .addImm(0);
10926 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
10927 .addReg(AArch64::XZR)
10928 .addReg(Reg)
10929 .addImm(0);
10930 } else {
10931 // We have the default case. Save and restore from SP.
10932 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10933 .addReg(AArch64::SP, RegState::Define)
10934 .addReg(AArch64::LR)
10935 .addReg(AArch64::SP)
10936 .addImm(-16);
10937 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10938 .addReg(AArch64::SP, RegState::Define)
10939 .addReg(AArch64::LR, RegState::Define)
10940 .addReg(AArch64::SP)
10941 .addImm(16);
10942 }
10943
10944 It = MBB.insert(It, Save);
10945 It++;
10946
10947 // Insert the call.
10948 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10949 .addGlobalAddress(M.getNamedValue(MF.getName())));
10950 CallPt = It;
10951 It++;
10952
10953 It = MBB.insert(It, Restore);
10954 return CallPt;
10955}
10956
10957bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10958 MachineFunction &MF) const {
10959 return MF.getFunction().hasMinSize();
10960}
10961
10962void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10964 DebugLoc &DL,
10965 bool AllowSideEffects) const {
10966 const MachineFunction &MF = *MBB.getParent();
10967 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10968 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10969
10970 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10971 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
10972 } else if (STI.isSVEorStreamingSVEAvailable()) {
10973 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
10974 .addImm(0)
10975 .addImm(0);
10976 } else if (STI.isNeonAvailable()) {
10977 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
10978 .addImm(0);
10979 } else {
10980 // This is a streaming-compatible function without SVE. We don't have full
10981 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10982 // So given `movi v..` would be illegal use `fmov d..` instead.
10983 assert(STI.hasNEON() && "Expected to have NEON.");
10984 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
10985 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
10986 }
10987}
10988
10989std::optional<DestSourcePair>
10991
10992 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10993 // and zero immediate operands used as an alias for mov instruction.
10994 if (((MI.getOpcode() == AArch64::ORRWrs &&
10995 MI.getOperand(1).getReg() == AArch64::WZR &&
10996 MI.getOperand(3).getImm() == 0x0) ||
10997 (MI.getOpcode() == AArch64::ORRWrr &&
10998 MI.getOperand(1).getReg() == AArch64::WZR)) &&
10999 // Check that the w->w move is not a zero-extending w->x mov.
11000 (!MI.getOperand(0).getReg().isVirtual() ||
11001 MI.getOperand(0).getSubReg() == 0) &&
11002 (!MI.getOperand(0).getReg().isPhysical() ||
11003 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11004 /*TRI=*/nullptr) == -1))
11005 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11006
11007 if (MI.getOpcode() == AArch64::ORRXrs &&
11008 MI.getOperand(1).getReg() == AArch64::XZR &&
11009 MI.getOperand(3).getImm() == 0x0)
11010 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11011
11012 return std::nullopt;
11013}
11014
11015std::optional<DestSourcePair>
11017 if ((MI.getOpcode() == AArch64::ORRWrs &&
11018 MI.getOperand(1).getReg() == AArch64::WZR &&
11019 MI.getOperand(3).getImm() == 0x0) ||
11020 (MI.getOpcode() == AArch64::ORRWrr &&
11021 MI.getOperand(1).getReg() == AArch64::WZR))
11022 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11023 return std::nullopt;
11024}
11025
11026std::optional<RegImmPair>
11027AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11028 int Sign = 1;
11029 int64_t Offset = 0;
11030
11031 // TODO: Handle cases where Reg is a super- or sub-register of the
11032 // destination register.
11033 const MachineOperand &Op0 = MI.getOperand(0);
11034 if (!Op0.isReg() || Reg != Op0.getReg())
11035 return std::nullopt;
11036
11037 switch (MI.getOpcode()) {
11038 default:
11039 return std::nullopt;
11040 case AArch64::SUBWri:
11041 case AArch64::SUBXri:
11042 case AArch64::SUBSWri:
11043 case AArch64::SUBSXri:
11044 Sign *= -1;
11045 [[fallthrough]];
11046 case AArch64::ADDSWri:
11047 case AArch64::ADDSXri:
11048 case AArch64::ADDWri:
11049 case AArch64::ADDXri: {
11050 // TODO: Third operand can be global address (usually some string).
11051 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11052 !MI.getOperand(2).isImm())
11053 return std::nullopt;
11054 int Shift = MI.getOperand(3).getImm();
11055 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11056 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11057 }
11058 }
11059 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11060}
11061
11062/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11063/// the destination register then, if possible, describe the value in terms of
11064/// the source register.
11065static std::optional<ParamLoadedValue>
11067 const TargetInstrInfo *TII,
11068 const TargetRegisterInfo *TRI) {
11069 auto DestSrc = TII->isCopyLikeInstr(MI);
11070 if (!DestSrc)
11071 return std::nullopt;
11072
11073 Register DestReg = DestSrc->Destination->getReg();
11074 Register SrcReg = DestSrc->Source->getReg();
11075
11076 if (!DestReg.isValid() || !SrcReg.isValid())
11077 return std::nullopt;
11078
11079 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11080
11081 // If the described register is the destination, just return the source.
11082 if (DestReg == DescribedReg)
11083 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11084
11085 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11086 if (MI.getOpcode() == AArch64::ORRWrs &&
11087 TRI->isSuperRegister(DestReg, DescribedReg))
11088 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11089
11090 // We may need to describe the lower part of a ORRXrs move.
11091 if (MI.getOpcode() == AArch64::ORRXrs &&
11092 TRI->isSubRegister(DestReg, DescribedReg)) {
11093 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11094 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11095 }
11096
11097 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11098 "Unhandled ORR[XW]rs copy case");
11099
11100 return std::nullopt;
11101}
11102
11103bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11104 // Functions cannot be split to different sections on AArch64 if they have
11105 // a red zone. This is because relaxing a cross-section branch may require
11106 // incrementing the stack pointer to spill a register, which would overwrite
11107 // the red zone.
11108 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11109 return false;
11110
11112}
11113
11114bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11115 const MachineBasicBlock &MBB) const {
11116 // Asm Goto blocks can contain conditional branches to goto labels, which can
11117 // get moved out of range of the branch instruction.
11118 auto isAsmGoto = [](const MachineInstr &MI) {
11119 return MI.getOpcode() == AArch64::INLINEASM_BR;
11120 };
11121 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11122 return false;
11123
11124 // Because jump tables are label-relative instead of table-relative, they all
11125 // must be in the same section or relocation fixup handling will fail.
11126
11127 // Check if MBB is a jump table target
11128 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11129 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11130 return llvm::is_contained(JTE.MBBs, &MBB);
11131 };
11132 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11133 return false;
11134
11135 // Check if MBB contains a jump table lookup
11136 for (const MachineInstr &MI : MBB) {
11137 switch (MI.getOpcode()) {
11138 case TargetOpcode::G_BRJT:
11139 case AArch64::JumpTableDest32:
11140 case AArch64::JumpTableDest16:
11141 case AArch64::JumpTableDest8:
11142 return false;
11143 default:
11144 continue;
11145 }
11146 }
11147
11148 // MBB isn't a special case, so it's safe to be split to the cold section.
11149 return true;
11150}
11151
11152std::optional<ParamLoadedValue>
11153AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11154 Register Reg) const {
11155 const MachineFunction *MF = MI.getMF();
11156 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11157 switch (MI.getOpcode()) {
11158 case AArch64::MOVZWi:
11159 case AArch64::MOVZXi: {
11160 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11161 // 64-bit parameters, so we need to consider super-registers.
11162 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11163 return std::nullopt;
11164
11165 if (!MI.getOperand(1).isImm())
11166 return std::nullopt;
11167 int64_t Immediate = MI.getOperand(1).getImm();
11168 int Shift = MI.getOperand(2).getImm();
11169 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11170 nullptr);
11171 }
11172 case AArch64::ORRWrs:
11173 case AArch64::ORRXrs:
11174 return describeORRLoadedValue(MI, Reg, this, TRI);
11175 }
11176
11178}
11179
11180bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11181 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11182 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11183 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11184 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11185
11186 // Anyexts are nops.
11187 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11188 return true;
11189
11190 Register DefReg = ExtMI.getOperand(0).getReg();
11191 if (!MRI.hasOneNonDBGUse(DefReg))
11192 return false;
11193
11194 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11195 // addressing mode.
11196 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11197 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11198}
11199
11200uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11201 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11202}
11203
11204bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11205 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11206}
11207
11208bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11209 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11210}
11211
11212unsigned int
11213AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11214 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11215}
11216
11217bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11218 unsigned Scale) const {
11219 if (Offset && Scale)
11220 return false;
11221
11222 // Check Reg + Imm
11223 if (!Scale) {
11224 // 9-bit signed offset
11225 if (isInt<9>(Offset))
11226 return true;
11227
11228 // 12-bit unsigned offset
11229 unsigned Shift = Log2_64(NumBytes);
11230 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11231 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11232 (Offset >> Shift) << Shift == Offset)
11233 return true;
11234 return false;
11235 }
11236
11237 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11238 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11239}
11240
11242 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11243 return AArch64::BLRNoIP;
11244 else
11245 return AArch64::BLR;
11246}
11247
11250 Register TargetReg, bool FrameSetup) const {
11251 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11252
11253 MachineBasicBlock &MBB = *MBBI->getParent();
11254 MachineFunction &MF = *MBB.getParent();
11255 const AArch64InstrInfo *TII =
11256 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11257 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11258 DebugLoc DL = MBB.findDebugLoc(MBBI);
11259
11260 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11261 MachineBasicBlock *LoopTestMBB =
11262 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11263 MF.insert(MBBInsertPoint, LoopTestMBB);
11264 MachineBasicBlock *LoopBodyMBB =
11265 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11266 MF.insert(MBBInsertPoint, LoopBodyMBB);
11267 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11268 MF.insert(MBBInsertPoint, ExitMBB);
11269 MachineInstr::MIFlag Flags =
11271
11272 // LoopTest:
11273 // SUB SP, SP, #ProbeSize
11274 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11275 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11276
11277 // CMP SP, TargetReg
11278 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11279 AArch64::XZR)
11280 .addReg(AArch64::SP)
11281 .addReg(TargetReg)
11283 .setMIFlags(Flags);
11284
11285 // B.<Cond> LoopExit
11286 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11288 .addMBB(ExitMBB)
11289 .setMIFlags(Flags);
11290
11291 // LDR XZR, [SP]
11292 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11293 .addDef(AArch64::XZR)
11294 .addReg(AArch64::SP)
11295 .addImm(0)
11299 Align(8)))
11300 .setMIFlags(Flags);
11301
11302 // B loop
11303 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11304 .addMBB(LoopTestMBB)
11305 .setMIFlags(Flags);
11306
11307 // LoopExit:
11308 // MOV SP, TargetReg
11309 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11310 .addReg(TargetReg)
11311 .addImm(0)
11313 .setMIFlags(Flags);
11314
11315 // LDR XZR, [SP]
11316 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11317 .addReg(AArch64::XZR, RegState::Define)
11318 .addReg(AArch64::SP)
11319 .addImm(0)
11320 .setMIFlags(Flags);
11321
11322 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11324
11325 LoopTestMBB->addSuccessor(ExitMBB);
11326 LoopTestMBB->addSuccessor(LoopBodyMBB);
11327 LoopBodyMBB->addSuccessor(LoopTestMBB);
11328 MBB.addSuccessor(LoopTestMBB);
11329
11330 // Update liveins.
11331 if (MF.getRegInfo().reservedRegsFrozen())
11332 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11333
11334 return ExitMBB->begin();
11335}
11336
11337namespace {
11338class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11339 MachineFunction *MF;
11340 const TargetInstrInfo *TII;
11341 const TargetRegisterInfo *TRI;
11343
11344 /// The block of the loop
11345 MachineBasicBlock *LoopBB;
11346 /// The conditional branch of the loop
11347 MachineInstr *CondBranch;
11348 /// The compare instruction for loop control
11349 MachineInstr *Comp;
11350 /// The number of the operand of the loop counter value in Comp
11351 unsigned CompCounterOprNum;
11352 /// The instruction that updates the loop counter value
11353 MachineInstr *Update;
11354 /// The number of the operand of the loop counter value in Update
11355 unsigned UpdateCounterOprNum;
11356 /// The initial value of the loop counter
11357 Register Init;
11358 /// True iff Update is a predecessor of Comp
11359 bool IsUpdatePriorComp;
11360
11361 /// The normalized condition used by createTripCountGreaterCondition()
11363
11364public:
11365 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11366 MachineInstr *Comp, unsigned CompCounterOprNum,
11367 MachineInstr *Update, unsigned UpdateCounterOprNum,
11368 Register Init, bool IsUpdatePriorComp,
11370 : MF(Comp->getParent()->getParent()),
11371 TII(MF->getSubtarget().getInstrInfo()),
11372 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11373 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11374 CompCounterOprNum(CompCounterOprNum), Update(Update),
11375 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11376 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11377
11378 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11379 // Make the instructions for loop control be placed in stage 0.
11380 // The predecessors of Comp are considered by the caller.
11381 return MI == Comp;
11382 }
11383
11384 std::optional<bool> createTripCountGreaterCondition(
11385 int TC, MachineBasicBlock &MBB,
11386 SmallVectorImpl<MachineOperand> &CondParam) override {
11387 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11388 // Cond is normalized for such use.
11389 // The predecessors of the branch are assumed to have already been inserted.
11390 CondParam = Cond;
11391 return {};
11392 }
11393
11394 void createRemainingIterationsGreaterCondition(
11395 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11396 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11397
11398 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11399
11400 void adjustTripCount(int TripCountAdjust) override {}
11401
11402 bool isMVEExpanderSupported() override { return true; }
11403};
11404} // namespace
11405
11406/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11407/// is replaced by ReplaceReg. The output register is newly created.
11408/// The other operands are unchanged from MI.
11409static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11410 Register ReplaceReg, MachineBasicBlock &MBB,
11411 MachineBasicBlock::iterator InsertTo) {
11412 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11413 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11414 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11415 Register Result = 0;
11416 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11417 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11418 Result = MRI.createVirtualRegister(
11419 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11420 NewMI->getOperand(I).setReg(Result);
11421 } else if (I == ReplaceOprNum) {
11422 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11423 NewMI->getOperand(I).setReg(ReplaceReg);
11424 }
11425 }
11426 MBB.insert(InsertTo, NewMI);
11427 return Result;
11428}
11429
11430void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11433 // Create and accumulate conditions for next TC iterations.
11434 // Example:
11435 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11436 // # iteration of the kernel
11437 //
11438 // # insert the following instructions
11439 // cond = CSINCXr 0, 0, C, implicit $nzcv
11440 // counter = ADDXri counter, 1 # clone from this->Update
11441 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11442 // cond = CSINCXr cond, cond, C, implicit $nzcv
11443 // ... (repeat TC times)
11444 // SUBSXri cond, 0, implicit-def $nzcv
11445
11446 assert(CondBranch->getOpcode() == AArch64::Bcc);
11447 // CondCode to exit the loop
11449 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11450 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11452
11453 // Accumulate conditions to exit the loop
11454 Register AccCond = AArch64::XZR;
11455
11456 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11457 auto AccumulateCond = [&](Register CurCond,
11459 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11460 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11461 .addReg(NewCond, RegState::Define)
11462 .addReg(CurCond)
11463 .addReg(CurCond)
11465 return NewCond;
11466 };
11467
11468 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11469 // Update and Comp for I==0 are already exists in MBB
11470 // (MBB is an unrolled kernel)
11471 Register Counter;
11472 for (int I = 0; I <= TC; ++I) {
11473 Register NextCounter;
11474 if (I != 0)
11475 NextCounter =
11476 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11477
11478 AccCond = AccumulateCond(AccCond, CC);
11479
11480 if (I != TC) {
11481 if (I == 0) {
11482 if (Update != Comp && IsUpdatePriorComp) {
11483 Counter =
11484 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11485 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11486 MBB.end());
11487 } else {
11488 // can use already calculated value
11489 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11490 }
11491 } else if (Update != Comp) {
11492 NextCounter =
11493 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11494 }
11495 }
11496 Counter = NextCounter;
11497 }
11498 } else {
11499 Register Counter;
11500 if (LastStage0Insts.empty()) {
11501 // use initial counter value (testing if the trip count is sufficient to
11502 // be executed by pipelined code)
11503 Counter = Init;
11504 if (IsUpdatePriorComp)
11505 Counter =
11506 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11507 } else {
11508 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11509 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11510 }
11511
11512 for (int I = 0; I <= TC; ++I) {
11513 Register NextCounter;
11514 NextCounter =
11515 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11516 AccCond = AccumulateCond(AccCond, CC);
11517 if (I != TC && Update != Comp)
11518 NextCounter =
11519 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11520 Counter = NextCounter;
11521 }
11522 }
11523
11524 // If AccCond == 0, the remainder is greater than TC.
11525 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11526 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11527 .addReg(AccCond)
11528 .addImm(0)
11529 .addImm(0);
11530 Cond.clear();
11532}
11533
11534static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11535 Register &RegMBB, Register &RegOther) {
11536 assert(Phi.getNumOperands() == 5);
11537 if (Phi.getOperand(2).getMBB() == MBB) {
11538 RegMBB = Phi.getOperand(1).getReg();
11539 RegOther = Phi.getOperand(3).getReg();
11540 } else {
11541 assert(Phi.getOperand(4).getMBB() == MBB);
11542 RegMBB = Phi.getOperand(3).getReg();
11543 RegOther = Phi.getOperand(1).getReg();
11544 }
11545}
11546
11548 if (!Reg.isVirtual())
11549 return false;
11550 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11551 return MRI.getVRegDef(Reg)->getParent() != BB;
11552}
11553
11554/// If Reg is an induction variable, return true and set some parameters
11555static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11556 MachineInstr *&UpdateInst,
11557 unsigned &UpdateCounterOprNum, Register &InitReg,
11558 bool &IsUpdatePriorComp) {
11559 // Example:
11560 //
11561 // Preheader:
11562 // InitReg = ...
11563 // LoopBB:
11564 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11565 // Reg = COPY Reg0 ; COPY is ignored.
11566 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11567 // ; Reg is the value calculated in the previous
11568 // ; iteration, so IsUpdatePriorComp == false.
11569
11570 if (LoopBB->pred_size() != 2)
11571 return false;
11572 if (!Reg.isVirtual())
11573 return false;
11574 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11575 UpdateInst = nullptr;
11576 UpdateCounterOprNum = 0;
11577 InitReg = 0;
11578 IsUpdatePriorComp = true;
11579 Register CurReg = Reg;
11580 while (true) {
11581 MachineInstr *Def = MRI.getVRegDef(CurReg);
11582 if (Def->getParent() != LoopBB)
11583 return false;
11584 if (Def->isCopy()) {
11585 // Ignore copy instructions unless they contain subregisters
11586 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11587 return false;
11588 CurReg = Def->getOperand(1).getReg();
11589 } else if (Def->isPHI()) {
11590 if (InitReg != 0)
11591 return false;
11592 if (!UpdateInst)
11593 IsUpdatePriorComp = false;
11594 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11595 } else {
11596 if (UpdateInst)
11597 return false;
11598 switch (Def->getOpcode()) {
11599 case AArch64::ADDSXri:
11600 case AArch64::ADDSWri:
11601 case AArch64::SUBSXri:
11602 case AArch64::SUBSWri:
11603 case AArch64::ADDXri:
11604 case AArch64::ADDWri:
11605 case AArch64::SUBXri:
11606 case AArch64::SUBWri:
11607 UpdateInst = Def;
11608 UpdateCounterOprNum = 1;
11609 break;
11610 case AArch64::ADDSXrr:
11611 case AArch64::ADDSWrr:
11612 case AArch64::SUBSXrr:
11613 case AArch64::SUBSWrr:
11614 case AArch64::ADDXrr:
11615 case AArch64::ADDWrr:
11616 case AArch64::SUBXrr:
11617 case AArch64::SUBWrr:
11618 UpdateInst = Def;
11619 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11620 UpdateCounterOprNum = 1;
11621 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11622 UpdateCounterOprNum = 2;
11623 else
11624 return false;
11625 break;
11626 default:
11627 return false;
11628 }
11629 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11630 }
11631
11632 if (!CurReg.isVirtual())
11633 return false;
11634 if (Reg == CurReg)
11635 break;
11636 }
11637
11638 if (!UpdateInst)
11639 return false;
11640
11641 return true;
11642}
11643
11644std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11646 // Accept loops that meet the following conditions
11647 // * The conditional branch is BCC
11648 // * The compare instruction is ADDS/SUBS/WHILEXX
11649 // * One operand of the compare is an induction variable and the other is a
11650 // loop invariant value
11651 // * The induction variable is incremented/decremented by a single instruction
11652 // * Does not contain CALL or instructions which have unmodeled side effects
11653
11654 for (MachineInstr &MI : *LoopBB)
11655 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11656 // This instruction may use NZCV, which interferes with the instruction to
11657 // be inserted for loop control.
11658 return nullptr;
11659
11660 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11662 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11663 return nullptr;
11664
11665 // Infinite loops are not supported
11666 if (TBB == LoopBB && FBB == LoopBB)
11667 return nullptr;
11668
11669 // Must be conditional branch
11670 if (TBB != LoopBB && FBB == nullptr)
11671 return nullptr;
11672
11673 assert((TBB == LoopBB || FBB == LoopBB) &&
11674 "The Loop must be a single-basic-block loop");
11675
11676 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11678
11679 if (CondBranch->getOpcode() != AArch64::Bcc)
11680 return nullptr;
11681
11682 // Normalization for createTripCountGreaterCondition()
11683 if (TBB == LoopBB)
11685
11686 MachineInstr *Comp = nullptr;
11687 unsigned CompCounterOprNum = 0;
11688 for (MachineInstr &MI : reverse(*LoopBB)) {
11689 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11690 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11691 // operands is a loop invariant value
11692
11693 switch (MI.getOpcode()) {
11694 case AArch64::SUBSXri:
11695 case AArch64::SUBSWri:
11696 case AArch64::ADDSXri:
11697 case AArch64::ADDSWri:
11698 Comp = &MI;
11699 CompCounterOprNum = 1;
11700 break;
11701 case AArch64::ADDSWrr:
11702 case AArch64::ADDSXrr:
11703 case AArch64::SUBSWrr:
11704 case AArch64::SUBSXrr:
11705 Comp = &MI;
11706 break;
11707 default:
11708 if (isWhileOpcode(MI.getOpcode())) {
11709 Comp = &MI;
11710 break;
11711 }
11712 return nullptr;
11713 }
11714
11715 if (CompCounterOprNum == 0) {
11716 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11717 CompCounterOprNum = 2;
11718 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11719 CompCounterOprNum = 1;
11720 else
11721 return nullptr;
11722 }
11723 break;
11724 }
11725 }
11726 if (!Comp)
11727 return nullptr;
11728
11729 MachineInstr *Update = nullptr;
11730 Register Init;
11731 bool IsUpdatePriorComp;
11732 unsigned UpdateCounterOprNum;
11733 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11734 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11735 return nullptr;
11736
11737 return std::make_unique<AArch64PipelinerLoopInfo>(
11738 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11739 Init, IsUpdatePriorComp, Cond);
11740}
11741
11742/// verifyInstruction - Perform target specific instruction verification.
11743bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11744 StringRef &ErrInfo) const {
11745 // Verify that immediate offsets on load/store instructions are within range.
11746 // Stack objects with an FI operand are excluded as they can be fixed up
11747 // during PEI.
11748 TypeSize Scale(0U, false), Width(0U, false);
11749 int64_t MinOffset, MaxOffset;
11750 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11751 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11752 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11753 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11754 if (Imm < MinOffset || Imm > MaxOffset) {
11755 ErrInfo = "Unexpected immediate on load/store instruction";
11756 return false;
11757 }
11758 }
11759 }
11760
11761 const MCInstrDesc &MCID = MI.getDesc();
11762 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11763 const MachineOperand &MO = MI.getOperand(Op);
11764 switch (MCID.operands()[Op].OperandType) {
11766 if (!MO.isImm() || MO.getImm() != 0) {
11767 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11768 return false;
11769 }
11770 break;
11772 if (!MO.isImm() ||
11774 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11775 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11776 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11777 return false;
11778 }
11779 break;
11780 default:
11781 break;
11782 }
11783 }
11784 return true;
11785}
11786
11787#define GET_INSTRINFO_HELPERS
11788#define GET_INSTRMAP_INFO
11789#include "AArch64GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static int findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr)
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, Register TargetReg, bool FrameSetup) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:655
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
Instructions::const_iterator const_instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
LLVM_ABI Instruction & back() const
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.