LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
45#include "llvm/IR/DebugLoc.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/Module.h"
48#include "llvm/MC/MCAsmInfo.h"
49#include "llvm/MC/MCInst.h"
51#include "llvm/MC/MCInstrDesc.h"
56#include "llvm/Support/LEB128.h"
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64
65using namespace llvm;
66
67#define GET_INSTRINFO_CTOR_DTOR
68#include "AArch64GenInstrInfo.inc"
69
70#define DEBUG_TYPE "AArch64InstrInfo"
71
72STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
73STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
74 "instructions expanded from canonical COPY");
75STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
76 "instructions expanded from canonical COPY");
77STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
78 "instructions expanded from canonical COPY");
79// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
80
82 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
83 cl::desc("Restrict range of CB instructions (DEBUG)"));
84
86 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
87 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
88
90 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
91 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
92
94 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
95 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
96
98 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
99 cl::desc("Restrict range of B instructions (DEBUG)"));
100
102 "aarch64-search-limit", cl::Hidden, cl::init(2048),
103 cl::desc("Restrict range of instructions to search for the "
104 "machine-combiner gather pattern optimization"));
105
107 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
108 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
109 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
110
111/// Return the maximum number of bytes of code the specified instruction may be
112/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
113/// returned (use default sizing).
114///
115/// NOTE: the size estimates here must be kept in sync with the rewrites in
116/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
117/// instruction sequences.
118static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
119 switch (MI.getOpcode()) {
120 case AArch64::SVC:
121 // SVC expands to 4 instructions.
122 return 16;
123 case AArch64::BR:
124 case AArch64::BLR:
125 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
126 return 8;
127 case AArch64::RET:
128 // RET through LR is not rewritten, but RET through another register
129 // expands to 2 instructions (guard + ret).
130 if (MI.getOperand(0).getReg() != AArch64::LR)
131 return 8;
132 return 4;
133 case AArch64::SYSxt:
134 // VA-based DC/IC ops (op1=3, Cn=7, op2=1) expand to 2 instructions.
135 if (MI.getOperand(0).getImm() == 3 && MI.getOperand(1).getImm() == 7 &&
136 MI.getOperand(3).getImm() == 1)
137 return 8;
138 return std::nullopt;
139 default:
140 break;
141 }
142
143 // Detect instructions that explicitly define SP or LR.
144 bool ModifiesLR = false;
145 bool ModifiesSP = false;
146 for (const MachineOperand &MO : MI.defs()) {
147 if (!MO.isReg())
148 continue;
149 if (MO.getReg() == AArch64::LR)
150 ModifiesLR = true;
151 else if (MO.getReg() == AArch64::SP)
152 ModifiesSP = true;
153 }
154
155 // Memory accesses expand to a base-register guard plus the rewritten access
156 // (8 bytes), with an extra base-register update for pre/post-index forms (12
157 // bytes total). If the access also defines LR, an LR mask is appended (+4
158 // bytes). Depending on additional optimizations that the rewriter performs,
159 // this may be an overestimate.
160 if (MI.mayLoadOrStore()) {
161 unsigned Size = isLFIPrePostMemAccess(MI.getOpcode()) ? 12 : 8;
162 if (ModifiesLR)
163 Size += 4;
164 return Size;
165 }
166
167 // Non memory operations that modify LR or SP expand to 2 instructions.
168 if (ModifiesSP || ModifiesLR)
169 return 8;
170
171 // Default case: instructions that don't cause expansion.
172 // - TP accesses in LFI are a single load/store, so no expansion.
173 // - All remaining instructions are not rewritten.
174 return std::nullopt;
175}
176
177/// GetInstSize - Return the number of bytes of code the specified
178/// instruction may be. This returns the maximum number of bytes.
180 const MachineBasicBlock &MBB = *MI.getParent();
181 const MachineFunction *MF = MBB.getParent();
182 const Function &F = MF->getFunction();
183 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
184
185 {
186 auto Op = MI.getOpcode();
187 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
188 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
189 }
190
191 // Meta-instructions emit no code.
192 if (MI.isMetaInstruction())
193 return 0;
194
195 // FIXME: We currently only handle pseudoinstructions that don't get expanded
196 // before the assembly printer.
197 unsigned NumBytes = 0;
198 const MCInstrDesc &Desc = MI.getDesc();
199
200 // LFI rewriter expansions that supersede normal sizing.
201 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
202 if (STI.isLFI())
203 if (auto Size = getLFIInstSizeInBytes(MI))
204 return *Size;
205
206 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
207 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
208
209 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
210 if (!MFI->shouldSignReturnAddress(*MF))
211 return NumBytes;
212
213 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
214 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
215 return NumBytes;
216 }
217
218 // Size should be preferably set in
219 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
220 // Specific cases handle instructions of variable sizes
221 switch (Desc.getOpcode()) {
222 default:
223 if (Desc.getSize())
224 return Desc.getSize();
225
226 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
227 // with fixed constant size but not specified in .td file) is a normal
228 // 4-byte insn.
229 NumBytes = 4;
230 break;
231 case TargetOpcode::STACKMAP:
232 // The upper bound for a stackmap intrinsic is the full length of its shadow
233 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
234 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
235 break;
236 case TargetOpcode::PATCHPOINT:
237 // The size of the patchpoint intrinsic is the number of bytes requested
238 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
239 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
240 break;
241 case TargetOpcode::STATEPOINT:
242 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
243 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
244 // No patch bytes means a normal call inst is emitted
245 if (NumBytes == 0)
246 NumBytes = 4;
247 break;
248 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
249 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
250 // instructions are expanded to the specified number of NOPs. Otherwise,
251 // they are expanded to 36-byte XRay sleds.
252 NumBytes =
253 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
254 break;
255 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
256 case TargetOpcode::PATCHABLE_TAIL_CALL:
257 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
258 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
259 NumBytes = 36;
260 break;
261 case TargetOpcode::PATCHABLE_EVENT_CALL:
262 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
263 NumBytes = 24;
264 break;
265
266 case AArch64::SPACE:
267 NumBytes = MI.getOperand(1).getImm();
268 break;
269 case AArch64::MOVaddr:
270 case AArch64::MOVaddrJT:
271 case AArch64::MOVaddrCP:
272 case AArch64::MOVaddrBA:
273 case AArch64::MOVaddrTLS:
274 case AArch64::MOVaddrEXT: {
275 // Use the same logic as the pseudo expansion to count instructions.
278 MI.getOperand(1).getTargetFlags(),
279 Subtarget.isTargetMachO(), Insn);
280 NumBytes = Insn.size() * 4;
281 break;
282 }
283
284 case AArch64::MOVi32imm:
285 case AArch64::MOVi64imm: {
286 // Use the same logic as the pseudo expansion to count instructions.
287 unsigned BitSize = Desc.getOpcode() == AArch64::MOVi32imm ? 32 : 64;
289 AArch64_IMM::expandMOVImm(MI.getOperand(1).getImm(), BitSize, Insn);
290 NumBytes = Insn.size() * 4;
291 break;
292 }
293
294 case TargetOpcode::BUNDLE:
295 NumBytes = getInstBundleSize(MI);
296 break;
297 }
298
299 return NumBytes;
300}
301
304 // Block ends with fall-through condbranch.
305 switch (LastInst->getOpcode()) {
306 default:
307 llvm_unreachable("Unknown branch instruction?");
308 case AArch64::Bcc:
309 Target = LastInst->getOperand(1).getMBB();
310 Cond.push_back(LastInst->getOperand(0));
311 break;
312 case AArch64::CBZW:
313 case AArch64::CBZX:
314 case AArch64::CBNZW:
315 case AArch64::CBNZX:
316 Target = LastInst->getOperand(1).getMBB();
317 Cond.push_back(MachineOperand::CreateImm(-1));
318 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
319 Cond.push_back(LastInst->getOperand(0));
320 break;
321 case AArch64::TBZW:
322 case AArch64::TBZX:
323 case AArch64::TBNZW:
324 case AArch64::TBNZX:
325 Target = LastInst->getOperand(2).getMBB();
326 Cond.push_back(MachineOperand::CreateImm(-1));
327 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
328 Cond.push_back(LastInst->getOperand(0));
329 Cond.push_back(LastInst->getOperand(1));
330 break;
331 case AArch64::CBWPri:
332 case AArch64::CBXPri:
333 case AArch64::CBWPrr:
334 case AArch64::CBXPrr:
335 Target = LastInst->getOperand(3).getMBB();
336 Cond.push_back(MachineOperand::CreateImm(-1));
337 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
338 Cond.push_back(LastInst->getOperand(0));
339 Cond.push_back(LastInst->getOperand(1));
340 Cond.push_back(LastInst->getOperand(2));
341 break;
342 case AArch64::CBBAssertExt:
343 case AArch64::CBHAssertExt:
344 Target = LastInst->getOperand(3).getMBB();
345 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
346 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
347 Cond.push_back(LastInst->getOperand(0)); // Cond
348 Cond.push_back(LastInst->getOperand(1)); // Op0
349 Cond.push_back(LastInst->getOperand(2)); // Op1
350 Cond.push_back(LastInst->getOperand(4)); // Ext0
351 Cond.push_back(LastInst->getOperand(5)); // Ext1
352 break;
353 }
354}
355
356static unsigned getBranchDisplacementBits(unsigned Opc) {
357 switch (Opc) {
358 default:
359 llvm_unreachable("unexpected opcode!");
360 case AArch64::B:
361 return BDisplacementBits;
362 case AArch64::TBNZW:
363 case AArch64::TBZW:
364 case AArch64::TBNZX:
365 case AArch64::TBZX:
366 return TBZDisplacementBits;
367 case AArch64::CBNZW:
368 case AArch64::CBZW:
369 case AArch64::CBNZX:
370 case AArch64::CBZX:
371 return CBZDisplacementBits;
372 case AArch64::Bcc:
373 return BCCDisplacementBits;
374 case AArch64::CBWPri:
375 case AArch64::CBXPri:
376 case AArch64::CBBAssertExt:
377 case AArch64::CBHAssertExt:
378 case AArch64::CBWPrr:
379 case AArch64::CBXPrr:
380 return CBDisplacementBits;
381 }
382}
383
385 int64_t BrOffset) const {
386 unsigned Bits = getBranchDisplacementBits(BranchOp);
387 assert(Bits >= 3 && "max branch displacement must be enough to jump"
388 "over conditional branch expansion");
389 return isIntN(Bits, BrOffset / 4);
390}
391
394 switch (MI.getOpcode()) {
395 default:
396 llvm_unreachable("unexpected opcode!");
397 case AArch64::B:
398 return MI.getOperand(0).getMBB();
399 case AArch64::TBZW:
400 case AArch64::TBNZW:
401 case AArch64::TBZX:
402 case AArch64::TBNZX:
403 return MI.getOperand(2).getMBB();
404 case AArch64::CBZW:
405 case AArch64::CBNZW:
406 case AArch64::CBZX:
407 case AArch64::CBNZX:
408 case AArch64::Bcc:
409 return MI.getOperand(1).getMBB();
410 case AArch64::CBWPri:
411 case AArch64::CBXPri:
412 case AArch64::CBBAssertExt:
413 case AArch64::CBHAssertExt:
414 case AArch64::CBWPrr:
415 case AArch64::CBXPrr:
416 return MI.getOperand(3).getMBB();
417 }
418}
419
421 MachineBasicBlock &NewDestBB,
422 MachineBasicBlock &RestoreBB,
423 const DebugLoc &DL,
424 int64_t BrOffset,
425 RegScavenger *RS) const {
426 assert(RS && "RegScavenger required for long branching");
427 assert(MBB.empty() &&
428 "new block should be inserted for expanding unconditional branch");
429 assert(MBB.pred_size() == 1);
430 assert(RestoreBB.empty() &&
431 "restore block should be inserted for restoring clobbered registers");
432
433 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
434 // Offsets outside of the signed 33-bit range are not supported for ADRP +
435 // ADD.
436 if (!isInt<33>(BrOffset))
438 "Branch offsets outside of the signed 33-bit range not supported");
439
440 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
441 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
442 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
443 .addReg(Reg)
444 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
445 .addImm(0);
446 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
447 };
448
449 RS->enterBasicBlockEnd(MBB);
450 // If X16 is unused, we can rely on the linker to insert a range extension
451 // thunk if NewDestBB is out of range of a single B instruction.
452 constexpr Register Reg = AArch64::X16;
453 if (!RS->isRegUsed(Reg)) {
454 insertUnconditionalBranch(MBB, &NewDestBB, DL);
455 RS->setRegUsed(Reg);
456 return;
457 }
458
459 // In a cold block without BTI, insert the indirect branch if a register is
460 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
461 // prioritizing a dynamic cost in cold code over a static cost in hot code.
462 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
463 bool HasBTI = AFI && AFI->branchTargetEnforcement();
464 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
465 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
466 if (Scavenged != AArch64::NoRegister) {
467 buildIndirectBranch(Scavenged, NewDestBB);
468 RS->setRegUsed(Scavenged);
469 return;
470 }
471 }
472
473 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
474 // with red zones.
475 if (!AFI || AFI->hasRedZone().value_or(true))
477 "Unable to insert indirect branch inside function that has red zone");
478
479 // Otherwise, spill X16 and defer range extension to the linker.
480 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
481 .addReg(AArch64::SP, RegState::Define)
482 .addReg(Reg)
483 .addReg(AArch64::SP)
484 .addImm(-16);
485
486 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
487
488 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
489 .addReg(AArch64::SP, RegState::Define)
491 .addReg(AArch64::SP)
492 .addImm(16);
493}
494
495// Branch analysis.
498 MachineBasicBlock *&FBB,
500 bool AllowModify) const {
501 // If the block has no terminators, it just falls into the block after it.
502 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
503 if (I == MBB.end())
504 return false;
505
506 // Skip over SpeculationBarrierEndBB terminators
507 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
508 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
509 --I;
510 }
511
512 if (!isUnpredicatedTerminator(*I))
513 return false;
514
515 // Get the last instruction in the block.
516 MachineInstr *LastInst = &*I;
517
518 // If there is only one terminator instruction, process it.
519 unsigned LastOpc = LastInst->getOpcode();
520 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
521 if (isUncondBranchOpcode(LastOpc)) {
522 TBB = LastInst->getOperand(0).getMBB();
523 return false;
524 }
525 if (isCondBranchOpcode(LastOpc)) {
526 // Block ends with fall-through condbranch.
527 parseCondBranch(LastInst, TBB, Cond);
528 return false;
529 }
530 return true; // Can't handle indirect branch.
531 }
532
533 // Get the instruction before it if it is a terminator.
534 MachineInstr *SecondLastInst = &*I;
535 unsigned SecondLastOpc = SecondLastInst->getOpcode();
536
537 // If AllowModify is true and the block ends with two or more unconditional
538 // branches, delete all but the first unconditional branch.
539 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
540 while (isUncondBranchOpcode(SecondLastOpc)) {
541 LastInst->eraseFromParent();
542 LastInst = SecondLastInst;
543 LastOpc = LastInst->getOpcode();
544 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
545 // Return now the only terminator is an unconditional branch.
546 TBB = LastInst->getOperand(0).getMBB();
547 return false;
548 }
549 SecondLastInst = &*I;
550 SecondLastOpc = SecondLastInst->getOpcode();
551 }
552 }
553
554 // If we're allowed to modify and the block ends in a unconditional branch
555 // which could simply fallthrough, remove the branch. (Note: This case only
556 // matters when we can't understand the whole sequence, otherwise it's also
557 // handled by BranchFolding.cpp.)
558 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
559 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
560 LastInst->eraseFromParent();
561 LastInst = SecondLastInst;
562 LastOpc = LastInst->getOpcode();
563 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
564 assert(!isUncondBranchOpcode(LastOpc) &&
565 "unreachable unconditional branches removed above");
566
567 if (isCondBranchOpcode(LastOpc)) {
568 // Block ends with fall-through condbranch.
569 parseCondBranch(LastInst, TBB, Cond);
570 return false;
571 }
572 return true; // Can't handle indirect branch.
573 }
574 SecondLastInst = &*I;
575 SecondLastOpc = SecondLastInst->getOpcode();
576 }
577
578 // If there are three terminators, we don't know what sort of block this is.
579 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
580 return true;
581
582 // If the block ends with a B and a Bcc, handle it.
583 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
584 parseCondBranch(SecondLastInst, TBB, Cond);
585 FBB = LastInst->getOperand(0).getMBB();
586 return false;
587 }
588
589 // If the block ends with two unconditional branches, handle it. The second
590 // one is not executed, so remove it.
591 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
592 TBB = SecondLastInst->getOperand(0).getMBB();
593 I = LastInst;
594 if (AllowModify)
595 I->eraseFromParent();
596 return false;
597 }
598
599 // ...likewise if it ends with an indirect branch followed by an unconditional
600 // branch.
601 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
602 I = LastInst;
603 if (AllowModify)
604 I->eraseFromParent();
605 return true;
606 }
607
608 // Otherwise, can't handle this.
609 return true;
610}
611
613 MachineBranchPredicate &MBP,
614 bool AllowModify) const {
615 // Use analyzeBranch to validate the branch pattern.
616 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
618 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
619 return true;
620
621 // analyzeBranch returns success with empty Cond for unconditional branches.
622 if (Cond.empty())
623 return true;
624
625 MBP.TrueDest = TBB;
626 assert(MBP.TrueDest && "expected!");
627 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
628
629 MBP.ConditionDef = nullptr;
630 MBP.SingleUseCondition = false;
631
632 // Find the conditional branch. After analyzeBranch succeeds with non-empty
633 // Cond, there's exactly one conditional branch - either last (fallthrough)
634 // or second-to-last (followed by unconditional B).
635 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
636 if (I == MBB.end())
637 return true;
638
639 if (isUncondBranchOpcode(I->getOpcode())) {
640 if (I == MBB.begin())
641 return true;
642 --I;
643 }
644
645 MachineInstr *CondBranch = &*I;
646 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
647
648 switch (CondBranch->getOpcode()) {
649 default:
650 return true;
651
652 case AArch64::Bcc:
653 // Bcc takes the NZCV flag as the operand to branch on, walk up the
654 // instruction stream to find the last instruction to define NZCV.
656 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
657 MBP.ConditionDef = &MI;
658 break;
659 }
660 }
661 return false;
662
663 case AArch64::CBZW:
664 case AArch64::CBZX:
665 case AArch64::CBNZW:
666 case AArch64::CBNZX: {
667 MBP.LHS = CondBranch->getOperand(0);
668 MBP.RHS = MachineOperand::CreateImm(0);
669 unsigned Opc = CondBranch->getOpcode();
670 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
671 ? MachineBranchPredicate::PRED_NE
672 : MachineBranchPredicate::PRED_EQ;
673 Register CondReg = MBP.LHS.getReg();
674 if (CondReg.isVirtual())
675 MBP.ConditionDef = MRI.getVRegDef(CondReg);
676 return false;
677 }
678
679 case AArch64::TBZW:
680 case AArch64::TBZX:
681 case AArch64::TBNZW:
682 case AArch64::TBNZX: {
683 Register CondReg = CondBranch->getOperand(0).getReg();
684 if (CondReg.isVirtual())
685 MBP.ConditionDef = MRI.getVRegDef(CondReg);
686 return false;
687 }
688 }
689}
690
693 if (Cond[0].getImm() != -1) {
694 // Regular Bcc
695 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
697 } else {
698 // Folded compare-and-branch
699 switch (Cond[1].getImm()) {
700 default:
701 llvm_unreachable("Unknown conditional branch!");
702 case AArch64::CBZW:
703 Cond[1].setImm(AArch64::CBNZW);
704 break;
705 case AArch64::CBNZW:
706 Cond[1].setImm(AArch64::CBZW);
707 break;
708 case AArch64::CBZX:
709 Cond[1].setImm(AArch64::CBNZX);
710 break;
711 case AArch64::CBNZX:
712 Cond[1].setImm(AArch64::CBZX);
713 break;
714 case AArch64::TBZW:
715 Cond[1].setImm(AArch64::TBNZW);
716 break;
717 case AArch64::TBNZW:
718 Cond[1].setImm(AArch64::TBZW);
719 break;
720 case AArch64::TBZX:
721 Cond[1].setImm(AArch64::TBNZX);
722 break;
723 case AArch64::TBNZX:
724 Cond[1].setImm(AArch64::TBZX);
725 break;
726
727 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
728 case AArch64::CBWPri:
729 case AArch64::CBXPri:
730 case AArch64::CBBAssertExt:
731 case AArch64::CBHAssertExt:
732 case AArch64::CBWPrr:
733 case AArch64::CBXPrr: {
734 // Pseudos using standard 4bit Arm condition codes
736 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
738 }
739 }
740 }
741
742 return false;
743}
744
746 int *BytesRemoved) const {
747 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
748 if (I == MBB.end())
749 return 0;
750
751 if (!isUncondBranchOpcode(I->getOpcode()) &&
752 !isCondBranchOpcode(I->getOpcode()))
753 return 0;
754
755 // Remove the branch.
756 I->eraseFromParent();
757
758 I = MBB.end();
759
760 if (I == MBB.begin()) {
761 if (BytesRemoved)
762 *BytesRemoved = 4;
763 return 1;
764 }
765 --I;
766 if (!isCondBranchOpcode(I->getOpcode())) {
767 if (BytesRemoved)
768 *BytesRemoved = 4;
769 return 1;
770 }
771
772 // Remove the branch.
773 I->eraseFromParent();
774 if (BytesRemoved)
775 *BytesRemoved = 8;
776
777 return 2;
778}
779
780void AArch64InstrInfo::instantiateCondBranch(
783 if (Cond[0].getImm() != -1) {
784 // Regular Bcc
785 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
786 } else {
787 // Folded compare-and-branch
788 // Note that we use addOperand instead of addReg to keep the flags.
789
790 // cbz, cbnz
791 const MachineInstrBuilder MIB =
792 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
793
794 // tbz/tbnz
795 if (Cond.size() > 3)
796 MIB.add(Cond[3]);
797
798 // cb
799 if (Cond.size() > 4)
800 MIB.add(Cond[4]);
801
802 MIB.addMBB(TBB);
803
804 // cb[b,h]
805 if (Cond.size() > 5) {
806 MIB.addImm(Cond[5].getImm());
807 MIB.addImm(Cond[6].getImm());
808 }
809 }
810}
811
814 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
815 // Shouldn't be a fall through.
816 assert(TBB && "insertBranch must not be told to insert a fallthrough");
817
818 if (!FBB) {
819 if (Cond.empty()) // Unconditional branch?
820 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
821 else
822 instantiateCondBranch(MBB, DL, TBB, Cond);
823
824 if (BytesAdded)
825 *BytesAdded = 4;
826
827 return 1;
828 }
829
830 // Two-way conditional branch.
831 instantiateCondBranch(MBB, DL, TBB, Cond);
832 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
833
834 if (BytesAdded)
835 *BytesAdded = 8;
836
837 return 2;
838}
839
841 const TargetInstrInfo &TII) {
842 for (MachineInstr &MI : MBB->terminators()) {
843 unsigned Opc = MI.getOpcode();
844 switch (Opc) {
845 case AArch64::CBZW:
846 case AArch64::CBZX:
847 case AArch64::TBZW:
848 case AArch64::TBZX:
849 // CBZ/TBZ with WZR/XZR -> unconditional B
850 if (MI.getOperand(0).getReg() == AArch64::WZR ||
851 MI.getOperand(0).getReg() == AArch64::XZR) {
852 DEBUG_WITH_TYPE("optimizeTerminators",
853 dbgs() << "Removing always taken branch: " << MI);
854 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
855 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
856 for (auto *S : Succs)
857 if (S != Target)
858 MBB->removeSuccessor(S);
859 DebugLoc DL = MI.getDebugLoc();
860 while (MBB->rbegin() != &MI)
861 MBB->rbegin()->eraseFromParent();
862 MI.eraseFromParent();
863 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
864 return true;
865 }
866 break;
867 case AArch64::CBNZW:
868 case AArch64::CBNZX:
869 case AArch64::TBNZW:
870 case AArch64::TBNZX:
871 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
872 if (MI.getOperand(0).getReg() == AArch64::WZR ||
873 MI.getOperand(0).getReg() == AArch64::XZR) {
874 DEBUG_WITH_TYPE("optimizeTerminators",
875 dbgs() << "Removing never taken branch: " << MI);
876 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
877 MI.getParent()->removeSuccessor(Target);
878 MI.eraseFromParent();
879 return true;
880 }
881 break;
882 }
883 }
884 return false;
885}
886
887// Find the original register that VReg is copied from.
888static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
889 while (Register::isVirtualRegister(VReg)) {
890 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
891 if (!DefMI->isFullCopy())
892 return VReg;
893 VReg = DefMI->getOperand(1).getReg();
894 }
895 return VReg;
896}
897
898// Determine if VReg is defined by an instruction that can be folded into a
899// csel instruction. If so, return the folded opcode, and the replacement
900// register.
901static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
902 unsigned *NewReg = nullptr) {
903 VReg = removeCopies(MRI, VReg);
905 return 0;
906
907 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
908 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
909 unsigned Opc = 0;
910 unsigned SrcReg = 0;
911 switch (DefMI->getOpcode()) {
912 case AArch64::SUBREG_TO_REG:
913 // Check for the following way to define an 64-bit immediate:
914 // %0:gpr32 = MOVi32imm 1
915 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
916 if (!DefMI->getOperand(1).isReg())
917 return 0;
918 if (!DefMI->getOperand(2).isImm() ||
919 DefMI->getOperand(2).getImm() != AArch64::sub_32)
920 return 0;
921 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
922 if (DefMI->getOpcode() != AArch64::MOVi32imm)
923 return 0;
924 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
925 return 0;
926 assert(Is64Bit);
927 SrcReg = AArch64::XZR;
928 Opc = AArch64::CSINCXr;
929 break;
930
931 case AArch64::MOVi32imm:
932 case AArch64::MOVi64imm:
933 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
934 return 0;
935 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
936 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
937 break;
938
939 case AArch64::ADDSXri:
940 case AArch64::ADDSWri:
941 // if NZCV is used, do not fold.
942 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
943 true) == -1)
944 return 0;
945 // fall-through to ADDXri and ADDWri.
946 [[fallthrough]];
947 case AArch64::ADDXri:
948 case AArch64::ADDWri:
949 // add x, 1 -> csinc.
950 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
951 DefMI->getOperand(3).getImm() != 0)
952 return 0;
953 SrcReg = DefMI->getOperand(1).getReg();
954 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
955 break;
956
957 case AArch64::ORNXrr:
958 case AArch64::ORNWrr: {
959 // not x -> csinv, represented as orn dst, xzr, src.
960 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
961 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
962 return 0;
963 SrcReg = DefMI->getOperand(2).getReg();
964 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
965 break;
966 }
967
968 case AArch64::SUBSXrr:
969 case AArch64::SUBSWrr:
970 // if NZCV is used, do not fold.
971 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
972 true) == -1)
973 return 0;
974 // fall-through to SUBXrr and SUBWrr.
975 [[fallthrough]];
976 case AArch64::SUBXrr:
977 case AArch64::SUBWrr: {
978 // neg x -> csneg, represented as sub dst, xzr, src.
979 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
980 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
981 return 0;
982 SrcReg = DefMI->getOperand(2).getReg();
983 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
984 break;
985 }
986 default:
987 return 0;
988 }
989 assert(Opc && SrcReg && "Missing parameters");
990
991 if (NewReg)
992 *NewReg = SrcReg;
993 return Opc;
994}
995
998 Register DstReg, Register TrueReg,
999 Register FalseReg, int &CondCycles,
1000 int &TrueCycles,
1001 int &FalseCycles) const {
1002 // Check register classes.
1003 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1004 const TargetRegisterClass *RC =
1005 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
1006 if (!RC)
1007 return false;
1008
1009 // Also need to check the dest regclass, in case we're trying to optimize
1010 // something like:
1011 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
1012 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
1013 return false;
1014
1015 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
1016 unsigned ExtraCondLat = Cond.size() != 1;
1017
1018 // GPRs are handled by csel.
1019 // FIXME: Fold in x+1, -x, and ~x when applicable.
1020 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
1021 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
1022 // Single-cycle csel, csinc, csinv, and csneg.
1023 CondCycles = 1 + ExtraCondLat;
1024 TrueCycles = FalseCycles = 1;
1025 if (canFoldIntoCSel(MRI, TrueReg))
1026 TrueCycles = 0;
1027 else if (canFoldIntoCSel(MRI, FalseReg))
1028 FalseCycles = 0;
1029 return true;
1030 }
1031
1032 // Scalar floating point is handled by fcsel.
1033 // FIXME: Form fabs, fmin, and fmax when applicable.
1034 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
1035 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
1036 CondCycles = 5 + ExtraCondLat;
1037 TrueCycles = FalseCycles = 2;
1038 return true;
1039 }
1040
1041 // Can't do vectors.
1042 return false;
1043}
1044
1047 const DebugLoc &DL, Register DstReg,
1049 Register TrueReg, Register FalseReg) const {
1050 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1051
1052 // Parse the condition code, see parseCondBranch() above.
1054 switch (Cond.size()) {
1055 default:
1056 llvm_unreachable("Unknown condition opcode in Cond");
1057 case 1: // b.cc
1058 CC = AArch64CC::CondCode(Cond[0].getImm());
1059 break;
1060 case 3: { // cbz/cbnz
1061 // We must insert a compare against 0.
1062 bool Is64Bit;
1063 switch (Cond[1].getImm()) {
1064 default:
1065 llvm_unreachable("Unknown branch opcode in Cond");
1066 case AArch64::CBZW:
1067 Is64Bit = false;
1068 CC = AArch64CC::EQ;
1069 break;
1070 case AArch64::CBZX:
1071 Is64Bit = true;
1072 CC = AArch64CC::EQ;
1073 break;
1074 case AArch64::CBNZW:
1075 Is64Bit = false;
1076 CC = AArch64CC::NE;
1077 break;
1078 case AArch64::CBNZX:
1079 Is64Bit = true;
1080 CC = AArch64CC::NE;
1081 break;
1082 }
1083 Register SrcReg = Cond[2].getReg();
1084 if (Is64Bit) {
1085 // cmp reg, #0 is actually subs xzr, reg, #0.
1086 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1087 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1088 .addReg(SrcReg)
1089 .addImm(0)
1090 .addImm(0);
1091 } else {
1092 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1093 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1094 .addReg(SrcReg)
1095 .addImm(0)
1096 .addImm(0);
1097 }
1098 break;
1099 }
1100 case 4: { // tbz/tbnz
1101 // We must insert a tst instruction.
1102 switch (Cond[1].getImm()) {
1103 default:
1104 llvm_unreachable("Unknown branch opcode in Cond");
1105 case AArch64::TBZW:
1106 case AArch64::TBZX:
1107 CC = AArch64CC::EQ;
1108 break;
1109 case AArch64::TBNZW:
1110 case AArch64::TBNZX:
1111 CC = AArch64CC::NE;
1112 break;
1113 }
1114 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1115 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1116 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1117 .addReg(Cond[2].getReg())
1118 .addImm(
1120 else
1121 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1122 .addReg(Cond[2].getReg())
1123 .addImm(
1125 break;
1126 }
1127 case 5: { // cb
1128 // We must insert a cmp, that is a subs
1129 // 0 1 2 3 4
1130 // Cond is { -1, Opcode, CC, Op0, Op1 }
1131
1132 unsigned SubsOpc, SubsDestReg;
1133 bool IsImm = false;
1134 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1135 switch (Cond[1].getImm()) {
1136 default:
1137 llvm_unreachable("Unknown branch opcode in Cond");
1138 case AArch64::CBWPri:
1139 SubsOpc = AArch64::SUBSWri;
1140 SubsDestReg = AArch64::WZR;
1141 IsImm = true;
1142 break;
1143 case AArch64::CBXPri:
1144 SubsOpc = AArch64::SUBSXri;
1145 SubsDestReg = AArch64::XZR;
1146 IsImm = true;
1147 break;
1148 case AArch64::CBWPrr:
1149 SubsOpc = AArch64::SUBSWrr;
1150 SubsDestReg = AArch64::WZR;
1151 IsImm = false;
1152 break;
1153 case AArch64::CBXPrr:
1154 SubsOpc = AArch64::SUBSXrr;
1155 SubsDestReg = AArch64::XZR;
1156 IsImm = false;
1157 break;
1158 }
1159
1160 if (IsImm)
1161 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1162 .addReg(Cond[3].getReg())
1163 .addImm(Cond[4].getImm())
1164 .addImm(0);
1165 else
1166 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1167 .addReg(Cond[3].getReg())
1168 .addReg(Cond[4].getReg());
1169 } break;
1170 case 7: { // cb[b,h]
1171 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1172 // that have been folded. For the first operand we codegen an explicit
1173 // extension, for the second operand we fold the extension into cmp.
1174 // 0 1 2 3 4 5 6
1175 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1176
1177 // We need a new register for the now explicitly extended register
1178 Register Reg = Cond[4].getReg();
1180 unsigned ExtOpc;
1181 unsigned ExtBits;
1182 AArch64_AM::ShiftExtendType ExtendType =
1184 switch (ExtendType) {
1185 default:
1186 llvm_unreachable("Unknown shift-extend for CB instruction");
1187 case AArch64_AM::SXTB:
1188 assert(
1189 Cond[1].getImm() == AArch64::CBBAssertExt &&
1190 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1191 ExtOpc = AArch64::SBFMWri;
1192 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1193 break;
1194 case AArch64_AM::SXTH:
1195 assert(
1196 Cond[1].getImm() == AArch64::CBHAssertExt &&
1197 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1198 ExtOpc = AArch64::SBFMWri;
1199 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1200 break;
1201 case AArch64_AM::UXTB:
1202 assert(
1203 Cond[1].getImm() == AArch64::CBBAssertExt &&
1204 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1205 ExtOpc = AArch64::ANDWri;
1206 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1207 break;
1208 case AArch64_AM::UXTH:
1209 assert(
1210 Cond[1].getImm() == AArch64::CBHAssertExt &&
1211 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1212 ExtOpc = AArch64::ANDWri;
1213 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1214 break;
1215 }
1216
1217 // Build the explicit extension of the first operand
1218 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1220 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1221 if (ExtOpc != AArch64::ANDWri)
1222 MBBI.addImm(0);
1223 MBBI.addImm(ExtBits);
1224 }
1225
1226 // Now, subs with an extended second operand
1228 AArch64_AM::ShiftExtendType ExtendType =
1230 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1231 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1232 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1233 .addReg(Cond[3].getReg())
1234 .addReg(Reg)
1235 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1236 } // If no extension is needed, just a regular subs
1237 else {
1238 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1239 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1240 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1241 .addReg(Cond[3].getReg())
1242 .addReg(Reg);
1243 }
1244
1245 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1246 } break;
1247 }
1248
1249 unsigned Opc = 0;
1250 const TargetRegisterClass *RC = nullptr;
1251 bool TryFold = false;
1252 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1253 RC = &AArch64::GPR64RegClass;
1254 Opc = AArch64::CSELXr;
1255 TryFold = true;
1256 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1257 RC = &AArch64::GPR32RegClass;
1258 Opc = AArch64::CSELWr;
1259 TryFold = true;
1260 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1261 RC = &AArch64::FPR64RegClass;
1262 Opc = AArch64::FCSELDrrr;
1263 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1264 RC = &AArch64::FPR32RegClass;
1265 Opc = AArch64::FCSELSrrr;
1266 }
1267 assert(RC && "Unsupported regclass");
1268
1269 // Try folding simple instructions into the csel.
1270 if (TryFold) {
1271 unsigned NewReg = 0;
1272 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1273 if (FoldedOpc) {
1274 // The folded opcodes csinc, csinc and csneg apply the operation to
1275 // FalseReg, so we need to invert the condition.
1277 TrueReg = FalseReg;
1278 } else
1279 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1280
1281 // Fold the operation. Leave any dead instructions for DCE to clean up.
1282 if (FoldedOpc) {
1283 FalseReg = NewReg;
1284 Opc = FoldedOpc;
1285 // Extend the live range of NewReg.
1286 MRI.clearKillFlags(NewReg);
1287 }
1288 }
1289
1290 // Pull all virtual register into the appropriate class.
1291 MRI.constrainRegClass(TrueReg, RC);
1292 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1293 assert(
1294 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1295 FalseReg == AArch64::XZR) &&
1296 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1297 if (FalseReg.isVirtual())
1298 MRI.constrainRegClass(FalseReg, RC);
1299
1300 // Insert the csel.
1301 BuildMI(MBB, I, DL, get(Opc), DstReg)
1302 .addReg(TrueReg)
1303 .addReg(FalseReg)
1304 .addImm(CC);
1305}
1306
1307// Return true if Imm can be loaded into a register by a "cheap" sequence of
1308// instructions. For now, "cheap" means at most two instructions.
1309static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1310 if (BitSize == 32)
1311 return true;
1312
1313 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1314 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1316 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1317
1318 return Is.size() <= 2;
1319}
1320
1321// Check if a COPY instruction is cheap.
1322static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1323 assert(MI.isCopy() && "Expected COPY instruction");
1324 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1325
1326 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1327 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1328 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1329 if (Reg.isVirtual())
1330 return MRI.getRegClass(Reg);
1331 if (Reg.isPhysical())
1332 return RI.getMinimalPhysRegClass(Reg);
1333 return nullptr;
1334 };
1335 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1336 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1337 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1338 return false;
1339
1340 return MI.isAsCheapAsAMove();
1341}
1342
1343// FIXME: this implementation should be micro-architecture dependent, so a
1344// micro-architecture target hook should be introduced here in future.
1346 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1347 if (isExynosCheapAsMove(MI))
1348 return true;
1349 return MI.isAsCheapAsAMove();
1350 }
1351
1352 switch (MI.getOpcode()) {
1353 default:
1354 return MI.isAsCheapAsAMove();
1355
1356 case TargetOpcode::COPY:
1357 return isCheapCopy(MI, RI);
1358
1359 case AArch64::ADDWrs:
1360 case AArch64::ADDXrs:
1361 case AArch64::SUBWrs:
1362 case AArch64::SUBXrs:
1363 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1364
1365 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1366 // ORRXri, it is as cheap as MOV.
1367 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1368 case AArch64::MOVi32imm:
1369 return isCheapImmediate(MI, 32);
1370 case AArch64::MOVi64imm:
1371 return isCheapImmediate(MI, 64);
1372 }
1373}
1374
1375bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1376 switch (MI.getOpcode()) {
1377 default:
1378 return false;
1379
1380 case AArch64::ADDWrs:
1381 case AArch64::ADDXrs:
1382 case AArch64::ADDSWrs:
1383 case AArch64::ADDSXrs: {
1384 unsigned Imm = MI.getOperand(3).getImm();
1385 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1386 if (ShiftVal == 0)
1387 return true;
1388 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1389 }
1390
1391 case AArch64::ADDWrx:
1392 case AArch64::ADDXrx:
1393 case AArch64::ADDXrx64:
1394 case AArch64::ADDSWrx:
1395 case AArch64::ADDSXrx:
1396 case AArch64::ADDSXrx64: {
1397 unsigned Imm = MI.getOperand(3).getImm();
1398 switch (AArch64_AM::getArithExtendType(Imm)) {
1399 default:
1400 return false;
1401 case AArch64_AM::UXTB:
1402 case AArch64_AM::UXTH:
1403 case AArch64_AM::UXTW:
1404 case AArch64_AM::UXTX:
1405 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1406 }
1407 }
1408
1409 case AArch64::SUBWrs:
1410 case AArch64::SUBSWrs: {
1411 unsigned Imm = MI.getOperand(3).getImm();
1412 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1413 return ShiftVal == 0 ||
1414 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1415 }
1416
1417 case AArch64::SUBXrs:
1418 case AArch64::SUBSXrs: {
1419 unsigned Imm = MI.getOperand(3).getImm();
1420 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1421 return ShiftVal == 0 ||
1422 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1423 }
1424
1425 case AArch64::SUBWrx:
1426 case AArch64::SUBXrx:
1427 case AArch64::SUBXrx64:
1428 case AArch64::SUBSWrx:
1429 case AArch64::SUBSXrx:
1430 case AArch64::SUBSXrx64: {
1431 unsigned Imm = MI.getOperand(3).getImm();
1432 switch (AArch64_AM::getArithExtendType(Imm)) {
1433 default:
1434 return false;
1435 case AArch64_AM::UXTB:
1436 case AArch64_AM::UXTH:
1437 case AArch64_AM::UXTW:
1438 case AArch64_AM::UXTX:
1439 return AArch64_AM::getArithShiftValue(Imm) == 0;
1440 }
1441 }
1442
1443 case AArch64::LDRBBroW:
1444 case AArch64::LDRBBroX:
1445 case AArch64::LDRBroW:
1446 case AArch64::LDRBroX:
1447 case AArch64::LDRDroW:
1448 case AArch64::LDRDroX:
1449 case AArch64::LDRHHroW:
1450 case AArch64::LDRHHroX:
1451 case AArch64::LDRHroW:
1452 case AArch64::LDRHroX:
1453 case AArch64::LDRQroW:
1454 case AArch64::LDRQroX:
1455 case AArch64::LDRSBWroW:
1456 case AArch64::LDRSBWroX:
1457 case AArch64::LDRSBXroW:
1458 case AArch64::LDRSBXroX:
1459 case AArch64::LDRSHWroW:
1460 case AArch64::LDRSHWroX:
1461 case AArch64::LDRSHXroW:
1462 case AArch64::LDRSHXroX:
1463 case AArch64::LDRSWroW:
1464 case AArch64::LDRSWroX:
1465 case AArch64::LDRSroW:
1466 case AArch64::LDRSroX:
1467 case AArch64::LDRWroW:
1468 case AArch64::LDRWroX:
1469 case AArch64::LDRXroW:
1470 case AArch64::LDRXroX:
1471 case AArch64::PRFMroW:
1472 case AArch64::PRFMroX:
1473 case AArch64::STRBBroW:
1474 case AArch64::STRBBroX:
1475 case AArch64::STRBroW:
1476 case AArch64::STRBroX:
1477 case AArch64::STRDroW:
1478 case AArch64::STRDroX:
1479 case AArch64::STRHHroW:
1480 case AArch64::STRHHroX:
1481 case AArch64::STRHroW:
1482 case AArch64::STRHroX:
1483 case AArch64::STRQroW:
1484 case AArch64::STRQroX:
1485 case AArch64::STRSroW:
1486 case AArch64::STRSroX:
1487 case AArch64::STRWroW:
1488 case AArch64::STRWroX:
1489 case AArch64::STRXroW:
1490 case AArch64::STRXroX: {
1491 unsigned IsSigned = MI.getOperand(3).getImm();
1492 return !IsSigned;
1493 }
1494 }
1495}
1496
1497bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1498 unsigned Opc = MI.getOpcode();
1499 switch (Opc) {
1500 default:
1501 return false;
1502 case AArch64::SEH_StackAlloc:
1503 case AArch64::SEH_SaveFPLR:
1504 case AArch64::SEH_SaveFPLR_X:
1505 case AArch64::SEH_SaveReg:
1506 case AArch64::SEH_SaveReg_X:
1507 case AArch64::SEH_SaveRegP:
1508 case AArch64::SEH_SaveRegP_X:
1509 case AArch64::SEH_SaveFReg:
1510 case AArch64::SEH_SaveFReg_X:
1511 case AArch64::SEH_SaveFRegP:
1512 case AArch64::SEH_SaveFRegP_X:
1513 case AArch64::SEH_SetFP:
1514 case AArch64::SEH_AddFP:
1515 case AArch64::SEH_Nop:
1516 case AArch64::SEH_PrologEnd:
1517 case AArch64::SEH_EpilogStart:
1518 case AArch64::SEH_EpilogEnd:
1519 case AArch64::SEH_PACSignLR:
1520 case AArch64::SEH_SaveAnyRegI:
1521 case AArch64::SEH_SaveAnyRegIP:
1522 case AArch64::SEH_SaveAnyRegQP:
1523 case AArch64::SEH_SaveAnyRegQPX:
1524 case AArch64::SEH_AllocZ:
1525 case AArch64::SEH_SaveZReg:
1526 case AArch64::SEH_SavePReg:
1527 return true;
1528 }
1529}
1530
1532 Register &SrcReg, Register &DstReg,
1533 unsigned &SubIdx) const {
1534 switch (MI.getOpcode()) {
1535 default:
1536 return false;
1537 case AArch64::SBFMXri: // aka sxtw
1538 case AArch64::UBFMXri: // aka uxtw
1539 // Check for the 32 -> 64 bit extension case, these instructions can do
1540 // much more.
1541 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1542 return false;
1543 // This is a signed or unsigned 32 -> 64 bit extension.
1544 SrcReg = MI.getOperand(1).getReg();
1545 DstReg = MI.getOperand(0).getReg();
1546 SubIdx = AArch64::sub_32;
1547 return true;
1548 }
1549}
1550
1552 const MachineInstr &MIa, const MachineInstr &MIb) const {
1554 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1555 int64_t OffsetA = 0, OffsetB = 0;
1556 TypeSize WidthA(0, false), WidthB(0, false);
1557 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1558
1559 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1560 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1561
1564 return false;
1565
1566 // Retrieve the base, offset from the base and width. Width
1567 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1568 // base are identical, and the offset of a lower memory access +
1569 // the width doesn't overlap the offset of a higher memory access,
1570 // then the memory accesses are different.
1571 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1572 // are assumed to have the same scale (vscale).
1573 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1574 WidthA, TRI) &&
1575 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1576 WidthB, TRI)) {
1577 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1578 OffsetAIsScalable == OffsetBIsScalable) {
1579 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1580 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1581 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1582 if (LowWidth.isScalable() == OffsetAIsScalable &&
1583 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1584 return true;
1585 }
1586 }
1587 return false;
1588}
1589
1591 const MachineBasicBlock *MBB,
1592 const MachineFunction &MF) const {
1594 return true;
1595
1596 // Do not move an instruction that can be recognized as a branch target.
1597 if (hasBTISemantics(MI))
1598 return true;
1599
1600 switch (MI.getOpcode()) {
1601 case AArch64::HINT:
1602 // CSDB hints are scheduling barriers.
1603 if (MI.getOperand(0).getImm() == 0x14)
1604 return true;
1605 break;
1606 case AArch64::DSB:
1607 case AArch64::ISB:
1608 // DSB and ISB also are scheduling barriers.
1609 return true;
1610 case AArch64::MSRpstatesvcrImm1:
1611 // SMSTART and SMSTOP are also scheduling barriers.
1612 return true;
1613 default:;
1614 }
1615 if (isSEHInstruction(MI))
1616 return true;
1617 auto Next = std::next(MI.getIterator());
1618 return Next != MBB->end() && Next->isCFIInstruction();
1619}
1620
1621/// analyzeCompare - For a comparison instruction, return the source registers
1622/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1623/// Return true if the comparison instruction can be analyzed.
1625 Register &SrcReg2, int64_t &CmpMask,
1626 int64_t &CmpValue) const {
1627 // The first operand can be a frame index where we'd normally expect a
1628 // register.
1629 // FIXME: Pass subregisters out of analyzeCompare
1630 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1631 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1632 return false;
1633
1634 switch (MI.getOpcode()) {
1635 default:
1636 break;
1637 case AArch64::PTEST_PP:
1638 case AArch64::PTEST_PP_ANY:
1639 case AArch64::PTEST_PP_FIRST:
1640 SrcReg = MI.getOperand(0).getReg();
1641 SrcReg2 = MI.getOperand(1).getReg();
1642 if (MI.getOperand(2).getSubReg())
1643 return false;
1644
1645 // Not sure about the mask and value for now...
1646 CmpMask = ~0;
1647 CmpValue = 0;
1648 return true;
1649 case AArch64::SUBSWrr:
1650 case AArch64::SUBSWrs:
1651 case AArch64::SUBSWrx:
1652 case AArch64::SUBSXrr:
1653 case AArch64::SUBSXrs:
1654 case AArch64::SUBSXrx:
1655 case AArch64::ADDSWrr:
1656 case AArch64::ADDSWrs:
1657 case AArch64::ADDSWrx:
1658 case AArch64::ADDSXrr:
1659 case AArch64::ADDSXrs:
1660 case AArch64::ADDSXrx:
1661 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1662 SrcReg = MI.getOperand(1).getReg();
1663 SrcReg2 = MI.getOperand(2).getReg();
1664
1665 // FIXME: Pass subregisters out of analyzeCompare
1666 if (MI.getOperand(2).getSubReg())
1667 return false;
1668
1669 CmpMask = ~0;
1670 CmpValue = 0;
1671 return true;
1672 case AArch64::SUBSWri:
1673 case AArch64::ADDSWri:
1674 case AArch64::SUBSXri:
1675 case AArch64::ADDSXri:
1676 SrcReg = MI.getOperand(1).getReg();
1677 SrcReg2 = 0;
1678 CmpMask = ~0;
1679 CmpValue = MI.getOperand(2).getImm();
1680 return true;
1681 case AArch64::ANDSWri:
1682 case AArch64::ANDSXri:
1683 // ANDS does not use the same encoding scheme as the others xxxS
1684 // instructions.
1685 SrcReg = MI.getOperand(1).getReg();
1686 SrcReg2 = 0;
1687 CmpMask = ~0;
1689 MI.getOperand(2).getImm(),
1690 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1691 return true;
1692 }
1693
1694 return false;
1695}
1696
1698 MachineBasicBlock *MBB = Instr.getParent();
1699 assert(MBB && "Can't get MachineBasicBlock here");
1700 MachineFunction *MF = MBB->getParent();
1701 assert(MF && "Can't get MachineFunction here");
1704 MachineRegisterInfo *MRI = &MF->getRegInfo();
1705
1706 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1707 ++OpIdx) {
1708 MachineOperand &MO = Instr.getOperand(OpIdx);
1709 const TargetRegisterClass *OpRegCstraints =
1710 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1711
1712 // If there's no constraint, there's nothing to do.
1713 if (!OpRegCstraints)
1714 continue;
1715 // If the operand is a frame index, there's nothing to do here.
1716 // A frame index operand will resolve correctly during PEI.
1717 if (MO.isFI())
1718 continue;
1719
1720 assert(MO.isReg() &&
1721 "Operand has register constraints without being a register!");
1722
1723 Register Reg = MO.getReg();
1724 if (Reg.isPhysical()) {
1725 if (!OpRegCstraints->contains(Reg))
1726 return false;
1727 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1728 !MRI->constrainRegClass(Reg, OpRegCstraints))
1729 return false;
1730 }
1731
1732 return true;
1733}
1734
1735/// Return the opcode that does not set flags when possible - otherwise
1736/// return the original opcode. The caller is responsible to do the actual
1737/// substitution and legality checking.
1739 // Don't convert all compare instructions, because for some the zero register
1740 // encoding becomes the sp register.
1741 bool MIDefinesZeroReg = false;
1742 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1743 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1744 MIDefinesZeroReg = true;
1745
1746 switch (MI.getOpcode()) {
1747 default:
1748 return MI.getOpcode();
1749 case AArch64::ADDSWrr:
1750 return AArch64::ADDWrr;
1751 case AArch64::ADDSWri:
1752 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1753 case AArch64::ADDSWrs:
1754 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1755 case AArch64::ADDSWrx:
1756 return AArch64::ADDWrx;
1757 case AArch64::ADDSXrr:
1758 return AArch64::ADDXrr;
1759 case AArch64::ADDSXri:
1760 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1761 case AArch64::ADDSXrs:
1762 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1763 case AArch64::ADDSXrx:
1764 return AArch64::ADDXrx;
1765 case AArch64::SUBSWrr:
1766 return AArch64::SUBWrr;
1767 case AArch64::SUBSWri:
1768 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1769 case AArch64::SUBSWrs:
1770 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1771 case AArch64::SUBSWrx:
1772 return AArch64::SUBWrx;
1773 case AArch64::SUBSXrr:
1774 return AArch64::SUBXrr;
1775 case AArch64::SUBSXri:
1776 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1777 case AArch64::SUBSXrs:
1778 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1779 case AArch64::SUBSXrx:
1780 return AArch64::SUBXrx;
1781 }
1782}
1783
1784enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1785
1786/// True when condition flags are accessed (either by writing or reading)
1787/// on the instruction trace starting at From and ending at To.
1788///
1789/// Note: If From and To are from different blocks it's assumed CC are accessed
1790/// on the path.
1793 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1794 // Early exit if To is at the beginning of the BB.
1795 if (To == To->getParent()->begin())
1796 return true;
1797
1798 // Check whether the instructions are in the same basic block
1799 // If not, assume the condition flags might get modified somewhere.
1800 if (To->getParent() != From->getParent())
1801 return true;
1802
1803 // From must be above To.
1804 assert(std::any_of(
1805 ++To.getReverse(), To->getParent()->rend(),
1806 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1807
1808 // We iterate backward starting at \p To until we hit \p From.
1809 for (const MachineInstr &Instr :
1811 if (((AccessToCheck & AK_Write) &&
1812 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1813 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1814 return true;
1815 }
1816 return false;
1817}
1818
1819std::optional<unsigned>
1820AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1821 MachineInstr *Pred,
1822 const MachineRegisterInfo *MRI) const {
1823 unsigned MaskOpcode = Mask->getOpcode();
1824 unsigned PredOpcode = Pred->getOpcode();
1825 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1826 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1827
1828 if (PredIsWhileLike) {
1829 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1830 // instruction and the condition is "any" since WHILcc does an implicit
1831 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1832 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1833 return PredOpcode;
1834
1835 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1836 // redundant since WHILE performs an implicit PTEST with an all active
1837 // mask.
1838 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1839 getElementSizeForOpcode(MaskOpcode) ==
1840 getElementSizeForOpcode(PredOpcode))
1841 return PredOpcode;
1842
1843 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1844 // WHILEcc performs an implicit PTEST with an all active mask, setting
1845 // the N flag as the PTEST_FIRST would.
1846 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1847 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1848 return PredOpcode;
1849
1850 return {};
1851 }
1852
1853 if (PredIsPTestLike) {
1854 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1855 // instruction that sets the flags as PTEST would and the condition is
1856 // "any" since PG is always a subset of the governing predicate of the
1857 // ptest-like instruction.
1858 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1859 return PredOpcode;
1860
1861 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1862
1863 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1864 // to look through a copy and try again. This is because some instructions
1865 // take a predicate whose register class is a subset of its result class.
1866 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1867 PTestLikeMask->getOperand(1).getReg().isVirtual())
1868 PTestLikeMask =
1869 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1870
1871 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1872 // the element size matches and either the PTEST_LIKE instruction uses
1873 // the same all active mask or the condition is "any".
1874 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1875 getElementSizeForOpcode(MaskOpcode) ==
1876 getElementSizeForOpcode(PredOpcode)) {
1877 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1878 return PredOpcode;
1879 }
1880
1881 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1882 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1883 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1884 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1885 // performed by the compare could consider fewer lanes for these element
1886 // sizes.
1887 //
1888 // For example, consider
1889 //
1890 // ptrue p0.b ; P0=1111-1111-1111-1111
1891 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1892 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1893 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1894 // ; ^ last active
1895 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1896 // ; ^ last active
1897 //
1898 // where the compare generates a canonical all active 32-bit predicate
1899 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1900 // active flag, whereas the PTEST instruction with the same mask doesn't.
1901 // For PTEST_ANY this doesn't apply as the flags in this case would be
1902 // identical regardless of element size.
1903 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1904 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1905 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1906 return PredOpcode;
1907
1908 return {};
1909 }
1910
1911 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1912 // opcode so the PTEST becomes redundant.
1913 switch (PredOpcode) {
1914 case AArch64::AND_PPzPP:
1915 case AArch64::BIC_PPzPP:
1916 case AArch64::EOR_PPzPP:
1917 case AArch64::NAND_PPzPP:
1918 case AArch64::NOR_PPzPP:
1919 case AArch64::ORN_PPzPP:
1920 case AArch64::ORR_PPzPP:
1921 case AArch64::BRKA_PPzP:
1922 case AArch64::BRKPA_PPzPP:
1923 case AArch64::BRKB_PPzP:
1924 case AArch64::BRKPB_PPzPP:
1925 case AArch64::RDFFR_PPz: {
1926 // Check to see if our mask is the same. If not the resulting flag bits
1927 // may be different and we can't remove the ptest.
1928 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1929 if (Mask != PredMask)
1930 return {};
1931 break;
1932 }
1933 case AArch64::BRKN_PPzP: {
1934 // BRKN uses an all active implicit mask to set flags unlike the other
1935 // flag-setting instructions.
1936 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1937 if ((MaskOpcode != AArch64::PTRUE_B) ||
1938 (Mask->getOperand(1).getImm() != 31))
1939 return {};
1940 break;
1941 }
1942 case AArch64::PTRUE_B:
1943 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1944 break;
1945 default:
1946 // Bail out if we don't recognize the input
1947 return {};
1948 }
1949
1950 return convertToFlagSettingOpc(PredOpcode);
1951}
1952
1953/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1954/// operation which could set the flags in an identical manner
1955bool AArch64InstrInfo::optimizePTestInstr(
1956 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1957 const MachineRegisterInfo *MRI) const {
1958 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1959 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1960
1961 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1962 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1963 // before the branch to extract each subregister.
1964 auto Op = Pred->getOperand(1);
1965 if (Op.isReg() && Op.getReg().isVirtual() &&
1966 Op.getSubReg() == AArch64::psub0)
1967 Pred = MRI->getUniqueVRegDef(Op.getReg());
1968 }
1969
1970 unsigned PredOpcode = Pred->getOpcode();
1971 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1972 if (!NewOp)
1973 return false;
1974
1975 const TargetRegisterInfo *TRI = &getRegisterInfo();
1976
1977 // If another instruction between Pred and PTest accesses flags, don't remove
1978 // the ptest or update the earlier instruction to modify them.
1979 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1980 return false;
1981
1982 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1983 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1984 // operand to be replaced with an equivalent instruction that also sets the
1985 // flags.
1986 PTest->eraseFromParent();
1987 if (*NewOp != PredOpcode) {
1988 Pred->setDesc(get(*NewOp));
1989 bool succeeded = UpdateOperandRegClass(*Pred);
1990 (void)succeeded;
1991 assert(succeeded && "Operands have incompatible register classes!");
1992 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1993 }
1994
1995 // Ensure that the flags def is live.
1996 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1997 unsigned i = 0, e = Pred->getNumOperands();
1998 for (; i != e; ++i) {
1999 MachineOperand &MO = Pred->getOperand(i);
2000 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
2001 MO.setIsDead(false);
2002 break;
2003 }
2004 }
2005 }
2006 return true;
2007}
2008
2009/// Try to optimize a compare instruction. A compare instruction is an
2010/// instruction which produces AArch64::NZCV. It can be truly compare
2011/// instruction
2012/// when there are no uses of its destination register.
2013///
2014/// The following steps are tried in order:
2015/// 1. Convert CmpInstr into an unconditional version.
2016/// 2. Remove CmpInstr if above there is an instruction producing a needed
2017/// condition code or an instruction which can be converted into such an
2018/// instruction.
2019/// Only comparison with zero is supported.
2021 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
2022 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
2023 assert(CmpInstr.getParent());
2024 assert(MRI);
2025
2026 // Replace SUBSWrr with SUBWrr if NZCV is not used.
2027 int DeadNZCVIdx =
2028 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
2029 if (DeadNZCVIdx != -1) {
2030 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
2031 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
2032 CmpInstr.eraseFromParent();
2033 return true;
2034 }
2035 unsigned Opc = CmpInstr.getOpcode();
2036 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
2037 if (NewOpc == Opc)
2038 return false;
2039 const MCInstrDesc &MCID = get(NewOpc);
2040 CmpInstr.setDesc(MCID);
2041 CmpInstr.removeOperand(DeadNZCVIdx);
2042 bool succeeded = UpdateOperandRegClass(CmpInstr);
2043 (void)succeeded;
2044 assert(succeeded && "Some operands reg class are incompatible!");
2045 return true;
2046 }
2047
2048 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
2049 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
2050 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
2051 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
2052
2053 if (SrcReg2 != 0)
2054 return false;
2055
2056 // CmpInstr is a Compare instruction if destination register is not used.
2057 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2058 return false;
2059
2060 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2061 return true;
2062 return (CmpValue == 0 || CmpValue == 1) &&
2063 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2064}
2065
2066/// Get opcode of S version of Instr.
2067/// If Instr is S version its opcode is returned.
2068/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2069/// or we are not interested in it.
2070static unsigned sForm(MachineInstr &Instr) {
2071 switch (Instr.getOpcode()) {
2072 default:
2073 return AArch64::INSTRUCTION_LIST_END;
2074
2075 case AArch64::ADDSWrr:
2076 case AArch64::ADDSWri:
2077 case AArch64::ADDSXrr:
2078 case AArch64::ADDSXri:
2079 case AArch64::ADDSWrx:
2080 case AArch64::ADDSXrx:
2081 case AArch64::ADDSWrs:
2082 case AArch64::ADDSXrs:
2083 case AArch64::SUBSWrr:
2084 case AArch64::SUBSWri:
2085 case AArch64::SUBSWrx:
2086 case AArch64::SUBSWrs:
2087 case AArch64::SUBSXrr:
2088 case AArch64::SUBSXri:
2089 case AArch64::SUBSXrx:
2090 case AArch64::SUBSXrs:
2091 case AArch64::ANDSWri:
2092 case AArch64::ANDSWrr:
2093 case AArch64::ANDSWrs:
2094 case AArch64::ANDSXri:
2095 case AArch64::ANDSXrr:
2096 case AArch64::ANDSXrs:
2097 case AArch64::BICSWrr:
2098 case AArch64::BICSXrr:
2099 case AArch64::BICSWrs:
2100 case AArch64::BICSXrs:
2101 case AArch64::ADCSWr:
2102 case AArch64::ADCSXr:
2103 case AArch64::SBCSWr:
2104 case AArch64::SBCSXr:
2105 return Instr.getOpcode();
2106
2107 case AArch64::ADDWrr:
2108 return AArch64::ADDSWrr;
2109 case AArch64::ADDWri:
2110 return AArch64::ADDSWri;
2111 case AArch64::ADDXrr:
2112 return AArch64::ADDSXrr;
2113 case AArch64::ADDXri:
2114 return AArch64::ADDSXri;
2115 case AArch64::ADDWrx:
2116 return AArch64::ADDSWrx;
2117 case AArch64::ADDXrx:
2118 return AArch64::ADDSXrx;
2119 case AArch64::ADDWrs:
2120 return AArch64::ADDSWrs;
2121 case AArch64::ADDXrs:
2122 return AArch64::ADDSXrs;
2123 case AArch64::ADCWr:
2124 return AArch64::ADCSWr;
2125 case AArch64::ADCXr:
2126 return AArch64::ADCSXr;
2127 case AArch64::SUBWrr:
2128 return AArch64::SUBSWrr;
2129 case AArch64::SUBWri:
2130 return AArch64::SUBSWri;
2131 case AArch64::SUBXrr:
2132 return AArch64::SUBSXrr;
2133 case AArch64::SUBXri:
2134 return AArch64::SUBSXri;
2135 case AArch64::SUBWrx:
2136 return AArch64::SUBSWrx;
2137 case AArch64::SUBXrx:
2138 return AArch64::SUBSXrx;
2139 case AArch64::SUBWrs:
2140 return AArch64::SUBSWrs;
2141 case AArch64::SUBXrs:
2142 return AArch64::SUBSXrs;
2143 case AArch64::SBCWr:
2144 return AArch64::SBCSWr;
2145 case AArch64::SBCXr:
2146 return AArch64::SBCSXr;
2147 case AArch64::ANDWri:
2148 return AArch64::ANDSWri;
2149 case AArch64::ANDXri:
2150 return AArch64::ANDSXri;
2151 case AArch64::ANDWrr:
2152 return AArch64::ANDSWrr;
2153 case AArch64::ANDWrs:
2154 return AArch64::ANDSWrs;
2155 case AArch64::ANDXrr:
2156 return AArch64::ANDSXrr;
2157 case AArch64::ANDXrs:
2158 return AArch64::ANDSXrs;
2159 case AArch64::BICWrr:
2160 return AArch64::BICSWrr;
2161 case AArch64::BICXrr:
2162 return AArch64::BICSXrr;
2163 case AArch64::BICWrs:
2164 return AArch64::BICSWrs;
2165 case AArch64::BICXrs:
2166 return AArch64::BICSXrs;
2167 }
2168}
2169
2170/// Check if AArch64::NZCV should be alive in successors of MBB.
2172 for (auto *BB : MBB->successors())
2173 if (BB->isLiveIn(AArch64::NZCV))
2174 return true;
2175 return false;
2176}
2177
2178/// \returns The condition code operand index for \p Instr if it is a branch
2179/// or select and -1 otherwise.
2180int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2181 const MachineInstr &Instr) {
2182 switch (Instr.getOpcode()) {
2183 default:
2184 return -1;
2185
2186 case AArch64::Bcc: {
2187 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2188 assert(Idx >= 2);
2189 return Idx - 2;
2190 }
2191
2192 case AArch64::CSINVWr:
2193 case AArch64::CSINVXr:
2194 case AArch64::CSINCWr:
2195 case AArch64::CSINCXr:
2196 case AArch64::CSELWr:
2197 case AArch64::CSELXr:
2198 case AArch64::CSNEGWr:
2199 case AArch64::CSNEGXr:
2200 case AArch64::FCSELSrrr:
2201 case AArch64::FCSELDrrr: {
2202 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2203 assert(Idx >= 1);
2204 return Idx - 1;
2205 }
2206 }
2207}
2208
2209/// Find a condition code used by the instruction.
2210/// Returns AArch64CC::Invalid if either the instruction does not use condition
2211/// codes or we don't optimize CmpInstr in the presence of such instructions.
2213 int CCIdx =
2214 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2215 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2216 Instr.getOperand(CCIdx).getImm())
2218}
2219
2222 UsedNZCV UsedFlags;
2223 switch (CC) {
2224 default:
2225 break;
2226
2227 case AArch64CC::EQ: // Z set
2228 case AArch64CC::NE: // Z clear
2229 UsedFlags.Z = true;
2230 break;
2231
2232 case AArch64CC::HI: // Z clear and C set
2233 case AArch64CC::LS: // Z set or C clear
2234 UsedFlags.Z = true;
2235 [[fallthrough]];
2236 case AArch64CC::HS: // C set
2237 case AArch64CC::LO: // C clear
2238 UsedFlags.C = true;
2239 break;
2240
2241 case AArch64CC::MI: // N set
2242 case AArch64CC::PL: // N clear
2243 UsedFlags.N = true;
2244 break;
2245
2246 case AArch64CC::VS: // V set
2247 case AArch64CC::VC: // V clear
2248 UsedFlags.V = true;
2249 break;
2250
2251 case AArch64CC::GT: // Z clear, N and V the same
2252 case AArch64CC::LE: // Z set, N and V differ
2253 UsedFlags.Z = true;
2254 [[fallthrough]];
2255 case AArch64CC::GE: // N and V the same
2256 case AArch64CC::LT: // N and V differ
2257 UsedFlags.N = true;
2258 UsedFlags.V = true;
2259 break;
2260 }
2261 return UsedFlags;
2262}
2263
2264/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2265/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2266/// \returns std::nullopt otherwise.
2267///
2268/// Collect instructions using that flags in \p CCUseInstrs if provided.
2269std::optional<UsedNZCV>
2271 const TargetRegisterInfo &TRI,
2272 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2273 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2274 if (MI.getParent() != CmpParent)
2275 return std::nullopt;
2276
2277 if (areCFlagsAliveInSuccessors(CmpParent))
2278 return std::nullopt;
2279
2280 UsedNZCV NZCVUsedAfterCmp;
2282 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2283 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2285 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2286 return std::nullopt;
2287 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2288 if (CCUseInstrs)
2289 CCUseInstrs->push_back(&Instr);
2290 }
2291 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2292 break;
2293 }
2294 return NZCVUsedAfterCmp;
2295}
2296
2297static bool isADDSRegImm(unsigned Opcode) {
2298 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2299}
2300
2301static bool isSUBSRegImm(unsigned Opcode) {
2302 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2303}
2304
2306 unsigned Opc = sForm(MI);
2307 switch (Opc) {
2308 case AArch64::ANDSWri:
2309 case AArch64::ANDSWrr:
2310 case AArch64::ANDSWrs:
2311 case AArch64::ANDSXri:
2312 case AArch64::ANDSXrr:
2313 case AArch64::ANDSXrs:
2314 case AArch64::BICSWrr:
2315 case AArch64::BICSXrr:
2316 case AArch64::BICSWrs:
2317 case AArch64::BICSXrs:
2318 return true;
2319 default:
2320 return false;
2321 }
2322}
2323
2324/// Check if CmpInstr can be substituted by MI.
2325///
2326/// CmpInstr can be substituted:
2327/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2328/// - and, MI and CmpInstr are from the same MachineBB
2329/// - and, condition flags are not alive in successors of the CmpInstr parent
2330/// - and, if MI opcode is the S form there must be no defs of flags between
2331/// MI and CmpInstr
2332/// or if MI opcode is not the S form there must be neither defs of flags
2333/// nor uses of flags between MI and CmpInstr.
2334/// - and, C is not used after CmpInstr; CmpInstr's C is from adds/subs #0 on
2335/// SrcReg and can differ from MI (e.g. carry out of ADCS/SBCS).
2336/// - and, V is not used after CmpInstr unless MI is AND/BIC (V cleared) or MI
2337/// has NoSWrap (overflow is poison and the fold is still safe).
2339 const TargetRegisterInfo &TRI) {
2340 // MI is an opcode sForm maps (add/sub/adc/sbc/and/bic and their S forms).
2341 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2342
2343 const unsigned CmpOpcode = CmpInstr.getOpcode();
2344 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2345 return false;
2346
2347 assert((CmpInstr.getOperand(2).isImm() &&
2348 CmpInstr.getOperand(2).getImm() == 0) &&
2349 "Caller guarantees that CmpInstr compares with constant 0");
2350
2351 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2352 if (!NZVCUsed || NZVCUsed->C)
2353 return false;
2354
2355 // CmpInstr is ADDS/SUBS with immediate 0 on SrcReg (compare SrcReg to zero).
2356 // After the fold, users see NZCV from MI (or its S form), not from CmpInstr.
2357 // N/Z match CmpInstr for the value in SrcReg; C/V need not match in general
2358 // (e.g. ADCS vs adds #0), so we require C unused after CmpInstr and gate V
2359 // as below. NoSWrap makes signed overflow poison; AND/BIC clear V.
2360 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2361 return false;
2362
2363 AccessKind AccessToCheck = AK_Write;
2364 if (sForm(MI) != MI.getOpcode())
2365 AccessToCheck = AK_All;
2366 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2367}
2368
2369/// Substitute an instruction comparing to zero with another instruction
2370/// which produces needed condition flags.
2371///
2372/// Return true on success.
2373bool AArch64InstrInfo::substituteCmpToZero(
2374 MachineInstr &CmpInstr, unsigned SrcReg,
2375 const MachineRegisterInfo &MRI) const {
2376 // Get the unique definition of SrcReg.
2377 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2378 if (!MI)
2379 return false;
2380
2381 const TargetRegisterInfo &TRI = getRegisterInfo();
2382
2383 unsigned NewOpc = sForm(*MI);
2384 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2385 return false;
2386
2387 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2388 return false;
2389
2390 // Update the instruction to set NZCV.
2391 MI->setDesc(get(NewOpc));
2392 CmpInstr.eraseFromParent();
2394 (void)succeeded;
2395 assert(succeeded && "Some operands reg class are incompatible!");
2396 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2397 return true;
2398}
2399
2400/// \returns True if \p CmpInstr can be removed.
2401///
2402/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2403/// codes used in \p CCUseInstrs must be inverted.
2405 int CmpValue, const TargetRegisterInfo &TRI,
2407 bool &IsInvertCC) {
2408 assert((CmpValue == 0 || CmpValue == 1) &&
2409 "Only comparisons to 0 or 1 considered for removal!");
2410
2411 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2412 unsigned MIOpc = MI.getOpcode();
2413 if (MIOpc == AArch64::CSINCWr) {
2414 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2415 MI.getOperand(2).getReg() != AArch64::WZR)
2416 return false;
2417 } else if (MIOpc == AArch64::CSINCXr) {
2418 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2419 MI.getOperand(2).getReg() != AArch64::XZR)
2420 return false;
2421 } else {
2422 return false;
2423 }
2425 if (MICC == AArch64CC::Invalid)
2426 return false;
2427
2428 // NZCV needs to be defined
2429 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2430 return false;
2431
2432 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2433 const unsigned CmpOpcode = CmpInstr.getOpcode();
2434 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2435 if (CmpValue && !IsSubsRegImm)
2436 return false;
2437 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2438 return false;
2439
2440 // MI conditions allowed: eq, ne, mi, pl
2441 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2442 if (MIUsedNZCV.C || MIUsedNZCV.V)
2443 return false;
2444
2445 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2446 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2447 // Condition flags are not used in CmpInstr basic block successors and only
2448 // Z or N flags allowed to be used after CmpInstr within its basic block
2449 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2450 return false;
2451 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2452 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2453 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2454 return false;
2455 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2456 if (MIUsedNZCV.N && !CmpValue)
2457 return false;
2458
2459 // There must be no defs of flags between MI and CmpInstr
2460 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2461 return false;
2462
2463 // Condition code is inverted in the following cases:
2464 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2465 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2466 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2467 (!CmpValue && MICC == AArch64CC::NE);
2468 return true;
2469}
2470
2471/// Remove comparison in csinc-cmp sequence
2472///
2473/// Examples:
2474/// 1. \code
2475/// csinc w9, wzr, wzr, ne
2476/// cmp w9, #0
2477/// b.eq
2478/// \endcode
2479/// to
2480/// \code
2481/// csinc w9, wzr, wzr, ne
2482/// b.ne
2483/// \endcode
2484///
2485/// 2. \code
2486/// csinc x2, xzr, xzr, mi
2487/// cmp x2, #1
2488/// b.pl
2489/// \endcode
2490/// to
2491/// \code
2492/// csinc x2, xzr, xzr, mi
2493/// b.pl
2494/// \endcode
2495///
2496/// \param CmpInstr comparison instruction
2497/// \return True when comparison removed
2498bool AArch64InstrInfo::removeCmpToZeroOrOne(
2499 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2500 const MachineRegisterInfo &MRI) const {
2501 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2502 if (!MI)
2503 return false;
2504 const TargetRegisterInfo &TRI = getRegisterInfo();
2505 SmallVector<MachineInstr *, 4> CCUseInstrs;
2506 bool IsInvertCC = false;
2507 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2508 IsInvertCC))
2509 return false;
2510 // Make transformation
2511 CmpInstr.eraseFromParent();
2512 if (IsInvertCC) {
2513 // Invert condition codes in CmpInstr CC users
2514 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2515 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2516 assert(Idx >= 0 && "Unexpected instruction using CC.");
2517 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2519 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2520 CCOperand.setImm(CCUse);
2521 }
2522 }
2523 return true;
2524}
2525
2526bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2527 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2528 MI.getOpcode() != AArch64::CATCHRET)
2529 return false;
2530
2531 MachineBasicBlock &MBB = *MI.getParent();
2532 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2533 auto TRI = Subtarget.getRegisterInfo();
2534 DebugLoc DL = MI.getDebugLoc();
2535
2536 if (MI.getOpcode() == AArch64::CATCHRET) {
2537 // Skip to the first instruction before the epilog.
2538 const TargetInstrInfo *TII =
2540 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2542 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2543 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2544 FirstEpilogSEH != MBB.begin())
2545 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2546 if (FirstEpilogSEH != MBB.begin())
2547 FirstEpilogSEH = std::next(FirstEpilogSEH);
2548 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2549 .addReg(AArch64::X0, RegState::Define)
2550 .addMBB(TargetMBB);
2551 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2552 .addReg(AArch64::X0, RegState::Define)
2553 .addReg(AArch64::X0)
2554 .addMBB(TargetMBB)
2555 .addImm(0);
2556 TargetMBB->setMachineBlockAddressTaken();
2557 return true;
2558 }
2559
2560 Register Reg = MI.getOperand(0).getReg();
2562 if (M.getStackProtectorGuard() == "sysreg") {
2563 const AArch64SysReg::SysReg *SrcReg =
2564 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2565 if (!SrcReg)
2566 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2567
2568 // mrs xN, sysreg
2569 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2571 .addImm(SrcReg->Encoding);
2572 int Offset = M.getStackProtectorGuardOffset();
2573 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2574 // ldr xN, [xN, #offset]
2575 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2576 .addDef(Reg)
2578 .addImm(Offset / 8);
2579 } else if (Offset >= -256 && Offset <= 255) {
2580 // ldur xN, [xN, #offset]
2581 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2582 .addDef(Reg)
2584 .addImm(Offset);
2585 } else if (Offset >= -4095 && Offset <= 4095) {
2586 if (Offset > 0) {
2587 // add xN, xN, #offset
2588 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2589 .addDef(Reg)
2591 .addImm(Offset)
2592 .addImm(0);
2593 } else {
2594 // sub xN, xN, #offset
2595 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2596 .addDef(Reg)
2598 .addImm(-Offset)
2599 .addImm(0);
2600 }
2601 // ldr xN, [xN]
2602 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2603 .addDef(Reg)
2605 .addImm(0);
2606 } else {
2607 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2608 // than 23760.
2609 // It might be nice to use AArch64::MOVi32imm here, which would get
2610 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2611 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2612 // AArch64FrameLowering might help us find such a scratch register
2613 // though. If we failed to find a scratch register, we could emit a
2614 // stream of add instructions to build up the immediate. Or, we could try
2615 // to insert a AArch64::MOVi32imm before register allocation so that we
2616 // didn't need to scavenge for a scratch register.
2617 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2618 }
2619 MBB.erase(MI);
2620 return true;
2621 }
2622
2623 const GlobalValue *GV =
2624 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2625 const TargetMachine &TM = MBB.getParent()->getTarget();
2626 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2627 const unsigned char MO_NC = AArch64II::MO_NC;
2628
2629 unsigned GuardWidth = M.getStackProtectorGuardValueWidth().value_or(
2630 Subtarget.isTargetILP32() ? 4 : 8);
2631 if (GuardWidth != 4 && GuardWidth != 8)
2632 report_fatal_error("Unsupported stack protector value width");
2633 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2634 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2635 .addGlobalAddress(GV, 0, OpFlags);
2636 if (GuardWidth == 4) {
2637 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2638 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2639 .addDef(Reg32, RegState::Dead)
2641 .addImm(0)
2642 .addMemOperand(*MI.memoperands_begin())
2644 } else {
2645 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2647 .addImm(0)
2648 .addMemOperand(*MI.memoperands_begin());
2649 }
2650 } else if (TM.getCodeModel() == CodeModel::Large) {
2651 if (GuardWidth == 4)
2652 report_fatal_error("Large code model with 4-byte stack protector not yet "
2653 "supported");
2654 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2655 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2656 .addImm(0);
2657 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2659 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2660 .addImm(16);
2661 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2663 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2664 .addImm(32);
2665 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2668 .addImm(48);
2669 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2671 .addImm(0)
2672 .addMemOperand(*MI.memoperands_begin());
2673 } else {
2674 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2675 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2676 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2677 if (GuardWidth == 4) {
2678 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2679 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2680 .addDef(Reg32, RegState::Dead)
2682 .addGlobalAddress(GV, 0, LoFlags)
2683 .addMemOperand(*MI.memoperands_begin())
2685 } else {
2686 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2688 .addGlobalAddress(GV, 0, LoFlags)
2689 .addMemOperand(*MI.memoperands_begin());
2690 }
2691 }
2692
2693 MBB.erase(MI);
2694
2695 return true;
2696}
2697
2698// Return true if this instruction simply sets its single destination register
2699// to zero. This is equivalent to a register rename of the zero-register.
2701 switch (MI.getOpcode()) {
2702 default:
2703 break;
2704 case AArch64::MOVZWi:
2705 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2706 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2707 assert(MI.getDesc().getNumOperands() == 3 &&
2708 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2709 return true;
2710 }
2711 break;
2712 case AArch64::ANDWri: // and Rd, Rzr, #imm
2713 return MI.getOperand(1).getReg() == AArch64::WZR;
2714 case AArch64::ANDXri:
2715 return MI.getOperand(1).getReg() == AArch64::XZR;
2716 case TargetOpcode::COPY:
2717 return MI.getOperand(1).getReg() == AArch64::WZR;
2718 }
2719 return false;
2720}
2721
2722// Return true if this instruction simply renames a general register without
2723// modifying bits.
2725 switch (MI.getOpcode()) {
2726 default:
2727 break;
2728 case TargetOpcode::COPY: {
2729 // GPR32 copies will by lowered to ORRXrs
2730 Register DstReg = MI.getOperand(0).getReg();
2731 return (AArch64::GPR32RegClass.contains(DstReg) ||
2732 AArch64::GPR64RegClass.contains(DstReg));
2733 }
2734 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2735 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2736 assert(MI.getDesc().getNumOperands() == 4 &&
2737 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2738 return true;
2739 }
2740 break;
2741 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2742 if (MI.getOperand(2).getImm() == 0) {
2743 assert(MI.getDesc().getNumOperands() == 4 &&
2744 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2745 return true;
2746 }
2747 break;
2748 }
2749 return false;
2750}
2751
2752// Return true if this instruction simply renames a general register without
2753// modifying bits.
2755 switch (MI.getOpcode()) {
2756 default:
2757 break;
2758 case TargetOpcode::COPY: {
2759 Register DstReg = MI.getOperand(0).getReg();
2760 return AArch64::FPR128RegClass.contains(DstReg);
2761 }
2762 case AArch64::ORRv16i8:
2763 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2764 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2765 "invalid ORRv16i8 operands");
2766 return true;
2767 }
2768 break;
2769 }
2770 return false;
2771}
2772
2773static bool isFrameLoadOpcode(int Opcode) {
2774 switch (Opcode) {
2775 default:
2776 return false;
2777 case AArch64::LDRWui:
2778 case AArch64::LDRXui:
2779 case AArch64::LDRBui:
2780 case AArch64::LDRHui:
2781 case AArch64::LDRSui:
2782 case AArch64::LDRDui:
2783 case AArch64::LDRQui:
2784 case AArch64::LDR_PXI:
2785 return true;
2786 }
2787}
2788
2790 int &FrameIndex) const {
2791 if (!isFrameLoadOpcode(MI.getOpcode()))
2792 return Register();
2793
2794 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2795 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2796 FrameIndex = MI.getOperand(1).getIndex();
2797 return MI.getOperand(0).getReg();
2798 }
2799 return Register();
2800}
2801
2802static bool isFrameStoreOpcode(int Opcode) {
2803 switch (Opcode) {
2804 default:
2805 return false;
2806 case AArch64::STRWui:
2807 case AArch64::STRXui:
2808 case AArch64::STRBui:
2809 case AArch64::STRHui:
2810 case AArch64::STRSui:
2811 case AArch64::STRDui:
2812 case AArch64::STRQui:
2813 case AArch64::STR_PXI:
2814 return true;
2815 }
2816}
2817
2819 int &FrameIndex) const {
2820 if (!isFrameStoreOpcode(MI.getOpcode()))
2821 return Register();
2822
2823 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2824 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2825 FrameIndex = MI.getOperand(1).getIndex();
2826 return MI.getOperand(0).getReg();
2827 }
2828 return Register();
2829}
2830
2832 int &FrameIndex) const {
2833 if (!isFrameStoreOpcode(MI.getOpcode()))
2834 return Register();
2835
2836 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2837 return Reg;
2838
2840 if (hasStoreToStackSlot(MI, Accesses)) {
2841 if (Accesses.size() > 1)
2842 return Register();
2843
2844 FrameIndex =
2845 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2846 ->getFrameIndex();
2847 return MI.getOperand(0).getReg();
2848 }
2849 return Register();
2850}
2851
2853 int &FrameIndex) const {
2854 if (!isFrameLoadOpcode(MI.getOpcode()))
2855 return Register();
2856
2857 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2858 return Reg;
2859
2861 if (hasLoadFromStackSlot(MI, Accesses)) {
2862 if (Accesses.size() > 1)
2863 return Register();
2864
2865 FrameIndex =
2866 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2867 ->getFrameIndex();
2868 return MI.getOperand(0).getReg();
2869 }
2870 return Register();
2871}
2872
2873/// Check all MachineMemOperands for a hint to suppress pairing.
2875 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2876 return MMO->getFlags() & MOSuppressPair;
2877 });
2878}
2879
2880/// Set a flag on the first MachineMemOperand to suppress pairing.
2882 if (MI.memoperands_empty())
2883 return;
2884 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2885}
2886
2887/// Check all MachineMemOperands for a hint that the load/store is strided.
2889 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2890 return MMO->getFlags() & MOStridedAccess;
2891 });
2892}
2893
2895 switch (Opc) {
2896 default:
2897 return false;
2898 case AArch64::STURSi:
2899 case AArch64::STRSpre:
2900 case AArch64::STURDi:
2901 case AArch64::STRDpre:
2902 case AArch64::STURQi:
2903 case AArch64::STRQpre:
2904 case AArch64::STURBBi:
2905 case AArch64::STURHHi:
2906 case AArch64::STURWi:
2907 case AArch64::STRWpre:
2908 case AArch64::STURXi:
2909 case AArch64::STRXpre:
2910 case AArch64::LDURSi:
2911 case AArch64::LDRSpre:
2912 case AArch64::LDURDi:
2913 case AArch64::LDRDpre:
2914 case AArch64::LDURQi:
2915 case AArch64::LDRQpre:
2916 case AArch64::LDURWi:
2917 case AArch64::LDRWpre:
2918 case AArch64::LDURXi:
2919 case AArch64::LDRXpre:
2920 case AArch64::LDRSWpre:
2921 case AArch64::LDURSWi:
2922 case AArch64::LDURHHi:
2923 case AArch64::LDURBBi:
2924 case AArch64::LDURSBWi:
2925 case AArch64::LDURSHWi:
2926 return true;
2927 }
2928}
2929
2930std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2931 switch (Opc) {
2932 default: return {};
2933 case AArch64::PRFMui: return AArch64::PRFUMi;
2934 case AArch64::LDRXui: return AArch64::LDURXi;
2935 case AArch64::LDRWui: return AArch64::LDURWi;
2936 case AArch64::LDRBui: return AArch64::LDURBi;
2937 case AArch64::LDRHui: return AArch64::LDURHi;
2938 case AArch64::LDRSui: return AArch64::LDURSi;
2939 case AArch64::LDRDui: return AArch64::LDURDi;
2940 case AArch64::LDRQui: return AArch64::LDURQi;
2941 case AArch64::LDRBBui: return AArch64::LDURBBi;
2942 case AArch64::LDRHHui: return AArch64::LDURHHi;
2943 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2944 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2945 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2946 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2947 case AArch64::LDRSWui: return AArch64::LDURSWi;
2948 case AArch64::STRXui: return AArch64::STURXi;
2949 case AArch64::STRWui: return AArch64::STURWi;
2950 case AArch64::STRBui: return AArch64::STURBi;
2951 case AArch64::STRHui: return AArch64::STURHi;
2952 case AArch64::STRSui: return AArch64::STURSi;
2953 case AArch64::STRDui: return AArch64::STURDi;
2954 case AArch64::STRQui: return AArch64::STURQi;
2955 case AArch64::STRBBui: return AArch64::STURBBi;
2956 case AArch64::STRHHui: return AArch64::STURHHi;
2957 }
2958}
2959
2961 switch (Opc) {
2962 default:
2963 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2964 case AArch64::ADDG:
2965 case AArch64::LDAPURBi:
2966 case AArch64::LDAPURHi:
2967 case AArch64::LDAPURi:
2968 case AArch64::LDAPURSBWi:
2969 case AArch64::LDAPURSBXi:
2970 case AArch64::LDAPURSHWi:
2971 case AArch64::LDAPURSHXi:
2972 case AArch64::LDAPURSWi:
2973 case AArch64::LDAPURXi:
2974 case AArch64::LDR_PPXI:
2975 case AArch64::LDR_PXI:
2976 case AArch64::LDR_ZXI:
2977 case AArch64::LDR_ZZXI:
2978 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2979 case AArch64::LDR_ZZZXI:
2980 case AArch64::LDR_ZZZZXI:
2981 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2982 case AArch64::LDRBBui:
2983 case AArch64::LDRBui:
2984 case AArch64::LDRDui:
2985 case AArch64::LDRHHui:
2986 case AArch64::LDRHui:
2987 case AArch64::LDRQui:
2988 case AArch64::LDRSBWui:
2989 case AArch64::LDRSBXui:
2990 case AArch64::LDRSHWui:
2991 case AArch64::LDRSHXui:
2992 case AArch64::LDRSui:
2993 case AArch64::LDRSWui:
2994 case AArch64::LDRWui:
2995 case AArch64::LDRXui:
2996 case AArch64::LDURBBi:
2997 case AArch64::LDURBi:
2998 case AArch64::LDURDi:
2999 case AArch64::LDURHHi:
3000 case AArch64::LDURHi:
3001 case AArch64::LDURQi:
3002 case AArch64::LDURSBWi:
3003 case AArch64::LDURSBXi:
3004 case AArch64::LDURSHWi:
3005 case AArch64::LDURSHXi:
3006 case AArch64::LDURSi:
3007 case AArch64::LDURSWi:
3008 case AArch64::LDURWi:
3009 case AArch64::LDURXi:
3010 case AArch64::PRFMui:
3011 case AArch64::PRFUMi:
3012 case AArch64::ST2Gi:
3013 case AArch64::STGi:
3014 case AArch64::STLURBi:
3015 case AArch64::STLURHi:
3016 case AArch64::STLURWi:
3017 case AArch64::STLURXi:
3018 case AArch64::StoreSwiftAsyncContext:
3019 case AArch64::STR_PPXI:
3020 case AArch64::STR_PXI:
3021 case AArch64::STR_ZXI:
3022 case AArch64::STR_ZZXI:
3023 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
3024 case AArch64::STR_ZZZXI:
3025 case AArch64::STR_ZZZZXI:
3026 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
3027 case AArch64::STRBBui:
3028 case AArch64::STRBui:
3029 case AArch64::STRDui:
3030 case AArch64::STRHHui:
3031 case AArch64::STRHui:
3032 case AArch64::STRQui:
3033 case AArch64::STRSui:
3034 case AArch64::STRWui:
3035 case AArch64::STRXui:
3036 case AArch64::STURBBi:
3037 case AArch64::STURBi:
3038 case AArch64::STURDi:
3039 case AArch64::STURHHi:
3040 case AArch64::STURHi:
3041 case AArch64::STURQi:
3042 case AArch64::STURSi:
3043 case AArch64::STURWi:
3044 case AArch64::STURXi:
3045 case AArch64::STZ2Gi:
3046 case AArch64::STZGi:
3047 case AArch64::TAGPstack:
3048 return 2;
3049 case AArch64::LD1B_D_IMM:
3050 case AArch64::LD1B_H_IMM:
3051 case AArch64::LD1B_IMM:
3052 case AArch64::LD1B_S_IMM:
3053 case AArch64::LD1D_IMM:
3054 case AArch64::LD1H_D_IMM:
3055 case AArch64::LD1H_IMM:
3056 case AArch64::LD1H_S_IMM:
3057 case AArch64::LD1RB_D_IMM:
3058 case AArch64::LD1RB_H_IMM:
3059 case AArch64::LD1RB_IMM:
3060 case AArch64::LD1RB_S_IMM:
3061 case AArch64::LD1RD_IMM:
3062 case AArch64::LD1RH_D_IMM:
3063 case AArch64::LD1RH_IMM:
3064 case AArch64::LD1RH_S_IMM:
3065 case AArch64::LD1RSB_D_IMM:
3066 case AArch64::LD1RSB_H_IMM:
3067 case AArch64::LD1RSB_S_IMM:
3068 case AArch64::LD1RSH_D_IMM:
3069 case AArch64::LD1RSH_S_IMM:
3070 case AArch64::LD1RSW_IMM:
3071 case AArch64::LD1RW_D_IMM:
3072 case AArch64::LD1RW_IMM:
3073 case AArch64::LD1SB_D_IMM:
3074 case AArch64::LD1SB_H_IMM:
3075 case AArch64::LD1SB_S_IMM:
3076 case AArch64::LD1SH_D_IMM:
3077 case AArch64::LD1SH_S_IMM:
3078 case AArch64::LD1SW_D_IMM:
3079 case AArch64::LD1W_D_IMM:
3080 case AArch64::LD1W_IMM:
3081 case AArch64::LD2B_IMM:
3082 case AArch64::LD2D_IMM:
3083 case AArch64::LD2H_IMM:
3084 case AArch64::LD2W_IMM:
3085 case AArch64::LD3B_IMM:
3086 case AArch64::LD3D_IMM:
3087 case AArch64::LD3H_IMM:
3088 case AArch64::LD3W_IMM:
3089 case AArch64::LD4B_IMM:
3090 case AArch64::LD4D_IMM:
3091 case AArch64::LD4H_IMM:
3092 case AArch64::LD4W_IMM:
3093 case AArch64::LDG:
3094 case AArch64::LDNF1B_D_IMM:
3095 case AArch64::LDNF1B_H_IMM:
3096 case AArch64::LDNF1B_IMM:
3097 case AArch64::LDNF1B_S_IMM:
3098 case AArch64::LDNF1D_IMM:
3099 case AArch64::LDNF1H_D_IMM:
3100 case AArch64::LDNF1H_IMM:
3101 case AArch64::LDNF1H_S_IMM:
3102 case AArch64::LDNF1SB_D_IMM:
3103 case AArch64::LDNF1SB_H_IMM:
3104 case AArch64::LDNF1SB_S_IMM:
3105 case AArch64::LDNF1SH_D_IMM:
3106 case AArch64::LDNF1SH_S_IMM:
3107 case AArch64::LDNF1SW_D_IMM:
3108 case AArch64::LDNF1W_D_IMM:
3109 case AArch64::LDNF1W_IMM:
3110 case AArch64::LDNPDi:
3111 case AArch64::LDNPQi:
3112 case AArch64::LDNPSi:
3113 case AArch64::LDNPWi:
3114 case AArch64::LDNPXi:
3115 case AArch64::LDNT1B_ZRI:
3116 case AArch64::LDNT1D_ZRI:
3117 case AArch64::LDNT1H_ZRI:
3118 case AArch64::LDNT1W_ZRI:
3119 case AArch64::LDPDi:
3120 case AArch64::LDPQi:
3121 case AArch64::LDPSi:
3122 case AArch64::LDPWi:
3123 case AArch64::LDPXi:
3124 case AArch64::LDRBBpost:
3125 case AArch64::LDRBBpre:
3126 case AArch64::LDRBpost:
3127 case AArch64::LDRBpre:
3128 case AArch64::LDRDpost:
3129 case AArch64::LDRDpre:
3130 case AArch64::LDRHHpost:
3131 case AArch64::LDRHHpre:
3132 case AArch64::LDRHpost:
3133 case AArch64::LDRHpre:
3134 case AArch64::LDRQpost:
3135 case AArch64::LDRQpre:
3136 case AArch64::LDRSpost:
3137 case AArch64::LDRSpre:
3138 case AArch64::LDRWpost:
3139 case AArch64::LDRWpre:
3140 case AArch64::LDRXpost:
3141 case AArch64::LDRXpre:
3142 case AArch64::ST1B_D_IMM:
3143 case AArch64::ST1B_H_IMM:
3144 case AArch64::ST1B_IMM:
3145 case AArch64::ST1B_S_IMM:
3146 case AArch64::ST1D_IMM:
3147 case AArch64::ST1H_D_IMM:
3148 case AArch64::ST1H_IMM:
3149 case AArch64::ST1H_S_IMM:
3150 case AArch64::ST1W_D_IMM:
3151 case AArch64::ST1W_IMM:
3152 case AArch64::ST2B_IMM:
3153 case AArch64::ST2D_IMM:
3154 case AArch64::ST2H_IMM:
3155 case AArch64::ST2W_IMM:
3156 case AArch64::ST3B_IMM:
3157 case AArch64::ST3D_IMM:
3158 case AArch64::ST3H_IMM:
3159 case AArch64::ST3W_IMM:
3160 case AArch64::ST4B_IMM:
3161 case AArch64::ST4D_IMM:
3162 case AArch64::ST4H_IMM:
3163 case AArch64::ST4W_IMM:
3164 case AArch64::STGPi:
3165 case AArch64::STGPreIndex:
3166 case AArch64::STZGPreIndex:
3167 case AArch64::ST2GPreIndex:
3168 case AArch64::STZ2GPreIndex:
3169 case AArch64::STGPostIndex:
3170 case AArch64::STZGPostIndex:
3171 case AArch64::ST2GPostIndex:
3172 case AArch64::STZ2GPostIndex:
3173 case AArch64::STNPDi:
3174 case AArch64::STNPQi:
3175 case AArch64::STNPSi:
3176 case AArch64::STNPWi:
3177 case AArch64::STNPXi:
3178 case AArch64::STNT1B_ZRI:
3179 case AArch64::STNT1D_ZRI:
3180 case AArch64::STNT1H_ZRI:
3181 case AArch64::STNT1W_ZRI:
3182 case AArch64::STPDi:
3183 case AArch64::STPQi:
3184 case AArch64::STPSi:
3185 case AArch64::STPWi:
3186 case AArch64::STPXi:
3187 case AArch64::STRBBpost:
3188 case AArch64::STRBBpre:
3189 case AArch64::STRBpost:
3190 case AArch64::STRBpre:
3191 case AArch64::STRDpost:
3192 case AArch64::STRDpre:
3193 case AArch64::STRHHpost:
3194 case AArch64::STRHHpre:
3195 case AArch64::STRHpost:
3196 case AArch64::STRHpre:
3197 case AArch64::STRQpost:
3198 case AArch64::STRQpre:
3199 case AArch64::STRSpost:
3200 case AArch64::STRSpre:
3201 case AArch64::STRWpost:
3202 case AArch64::STRWpre:
3203 case AArch64::STRXpost:
3204 case AArch64::STRXpre:
3205 case AArch64::LD1B_2Z_IMM:
3206 case AArch64::LD1B_2Z_STRIDED_IMM:
3207 case AArch64::LD1H_2Z_IMM:
3208 case AArch64::LD1H_2Z_STRIDED_IMM:
3209 case AArch64::LD1W_2Z_IMM:
3210 case AArch64::LD1W_2Z_STRIDED_IMM:
3211 case AArch64::LD1D_2Z_IMM:
3212 case AArch64::LD1D_2Z_STRIDED_IMM:
3213 case AArch64::LD1B_4Z_IMM:
3214 case AArch64::LD1B_4Z_STRIDED_IMM:
3215 case AArch64::LD1H_4Z_IMM:
3216 case AArch64::LD1H_4Z_STRIDED_IMM:
3217 case AArch64::LD1W_4Z_IMM:
3218 case AArch64::LD1W_4Z_STRIDED_IMM:
3219 case AArch64::LD1D_4Z_IMM:
3220 case AArch64::LD1D_4Z_STRIDED_IMM:
3221 case AArch64::LD1B_2Z_IMM_PSEUDO:
3222 case AArch64::LD1H_2Z_IMM_PSEUDO:
3223 case AArch64::LD1W_2Z_IMM_PSEUDO:
3224 case AArch64::LD1D_2Z_IMM_PSEUDO:
3225 case AArch64::LD1B_4Z_IMM_PSEUDO:
3226 case AArch64::LD1H_4Z_IMM_PSEUDO:
3227 case AArch64::LD1W_4Z_IMM_PSEUDO:
3228 case AArch64::LD1D_4Z_IMM_PSEUDO:
3229 case AArch64::ST1B_2Z_IMM:
3230 case AArch64::ST1B_2Z_STRIDED_IMM:
3231 case AArch64::ST1H_2Z_IMM:
3232 case AArch64::ST1H_2Z_STRIDED_IMM:
3233 case AArch64::ST1W_2Z_IMM:
3234 case AArch64::ST1W_2Z_STRIDED_IMM:
3235 case AArch64::ST1D_2Z_IMM:
3236 case AArch64::ST1D_2Z_STRIDED_IMM:
3237 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
3238 case AArch64::LDNT1B_2Z_IMM:
3239 case AArch64::LDNT1B_2Z_STRIDED_IMM:
3240 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
3241 case AArch64::LDNT1H_2Z_IMM:
3242 case AArch64::LDNT1H_2Z_STRIDED_IMM:
3243 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
3244 case AArch64::LDNT1W_2Z_IMM:
3245 case AArch64::LDNT1W_2Z_STRIDED_IMM:
3246 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
3247 case AArch64::LDNT1D_2Z_IMM:
3248 case AArch64::LDNT1D_2Z_STRIDED_IMM:
3249 case AArch64::STNT1B_2Z_IMM:
3250 case AArch64::STNT1B_2Z_STRIDED_IMM:
3251 case AArch64::STNT1H_2Z_IMM:
3252 case AArch64::STNT1H_2Z_STRIDED_IMM:
3253 case AArch64::STNT1W_2Z_IMM:
3254 case AArch64::STNT1W_2Z_STRIDED_IMM:
3255 case AArch64::STNT1D_2Z_IMM:
3256 case AArch64::STNT1D_2Z_STRIDED_IMM:
3257 case AArch64::ST1B_4Z_IMM:
3258 case AArch64::ST1B_4Z_STRIDED_IMM:
3259 case AArch64::ST1H_4Z_IMM:
3260 case AArch64::ST1H_4Z_STRIDED_IMM:
3261 case AArch64::ST1W_4Z_IMM:
3262 case AArch64::ST1W_4Z_STRIDED_IMM:
3263 case AArch64::ST1D_4Z_IMM:
3264 case AArch64::ST1D_4Z_STRIDED_IMM:
3265 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
3266 case AArch64::LDNT1B_4Z_IMM:
3267 case AArch64::LDNT1B_4Z_STRIDED_IMM:
3268 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
3269 case AArch64::LDNT1H_4Z_IMM:
3270 case AArch64::LDNT1H_4Z_STRIDED_IMM:
3271 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
3272 case AArch64::LDNT1W_4Z_IMM:
3273 case AArch64::LDNT1W_4Z_STRIDED_IMM:
3274 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
3275 case AArch64::LDNT1D_4Z_IMM:
3276 case AArch64::LDNT1D_4Z_STRIDED_IMM:
3277 case AArch64::STNT1B_4Z_IMM:
3278 case AArch64::STNT1B_4Z_STRIDED_IMM:
3279 case AArch64::STNT1H_4Z_IMM:
3280 case AArch64::STNT1H_4Z_STRIDED_IMM:
3281 case AArch64::STNT1W_4Z_IMM:
3282 case AArch64::STNT1W_4Z_STRIDED_IMM:
3283 case AArch64::STNT1D_4Z_IMM:
3284 case AArch64::STNT1D_4Z_STRIDED_IMM:
3285 return 3;
3286 case AArch64::LDPDpost:
3287 case AArch64::LDPDpre:
3288 case AArch64::LDPQpost:
3289 case AArch64::LDPQpre:
3290 case AArch64::LDPSpost:
3291 case AArch64::LDPSpre:
3292 case AArch64::LDPWpost:
3293 case AArch64::LDPWpre:
3294 case AArch64::LDPXpost:
3295 case AArch64::LDPXpre:
3296 case AArch64::STGPpre:
3297 case AArch64::STGPpost:
3298 case AArch64::STPDpost:
3299 case AArch64::STPDpre:
3300 case AArch64::STPQpost:
3301 case AArch64::STPQpre:
3302 case AArch64::STPSpost:
3303 case AArch64::STPSpre:
3304 case AArch64::STPWpost:
3305 case AArch64::STPWpre:
3306 case AArch64::STPXpost:
3307 case AArch64::STPXpre:
3308 return 4;
3309 }
3310}
3311
3313 switch (MI.getOpcode()) {
3314 default:
3315 return false;
3316 // Scaled instructions.
3317 case AArch64::STRSui:
3318 case AArch64::STRDui:
3319 case AArch64::STRQui:
3320 case AArch64::STRXui:
3321 case AArch64::STRWui:
3322 case AArch64::LDRSui:
3323 case AArch64::LDRDui:
3324 case AArch64::LDRQui:
3325 case AArch64::LDRXui:
3326 case AArch64::LDRWui:
3327 case AArch64::LDRSWui:
3328 // Unscaled instructions.
3329 case AArch64::STURSi:
3330 case AArch64::STRSpre:
3331 case AArch64::STURDi:
3332 case AArch64::STRDpre:
3333 case AArch64::STURQi:
3334 case AArch64::STRQpre:
3335 case AArch64::STURWi:
3336 case AArch64::STRWpre:
3337 case AArch64::STURXi:
3338 case AArch64::STRXpre:
3339 case AArch64::LDURSi:
3340 case AArch64::LDRSpre:
3341 case AArch64::LDURDi:
3342 case AArch64::LDRDpre:
3343 case AArch64::LDURQi:
3344 case AArch64::LDRQpre:
3345 case AArch64::LDURWi:
3346 case AArch64::LDRWpre:
3347 case AArch64::LDURXi:
3348 case AArch64::LDRXpre:
3349 case AArch64::LDURSWi:
3350 case AArch64::LDRSWpre:
3351 // SVE instructions.
3352 case AArch64::LDR_ZXI:
3353 case AArch64::STR_ZXI:
3354 return true;
3355 }
3356}
3357
3359 switch (MI.getOpcode()) {
3360 default:
3361 assert((!MI.isCall() || !MI.isReturn()) &&
3362 "Unexpected instruction - was a new tail call opcode introduced?");
3363 return false;
3364 case AArch64::TCRETURNdi:
3365 case AArch64::TCRETURNri:
3366 case AArch64::TCRETURNrix16x17:
3367 case AArch64::TCRETURNrix17:
3368 case AArch64::TCRETURNrinotx16:
3369 case AArch64::TCRETURNriALL:
3370 case AArch64::AUTH_TCRETURN:
3371 case AArch64::AUTH_TCRETURN_BTI:
3372 return true;
3373 }
3374}
3375
3377 switch (Opc) {
3378 default:
3379 llvm_unreachable("Opcode has no flag setting equivalent!");
3380 // 32-bit cases:
3381 case AArch64::ADDWri:
3382 return AArch64::ADDSWri;
3383 case AArch64::ADDWrr:
3384 return AArch64::ADDSWrr;
3385 case AArch64::ADDWrs:
3386 return AArch64::ADDSWrs;
3387 case AArch64::ADDWrx:
3388 return AArch64::ADDSWrx;
3389 case AArch64::ANDWri:
3390 return AArch64::ANDSWri;
3391 case AArch64::ANDWrr:
3392 return AArch64::ANDSWrr;
3393 case AArch64::ANDWrs:
3394 return AArch64::ANDSWrs;
3395 case AArch64::BICWrr:
3396 return AArch64::BICSWrr;
3397 case AArch64::BICWrs:
3398 return AArch64::BICSWrs;
3399 case AArch64::SUBWri:
3400 return AArch64::SUBSWri;
3401 case AArch64::SUBWrr:
3402 return AArch64::SUBSWrr;
3403 case AArch64::SUBWrs:
3404 return AArch64::SUBSWrs;
3405 case AArch64::SUBWrx:
3406 return AArch64::SUBSWrx;
3407 // 64-bit cases:
3408 case AArch64::ADDXri:
3409 return AArch64::ADDSXri;
3410 case AArch64::ADDXrr:
3411 return AArch64::ADDSXrr;
3412 case AArch64::ADDXrs:
3413 return AArch64::ADDSXrs;
3414 case AArch64::ADDXrx:
3415 return AArch64::ADDSXrx;
3416 case AArch64::ANDXri:
3417 return AArch64::ANDSXri;
3418 case AArch64::ANDXrr:
3419 return AArch64::ANDSXrr;
3420 case AArch64::ANDXrs:
3421 return AArch64::ANDSXrs;
3422 case AArch64::BICXrr:
3423 return AArch64::BICSXrr;
3424 case AArch64::BICXrs:
3425 return AArch64::BICSXrs;
3426 case AArch64::SUBXri:
3427 return AArch64::SUBSXri;
3428 case AArch64::SUBXrr:
3429 return AArch64::SUBSXrr;
3430 case AArch64::SUBXrs:
3431 return AArch64::SUBSXrs;
3432 case AArch64::SUBXrx:
3433 return AArch64::SUBSXrx;
3434 // SVE instructions:
3435 case AArch64::AND_PPzPP:
3436 return AArch64::ANDS_PPzPP;
3437 case AArch64::BIC_PPzPP:
3438 return AArch64::BICS_PPzPP;
3439 case AArch64::EOR_PPzPP:
3440 return AArch64::EORS_PPzPP;
3441 case AArch64::NAND_PPzPP:
3442 return AArch64::NANDS_PPzPP;
3443 case AArch64::NOR_PPzPP:
3444 return AArch64::NORS_PPzPP;
3445 case AArch64::ORN_PPzPP:
3446 return AArch64::ORNS_PPzPP;
3447 case AArch64::ORR_PPzPP:
3448 return AArch64::ORRS_PPzPP;
3449 case AArch64::BRKA_PPzP:
3450 return AArch64::BRKAS_PPzP;
3451 case AArch64::BRKPA_PPzPP:
3452 return AArch64::BRKPAS_PPzPP;
3453 case AArch64::BRKB_PPzP:
3454 return AArch64::BRKBS_PPzP;
3455 case AArch64::BRKPB_PPzPP:
3456 return AArch64::BRKPBS_PPzPP;
3457 case AArch64::BRKN_PPzP:
3458 return AArch64::BRKNS_PPzP;
3459 case AArch64::RDFFR_PPz:
3460 return AArch64::RDFFRS_PPz;
3461 case AArch64::PTRUE_B:
3462 return AArch64::PTRUES_B;
3463 }
3464}
3465
3466// Is this a candidate for ld/st merging or pairing? For example, we don't
3467// touch volatiles or load/stores that have a hint to avoid pair formation.
3469
3470 bool IsPreLdSt = isPreLdSt(MI);
3471
3472 // If this is a volatile load/store, don't mess with it.
3473 if (MI.hasOrderedMemoryRef())
3474 return false;
3475
3476 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3477 // For Pre-inc LD/ST, the operand is shifted by one.
3478 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3479 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3480 "Expected a reg or frame index operand.");
3481
3482 // For Pre-indexed addressing quadword instructions, the third operand is the
3483 // immediate value.
3484 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3485
3486 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3487 return false;
3488
3489 // Can't merge/pair if the instruction modifies the base register.
3490 // e.g., ldr x0, [x0]
3491 // This case will never occur with an FI base.
3492 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3493 // STR<S,D,Q,W,X>pre, it can be merged.
3494 // For example:
3495 // ldr q0, [x11, #32]!
3496 // ldr q1, [x11, #16]
3497 // to
3498 // ldp q0, q1, [x11, #32]!
3499 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3500 Register BaseReg = MI.getOperand(1).getReg();
3502 if (MI.modifiesRegister(BaseReg, TRI))
3503 return false;
3504 }
3505
3506 // Pairing SVE fills/spills is only valid for little-endian targets that
3507 // implement VLS 128.
3508 switch (MI.getOpcode()) {
3509 default:
3510 break;
3511 case AArch64::LDR_ZXI:
3512 case AArch64::STR_ZXI:
3513 if (!Subtarget.isLittleEndian() ||
3514 Subtarget.getSVEVectorSizeInBits() != 128)
3515 return false;
3516 }
3517
3518 // Check if this load/store has a hint to avoid pair formation.
3519 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3521 return false;
3522
3523 // Do not pair any callee-save store/reload instructions in the
3524 // prologue/epilogue if the CFI information encoded the operations as separate
3525 // instructions, as that will cause the size of the actual prologue to mismatch
3526 // with the prologue size recorded in the Windows CFI.
3527 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3528 bool NeedsWinCFI =
3529 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3530 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3532 return false;
3533
3534 // On some CPUs quad load/store pairs are slower than two single load/stores.
3535 if (Subtarget.isPaired128Slow()) {
3536 switch (MI.getOpcode()) {
3537 default:
3538 break;
3539 case AArch64::LDURQi:
3540 case AArch64::STURQi:
3541 case AArch64::LDRQui:
3542 case AArch64::STRQui:
3543 return false;
3544 }
3545 }
3546
3547 return true;
3548}
3549
3552 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3553 const TargetRegisterInfo *TRI) const {
3554 if (!LdSt.mayLoadOrStore())
3555 return false;
3556
3557 const MachineOperand *BaseOp;
3558 TypeSize WidthN(0, false);
3559 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3560 WidthN, TRI))
3561 return false;
3562 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3563 // vector.
3564 Width = LocationSize::precise(WidthN);
3565 BaseOps.push_back(BaseOp);
3566 return true;
3567}
3568
3569std::optional<ExtAddrMode>
3571 const TargetRegisterInfo *TRI) const {
3572 const MachineOperand *Base; // Filled with the base operand of MI.
3573 int64_t Offset; // Filled with the offset of MI.
3574 bool OffsetIsScalable;
3575 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3576 return std::nullopt;
3577
3578 if (!Base->isReg())
3579 return std::nullopt;
3580 ExtAddrMode AM;
3581 AM.BaseReg = Base->getReg();
3582 AM.Displacement = Offset;
3583 AM.ScaledReg = 0;
3584 AM.Scale = 0;
3585 return AM;
3586}
3587
3589 Register Reg,
3590 const MachineInstr &AddrI,
3591 ExtAddrMode &AM) const {
3592 // Filter out instructions into which we cannot fold.
3593 unsigned NumBytes;
3594 int64_t OffsetScale = 1;
3595 switch (MemI.getOpcode()) {
3596 default:
3597 return false;
3598
3599 case AArch64::LDURQi:
3600 case AArch64::STURQi:
3601 NumBytes = 16;
3602 break;
3603
3604 case AArch64::LDURDi:
3605 case AArch64::STURDi:
3606 case AArch64::LDURXi:
3607 case AArch64::STURXi:
3608 NumBytes = 8;
3609 break;
3610
3611 case AArch64::LDURWi:
3612 case AArch64::LDURSWi:
3613 case AArch64::STURWi:
3614 NumBytes = 4;
3615 break;
3616
3617 case AArch64::LDURHi:
3618 case AArch64::STURHi:
3619 case AArch64::LDURHHi:
3620 case AArch64::STURHHi:
3621 case AArch64::LDURSHXi:
3622 case AArch64::LDURSHWi:
3623 NumBytes = 2;
3624 break;
3625
3626 case AArch64::LDRBroX:
3627 case AArch64::LDRBBroX:
3628 case AArch64::LDRSBXroX:
3629 case AArch64::LDRSBWroX:
3630 case AArch64::STRBroX:
3631 case AArch64::STRBBroX:
3632 case AArch64::LDURBi:
3633 case AArch64::LDURBBi:
3634 case AArch64::LDURSBXi:
3635 case AArch64::LDURSBWi:
3636 case AArch64::STURBi:
3637 case AArch64::STURBBi:
3638 case AArch64::LDRBui:
3639 case AArch64::LDRBBui:
3640 case AArch64::LDRSBXui:
3641 case AArch64::LDRSBWui:
3642 case AArch64::STRBui:
3643 case AArch64::STRBBui:
3644 NumBytes = 1;
3645 break;
3646
3647 case AArch64::LDRQroX:
3648 case AArch64::STRQroX:
3649 case AArch64::LDRQui:
3650 case AArch64::STRQui:
3651 NumBytes = 16;
3652 OffsetScale = 16;
3653 break;
3654
3655 case AArch64::LDRDroX:
3656 case AArch64::STRDroX:
3657 case AArch64::LDRXroX:
3658 case AArch64::STRXroX:
3659 case AArch64::LDRDui:
3660 case AArch64::STRDui:
3661 case AArch64::LDRXui:
3662 case AArch64::STRXui:
3663 NumBytes = 8;
3664 OffsetScale = 8;
3665 break;
3666
3667 case AArch64::LDRWroX:
3668 case AArch64::LDRSWroX:
3669 case AArch64::STRWroX:
3670 case AArch64::LDRWui:
3671 case AArch64::LDRSWui:
3672 case AArch64::STRWui:
3673 NumBytes = 4;
3674 OffsetScale = 4;
3675 break;
3676
3677 case AArch64::LDRHroX:
3678 case AArch64::STRHroX:
3679 case AArch64::LDRHHroX:
3680 case AArch64::STRHHroX:
3681 case AArch64::LDRSHXroX:
3682 case AArch64::LDRSHWroX:
3683 case AArch64::LDRHui:
3684 case AArch64::STRHui:
3685 case AArch64::LDRHHui:
3686 case AArch64::STRHHui:
3687 case AArch64::LDRSHXui:
3688 case AArch64::LDRSHWui:
3689 NumBytes = 2;
3690 OffsetScale = 2;
3691 break;
3692 }
3693
3694 // Check the fold operand is not the loaded/stored value.
3695 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3696 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3697 return false;
3698
3699 // Handle memory instructions with a [Reg, Reg] addressing mode.
3700 if (MemI.getOperand(2).isReg()) {
3701 // Bail if the addressing mode already includes extension of the offset
3702 // register.
3703 if (MemI.getOperand(3).getImm())
3704 return false;
3705
3706 // Check if we actually have a scaled offset.
3707 if (MemI.getOperand(4).getImm() == 0)
3708 OffsetScale = 1;
3709
3710 // If the address instructions is folded into the base register, then the
3711 // addressing mode must not have a scale. Then we can swap the base and the
3712 // scaled registers.
3713 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3714 return false;
3715
3716 switch (AddrI.getOpcode()) {
3717 default:
3718 return false;
3719
3720 case AArch64::SBFMXri:
3721 // sxtw Xa, Wm
3722 // ldr Xd, [Xn, Xa, lsl #N]
3723 // ->
3724 // ldr Xd, [Xn, Wm, sxtw #N]
3725 if (AddrI.getOperand(2).getImm() != 0 ||
3726 AddrI.getOperand(3).getImm() != 31)
3727 return false;
3728
3729 AM.BaseReg = MemI.getOperand(1).getReg();
3730 if (AM.BaseReg == Reg)
3731 AM.BaseReg = MemI.getOperand(2).getReg();
3732 AM.ScaledReg = AddrI.getOperand(1).getReg();
3733 AM.Scale = OffsetScale;
3734 AM.Displacement = 0;
3736 return true;
3737
3738 case TargetOpcode::SUBREG_TO_REG: {
3739 // mov Wa, Wm
3740 // ldr Xd, [Xn, Xa, lsl #N]
3741 // ->
3742 // ldr Xd, [Xn, Wm, uxtw #N]
3743
3744 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3745 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3746 return false;
3747
3748 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3749 Register OffsetReg = AddrI.getOperand(1).getReg();
3750 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3751 return false;
3752
3753 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3754 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3755 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3756 DefMI.getOperand(3).getImm() != 0)
3757 return false;
3758
3759 AM.BaseReg = MemI.getOperand(1).getReg();
3760 if (AM.BaseReg == Reg)
3761 AM.BaseReg = MemI.getOperand(2).getReg();
3762 AM.ScaledReg = DefMI.getOperand(2).getReg();
3763 AM.Scale = OffsetScale;
3764 AM.Displacement = 0;
3766 return true;
3767 }
3768 }
3769 }
3770
3771 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3772
3773 // Check we are not breaking a potential conversion to an LDP.
3774 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3775 int64_t NewOffset) -> bool {
3776 int64_t MinOffset, MaxOffset;
3777 switch (NumBytes) {
3778 default:
3779 return true;
3780 case 4:
3781 MinOffset = -256;
3782 MaxOffset = 252;
3783 break;
3784 case 8:
3785 MinOffset = -512;
3786 MaxOffset = 504;
3787 break;
3788 case 16:
3789 MinOffset = -1024;
3790 MaxOffset = 1008;
3791 break;
3792 }
3793 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3794 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3795 };
3796 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3797 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3798 int64_t NewOffset = OldOffset + Disp;
3799 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3800 return false;
3801 // If the old offset would fit into an LDP, but the new offset wouldn't,
3802 // bail out.
3803 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3804 return false;
3805 AM.BaseReg = AddrI.getOperand(1).getReg();
3806 AM.ScaledReg = 0;
3807 AM.Scale = 0;
3808 AM.Displacement = NewOffset;
3810 return true;
3811 };
3812
3813 auto canFoldAddRegIntoAddrMode =
3814 [&](int64_t Scale,
3816 if (MemI.getOperand(2).getImm() != 0)
3817 return false;
3818 if ((unsigned)Scale != Scale)
3819 return false;
3820 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3821 return false;
3822 AM.BaseReg = AddrI.getOperand(1).getReg();
3823 AM.ScaledReg = AddrI.getOperand(2).getReg();
3824 AM.Scale = Scale;
3825 AM.Displacement = 0;
3826 AM.Form = Form;
3827 return true;
3828 };
3829
3830 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3831 unsigned Opcode = MemI.getOpcode();
3832 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3833 Subtarget.isSTRQroSlow();
3834 };
3835
3836 int64_t Disp = 0;
3837 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3838 switch (AddrI.getOpcode()) {
3839 default:
3840 return false;
3841
3842 case AArch64::ADDXri:
3843 // add Xa, Xn, #N
3844 // ldr Xd, [Xa, #M]
3845 // ->
3846 // ldr Xd, [Xn, #N'+M]
3847 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3848 return canFoldAddSubImmIntoAddrMode(Disp);
3849
3850 case AArch64::SUBXri:
3851 // sub Xa, Xn, #N
3852 // ldr Xd, [Xa, #M]
3853 // ->
3854 // ldr Xd, [Xn, #N'+M]
3855 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3856 return canFoldAddSubImmIntoAddrMode(-Disp);
3857
3858 case AArch64::ADDXrs: {
3859 // add Xa, Xn, Xm, lsl #N
3860 // ldr Xd, [Xa]
3861 // ->
3862 // ldr Xd, [Xn, Xm, lsl #N]
3863
3864 // Don't fold the add if the result would be slower, unless optimising for
3865 // size.
3866 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3868 return false;
3869 Shift = AArch64_AM::getShiftValue(Shift);
3870 if (!OptSize) {
3871 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3872 return false;
3873 if (avoidSlowSTRQ(MemI))
3874 return false;
3875 }
3876 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3877 }
3878
3879 case AArch64::ADDXrr:
3880 // add Xa, Xn, Xm
3881 // ldr Xd, [Xa]
3882 // ->
3883 // ldr Xd, [Xn, Xm, lsl #0]
3884
3885 // Don't fold the add if the result would be slower, unless optimising for
3886 // size.
3887 if (!OptSize && avoidSlowSTRQ(MemI))
3888 return false;
3889 return canFoldAddRegIntoAddrMode(1);
3890
3891 case AArch64::ADDXrx:
3892 // add Xa, Xn, Wm, {s,u}xtw #N
3893 // ldr Xd, [Xa]
3894 // ->
3895 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3896
3897 // Don't fold the add if the result would be slower, unless optimising for
3898 // size.
3899 if (!OptSize && avoidSlowSTRQ(MemI))
3900 return false;
3901
3902 // Can fold only sign-/zero-extend of a word.
3903 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3905 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3906 return false;
3907
3908 return canFoldAddRegIntoAddrMode(
3909 1ULL << AArch64_AM::getArithShiftValue(Imm),
3912 }
3913}
3914
3915// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3916// return the opcode of an instruction performing the same operation, but using
3917// the [Reg, Reg] addressing mode.
3918static unsigned regOffsetOpcode(unsigned Opcode) {
3919 switch (Opcode) {
3920 default:
3921 llvm_unreachable("Address folding not implemented for instruction");
3922
3923 case AArch64::LDURQi:
3924 case AArch64::LDRQui:
3925 return AArch64::LDRQroX;
3926 case AArch64::STURQi:
3927 case AArch64::STRQui:
3928 return AArch64::STRQroX;
3929 case AArch64::LDURDi:
3930 case AArch64::LDRDui:
3931 return AArch64::LDRDroX;
3932 case AArch64::STURDi:
3933 case AArch64::STRDui:
3934 return AArch64::STRDroX;
3935 case AArch64::LDURXi:
3936 case AArch64::LDRXui:
3937 return AArch64::LDRXroX;
3938 case AArch64::STURXi:
3939 case AArch64::STRXui:
3940 return AArch64::STRXroX;
3941 case AArch64::LDURWi:
3942 case AArch64::LDRWui:
3943 return AArch64::LDRWroX;
3944 case AArch64::LDURSWi:
3945 case AArch64::LDRSWui:
3946 return AArch64::LDRSWroX;
3947 case AArch64::STURWi:
3948 case AArch64::STRWui:
3949 return AArch64::STRWroX;
3950 case AArch64::LDURHi:
3951 case AArch64::LDRHui:
3952 return AArch64::LDRHroX;
3953 case AArch64::STURHi:
3954 case AArch64::STRHui:
3955 return AArch64::STRHroX;
3956 case AArch64::LDURHHi:
3957 case AArch64::LDRHHui:
3958 return AArch64::LDRHHroX;
3959 case AArch64::STURHHi:
3960 case AArch64::STRHHui:
3961 return AArch64::STRHHroX;
3962 case AArch64::LDURSHXi:
3963 case AArch64::LDRSHXui:
3964 return AArch64::LDRSHXroX;
3965 case AArch64::LDURSHWi:
3966 case AArch64::LDRSHWui:
3967 return AArch64::LDRSHWroX;
3968 case AArch64::LDURBi:
3969 case AArch64::LDRBui:
3970 return AArch64::LDRBroX;
3971 case AArch64::LDURBBi:
3972 case AArch64::LDRBBui:
3973 return AArch64::LDRBBroX;
3974 case AArch64::LDURSBXi:
3975 case AArch64::LDRSBXui:
3976 return AArch64::LDRSBXroX;
3977 case AArch64::LDURSBWi:
3978 case AArch64::LDRSBWui:
3979 return AArch64::LDRSBWroX;
3980 case AArch64::STURBi:
3981 case AArch64::STRBui:
3982 return AArch64::STRBroX;
3983 case AArch64::STURBBi:
3984 case AArch64::STRBBui:
3985 return AArch64::STRBBroX;
3986 }
3987}
3988
3989// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3990// the opcode of an instruction performing the same operation, but using the
3991// [Reg, #Imm] addressing mode with scaled offset.
3992unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3993 switch (Opcode) {
3994 default:
3995 llvm_unreachable("Address folding not implemented for instruction");
3996
3997 case AArch64::LDURQi:
3998 Scale = 16;
3999 return AArch64::LDRQui;
4000 case AArch64::STURQi:
4001 Scale = 16;
4002 return AArch64::STRQui;
4003 case AArch64::LDURDi:
4004 Scale = 8;
4005 return AArch64::LDRDui;
4006 case AArch64::STURDi:
4007 Scale = 8;
4008 return AArch64::STRDui;
4009 case AArch64::LDURXi:
4010 Scale = 8;
4011 return AArch64::LDRXui;
4012 case AArch64::STURXi:
4013 Scale = 8;
4014 return AArch64::STRXui;
4015 case AArch64::LDURWi:
4016 Scale = 4;
4017 return AArch64::LDRWui;
4018 case AArch64::LDURSWi:
4019 Scale = 4;
4020 return AArch64::LDRSWui;
4021 case AArch64::STURWi:
4022 Scale = 4;
4023 return AArch64::STRWui;
4024 case AArch64::LDURHi:
4025 Scale = 2;
4026 return AArch64::LDRHui;
4027 case AArch64::STURHi:
4028 Scale = 2;
4029 return AArch64::STRHui;
4030 case AArch64::LDURHHi:
4031 Scale = 2;
4032 return AArch64::LDRHHui;
4033 case AArch64::STURHHi:
4034 Scale = 2;
4035 return AArch64::STRHHui;
4036 case AArch64::LDURSHXi:
4037 Scale = 2;
4038 return AArch64::LDRSHXui;
4039 case AArch64::LDURSHWi:
4040 Scale = 2;
4041 return AArch64::LDRSHWui;
4042 case AArch64::LDURBi:
4043 Scale = 1;
4044 return AArch64::LDRBui;
4045 case AArch64::LDURBBi:
4046 Scale = 1;
4047 return AArch64::LDRBBui;
4048 case AArch64::LDURSBXi:
4049 Scale = 1;
4050 return AArch64::LDRSBXui;
4051 case AArch64::LDURSBWi:
4052 Scale = 1;
4053 return AArch64::LDRSBWui;
4054 case AArch64::STURBi:
4055 Scale = 1;
4056 return AArch64::STRBui;
4057 case AArch64::STURBBi:
4058 Scale = 1;
4059 return AArch64::STRBBui;
4060 case AArch64::LDRQui:
4061 case AArch64::STRQui:
4062 Scale = 16;
4063 return Opcode;
4064 case AArch64::LDRDui:
4065 case AArch64::STRDui:
4066 case AArch64::LDRXui:
4067 case AArch64::STRXui:
4068 Scale = 8;
4069 return Opcode;
4070 case AArch64::LDRWui:
4071 case AArch64::LDRSWui:
4072 case AArch64::STRWui:
4073 Scale = 4;
4074 return Opcode;
4075 case AArch64::LDRHui:
4076 case AArch64::STRHui:
4077 case AArch64::LDRHHui:
4078 case AArch64::STRHHui:
4079 case AArch64::LDRSHXui:
4080 case AArch64::LDRSHWui:
4081 Scale = 2;
4082 return Opcode;
4083 case AArch64::LDRBui:
4084 case AArch64::LDRBBui:
4085 case AArch64::LDRSBXui:
4086 case AArch64::LDRSBWui:
4087 case AArch64::STRBui:
4088 case AArch64::STRBBui:
4089 Scale = 1;
4090 return Opcode;
4091 }
4092}
4093
4094// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
4095// the opcode of an instruction performing the same operation, but using the
4096// [Reg, #Imm] addressing mode with unscaled offset.
4097unsigned unscaledOffsetOpcode(unsigned Opcode) {
4098 switch (Opcode) {
4099 default:
4100 llvm_unreachable("Address folding not implemented for instruction");
4101
4102 case AArch64::LDURQi:
4103 case AArch64::STURQi:
4104 case AArch64::LDURDi:
4105 case AArch64::STURDi:
4106 case AArch64::LDURXi:
4107 case AArch64::STURXi:
4108 case AArch64::LDURWi:
4109 case AArch64::LDURSWi:
4110 case AArch64::STURWi:
4111 case AArch64::LDURHi:
4112 case AArch64::STURHi:
4113 case AArch64::LDURHHi:
4114 case AArch64::STURHHi:
4115 case AArch64::LDURSHXi:
4116 case AArch64::LDURSHWi:
4117 case AArch64::LDURBi:
4118 case AArch64::STURBi:
4119 case AArch64::LDURBBi:
4120 case AArch64::STURBBi:
4121 case AArch64::LDURSBWi:
4122 case AArch64::LDURSBXi:
4123 return Opcode;
4124 case AArch64::LDRQui:
4125 return AArch64::LDURQi;
4126 case AArch64::STRQui:
4127 return AArch64::STURQi;
4128 case AArch64::LDRDui:
4129 return AArch64::LDURDi;
4130 case AArch64::STRDui:
4131 return AArch64::STURDi;
4132 case AArch64::LDRXui:
4133 return AArch64::LDURXi;
4134 case AArch64::STRXui:
4135 return AArch64::STURXi;
4136 case AArch64::LDRWui:
4137 return AArch64::LDURWi;
4138 case AArch64::LDRSWui:
4139 return AArch64::LDURSWi;
4140 case AArch64::STRWui:
4141 return AArch64::STURWi;
4142 case AArch64::LDRHui:
4143 return AArch64::LDURHi;
4144 case AArch64::STRHui:
4145 return AArch64::STURHi;
4146 case AArch64::LDRHHui:
4147 return AArch64::LDURHHi;
4148 case AArch64::STRHHui:
4149 return AArch64::STURHHi;
4150 case AArch64::LDRSHXui:
4151 return AArch64::LDURSHXi;
4152 case AArch64::LDRSHWui:
4153 return AArch64::LDURSHWi;
4154 case AArch64::LDRBBui:
4155 return AArch64::LDURBBi;
4156 case AArch64::LDRBui:
4157 return AArch64::LDURBi;
4158 case AArch64::STRBBui:
4159 return AArch64::STURBBi;
4160 case AArch64::STRBui:
4161 return AArch64::STURBi;
4162 case AArch64::LDRSBWui:
4163 return AArch64::LDURSBWi;
4164 case AArch64::LDRSBXui:
4165 return AArch64::LDURSBXi;
4166 }
4167}
4168
4169// Given the opcode of a memory load/store instruction, return the opcode of an
4170// instruction performing the same operation, but using
4171// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4172// offset register.
4173static unsigned offsetExtendOpcode(unsigned Opcode) {
4174 switch (Opcode) {
4175 default:
4176 llvm_unreachable("Address folding not implemented for instruction");
4177
4178 case AArch64::LDRQroX:
4179 case AArch64::LDURQi:
4180 case AArch64::LDRQui:
4181 return AArch64::LDRQroW;
4182 case AArch64::STRQroX:
4183 case AArch64::STURQi:
4184 case AArch64::STRQui:
4185 return AArch64::STRQroW;
4186 case AArch64::LDRDroX:
4187 case AArch64::LDURDi:
4188 case AArch64::LDRDui:
4189 return AArch64::LDRDroW;
4190 case AArch64::STRDroX:
4191 case AArch64::STURDi:
4192 case AArch64::STRDui:
4193 return AArch64::STRDroW;
4194 case AArch64::LDRXroX:
4195 case AArch64::LDURXi:
4196 case AArch64::LDRXui:
4197 return AArch64::LDRXroW;
4198 case AArch64::STRXroX:
4199 case AArch64::STURXi:
4200 case AArch64::STRXui:
4201 return AArch64::STRXroW;
4202 case AArch64::LDRWroX:
4203 case AArch64::LDURWi:
4204 case AArch64::LDRWui:
4205 return AArch64::LDRWroW;
4206 case AArch64::LDRSWroX:
4207 case AArch64::LDURSWi:
4208 case AArch64::LDRSWui:
4209 return AArch64::LDRSWroW;
4210 case AArch64::STRWroX:
4211 case AArch64::STURWi:
4212 case AArch64::STRWui:
4213 return AArch64::STRWroW;
4214 case AArch64::LDRHroX:
4215 case AArch64::LDURHi:
4216 case AArch64::LDRHui:
4217 return AArch64::LDRHroW;
4218 case AArch64::STRHroX:
4219 case AArch64::STURHi:
4220 case AArch64::STRHui:
4221 return AArch64::STRHroW;
4222 case AArch64::LDRHHroX:
4223 case AArch64::LDURHHi:
4224 case AArch64::LDRHHui:
4225 return AArch64::LDRHHroW;
4226 case AArch64::STRHHroX:
4227 case AArch64::STURHHi:
4228 case AArch64::STRHHui:
4229 return AArch64::STRHHroW;
4230 case AArch64::LDRSHXroX:
4231 case AArch64::LDURSHXi:
4232 case AArch64::LDRSHXui:
4233 return AArch64::LDRSHXroW;
4234 case AArch64::LDRSHWroX:
4235 case AArch64::LDURSHWi:
4236 case AArch64::LDRSHWui:
4237 return AArch64::LDRSHWroW;
4238 case AArch64::LDRBroX:
4239 case AArch64::LDURBi:
4240 case AArch64::LDRBui:
4241 return AArch64::LDRBroW;
4242 case AArch64::LDRBBroX:
4243 case AArch64::LDURBBi:
4244 case AArch64::LDRBBui:
4245 return AArch64::LDRBBroW;
4246 case AArch64::LDRSBXroX:
4247 case AArch64::LDURSBXi:
4248 case AArch64::LDRSBXui:
4249 return AArch64::LDRSBXroW;
4250 case AArch64::LDRSBWroX:
4251 case AArch64::LDURSBWi:
4252 case AArch64::LDRSBWui:
4253 return AArch64::LDRSBWroW;
4254 case AArch64::STRBroX:
4255 case AArch64::STURBi:
4256 case AArch64::STRBui:
4257 return AArch64::STRBroW;
4258 case AArch64::STRBBroX:
4259 case AArch64::STURBBi:
4260 case AArch64::STRBBui:
4261 return AArch64::STRBBroW;
4262 }
4263}
4264
4266 const ExtAddrMode &AM) const {
4267
4268 const DebugLoc &DL = MemI.getDebugLoc();
4269 MachineBasicBlock &MBB = *MemI.getParent();
4270 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4271
4273 if (AM.ScaledReg) {
4274 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4275 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4276 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4277 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4278 .addReg(MemI.getOperand(0).getReg(),
4279 getDefRegState(MemI.mayLoad()))
4280 .addReg(AM.BaseReg)
4281 .addReg(AM.ScaledReg)
4282 .addImm(0)
4283 .addImm(AM.Scale > 1)
4284 .setMemRefs(MemI.memoperands())
4285 .setMIFlags(MemI.getFlags());
4286 return B.getInstr();
4287 }
4288
4289 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4290 "Addressing mode not supported for folding");
4291
4292 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4293 unsigned Scale = 1;
4294 unsigned Opcode = MemI.getOpcode();
4295 if (isInt<9>(AM.Displacement))
4296 Opcode = unscaledOffsetOpcode(Opcode);
4297 else
4298 Opcode = scaledOffsetOpcode(Opcode, Scale);
4299
4300 auto B =
4301 BuildMI(MBB, MemI, DL, get(Opcode))
4302 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4303 .addReg(AM.BaseReg)
4304 .addImm(AM.Displacement / Scale)
4305 .setMemRefs(MemI.memoperands())
4306 .setMIFlags(MemI.getFlags());
4307 return B.getInstr();
4308 }
4309
4312 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4313 assert(AM.ScaledReg && !AM.Displacement &&
4314 "Address offset can be a register or an immediate, but not both");
4315 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4316 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4317 // Make sure the offset register is in the correct register class.
4318 Register OffsetReg = AM.ScaledReg;
4319 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4320 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4321 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4322 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4323 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4324 }
4325 auto B =
4326 BuildMI(MBB, MemI, DL, get(Opcode))
4327 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4328 .addReg(AM.BaseReg)
4329 .addReg(OffsetReg)
4331 .addImm(AM.Scale != 1)
4332 .setMemRefs(MemI.memoperands())
4333 .setMIFlags(MemI.getFlags());
4334
4335 return B.getInstr();
4336 }
4337
4339 "Function must not be called with an addressing mode it can't handle");
4340}
4341
4342/// Return true if the opcode is a post-index ld/st instruction, which really
4343/// loads from base+0.
4344static bool isPostIndexLdStOpcode(unsigned Opcode) {
4345 switch (Opcode) {
4346 default:
4347 return false;
4348 case AArch64::LD1Fourv16b_POST:
4349 case AArch64::LD1Fourv1d_POST:
4350 case AArch64::LD1Fourv2d_POST:
4351 case AArch64::LD1Fourv2s_POST:
4352 case AArch64::LD1Fourv4h_POST:
4353 case AArch64::LD1Fourv4s_POST:
4354 case AArch64::LD1Fourv8b_POST:
4355 case AArch64::LD1Fourv8h_POST:
4356 case AArch64::LD1Onev16b_POST:
4357 case AArch64::LD1Onev1d_POST:
4358 case AArch64::LD1Onev2d_POST:
4359 case AArch64::LD1Onev2s_POST:
4360 case AArch64::LD1Onev4h_POST:
4361 case AArch64::LD1Onev4s_POST:
4362 case AArch64::LD1Onev8b_POST:
4363 case AArch64::LD1Onev8h_POST:
4364 case AArch64::LD1Rv16b_POST:
4365 case AArch64::LD1Rv1d_POST:
4366 case AArch64::LD1Rv2d_POST:
4367 case AArch64::LD1Rv2s_POST:
4368 case AArch64::LD1Rv4h_POST:
4369 case AArch64::LD1Rv4s_POST:
4370 case AArch64::LD1Rv8b_POST:
4371 case AArch64::LD1Rv8h_POST:
4372 case AArch64::LD1Threev16b_POST:
4373 case AArch64::LD1Threev1d_POST:
4374 case AArch64::LD1Threev2d_POST:
4375 case AArch64::LD1Threev2s_POST:
4376 case AArch64::LD1Threev4h_POST:
4377 case AArch64::LD1Threev4s_POST:
4378 case AArch64::LD1Threev8b_POST:
4379 case AArch64::LD1Threev8h_POST:
4380 case AArch64::LD1Twov16b_POST:
4381 case AArch64::LD1Twov1d_POST:
4382 case AArch64::LD1Twov2d_POST:
4383 case AArch64::LD1Twov2s_POST:
4384 case AArch64::LD1Twov4h_POST:
4385 case AArch64::LD1Twov4s_POST:
4386 case AArch64::LD1Twov8b_POST:
4387 case AArch64::LD1Twov8h_POST:
4388 case AArch64::LD1i16_POST:
4389 case AArch64::LD1i32_POST:
4390 case AArch64::LD1i64_POST:
4391 case AArch64::LD1i8_POST:
4392 case AArch64::LD2Rv16b_POST:
4393 case AArch64::LD2Rv1d_POST:
4394 case AArch64::LD2Rv2d_POST:
4395 case AArch64::LD2Rv2s_POST:
4396 case AArch64::LD2Rv4h_POST:
4397 case AArch64::LD2Rv4s_POST:
4398 case AArch64::LD2Rv8b_POST:
4399 case AArch64::LD2Rv8h_POST:
4400 case AArch64::LD2Twov16b_POST:
4401 case AArch64::LD2Twov2d_POST:
4402 case AArch64::LD2Twov2s_POST:
4403 case AArch64::LD2Twov4h_POST:
4404 case AArch64::LD2Twov4s_POST:
4405 case AArch64::LD2Twov8b_POST:
4406 case AArch64::LD2Twov8h_POST:
4407 case AArch64::LD2i16_POST:
4408 case AArch64::LD2i32_POST:
4409 case AArch64::LD2i64_POST:
4410 case AArch64::LD2i8_POST:
4411 case AArch64::LD3Rv16b_POST:
4412 case AArch64::LD3Rv1d_POST:
4413 case AArch64::LD3Rv2d_POST:
4414 case AArch64::LD3Rv2s_POST:
4415 case AArch64::LD3Rv4h_POST:
4416 case AArch64::LD3Rv4s_POST:
4417 case AArch64::LD3Rv8b_POST:
4418 case AArch64::LD3Rv8h_POST:
4419 case AArch64::LD3Threev16b_POST:
4420 case AArch64::LD3Threev2d_POST:
4421 case AArch64::LD3Threev2s_POST:
4422 case AArch64::LD3Threev4h_POST:
4423 case AArch64::LD3Threev4s_POST:
4424 case AArch64::LD3Threev8b_POST:
4425 case AArch64::LD3Threev8h_POST:
4426 case AArch64::LD3i16_POST:
4427 case AArch64::LD3i32_POST:
4428 case AArch64::LD3i64_POST:
4429 case AArch64::LD3i8_POST:
4430 case AArch64::LD4Fourv16b_POST:
4431 case AArch64::LD4Fourv2d_POST:
4432 case AArch64::LD4Fourv2s_POST:
4433 case AArch64::LD4Fourv4h_POST:
4434 case AArch64::LD4Fourv4s_POST:
4435 case AArch64::LD4Fourv8b_POST:
4436 case AArch64::LD4Fourv8h_POST:
4437 case AArch64::LD4Rv16b_POST:
4438 case AArch64::LD4Rv1d_POST:
4439 case AArch64::LD4Rv2d_POST:
4440 case AArch64::LD4Rv2s_POST:
4441 case AArch64::LD4Rv4h_POST:
4442 case AArch64::LD4Rv4s_POST:
4443 case AArch64::LD4Rv8b_POST:
4444 case AArch64::LD4Rv8h_POST:
4445 case AArch64::LD4i16_POST:
4446 case AArch64::LD4i32_POST:
4447 case AArch64::LD4i64_POST:
4448 case AArch64::LD4i8_POST:
4449 case AArch64::LDAPRWpost:
4450 case AArch64::LDAPRXpost:
4451 case AArch64::LDIAPPWpost:
4452 case AArch64::LDIAPPXpost:
4453 case AArch64::LDPDpost:
4454 case AArch64::LDPQpost:
4455 case AArch64::LDPSWpost:
4456 case AArch64::LDPSpost:
4457 case AArch64::LDPWpost:
4458 case AArch64::LDPXpost:
4459 case AArch64::LDRBBpost:
4460 case AArch64::LDRBpost:
4461 case AArch64::LDRDpost:
4462 case AArch64::LDRHHpost:
4463 case AArch64::LDRHpost:
4464 case AArch64::LDRQpost:
4465 case AArch64::LDRSBWpost:
4466 case AArch64::LDRSBXpost:
4467 case AArch64::LDRSHWpost:
4468 case AArch64::LDRSHXpost:
4469 case AArch64::LDRSWpost:
4470 case AArch64::LDRSpost:
4471 case AArch64::LDRWpost:
4472 case AArch64::LDRXpost:
4473 case AArch64::ST1Fourv16b_POST:
4474 case AArch64::ST1Fourv1d_POST:
4475 case AArch64::ST1Fourv2d_POST:
4476 case AArch64::ST1Fourv2s_POST:
4477 case AArch64::ST1Fourv4h_POST:
4478 case AArch64::ST1Fourv4s_POST:
4479 case AArch64::ST1Fourv8b_POST:
4480 case AArch64::ST1Fourv8h_POST:
4481 case AArch64::ST1Onev16b_POST:
4482 case AArch64::ST1Onev1d_POST:
4483 case AArch64::ST1Onev2d_POST:
4484 case AArch64::ST1Onev2s_POST:
4485 case AArch64::ST1Onev4h_POST:
4486 case AArch64::ST1Onev4s_POST:
4487 case AArch64::ST1Onev8b_POST:
4488 case AArch64::ST1Onev8h_POST:
4489 case AArch64::ST1Threev16b_POST:
4490 case AArch64::ST1Threev1d_POST:
4491 case AArch64::ST1Threev2d_POST:
4492 case AArch64::ST1Threev2s_POST:
4493 case AArch64::ST1Threev4h_POST:
4494 case AArch64::ST1Threev4s_POST:
4495 case AArch64::ST1Threev8b_POST:
4496 case AArch64::ST1Threev8h_POST:
4497 case AArch64::ST1Twov16b_POST:
4498 case AArch64::ST1Twov1d_POST:
4499 case AArch64::ST1Twov2d_POST:
4500 case AArch64::ST1Twov2s_POST:
4501 case AArch64::ST1Twov4h_POST:
4502 case AArch64::ST1Twov4s_POST:
4503 case AArch64::ST1Twov8b_POST:
4504 case AArch64::ST1Twov8h_POST:
4505 case AArch64::ST1i16_POST:
4506 case AArch64::ST1i32_POST:
4507 case AArch64::ST1i64_POST:
4508 case AArch64::ST1i8_POST:
4509 case AArch64::ST2GPostIndex:
4510 case AArch64::ST2Twov16b_POST:
4511 case AArch64::ST2Twov2d_POST:
4512 case AArch64::ST2Twov2s_POST:
4513 case AArch64::ST2Twov4h_POST:
4514 case AArch64::ST2Twov4s_POST:
4515 case AArch64::ST2Twov8b_POST:
4516 case AArch64::ST2Twov8h_POST:
4517 case AArch64::ST2i16_POST:
4518 case AArch64::ST2i32_POST:
4519 case AArch64::ST2i64_POST:
4520 case AArch64::ST2i8_POST:
4521 case AArch64::ST3Threev16b_POST:
4522 case AArch64::ST3Threev2d_POST:
4523 case AArch64::ST3Threev2s_POST:
4524 case AArch64::ST3Threev4h_POST:
4525 case AArch64::ST3Threev4s_POST:
4526 case AArch64::ST3Threev8b_POST:
4527 case AArch64::ST3Threev8h_POST:
4528 case AArch64::ST3i16_POST:
4529 case AArch64::ST3i32_POST:
4530 case AArch64::ST3i64_POST:
4531 case AArch64::ST3i8_POST:
4532 case AArch64::ST4Fourv16b_POST:
4533 case AArch64::ST4Fourv2d_POST:
4534 case AArch64::ST4Fourv2s_POST:
4535 case AArch64::ST4Fourv4h_POST:
4536 case AArch64::ST4Fourv4s_POST:
4537 case AArch64::ST4Fourv8b_POST:
4538 case AArch64::ST4Fourv8h_POST:
4539 case AArch64::ST4i16_POST:
4540 case AArch64::ST4i32_POST:
4541 case AArch64::ST4i64_POST:
4542 case AArch64::ST4i8_POST:
4543 case AArch64::STGPostIndex:
4544 case AArch64::STGPpost:
4545 case AArch64::STPDpost:
4546 case AArch64::STPQpost:
4547 case AArch64::STPSpost:
4548 case AArch64::STPWpost:
4549 case AArch64::STPXpost:
4550 case AArch64::STRBBpost:
4551 case AArch64::STRBpost:
4552 case AArch64::STRDpost:
4553 case AArch64::STRHHpost:
4554 case AArch64::STRHpost:
4555 case AArch64::STRQpost:
4556 case AArch64::STRSpost:
4557 case AArch64::STRWpost:
4558 case AArch64::STRXpost:
4559 case AArch64::STZ2GPostIndex:
4560 case AArch64::STZGPostIndex:
4561 return true;
4562 }
4563}
4564
4566 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4567 bool &OffsetIsScalable, TypeSize &Width,
4568 const TargetRegisterInfo *TRI) const {
4569 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4570 // Handle only loads/stores with base register followed by immediate offset.
4571 if (LdSt.getNumExplicitOperands() == 3) {
4572 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4573 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4574 !LdSt.getOperand(2).isImm())
4575 return false;
4576 } else if (LdSt.getNumExplicitOperands() == 4) {
4577 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4578 if (!LdSt.getOperand(1).isReg() ||
4579 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4580 !LdSt.getOperand(3).isImm())
4581 return false;
4582 } else
4583 return false;
4584
4585 // Get the scaling factor for the instruction and set the width for the
4586 // instruction.
4587 TypeSize Scale(0U, false);
4588 int64_t Dummy1, Dummy2;
4589
4590 // If this returns false, then it's an instruction we don't want to handle.
4591 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4592 return false;
4593
4594 // Compute the offset. Offset is calculated as the immediate operand
4595 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4596 // set to 1. Postindex are a special case which have an offset of 0.
4597 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4598 BaseOp = &LdSt.getOperand(2);
4599 Offset = 0;
4600 } else if (LdSt.getNumExplicitOperands() == 3) {
4601 BaseOp = &LdSt.getOperand(1);
4602 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4603 } else {
4604 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4605 BaseOp = &LdSt.getOperand(2);
4606 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4607 }
4608 OffsetIsScalable = Scale.isScalable();
4609
4610 return BaseOp->isReg() || BaseOp->isFI();
4611}
4612
4615 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4616 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4617 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4618 return OfsOp;
4619}
4620
4621bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4622 TypeSize &Width, int64_t &MinOffset,
4623 int64_t &MaxOffset) {
4624 switch (Opcode) {
4625 // Not a memory operation or something we want to handle.
4626 default:
4627 Scale = Width = TypeSize::getFixed(0);
4628 MinOffset = MaxOffset = 0;
4629 return false;
4630 // LDR / STR
4631 case AArch64::LDRQui:
4632 case AArch64::STRQui:
4633 Scale = Width = TypeSize::getFixed(16);
4634 MinOffset = 0;
4635 MaxOffset = 4095;
4636 break;
4637 case AArch64::LDRXui:
4638 case AArch64::LDRDui:
4639 case AArch64::STRXui:
4640 case AArch64::STRDui:
4641 case AArch64::PRFMui:
4642 Scale = Width = TypeSize::getFixed(8);
4643 MinOffset = 0;
4644 MaxOffset = 4095;
4645 break;
4646 case AArch64::LDRWui:
4647 case AArch64::LDRSui:
4648 case AArch64::LDRSWui:
4649 case AArch64::STRWui:
4650 case AArch64::STRSui:
4651 Scale = Width = TypeSize::getFixed(4);
4652 MinOffset = 0;
4653 MaxOffset = 4095;
4654 break;
4655 case AArch64::LDRHui:
4656 case AArch64::LDRHHui:
4657 case AArch64::LDRSHWui:
4658 case AArch64::LDRSHXui:
4659 case AArch64::STRHui:
4660 case AArch64::STRHHui:
4661 Scale = Width = TypeSize::getFixed(2);
4662 MinOffset = 0;
4663 MaxOffset = 4095;
4664 break;
4665 case AArch64::LDRBui:
4666 case AArch64::LDRBBui:
4667 case AArch64::LDRSBWui:
4668 case AArch64::LDRSBXui:
4669 case AArch64::STRBui:
4670 case AArch64::STRBBui:
4671 Scale = Width = TypeSize::getFixed(1);
4672 MinOffset = 0;
4673 MaxOffset = 4095;
4674 break;
4675 // post/pre inc
4676 case AArch64::STRQpre:
4677 case AArch64::LDRQpost:
4678 Scale = TypeSize::getFixed(1);
4679 Width = TypeSize::getFixed(16);
4680 MinOffset = -256;
4681 MaxOffset = 255;
4682 break;
4683 case AArch64::LDRDpost:
4684 case AArch64::LDRDpre:
4685 case AArch64::LDRXpost:
4686 case AArch64::LDRXpre:
4687 case AArch64::STRDpost:
4688 case AArch64::STRDpre:
4689 case AArch64::STRXpost:
4690 case AArch64::STRXpre:
4691 Scale = TypeSize::getFixed(1);
4692 Width = TypeSize::getFixed(8);
4693 MinOffset = -256;
4694 MaxOffset = 255;
4695 break;
4696 case AArch64::STRWpost:
4697 case AArch64::STRWpre:
4698 case AArch64::LDRWpost:
4699 case AArch64::LDRWpre:
4700 case AArch64::STRSpost:
4701 case AArch64::STRSpre:
4702 case AArch64::LDRSpost:
4703 case AArch64::LDRSpre:
4704 Scale = TypeSize::getFixed(1);
4705 Width = TypeSize::getFixed(4);
4706 MinOffset = -256;
4707 MaxOffset = 255;
4708 break;
4709 case AArch64::LDRHpost:
4710 case AArch64::LDRHpre:
4711 case AArch64::STRHpost:
4712 case AArch64::STRHpre:
4713 case AArch64::LDRHHpost:
4714 case AArch64::LDRHHpre:
4715 case AArch64::STRHHpost:
4716 case AArch64::STRHHpre:
4717 Scale = TypeSize::getFixed(1);
4718 Width = TypeSize::getFixed(2);
4719 MinOffset = -256;
4720 MaxOffset = 255;
4721 break;
4722 case AArch64::LDRBpost:
4723 case AArch64::LDRBpre:
4724 case AArch64::STRBpost:
4725 case AArch64::STRBpre:
4726 case AArch64::LDRBBpost:
4727 case AArch64::LDRBBpre:
4728 case AArch64::STRBBpost:
4729 case AArch64::STRBBpre:
4730 Scale = Width = TypeSize::getFixed(1);
4731 MinOffset = -256;
4732 MaxOffset = 255;
4733 break;
4734 // Unscaled
4735 case AArch64::LDURQi:
4736 case AArch64::STURQi:
4737 Scale = TypeSize::getFixed(1);
4738 Width = TypeSize::getFixed(16);
4739 MinOffset = -256;
4740 MaxOffset = 255;
4741 break;
4742 case AArch64::LDURXi:
4743 case AArch64::LDURDi:
4744 case AArch64::LDAPURXi:
4745 case AArch64::STURXi:
4746 case AArch64::STURDi:
4747 case AArch64::STLURXi:
4748 case AArch64::PRFUMi:
4749 Scale = TypeSize::getFixed(1);
4750 Width = TypeSize::getFixed(8);
4751 MinOffset = -256;
4752 MaxOffset = 255;
4753 break;
4754 case AArch64::LDURWi:
4755 case AArch64::LDURSi:
4756 case AArch64::LDURSWi:
4757 case AArch64::LDAPURi:
4758 case AArch64::LDAPURSWi:
4759 case AArch64::STURWi:
4760 case AArch64::STURSi:
4761 case AArch64::STLURWi:
4762 Scale = TypeSize::getFixed(1);
4763 Width = TypeSize::getFixed(4);
4764 MinOffset = -256;
4765 MaxOffset = 255;
4766 break;
4767 case AArch64::LDURHi:
4768 case AArch64::LDURHHi:
4769 case AArch64::LDURSHXi:
4770 case AArch64::LDURSHWi:
4771 case AArch64::LDAPURHi:
4772 case AArch64::LDAPURSHWi:
4773 case AArch64::LDAPURSHXi:
4774 case AArch64::STURHi:
4775 case AArch64::STURHHi:
4776 case AArch64::STLURHi:
4777 Scale = TypeSize::getFixed(1);
4778 Width = TypeSize::getFixed(2);
4779 MinOffset = -256;
4780 MaxOffset = 255;
4781 break;
4782 case AArch64::LDURBi:
4783 case AArch64::LDURBBi:
4784 case AArch64::LDURSBXi:
4785 case AArch64::LDURSBWi:
4786 case AArch64::LDAPURBi:
4787 case AArch64::LDAPURSBWi:
4788 case AArch64::LDAPURSBXi:
4789 case AArch64::STURBi:
4790 case AArch64::STURBBi:
4791 case AArch64::STLURBi:
4792 Scale = Width = TypeSize::getFixed(1);
4793 MinOffset = -256;
4794 MaxOffset = 255;
4795 break;
4796 // LDP / STP (including pre/post inc)
4797 case AArch64::LDPQi:
4798 case AArch64::LDNPQi:
4799 case AArch64::STPQi:
4800 case AArch64::STNPQi:
4801 case AArch64::LDPQpost:
4802 case AArch64::LDPQpre:
4803 case AArch64::STPQpost:
4804 case AArch64::STPQpre:
4805 Scale = TypeSize::getFixed(16);
4806 Width = TypeSize::getFixed(16 * 2);
4807 MinOffset = -64;
4808 MaxOffset = 63;
4809 break;
4810 case AArch64::LDPXi:
4811 case AArch64::LDPDi:
4812 case AArch64::LDNPXi:
4813 case AArch64::LDNPDi:
4814 case AArch64::STPXi:
4815 case AArch64::STPDi:
4816 case AArch64::STNPXi:
4817 case AArch64::STNPDi:
4818 case AArch64::LDPDpost:
4819 case AArch64::LDPDpre:
4820 case AArch64::LDPXpost:
4821 case AArch64::LDPXpre:
4822 case AArch64::STPDpost:
4823 case AArch64::STPDpre:
4824 case AArch64::STPXpost:
4825 case AArch64::STPXpre:
4826 Scale = TypeSize::getFixed(8);
4827 Width = TypeSize::getFixed(8 * 2);
4828 MinOffset = -64;
4829 MaxOffset = 63;
4830 break;
4831 case AArch64::LDPWi:
4832 case AArch64::LDPSi:
4833 case AArch64::LDNPWi:
4834 case AArch64::LDNPSi:
4835 case AArch64::STPWi:
4836 case AArch64::STPSi:
4837 case AArch64::STNPWi:
4838 case AArch64::STNPSi:
4839 case AArch64::LDPSpost:
4840 case AArch64::LDPSpre:
4841 case AArch64::LDPWpost:
4842 case AArch64::LDPWpre:
4843 case AArch64::STPSpost:
4844 case AArch64::STPSpre:
4845 case AArch64::STPWpost:
4846 case AArch64::STPWpre:
4847 Scale = TypeSize::getFixed(4);
4848 Width = TypeSize::getFixed(4 * 2);
4849 MinOffset = -64;
4850 MaxOffset = 63;
4851 break;
4852 case AArch64::StoreSwiftAsyncContext:
4853 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4854 Scale = TypeSize::getFixed(1);
4855 Width = TypeSize::getFixed(8);
4856 MinOffset = 0;
4857 MaxOffset = 4095;
4858 break;
4859 case AArch64::ADDG:
4860 Scale = TypeSize::getFixed(16);
4861 Width = TypeSize::getFixed(0);
4862 MinOffset = 0;
4863 MaxOffset = 63;
4864 break;
4865 case AArch64::TAGPstack:
4866 Scale = TypeSize::getFixed(16);
4867 Width = TypeSize::getFixed(0);
4868 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4869 // of 63 (not 64!).
4870 MinOffset = -63;
4871 MaxOffset = 63;
4872 break;
4873 case AArch64::LDG:
4874 case AArch64::STGi:
4875 case AArch64::STGPreIndex:
4876 case AArch64::STGPostIndex:
4877 case AArch64::STZGi:
4878 case AArch64::STZGPreIndex:
4879 case AArch64::STZGPostIndex:
4880 Scale = Width = TypeSize::getFixed(16);
4881 MinOffset = -256;
4882 MaxOffset = 255;
4883 break;
4884 // SVE
4885 case AArch64::STR_ZZZZXI:
4886 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4887 case AArch64::LDR_ZZZZXI:
4888 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4889 Scale = TypeSize::getScalable(16);
4890 Width = TypeSize::getScalable(16 * 4);
4891 MinOffset = -256;
4892 MaxOffset = 252;
4893 break;
4894 case AArch64::STR_ZZZXI:
4895 case AArch64::LDR_ZZZXI:
4896 Scale = TypeSize::getScalable(16);
4897 Width = TypeSize::getScalable(16 * 3);
4898 MinOffset = -256;
4899 MaxOffset = 253;
4900 break;
4901 case AArch64::STR_ZZXI:
4902 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4903 case AArch64::LDR_ZZXI:
4904 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4905 Scale = TypeSize::getScalable(16);
4906 Width = TypeSize::getScalable(16 * 2);
4907 MinOffset = -256;
4908 MaxOffset = 254;
4909 break;
4910 case AArch64::LDR_PXI:
4911 case AArch64::STR_PXI:
4912 Scale = Width = TypeSize::getScalable(2);
4913 MinOffset = -256;
4914 MaxOffset = 255;
4915 break;
4916 case AArch64::LDR_PPXI:
4917 case AArch64::STR_PPXI:
4918 Scale = TypeSize::getScalable(2);
4919 Width = TypeSize::getScalable(2 * 2);
4920 MinOffset = -256;
4921 MaxOffset = 254;
4922 break;
4923 case AArch64::LDR_ZXI:
4924 case AArch64::STR_ZXI:
4925 Scale = Width = TypeSize::getScalable(16);
4926 MinOffset = -256;
4927 MaxOffset = 255;
4928 break;
4929 case AArch64::LD1B_IMM:
4930 case AArch64::LD1H_IMM:
4931 case AArch64::LD1W_IMM:
4932 case AArch64::LD1D_IMM:
4933 case AArch64::LDNT1B_ZRI:
4934 case AArch64::LDNT1H_ZRI:
4935 case AArch64::LDNT1W_ZRI:
4936 case AArch64::LDNT1D_ZRI:
4937 case AArch64::ST1B_IMM:
4938 case AArch64::ST1H_IMM:
4939 case AArch64::ST1W_IMM:
4940 case AArch64::ST1D_IMM:
4941 case AArch64::STNT1B_ZRI:
4942 case AArch64::STNT1H_ZRI:
4943 case AArch64::STNT1W_ZRI:
4944 case AArch64::STNT1D_ZRI:
4945 case AArch64::LDNF1B_IMM:
4946 case AArch64::LDNF1H_IMM:
4947 case AArch64::LDNF1W_IMM:
4948 case AArch64::LDNF1D_IMM:
4949 // A full vectors worth of data
4950 // Width = mbytes * elements
4951 Scale = Width = TypeSize::getScalable(16);
4952 MinOffset = -8;
4953 MaxOffset = 7;
4954 break;
4955 case AArch64::LD2B_IMM:
4956 case AArch64::LD2H_IMM:
4957 case AArch64::LD2W_IMM:
4958 case AArch64::LD2D_IMM:
4959 case AArch64::ST2B_IMM:
4960 case AArch64::ST2H_IMM:
4961 case AArch64::ST2W_IMM:
4962 case AArch64::ST2D_IMM:
4963 case AArch64::LD1B_2Z_IMM:
4964 case AArch64::LD1B_2Z_STRIDED_IMM:
4965 case AArch64::LD1H_2Z_IMM:
4966 case AArch64::LD1H_2Z_STRIDED_IMM:
4967 case AArch64::LD1W_2Z_IMM:
4968 case AArch64::LD1W_2Z_STRIDED_IMM:
4969 case AArch64::LD1D_2Z_IMM:
4970 case AArch64::LD1D_2Z_STRIDED_IMM:
4971 case AArch64::LD1B_2Z_IMM_PSEUDO:
4972 case AArch64::LD1H_2Z_IMM_PSEUDO:
4973 case AArch64::LD1W_2Z_IMM_PSEUDO:
4974 case AArch64::LD1D_2Z_IMM_PSEUDO:
4975 case AArch64::ST1B_2Z_IMM:
4976 case AArch64::ST1B_2Z_STRIDED_IMM:
4977 case AArch64::ST1H_2Z_IMM:
4978 case AArch64::ST1H_2Z_STRIDED_IMM:
4979 case AArch64::ST1W_2Z_IMM:
4980 case AArch64::ST1W_2Z_STRIDED_IMM:
4981 case AArch64::ST1D_2Z_IMM:
4982 case AArch64::ST1D_2Z_STRIDED_IMM:
4983 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
4984 case AArch64::LDNT1B_2Z_IMM:
4985 case AArch64::LDNT1B_2Z_STRIDED_IMM:
4986 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
4987 case AArch64::LDNT1H_2Z_IMM:
4988 case AArch64::LDNT1H_2Z_STRIDED_IMM:
4989 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
4990 case AArch64::LDNT1W_2Z_IMM:
4991 case AArch64::LDNT1W_2Z_STRIDED_IMM:
4992 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
4993 case AArch64::LDNT1D_2Z_IMM:
4994 case AArch64::LDNT1D_2Z_STRIDED_IMM:
4995 case AArch64::STNT1B_2Z_IMM:
4996 case AArch64::STNT1B_2Z_STRIDED_IMM:
4997 case AArch64::STNT1H_2Z_IMM:
4998 case AArch64::STNT1H_2Z_STRIDED_IMM:
4999 case AArch64::STNT1W_2Z_IMM:
5000 case AArch64::STNT1W_2Z_STRIDED_IMM:
5001 case AArch64::STNT1D_2Z_IMM:
5002 case AArch64::STNT1D_2Z_STRIDED_IMM:
5003 Scale = Width = TypeSize::getScalable(16 * 2);
5004 MinOffset = -8;
5005 MaxOffset = 7;
5006 break;
5007 case AArch64::LD3B_IMM:
5008 case AArch64::LD3H_IMM:
5009 case AArch64::LD3W_IMM:
5010 case AArch64::LD3D_IMM:
5011 case AArch64::ST3B_IMM:
5012 case AArch64::ST3H_IMM:
5013 case AArch64::ST3W_IMM:
5014 case AArch64::ST3D_IMM:
5015 Scale = Width = TypeSize::getScalable(16 * 3);
5016 MinOffset = -8;
5017 MaxOffset = 7;
5018 break;
5019 case AArch64::LD4B_IMM:
5020 case AArch64::LD4H_IMM:
5021 case AArch64::LD4W_IMM:
5022 case AArch64::LD4D_IMM:
5023 case AArch64::ST4B_IMM:
5024 case AArch64::ST4H_IMM:
5025 case AArch64::ST4W_IMM:
5026 case AArch64::ST4D_IMM:
5027 case AArch64::LD1B_4Z_IMM:
5028 case AArch64::LD1B_4Z_STRIDED_IMM:
5029 case AArch64::LD1H_4Z_IMM:
5030 case AArch64::LD1H_4Z_STRIDED_IMM:
5031 case AArch64::LD1W_4Z_IMM:
5032 case AArch64::LD1W_4Z_STRIDED_IMM:
5033 case AArch64::LD1D_4Z_IMM:
5034 case AArch64::LD1D_4Z_STRIDED_IMM:
5035 case AArch64::LD1B_4Z_IMM_PSEUDO:
5036 case AArch64::LD1H_4Z_IMM_PSEUDO:
5037 case AArch64::LD1W_4Z_IMM_PSEUDO:
5038 case AArch64::LD1D_4Z_IMM_PSEUDO:
5039 case AArch64::ST1B_4Z_IMM:
5040 case AArch64::ST1B_4Z_STRIDED_IMM:
5041 case AArch64::ST1H_4Z_IMM:
5042 case AArch64::ST1H_4Z_STRIDED_IMM:
5043 case AArch64::ST1W_4Z_IMM:
5044 case AArch64::ST1W_4Z_STRIDED_IMM:
5045 case AArch64::ST1D_4Z_IMM:
5046 case AArch64::ST1D_4Z_STRIDED_IMM:
5047 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
5048 case AArch64::LDNT1B_4Z_IMM:
5049 case AArch64::LDNT1B_4Z_STRIDED_IMM:
5050 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
5051 case AArch64::LDNT1H_4Z_IMM:
5052 case AArch64::LDNT1H_4Z_STRIDED_IMM:
5053 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
5054 case AArch64::LDNT1W_4Z_IMM:
5055 case AArch64::LDNT1W_4Z_STRIDED_IMM:
5056 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
5057 case AArch64::LDNT1D_4Z_IMM:
5058 case AArch64::LDNT1D_4Z_STRIDED_IMM:
5059 case AArch64::STNT1B_4Z_IMM:
5060 case AArch64::STNT1B_4Z_STRIDED_IMM:
5061 case AArch64::STNT1H_4Z_IMM:
5062 case AArch64::STNT1H_4Z_STRIDED_IMM:
5063 case AArch64::STNT1W_4Z_IMM:
5064 case AArch64::STNT1W_4Z_STRIDED_IMM:
5065 case AArch64::STNT1D_4Z_IMM:
5066 case AArch64::STNT1D_4Z_STRIDED_IMM:
5067 Scale = Width = TypeSize::getScalable(16 * 4);
5068 MinOffset = -8;
5069 MaxOffset = 7;
5070 break;
5071 case AArch64::LD1B_H_IMM:
5072 case AArch64::LD1SB_H_IMM:
5073 case AArch64::LD1H_S_IMM:
5074 case AArch64::LD1SH_S_IMM:
5075 case AArch64::LD1W_D_IMM:
5076 case AArch64::LD1SW_D_IMM:
5077 case AArch64::ST1B_H_IMM:
5078 case AArch64::ST1H_S_IMM:
5079 case AArch64::ST1W_D_IMM:
5080 case AArch64::LDNF1B_H_IMM:
5081 case AArch64::LDNF1SB_H_IMM:
5082 case AArch64::LDNF1H_S_IMM:
5083 case AArch64::LDNF1SH_S_IMM:
5084 case AArch64::LDNF1W_D_IMM:
5085 case AArch64::LDNF1SW_D_IMM:
5086 // A half vector worth of data
5087 // Width = mbytes * elements
5088 Scale = Width = TypeSize::getScalable(8);
5089 MinOffset = -8;
5090 MaxOffset = 7;
5091 break;
5092 case AArch64::LD1B_S_IMM:
5093 case AArch64::LD1SB_S_IMM:
5094 case AArch64::LD1H_D_IMM:
5095 case AArch64::LD1SH_D_IMM:
5096 case AArch64::ST1B_S_IMM:
5097 case AArch64::ST1H_D_IMM:
5098 case AArch64::LDNF1B_S_IMM:
5099 case AArch64::LDNF1SB_S_IMM:
5100 case AArch64::LDNF1H_D_IMM:
5101 case AArch64::LDNF1SH_D_IMM:
5102 // A quarter vector worth of data
5103 // Width = mbytes * elements
5104 Scale = Width = TypeSize::getScalable(4);
5105 MinOffset = -8;
5106 MaxOffset = 7;
5107 break;
5108 case AArch64::LD1B_D_IMM:
5109 case AArch64::LD1SB_D_IMM:
5110 case AArch64::ST1B_D_IMM:
5111 case AArch64::LDNF1B_D_IMM:
5112 case AArch64::LDNF1SB_D_IMM:
5113 // A eighth vector worth of data
5114 // Width = mbytes * elements
5115 Scale = Width = TypeSize::getScalable(2);
5116 MinOffset = -8;
5117 MaxOffset = 7;
5118 break;
5119 case AArch64::ST2Gi:
5120 case AArch64::ST2GPreIndex:
5121 case AArch64::ST2GPostIndex:
5122 case AArch64::STZ2Gi:
5123 case AArch64::STZ2GPreIndex:
5124 case AArch64::STZ2GPostIndex:
5125 Scale = TypeSize::getFixed(16);
5126 Width = TypeSize::getFixed(32);
5127 MinOffset = -256;
5128 MaxOffset = 255;
5129 break;
5130 case AArch64::STGPi:
5131 case AArch64::STGPpost:
5132 case AArch64::STGPpre:
5133 Scale = Width = TypeSize::getFixed(16);
5134 MinOffset = -64;
5135 MaxOffset = 63;
5136 break;
5137 case AArch64::LD1RB_IMM:
5138 case AArch64::LD1RB_H_IMM:
5139 case AArch64::LD1RB_S_IMM:
5140 case AArch64::LD1RB_D_IMM:
5141 case AArch64::LD1RSB_H_IMM:
5142 case AArch64::LD1RSB_S_IMM:
5143 case AArch64::LD1RSB_D_IMM:
5144 Scale = Width = TypeSize::getFixed(1);
5145 MinOffset = 0;
5146 MaxOffset = 63;
5147 break;
5148 case AArch64::LD1RH_IMM:
5149 case AArch64::LD1RH_S_IMM:
5150 case AArch64::LD1RH_D_IMM:
5151 case AArch64::LD1RSH_S_IMM:
5152 case AArch64::LD1RSH_D_IMM:
5153 Scale = Width = TypeSize::getFixed(2);
5154 MinOffset = 0;
5155 MaxOffset = 63;
5156 break;
5157 case AArch64::LD1RW_IMM:
5158 case AArch64::LD1RW_D_IMM:
5159 case AArch64::LD1RSW_IMM:
5160 Scale = Width = TypeSize::getFixed(4);
5161 MinOffset = 0;
5162 MaxOffset = 63;
5163 break;
5164 case AArch64::LD1RD_IMM:
5165 Scale = Width = TypeSize::getFixed(8);
5166 MinOffset = 0;
5167 MaxOffset = 63;
5168 break;
5169 }
5170
5171 return true;
5172}
5173
5174// Scaling factor for unscaled load or store.
5176 switch (Opc) {
5177 default:
5178 llvm_unreachable("Opcode has unknown scale!");
5179 case AArch64::LDRBui:
5180 case AArch64::LDRBBui:
5181 case AArch64::LDURBBi:
5182 case AArch64::LDRSBWui:
5183 case AArch64::LDURSBWi:
5184 case AArch64::STRBui:
5185 case AArch64::STRBBui:
5186 case AArch64::STURBBi:
5187 return 1;
5188 case AArch64::LDRHui:
5189 case AArch64::LDRHHui:
5190 case AArch64::LDURHHi:
5191 case AArch64::LDRSHWui:
5192 case AArch64::LDURSHWi:
5193 case AArch64::STRHui:
5194 case AArch64::STRHHui:
5195 case AArch64::STURHHi:
5196 return 2;
5197 case AArch64::LDRSui:
5198 case AArch64::LDURSi:
5199 case AArch64::LDRSpre:
5200 case AArch64::LDRSWui:
5201 case AArch64::LDURSWi:
5202 case AArch64::LDRSWpre:
5203 case AArch64::LDRWpre:
5204 case AArch64::LDRWui:
5205 case AArch64::LDURWi:
5206 case AArch64::STRSui:
5207 case AArch64::STURSi:
5208 case AArch64::STRSpre:
5209 case AArch64::STRWui:
5210 case AArch64::STURWi:
5211 case AArch64::STRWpre:
5212 case AArch64::LDPSi:
5213 case AArch64::LDPSWi:
5214 case AArch64::LDPWi:
5215 case AArch64::STPSi:
5216 case AArch64::STPWi:
5217 return 4;
5218 case AArch64::LDRDui:
5219 case AArch64::LDURDi:
5220 case AArch64::LDRDpre:
5221 case AArch64::LDRXui:
5222 case AArch64::LDURXi:
5223 case AArch64::LDRXpre:
5224 case AArch64::STRDui:
5225 case AArch64::STURDi:
5226 case AArch64::STRDpre:
5227 case AArch64::STRXui:
5228 case AArch64::STURXi:
5229 case AArch64::STRXpre:
5230 case AArch64::LDPDi:
5231 case AArch64::LDPXi:
5232 case AArch64::STPDi:
5233 case AArch64::STPXi:
5234 return 8;
5235 case AArch64::LDRQui:
5236 case AArch64::LDURQi:
5237 case AArch64::STRQui:
5238 case AArch64::STURQi:
5239 case AArch64::STRQpre:
5240 case AArch64::LDPQi:
5241 case AArch64::LDRQpre:
5242 case AArch64::STPQi:
5243 case AArch64::STGi:
5244 case AArch64::STZGi:
5245 case AArch64::ST2Gi:
5246 case AArch64::STZ2Gi:
5247 case AArch64::STGPi:
5248 return 16;
5249 }
5250}
5251
5253 switch (MI.getOpcode()) {
5254 default:
5255 return false;
5256 case AArch64::LDRWpre:
5257 case AArch64::LDRXpre:
5258 case AArch64::LDRSWpre:
5259 case AArch64::LDRSpre:
5260 case AArch64::LDRDpre:
5261 case AArch64::LDRQpre:
5262 return true;
5263 }
5264}
5265
5267 switch (MI.getOpcode()) {
5268 default:
5269 return false;
5270 case AArch64::STRWpre:
5271 case AArch64::STRXpre:
5272 case AArch64::STRSpre:
5273 case AArch64::STRDpre:
5274 case AArch64::STRQpre:
5275 return true;
5276 }
5277}
5278
5280 return isPreLd(MI) || isPreSt(MI);
5281}
5282
5284 switch (MI.getOpcode()) {
5285 default:
5286 return false;
5287 case AArch64::LDURBBi:
5288 case AArch64::LDURHHi:
5289 case AArch64::LDURWi:
5290 case AArch64::LDRBBui:
5291 case AArch64::LDRHHui:
5292 case AArch64::LDRWui:
5293 case AArch64::LDRBBroX:
5294 case AArch64::LDRHHroX:
5295 case AArch64::LDRWroX:
5296 case AArch64::LDRBBroW:
5297 case AArch64::LDRHHroW:
5298 case AArch64::LDRWroW:
5299 return true;
5300 }
5301}
5302
5304 switch (MI.getOpcode()) {
5305 default:
5306 return false;
5307 case AArch64::LDURSBWi:
5308 case AArch64::LDURSHWi:
5309 case AArch64::LDURSBXi:
5310 case AArch64::LDURSHXi:
5311 case AArch64::LDURSWi:
5312 case AArch64::LDRSBWui:
5313 case AArch64::LDRSHWui:
5314 case AArch64::LDRSBXui:
5315 case AArch64::LDRSHXui:
5316 case AArch64::LDRSWui:
5317 case AArch64::LDRSBWroX:
5318 case AArch64::LDRSHWroX:
5319 case AArch64::LDRSBXroX:
5320 case AArch64::LDRSHXroX:
5321 case AArch64::LDRSWroX:
5322 case AArch64::LDRSBWroW:
5323 case AArch64::LDRSHWroW:
5324 case AArch64::LDRSBXroW:
5325 case AArch64::LDRSHXroW:
5326 case AArch64::LDRSWroW:
5327 return true;
5328 }
5329}
5330
5332 switch (MI.getOpcode()) {
5333 default:
5334 return false;
5335 case AArch64::LDPSi:
5336 case AArch64::LDPSWi:
5337 case AArch64::LDPDi:
5338 case AArch64::LDPQi:
5339 case AArch64::LDPWi:
5340 case AArch64::LDPXi:
5341 case AArch64::STPSi:
5342 case AArch64::STPDi:
5343 case AArch64::STPQi:
5344 case AArch64::STPWi:
5345 case AArch64::STPXi:
5346 case AArch64::STGPi:
5347 return true;
5348 }
5349}
5350
5352 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5353 unsigned Idx =
5355 : 1;
5356 return MI.getOperand(Idx);
5357}
5358
5359const MachineOperand &
5361 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5362 unsigned Idx =
5364 : 2;
5365 return MI.getOperand(Idx);
5366}
5367
5368const MachineOperand &
5370 switch (MI.getOpcode()) {
5371 default:
5372 llvm_unreachable("Unexpected opcode");
5373 case AArch64::LDRBroX:
5374 case AArch64::LDRBBroX:
5375 case AArch64::LDRSBXroX:
5376 case AArch64::LDRSBWroX:
5377 case AArch64::LDRHroX:
5378 case AArch64::LDRHHroX:
5379 case AArch64::LDRSHXroX:
5380 case AArch64::LDRSHWroX:
5381 case AArch64::LDRWroX:
5382 case AArch64::LDRSroX:
5383 case AArch64::LDRSWroX:
5384 case AArch64::LDRDroX:
5385 case AArch64::LDRXroX:
5386 case AArch64::LDRQroX:
5387 return MI.getOperand(4);
5388 }
5389}
5390
5392 Register Reg) {
5393 if (MI.getParent() == nullptr)
5394 return nullptr;
5395 const MachineFunction *MF = MI.getParent()->getParent();
5396 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5397}
5398
5400 auto IsHFPR = [&](const MachineOperand &Op) {
5401 if (!Op.isReg())
5402 return false;
5403 auto Reg = Op.getReg();
5404 if (Reg.isPhysical())
5405 return AArch64::FPR16RegClass.contains(Reg);
5406 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5407 return TRC == &AArch64::FPR16RegClass ||
5408 TRC == &AArch64::FPR16_loRegClass;
5409 };
5410 return llvm::any_of(MI.operands(), IsHFPR);
5411}
5412
5414 auto IsQFPR = [&](const MachineOperand &Op) {
5415 if (!Op.isReg())
5416 return false;
5417 auto Reg = Op.getReg();
5418 if (Reg.isPhysical())
5419 return AArch64::FPR128RegClass.contains(Reg);
5420 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5421 return TRC == &AArch64::FPR128RegClass ||
5422 TRC == &AArch64::FPR128_loRegClass;
5423 };
5424 return llvm::any_of(MI.operands(), IsQFPR);
5425}
5426
5428 switch (MI.getOpcode()) {
5429 case AArch64::BRK:
5430 case AArch64::HLT:
5431 case AArch64::PACIASP:
5432 case AArch64::PACIBSP:
5433 // Implicit BTI behavior.
5434 return true;
5435 case AArch64::PAUTH_PROLOGUE:
5436 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5437 return true;
5438 case AArch64::HINT: {
5439 unsigned Imm = MI.getOperand(0).getImm();
5440 // Explicit BTI instruction.
5441 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5442 return true;
5443 // PACI(A|B)SP instructions.
5444 if (Imm == 25 || Imm == 27)
5445 return true;
5446 return false;
5447 }
5448 default:
5449 return false;
5450 }
5451}
5452
5454 if (Reg == 0)
5455 return false;
5456 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5457 return AArch64::FPR128RegClass.contains(Reg) ||
5458 AArch64::FPR64RegClass.contains(Reg) ||
5459 AArch64::FPR32RegClass.contains(Reg) ||
5460 AArch64::FPR16RegClass.contains(Reg) ||
5461 AArch64::FPR8RegClass.contains(Reg);
5462}
5463
5465 auto IsFPR = [&](const MachineOperand &Op) {
5466 if (!Op.isReg())
5467 return false;
5468 auto Reg = Op.getReg();
5469 if (Reg.isPhysical())
5470 return isFpOrNEON(Reg);
5471
5472 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5473 return TRC == &AArch64::FPR128RegClass ||
5474 TRC == &AArch64::FPR128_loRegClass ||
5475 TRC == &AArch64::FPR64RegClass ||
5476 TRC == &AArch64::FPR64_loRegClass ||
5477 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5478 TRC == &AArch64::FPR8RegClass;
5479 };
5480 return llvm::any_of(MI.operands(), IsFPR);
5481}
5482
5483// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5484// scaled.
5485static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5487
5488 // If the byte-offset isn't a multiple of the stride, we can't scale this
5489 // offset.
5490 if (Offset % Scale != 0)
5491 return false;
5492
5493 // Convert the byte-offset used by unscaled into an "element" offset used
5494 // by the scaled pair load/store instructions.
5495 Offset /= Scale;
5496 return true;
5497}
5498
5499static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5500 if (FirstOpc == SecondOpc)
5501 return true;
5502 // We can also pair sign-ext and zero-ext instructions.
5503 switch (FirstOpc) {
5504 default:
5505 return false;
5506 case AArch64::STRSui:
5507 case AArch64::STURSi:
5508 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5509 case AArch64::STRDui:
5510 case AArch64::STURDi:
5511 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5512 case AArch64::STRQui:
5513 case AArch64::STURQi:
5514 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5515 case AArch64::STRWui:
5516 case AArch64::STURWi:
5517 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5518 case AArch64::STRXui:
5519 case AArch64::STURXi:
5520 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5521 case AArch64::LDRSui:
5522 case AArch64::LDURSi:
5523 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5524 case AArch64::LDRDui:
5525 case AArch64::LDURDi:
5526 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5527 case AArch64::LDRQui:
5528 case AArch64::LDURQi:
5529 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5530 case AArch64::LDRWui:
5531 case AArch64::LDURWi:
5532 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5533 case AArch64::LDRSWui:
5534 case AArch64::LDURSWi:
5535 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5536 case AArch64::LDRXui:
5537 case AArch64::LDURXi:
5538 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5539 }
5540 // These instructions can't be paired based on their opcodes.
5541 return false;
5542}
5543
5544static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5545 int64_t Offset1, unsigned Opcode1, int FI2,
5546 int64_t Offset2, unsigned Opcode2) {
5547 // Accesses through fixed stack object frame indices may access a different
5548 // fixed stack slot. Check that the object offsets + offsets match.
5549 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5550 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5551 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5552 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5553 // Convert to scaled object offsets.
5554 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5555 if (ObjectOffset1 % Scale1 != 0)
5556 return false;
5557 ObjectOffset1 /= Scale1;
5558 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5559 if (ObjectOffset2 % Scale2 != 0)
5560 return false;
5561 ObjectOffset2 /= Scale2;
5562 ObjectOffset1 += Offset1;
5563 ObjectOffset2 += Offset2;
5564 return ObjectOffset1 + 1 == ObjectOffset2;
5565 }
5566
5567 return FI1 == FI2;
5568}
5569
5570/// Detect opportunities for ldp/stp formation.
5571///
5572/// Only called for LdSt for which getMemOperandWithOffset returns true.
5574 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5575 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5576 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5577 unsigned NumBytes) const {
5578 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5579 const MachineOperand &BaseOp1 = *BaseOps1.front();
5580 const MachineOperand &BaseOp2 = *BaseOps2.front();
5581 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5582 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5583 if (BaseOp1.getType() != BaseOp2.getType())
5584 return false;
5585
5586 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5587 "Only base registers and frame indices are supported.");
5588
5589 // Check for both base regs and base FI.
5590 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5591 return false;
5592
5593 // Only cluster up to a single pair.
5594 if (ClusterSize > 2)
5595 return false;
5596
5597 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5598 return false;
5599
5600 // Can we pair these instructions based on their opcodes?
5601 unsigned FirstOpc = FirstLdSt.getOpcode();
5602 unsigned SecondOpc = SecondLdSt.getOpcode();
5603 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5604 return false;
5605
5606 // Can't merge volatiles or load/stores that have a hint to avoid pair
5607 // formation, for example.
5608 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5609 !isCandidateToMergeOrPair(SecondLdSt))
5610 return false;
5611
5612 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5613 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5614 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5615 return false;
5616
5617 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5618 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5619 return false;
5620
5621 // Pairwise instructions have a 7-bit signed offset field.
5622 if (Offset1 > 63 || Offset1 < -64)
5623 return false;
5624
5625 // The caller should already have ordered First/SecondLdSt by offset.
5626 // Note: except for non-equal frame index bases
5627 if (BaseOp1.isFI()) {
5628 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5629 "Caller should have ordered offsets.");
5630
5631 const MachineFrameInfo &MFI =
5632 FirstLdSt.getParent()->getParent()->getFrameInfo();
5633 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5634 BaseOp2.getIndex(), Offset2, SecondOpc);
5635 }
5636
5637 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5638
5639 return Offset1 + 1 == Offset2;
5640}
5641
5643 MCRegister Reg, unsigned SubIdx,
5644 RegState State,
5645 const TargetRegisterInfo *TRI) {
5646 if (!SubIdx)
5647 return MIB.addReg(Reg, State);
5648
5649 if (Reg.isPhysical())
5650 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5651 return MIB.addReg(Reg, State, SubIdx);
5652}
5653
5654static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5655 unsigned NumRegs) {
5656 // We really want the positive remainder mod 32 here, that happens to be
5657 // easily obtainable with a mask.
5658 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5659}
5660
5663 const DebugLoc &DL, MCRegister DestReg,
5664 MCRegister SrcReg, bool KillSrc,
5665 unsigned Opcode,
5666 ArrayRef<unsigned> Indices) const {
5667 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5669 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5670 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5671 unsigned NumRegs = Indices.size();
5672
5673 int SubReg = 0, End = NumRegs, Incr = 1;
5674 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5675 SubReg = NumRegs - 1;
5676 End = -1;
5677 Incr = -1;
5678 }
5679
5680 for (; SubReg != End; SubReg += Incr) {
5681 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5682 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5683 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5684 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5685 }
5686}
5687
5690 const DebugLoc &DL, MCRegister DestReg,
5691 MCRegister SrcReg, bool KillSrc,
5692 unsigned Opcode, unsigned ZeroReg,
5693 llvm::ArrayRef<unsigned> Indices) const {
5695 unsigned NumRegs = Indices.size();
5696
5697#ifndef NDEBUG
5698 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5699 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5700 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5701 "GPR reg sequences should not be able to overlap");
5702#endif
5703
5704 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5705 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5706 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5707 MIB.addReg(ZeroReg);
5708 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5709 MIB.addImm(0);
5710 }
5711}
5712
5713/// Returns true if the instruction at I is in a streaming call site region,
5714/// within a single basic block.
5715/// A "call site streaming region" starts after smstart and ends at smstop
5716/// around a call to a streaming function. This walks backward from I.
5719 MachineFunction &MF = *MBB.getParent();
5721 if (!AFI->hasStreamingModeChanges())
5722 return false;
5723 // Walk backwards to find smstart/smstop
5724 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5725 unsigned Opc = MI.getOpcode();
5726 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5727 // Check if this is SM change (not ZA)
5728 int64_t PState = MI.getOperand(0).getImm();
5729 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5730 // Operand 1 is 1 for start, 0 for stop
5731 return MI.getOperand(1).getImm() == 1;
5732 }
5733 }
5734 }
5735 return false;
5736}
5737
5738/// Returns true if in a streaming call site region without SME-FA64.
5739static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5742 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5743}
5744
5747 const DebugLoc &DL, Register DestReg,
5748 Register SrcReg, bool KillSrc,
5749 bool RenamableDest,
5750 bool RenamableSrc) const {
5751 ++NumCopyInstrs;
5752 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5753 AArch64::GPR32spRegClass.contains(SrcReg)) {
5754 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5755 // If either operand is WSP, expand to ADD #0.
5756 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5757 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5758 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5759 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5760 &AArch64::GPR64spRegClass);
5761 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5762 &AArch64::GPR64spRegClass);
5763 // This instruction is reading and writing X registers. This may upset
5764 // the register scavenger and machine verifier, so we need to indicate
5765 // that we are reading an undefined value from SrcRegX, but a proper
5766 // value from SrcReg.
5767 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5768 .addReg(SrcRegX, RegState::Undef)
5769 .addImm(0)
5771 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5772 ++NumZCRegMoveInstrsGPR;
5773 } else {
5774 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5775 .addReg(SrcReg, getKillRegState(KillSrc))
5776 .addImm(0)
5778 if (Subtarget.hasZeroCycleRegMoveGPR32())
5779 ++NumZCRegMoveInstrsGPR;
5780 }
5781 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5782 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5783 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5784 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5785 &AArch64::GPR64spRegClass);
5786 assert(DestRegX.isValid() && "Destination super-reg not valid");
5787 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5788 &AArch64::GPR64spRegClass);
5789 assert(SrcRegX.isValid() && "Source super-reg not valid");
5790 // This instruction is reading and writing X registers. This may upset
5791 // the register scavenger and machine verifier, so we need to indicate
5792 // that we are reading an undefined value from SrcRegX, but a proper
5793 // value from SrcReg.
5794 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5795 .addReg(AArch64::XZR)
5796 .addReg(SrcRegX, RegState::Undef)
5797 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5798 ++NumZCRegMoveInstrsGPR;
5799 } else {
5800 // Otherwise, expand to ORR WZR.
5801 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5802 .addReg(AArch64::WZR)
5803 .addReg(SrcReg, getKillRegState(KillSrc));
5804 if (Subtarget.hasZeroCycleRegMoveGPR32())
5805 ++NumZCRegMoveInstrsGPR;
5806 }
5807 return;
5808 }
5809
5810 // GPR32 zeroing
5811 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5812 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5813 !Subtarget.hasZeroCycleZeroingGPR32()) {
5814 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5815 &AArch64::GPR64spRegClass);
5816 assert(DestRegX.isValid() && "Destination super-reg not valid");
5817 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5818 .addImm(0)
5820 ++NumZCZeroingInstrsGPR;
5821 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5822 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5823 .addImm(0)
5825 ++NumZCZeroingInstrsGPR;
5826 } else {
5827 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5828 .addReg(AArch64::WZR)
5829 .addReg(AArch64::WZR);
5830 }
5831 return;
5832 }
5833
5834 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5835 AArch64::GPR64spRegClass.contains(SrcReg)) {
5836 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5837 // If either operand is SP, expand to ADD #0.
5838 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5839 .addReg(SrcReg, getKillRegState(KillSrc))
5840 .addImm(0)
5842 if (Subtarget.hasZeroCycleRegMoveGPR64())
5843 ++NumZCRegMoveInstrsGPR;
5844 } else {
5845 // Otherwise, expand to ORR XZR.
5846 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5847 .addReg(AArch64::XZR)
5848 .addReg(SrcReg, getKillRegState(KillSrc));
5849 if (Subtarget.hasZeroCycleRegMoveGPR64())
5850 ++NumZCRegMoveInstrsGPR;
5851 }
5852 return;
5853 }
5854
5855 // GPR64 zeroing
5856 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5857 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5858 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5859 .addImm(0)
5861 ++NumZCZeroingInstrsGPR;
5862 } else {
5863 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5864 .addReg(AArch64::XZR)
5865 .addReg(AArch64::XZR);
5866 }
5867 return;
5868 }
5869
5870 // Copy a Predicate register by ORRing with itself.
5871 if (AArch64::PPRRegClass.contains(DestReg) &&
5872 AArch64::PPRRegClass.contains(SrcReg)) {
5873 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5874 "Unexpected SVE register.");
5875 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5876 .addReg(SrcReg) // Pg
5877 .addReg(SrcReg)
5878 .addReg(SrcReg, getKillRegState(KillSrc));
5879 return;
5880 }
5881
5882 // Copy a predicate-as-counter register by ORRing with itself as if it
5883 // were a regular predicate (mask) register.
5884 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5885 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5886 if (DestIsPNR || SrcIsPNR) {
5887 auto ToPPR = [](MCRegister R) -> MCRegister {
5888 return (R - AArch64::PN0) + AArch64::P0;
5889 };
5890 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5891 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5892
5893 if (PPRSrcReg != PPRDestReg) {
5894 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5895 .addReg(PPRSrcReg) // Pg
5896 .addReg(PPRSrcReg)
5897 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5898 if (DestIsPNR)
5899 NewMI.addDef(DestReg, RegState::Implicit);
5900 }
5901 return;
5902 }
5903
5904 // Copy a Z register by ORRing with itself.
5905 if (AArch64::ZPRRegClass.contains(DestReg) &&
5906 AArch64::ZPRRegClass.contains(SrcReg)) {
5907 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5908 "Unexpected SVE register.");
5909 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5910 .addReg(SrcReg)
5911 .addReg(SrcReg, getKillRegState(KillSrc));
5912 return;
5913 }
5914
5915 // Copy a Z register pair by copying the individual sub-registers.
5916 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5917 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5918 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5919 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5920 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5921 "Unexpected SVE register.");
5922 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5923 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5924 Indices);
5925 return;
5926 }
5927
5928 // Copy a Z register triple by copying the individual sub-registers.
5929 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5930 AArch64::ZPR3RegClass.contains(SrcReg)) {
5931 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5932 "Unexpected SVE register.");
5933 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5934 AArch64::zsub2};
5935 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5936 Indices);
5937 return;
5938 }
5939
5940 // Copy a Z register quad by copying the individual sub-registers.
5941 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5942 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5943 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5944 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5945 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5946 "Unexpected SVE register.");
5947 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5948 AArch64::zsub2, AArch64::zsub3};
5949 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5950 Indices);
5951 return;
5952 }
5953
5954 // Copy a DDDD register quad by copying the individual sub-registers.
5955 if (AArch64::DDDDRegClass.contains(DestReg) &&
5956 AArch64::DDDDRegClass.contains(SrcReg)) {
5957 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5958 AArch64::dsub2, AArch64::dsub3};
5959 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5960 Indices);
5961 return;
5962 }
5963
5964 // Copy a DDD register triple by copying the individual sub-registers.
5965 if (AArch64::DDDRegClass.contains(DestReg) &&
5966 AArch64::DDDRegClass.contains(SrcReg)) {
5967 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5968 AArch64::dsub2};
5969 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5970 Indices);
5971 return;
5972 }
5973
5974 // Copy a DD register pair by copying the individual sub-registers.
5975 if (AArch64::DDRegClass.contains(DestReg) &&
5976 AArch64::DDRegClass.contains(SrcReg)) {
5977 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5978 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5979 Indices);
5980 return;
5981 }
5982
5983 // Copy a QQQQ register quad by copying the individual sub-registers.
5984 if (AArch64::QQQQRegClass.contains(DestReg) &&
5985 AArch64::QQQQRegClass.contains(SrcReg)) {
5986 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5987 AArch64::qsub2, AArch64::qsub3};
5988 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5989 Indices);
5990 return;
5991 }
5992
5993 // Copy a QQQ register triple by copying the individual sub-registers.
5994 if (AArch64::QQQRegClass.contains(DestReg) &&
5995 AArch64::QQQRegClass.contains(SrcReg)) {
5996 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5997 AArch64::qsub2};
5998 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5999 Indices);
6000 return;
6001 }
6002
6003 // Copy a QQ register pair by copying the individual sub-registers.
6004 if (AArch64::QQRegClass.contains(DestReg) &&
6005 AArch64::QQRegClass.contains(SrcReg)) {
6006 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
6007 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
6008 Indices);
6009 return;
6010 }
6011
6012 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
6013 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
6014 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
6015 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
6016 AArch64::XZR, Indices);
6017 return;
6018 }
6019
6020 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
6021 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
6022 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
6023 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
6024 AArch64::WZR, Indices);
6025 return;
6026 }
6027
6028 if (AArch64::FPR128RegClass.contains(DestReg) &&
6029 AArch64::FPR128RegClass.contains(SrcReg)) {
6030 // In streaming regions, NEON is illegal but streaming-SVE is available.
6031 // Use SVE for copies if we're in a streaming region and SME is available.
6032 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
6033 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
6034 !Subtarget.isNeonAvailable()) ||
6035 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6036 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
6037 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
6038 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
6039 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
6040 } else if (Subtarget.isNeonAvailable()) {
6041 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
6042 .addReg(SrcReg)
6043 .addReg(SrcReg, getKillRegState(KillSrc));
6044 if (Subtarget.hasZeroCycleRegMoveFPR128())
6045 ++NumZCRegMoveInstrsFPR;
6046 } else {
6047 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
6048 .addReg(AArch64::SP, RegState::Define)
6049 .addReg(SrcReg, getKillRegState(KillSrc))
6050 .addReg(AArch64::SP)
6051 .addImm(-16);
6052 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
6053 .addReg(AArch64::SP, RegState::Define)
6054 .addReg(DestReg, RegState::Define)
6055 .addReg(AArch64::SP)
6056 .addImm(16);
6057 }
6058 return;
6059 }
6060
6061 if (AArch64::FPR64RegClass.contains(DestReg) &&
6062 AArch64::FPR64RegClass.contains(SrcReg)) {
6063 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6064 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6065 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6066 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6067 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
6068 &AArch64::FPR128RegClass);
6069 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
6070 &AArch64::FPR128RegClass);
6071 // This instruction is reading and writing Q registers. This may upset
6072 // the register scavenger and machine verifier, so we need to indicate
6073 // that we are reading an undefined value from SrcRegQ, but a proper
6074 // value from SrcReg.
6075 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6076 .addReg(SrcRegQ, RegState::Undef)
6077 .addReg(SrcRegQ, RegState::Undef)
6078 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6079 ++NumZCRegMoveInstrsFPR;
6080 } else {
6081 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
6082 .addReg(SrcReg, getKillRegState(KillSrc));
6083 if (Subtarget.hasZeroCycleRegMoveFPR64())
6084 ++NumZCRegMoveInstrsFPR;
6085 }
6086 return;
6087 }
6088
6089 if (AArch64::FPR32RegClass.contains(DestReg) &&
6090 AArch64::FPR32RegClass.contains(SrcReg)) {
6091 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6092 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6093 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6094 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6095 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6096 &AArch64::FPR128RegClass);
6097 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6098 &AArch64::FPR128RegClass);
6099 // This instruction is reading and writing Q registers. This may upset
6100 // the register scavenger and machine verifier, so we need to indicate
6101 // that we are reading an undefined value from SrcRegQ, but a proper
6102 // value from SrcReg.
6103 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6104 .addReg(SrcRegQ, RegState::Undef)
6105 .addReg(SrcRegQ, RegState::Undef)
6106 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6107 ++NumZCRegMoveInstrsFPR;
6108 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6109 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6110 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6111 &AArch64::FPR64RegClass);
6112 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6113 &AArch64::FPR64RegClass);
6114 // This instruction is reading and writing D registers. This may upset
6115 // the register scavenger and machine verifier, so we need to indicate
6116 // that we are reading an undefined value from SrcRegD, but a proper
6117 // value from SrcReg.
6118 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6119 .addReg(SrcRegD, RegState::Undef)
6120 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6121 ++NumZCRegMoveInstrsFPR;
6122 } else {
6123 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6124 .addReg(SrcReg, getKillRegState(KillSrc));
6125 if (Subtarget.hasZeroCycleRegMoveFPR32())
6126 ++NumZCRegMoveInstrsFPR;
6127 }
6128 return;
6129 }
6130
6131 if (AArch64::FPR16RegClass.contains(DestReg) &&
6132 AArch64::FPR16RegClass.contains(SrcReg)) {
6133 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6134 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6135 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6136 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6137 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6138 &AArch64::FPR128RegClass);
6139 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6140 &AArch64::FPR128RegClass);
6141 // This instruction is reading and writing Q registers. This may upset
6142 // the register scavenger and machine verifier, so we need to indicate
6143 // that we are reading an undefined value from SrcRegQ, but a proper
6144 // value from SrcReg.
6145 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6146 .addReg(SrcRegQ, RegState::Undef)
6147 .addReg(SrcRegQ, RegState::Undef)
6148 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6149 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6150 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6151 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6152 &AArch64::FPR64RegClass);
6153 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6154 &AArch64::FPR64RegClass);
6155 // This instruction is reading and writing D registers. This may upset
6156 // the register scavenger and machine verifier, so we need to indicate
6157 // that we are reading an undefined value from SrcRegD, but a proper
6158 // value from SrcReg.
6159 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6160 .addReg(SrcRegD, RegState::Undef)
6161 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6162 } else {
6163 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6164 &AArch64::FPR32RegClass);
6165 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6166 &AArch64::FPR32RegClass);
6167 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6168 .addReg(SrcReg, getKillRegState(KillSrc));
6169 }
6170 return;
6171 }
6172
6173 if (AArch64::FPR8RegClass.contains(DestReg) &&
6174 AArch64::FPR8RegClass.contains(SrcReg)) {
6175 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6176 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6177 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6178 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6179 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6180 &AArch64::FPR128RegClass);
6181 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6182 &AArch64::FPR128RegClass);
6183 // This instruction is reading and writing Q registers. This may upset
6184 // the register scavenger and machine verifier, so we need to indicate
6185 // that we are reading an undefined value from SrcRegQ, but a proper
6186 // value from SrcReg.
6187 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6188 .addReg(SrcRegQ, RegState::Undef)
6189 .addReg(SrcRegQ, RegState::Undef)
6190 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6191 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6192 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6193 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6194 &AArch64::FPR64RegClass);
6195 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6196 &AArch64::FPR64RegClass);
6197 // This instruction is reading and writing D registers. This may upset
6198 // the register scavenger and machine verifier, so we need to indicate
6199 // that we are reading an undefined value from SrcRegD, but a proper
6200 // value from SrcReg.
6201 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6202 .addReg(SrcRegD, RegState::Undef)
6203 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6204 } else {
6205 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6206 &AArch64::FPR32RegClass);
6207 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6208 &AArch64::FPR32RegClass);
6209 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6210 .addReg(SrcReg, getKillRegState(KillSrc));
6211 }
6212 return;
6213 }
6214
6215 // Copies between GPR64 and FPR64.
6216 if (AArch64::FPR64RegClass.contains(DestReg) &&
6217 AArch64::GPR64RegClass.contains(SrcReg)) {
6218 if (AArch64::XZR == SrcReg) {
6219 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6220 } else {
6221 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6222 .addReg(SrcReg, getKillRegState(KillSrc));
6223 }
6224 return;
6225 }
6226 if (AArch64::GPR64RegClass.contains(DestReg) &&
6227 AArch64::FPR64RegClass.contains(SrcReg)) {
6228 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6229 .addReg(SrcReg, getKillRegState(KillSrc));
6230 return;
6231 }
6232 // Copies between GPR32 and FPR32.
6233 if (AArch64::FPR32RegClass.contains(DestReg) &&
6234 AArch64::GPR32RegClass.contains(SrcReg)) {
6235 if (AArch64::WZR == SrcReg) {
6236 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6237 } else {
6238 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6239 .addReg(SrcReg, getKillRegState(KillSrc));
6240 }
6241 return;
6242 }
6243 if (AArch64::GPR32RegClass.contains(DestReg) &&
6244 AArch64::FPR32RegClass.contains(SrcReg)) {
6245 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6246 .addReg(SrcReg, getKillRegState(KillSrc));
6247 return;
6248 }
6249
6250 if (DestReg == AArch64::NZCV) {
6251 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6252 BuildMI(MBB, I, DL, get(AArch64::MSR))
6253 .addImm(AArch64SysReg::NZCV)
6254 .addReg(SrcReg, getKillRegState(KillSrc))
6255 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6256 return;
6257 }
6258
6259 if (SrcReg == AArch64::NZCV) {
6260 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6261 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6262 .addImm(AArch64SysReg::NZCV)
6263 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6264 return;
6265 }
6266
6267#ifndef NDEBUG
6268 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6269 << "\n";
6270#endif
6271 llvm_unreachable("unimplemented reg-to-reg copy");
6272}
6273
6276 MachineBasicBlock::iterator InsertBefore,
6277 const MCInstrDesc &MCID,
6278 Register SrcReg, bool IsKill,
6279 unsigned SubIdx0, unsigned SubIdx1, int FI,
6280 MachineMemOperand *MMO) {
6281 Register SrcReg0 = SrcReg;
6282 Register SrcReg1 = SrcReg;
6283 if (SrcReg.isPhysical()) {
6284 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6285 SubIdx0 = 0;
6286 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6287 SubIdx1 = 0;
6288 }
6289 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6290 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6291 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6292 .addFrameIndex(FI)
6293 .addImm(0)
6294 .addMemOperand(MMO);
6295}
6296
6299 Register SrcReg, bool isKill, int FI,
6300 const TargetRegisterClass *RC,
6301 Register VReg,
6302 MachineInstr::MIFlag Flags) const {
6303 MachineFunction &MF = *MBB.getParent();
6304 MachineFrameInfo &MFI = MF.getFrameInfo();
6305
6307 MachineMemOperand *MMO =
6309 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6310 unsigned Opc = 0;
6311 bool Offset = true;
6313 unsigned StackID = TargetStackID::Default;
6314 switch (RI.getSpillSize(*RC)) {
6315 case 1:
6316 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6317 Opc = AArch64::STRBui;
6318 break;
6319 case 2: {
6320 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6321 Opc = AArch64::STRHui;
6322 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6323 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6324 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6325 "Unexpected register store without SVE store instructions");
6326 Opc = AArch64::STR_PXI;
6328 }
6329 break;
6330 }
6331 case 4:
6332 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6333 Opc = AArch64::STRWui;
6334 if (SrcReg.isVirtual())
6335 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6336 else
6337 assert(SrcReg != AArch64::WSP);
6338 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6339 Opc = AArch64::STRSui;
6340 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6341 Opc = AArch64::STR_PPXI;
6343 }
6344 break;
6345 case 8:
6346 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6347 Opc = AArch64::STRXui;
6348 if (SrcReg.isVirtual())
6349 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6350 else
6351 assert(SrcReg != AArch64::SP);
6352 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6353 Opc = AArch64::STRDui;
6354 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6356 get(AArch64::STPWi), SrcReg, isKill,
6357 AArch64::sube32, AArch64::subo32, FI, MMO);
6358 return;
6359 }
6360 break;
6361 case 16:
6362 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6363 Opc = AArch64::STRQui;
6364 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6365 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6366 Opc = AArch64::ST1Twov1d;
6367 Offset = false;
6368 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6370 get(AArch64::STPXi), SrcReg, isKill,
6371 AArch64::sube64, AArch64::subo64, FI, MMO);
6372 return;
6373 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6374 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6375 "Unexpected register store without SVE store instructions");
6376 Opc = AArch64::STR_ZXI;
6378 }
6379 break;
6380 case 24:
6381 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6382 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6383 Opc = AArch64::ST1Threev1d;
6384 Offset = false;
6385 }
6386 break;
6387 case 32:
6388 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6389 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6390 Opc = AArch64::ST1Fourv1d;
6391 Offset = false;
6392 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6393 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6394 Opc = AArch64::ST1Twov2d;
6395 Offset = false;
6396 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6397 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6398 "Unexpected register store without SVE store instructions");
6399 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6401 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6402 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6403 "Unexpected register store without SVE store instructions");
6404 Opc = AArch64::STR_ZZXI;
6406 }
6407 break;
6408 case 48:
6409 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6410 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6411 Opc = AArch64::ST1Threev2d;
6412 Offset = false;
6413 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6414 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6415 "Unexpected register store without SVE store instructions");
6416 Opc = AArch64::STR_ZZZXI;
6418 }
6419 break;
6420 case 64:
6421 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6422 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6423 Opc = AArch64::ST1Fourv2d;
6424 Offset = false;
6425 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6426 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6427 "Unexpected register store without SVE store instructions");
6428 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6430 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6431 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6432 "Unexpected register store without SVE store instructions");
6433 Opc = AArch64::STR_ZZZZXI;
6435 }
6436 break;
6437 }
6438 assert(Opc && "Unknown register class");
6439 MFI.setStackID(FI, StackID);
6440
6442 .addReg(SrcReg, getKillRegState(isKill))
6443 .addFrameIndex(FI);
6444
6445 if (Offset)
6446 MI.addImm(0);
6447 if (PNRReg.isValid())
6448 MI.addDef(PNRReg, RegState::Implicit);
6449 MI.addMemOperand(MMO);
6450}
6451
6454 MachineBasicBlock::iterator InsertBefore,
6455 const MCInstrDesc &MCID,
6456 Register DestReg, unsigned SubIdx0,
6457 unsigned SubIdx1, int FI,
6458 MachineMemOperand *MMO) {
6459 Register DestReg0 = DestReg;
6460 Register DestReg1 = DestReg;
6461 bool IsUndef = true;
6462 if (DestReg.isPhysical()) {
6463 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6464 SubIdx0 = 0;
6465 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6466 SubIdx1 = 0;
6467 IsUndef = false;
6468 }
6469 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6470 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6471 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6472 .addFrameIndex(FI)
6473 .addImm(0)
6474 .addMemOperand(MMO);
6475}
6476
6479 Register DestReg, int FI,
6480 const TargetRegisterClass *RC,
6481 Register VReg, unsigned SubReg,
6482 MachineInstr::MIFlag Flags) const {
6483 MachineFunction &MF = *MBB.getParent();
6484 MachineFrameInfo &MFI = MF.getFrameInfo();
6486 MachineMemOperand *MMO =
6488 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6489
6490 unsigned Opc = 0;
6491 bool Offset = true;
6492 unsigned StackID = TargetStackID::Default;
6494 switch (TRI.getSpillSize(*RC)) {
6495 case 1:
6496 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6497 Opc = AArch64::LDRBui;
6498 break;
6499 case 2: {
6500 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6501 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6502 Opc = AArch64::LDRHui;
6503 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6504 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6505 "Unexpected register load without SVE load instructions");
6506 if (IsPNR)
6507 PNRReg = DestReg;
6508 Opc = AArch64::LDR_PXI;
6510 }
6511 break;
6512 }
6513 case 4:
6514 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6515 Opc = AArch64::LDRWui;
6516 if (DestReg.isVirtual())
6517 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6518 else
6519 assert(DestReg != AArch64::WSP);
6520 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6521 Opc = AArch64::LDRSui;
6522 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6523 Opc = AArch64::LDR_PPXI;
6525 }
6526 break;
6527 case 8:
6528 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6529 Opc = AArch64::LDRXui;
6530 if (DestReg.isVirtual())
6531 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6532 else
6533 assert(DestReg != AArch64::SP);
6534 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6535 Opc = AArch64::LDRDui;
6536 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6538 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6539 AArch64::subo32, FI, MMO);
6540 return;
6541 }
6542 break;
6543 case 16:
6544 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6545 Opc = AArch64::LDRQui;
6546 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6547 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6548 Opc = AArch64::LD1Twov1d;
6549 Offset = false;
6550 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6552 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6553 AArch64::subo64, FI, MMO);
6554 return;
6555 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6556 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6557 "Unexpected register load without SVE load instructions");
6558 Opc = AArch64::LDR_ZXI;
6560 }
6561 break;
6562 case 24:
6563 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6564 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6565 Opc = AArch64::LD1Threev1d;
6566 Offset = false;
6567 }
6568 break;
6569 case 32:
6570 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6571 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6572 Opc = AArch64::LD1Fourv1d;
6573 Offset = false;
6574 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6575 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6576 Opc = AArch64::LD1Twov2d;
6577 Offset = false;
6578 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6579 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6580 "Unexpected register load without SVE load instructions");
6581 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6583 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6584 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6585 "Unexpected register load without SVE load instructions");
6586 Opc = AArch64::LDR_ZZXI;
6588 }
6589 break;
6590 case 48:
6591 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6592 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6593 Opc = AArch64::LD1Threev2d;
6594 Offset = false;
6595 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6596 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6597 "Unexpected register load without SVE load instructions");
6598 Opc = AArch64::LDR_ZZZXI;
6600 }
6601 break;
6602 case 64:
6603 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6604 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6605 Opc = AArch64::LD1Fourv2d;
6606 Offset = false;
6607 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6608 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6609 "Unexpected register load without SVE load instructions");
6610 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6612 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6613 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6614 "Unexpected register load without SVE load instructions");
6615 Opc = AArch64::LDR_ZZZZXI;
6617 }
6618 break;
6619 }
6620
6621 assert(Opc && "Unknown register class");
6622 MFI.setStackID(FI, StackID);
6623
6625 .addReg(DestReg, getDefRegState(true))
6626 .addFrameIndex(FI);
6627 if (Offset)
6628 MI.addImm(0);
6629 if (PNRReg.isValid() && !PNRReg.isVirtual())
6630 MI.addDef(PNRReg, RegState::Implicit);
6631 MI.addMemOperand(MMO);
6632}
6633
6635 const MachineInstr &UseMI,
6636 const TargetRegisterInfo *TRI) {
6637 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6638 UseMI.getIterator()),
6639 [TRI](const MachineInstr &I) {
6640 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6641 I.readsRegister(AArch64::NZCV, TRI);
6642 });
6643}
6644
6645void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6646 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6647 // The smallest scalable element supported by scaled SVE addressing
6648 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6649 // byte offset must always be a multiple of 2.
6650 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6651
6652 // VGSized offsets are divided by '2', because the VG register is the
6653 // the number of 64bit granules as opposed to 128bit vector chunks,
6654 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6655 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6656 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6657 ByteSized = Offset.getFixed();
6658 VGSized = Offset.getScalable() / 2;
6659}
6660
6661/// Returns the offset in parts to which this frame offset can be
6662/// decomposed for the purpose of describing a frame offset.
6663/// For non-scalable offsets this is simply its byte size.
6664void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6665 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6666 int64_t &NumDataVectors) {
6667 // The smallest scalable element supported by scaled SVE addressing
6668 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6669 // byte offset must always be a multiple of 2.
6670 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6671
6672 NumBytes = Offset.getFixed();
6673 NumDataVectors = 0;
6674 NumPredicateVectors = Offset.getScalable() / 2;
6675 // This method is used to get the offsets to adjust the frame offset.
6676 // If the function requires ADDPL to be used and needs more than two ADDPL
6677 // instructions, part of the offset is folded into NumDataVectors so that it
6678 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6679 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6680 NumPredicateVectors > 62) {
6681 NumDataVectors = NumPredicateVectors / 8;
6682 NumPredicateVectors -= NumDataVectors * 8;
6683 }
6684}
6685
6686// Convenience function to create a DWARF expression for: Constant `Operation`.
6687// This helper emits compact sequences for common cases. For example, for`-15
6688// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6691 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6692 // -Constant (1 to 31)
6693 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6694 Operation = dwarf::DW_OP_minus;
6695 } else if (Constant >= 0 && Constant <= 31) {
6696 // Literal value 0 to 31
6697 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6698 } else {
6699 // Signed constant
6700 Expr.push_back(dwarf::DW_OP_consts);
6702 }
6703 return Expr.push_back(Operation);
6704}
6705
6706// Convenience function to create a DWARF expression for a register.
6707static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6708 Expr.push_back((char)dwarf::DW_OP_bregx);
6710 Expr.push_back(0);
6711}
6712
6713// Convenience function to create a DWARF expression for loading a register from
6714// a CFA offset.
6716 int64_t OffsetFromDefCFA) {
6717 // This assumes the top of the DWARF stack contains the CFA.
6718 Expr.push_back(dwarf::DW_OP_dup);
6719 // Add the offset to the register.
6720 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6721 // Dereference the address (loads a 64 bit value)..
6722 Expr.push_back(dwarf::DW_OP_deref);
6723}
6724
6725// Convenience function to create a comment for
6726// (+/-) NumBytes (* RegScale)?
6727static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6728 StringRef RegScale = {}) {
6729 if (NumBytes) {
6730 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6731 if (!RegScale.empty())
6732 Comment << ' ' << RegScale;
6733 }
6734}
6735
6736// Creates an MCCFIInstruction:
6737// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6739 unsigned Reg,
6740 const StackOffset &Offset) {
6741 int64_t NumBytes, NumVGScaledBytes;
6742 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6743 NumVGScaledBytes);
6744 std::string CommentBuffer;
6745 llvm::raw_string_ostream Comment(CommentBuffer);
6746
6747 if (Reg == AArch64::SP)
6748 Comment << "sp";
6749 else if (Reg == AArch64::FP)
6750 Comment << "fp";
6751 else
6752 Comment << printReg(Reg, &TRI);
6753
6754 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6755 SmallString<64> Expr;
6756 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6757 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6758 // Reg + NumBytes
6759 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6760 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6761 appendOffsetComment(NumBytes, Comment);
6762 if (NumVGScaledBytes) {
6763 // + VG * NumVGScaledBytes
6764 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6765 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6766 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6767 Expr.push_back(dwarf::DW_OP_plus);
6768 }
6769
6770 // Wrap this into DW_CFA_def_cfa.
6771 SmallString<64> DefCfaExpr;
6772 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6773 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6774 DefCfaExpr.append(Expr.str());
6775 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6776 Comment.str());
6777}
6778
6780 unsigned FrameReg, unsigned Reg,
6781 const StackOffset &Offset,
6782 bool LastAdjustmentWasScalable) {
6783 if (Offset.getScalable())
6784 return createDefCFAExpression(TRI, Reg, Offset);
6785
6786 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6787 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6788
6789 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6790 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6791}
6792
6795 const StackOffset &OffsetFromDefCFA,
6796 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6797 int64_t NumBytes, NumVGScaledBytes;
6798 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6799 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6800
6801 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6802
6803 // Non-scalable offsets can use DW_CFA_offset directly.
6804 if (!NumVGScaledBytes)
6805 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6806
6807 std::string CommentBuffer;
6808 llvm::raw_string_ostream Comment(CommentBuffer);
6809 Comment << printReg(Reg, &TRI) << " @ cfa";
6810
6811 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6812 assert(NumVGScaledBytes && "Expected scalable offset");
6813 SmallString<64> OffsetExpr;
6814 // + VG * NumVGScaledBytes
6815 StringRef VGRegScale;
6816 if (IncomingVGOffsetFromDefCFA) {
6817 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6818 VGRegScale = "* IncomingVG";
6819 } else {
6820 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6821 VGRegScale = "* VG";
6822 }
6823 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6824 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6825 OffsetExpr.push_back(dwarf::DW_OP_plus);
6826 if (NumBytes) {
6827 // + NumBytes
6828 appendOffsetComment(NumBytes, Comment);
6829 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6830 }
6831
6832 // Wrap this into DW_CFA_expression
6833 SmallString<64> CfaExpr;
6834 CfaExpr.push_back(dwarf::DW_CFA_expression);
6835 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6836 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6837 CfaExpr.append(OffsetExpr.str());
6838
6839 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6840 Comment.str());
6841}
6842
6843// Helper function to emit a frame offset adjustment from a given
6844// pointer (SrcReg), stored into DestReg. This function is explicit
6845// in that it requires the opcode.
6848 const DebugLoc &DL, unsigned DestReg,
6849 unsigned SrcReg, int64_t Offset, unsigned Opc,
6850 const TargetInstrInfo *TII,
6851 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6852 bool *HasWinCFI, bool EmitCFAOffset,
6853 StackOffset CFAOffset, unsigned FrameReg) {
6854 int Sign = 1;
6855 unsigned MaxEncoding, ShiftSize;
6856 switch (Opc) {
6857 case AArch64::ADDXri:
6858 case AArch64::ADDSXri:
6859 case AArch64::SUBXri:
6860 case AArch64::SUBSXri:
6861 MaxEncoding = 0xfff;
6862 ShiftSize = 12;
6863 break;
6864 case AArch64::ADDVL_XXI:
6865 case AArch64::ADDPL_XXI:
6866 case AArch64::ADDSVL_XXI:
6867 case AArch64::ADDSPL_XXI:
6868 MaxEncoding = 31;
6869 ShiftSize = 0;
6870 if (Offset < 0) {
6871 MaxEncoding = 32;
6872 Sign = -1;
6873 Offset = -Offset;
6874 }
6875 break;
6876 default:
6877 llvm_unreachable("Unsupported opcode");
6878 }
6879
6880 // `Offset` can be in bytes or in "scalable bytes".
6881 int VScale = 1;
6882 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6883 VScale = 16;
6884 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6885 VScale = 2;
6886
6887 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6888 // scratch register. If DestReg is a virtual register, use it as the
6889 // scratch register; otherwise, create a new virtual register (to be
6890 // replaced by the scavenger at the end of PEI). That case can be optimized
6891 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6892 // register can be loaded with offset%8 and the add/sub can use an extending
6893 // instruction with LSL#3.
6894 // Currently the function handles any offsets but generates a poor sequence
6895 // of code.
6896 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6897
6898 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6899 Register TmpReg = DestReg;
6900 if (TmpReg == AArch64::XZR)
6901 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6902 &AArch64::GPR64RegClass);
6903 do {
6904 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6905 unsigned LocalShiftSize = 0;
6906 if (ThisVal > MaxEncoding) {
6907 ThisVal = ThisVal >> ShiftSize;
6908 LocalShiftSize = ShiftSize;
6909 }
6910 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6911 "Encoding cannot handle value that big");
6912
6913 Offset -= ThisVal << LocalShiftSize;
6914 if (Offset == 0)
6915 TmpReg = DestReg;
6916 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6917 .addReg(SrcReg)
6918 .addImm(Sign * (int)ThisVal);
6919 if (ShiftSize)
6920 MBI = MBI.addImm(
6922 MBI = MBI.setMIFlag(Flag);
6923
6924 auto Change =
6925 VScale == 1
6926 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6927 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6928 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6929 CFAOffset += Change;
6930 else
6931 CFAOffset -= Change;
6932 if (EmitCFAOffset && DestReg == TmpReg) {
6933 MachineFunction &MF = *MBB.getParent();
6934 const TargetSubtargetInfo &STI = MF.getSubtarget();
6935 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6936
6937 unsigned CFIIndex = MF.addFrameInst(
6938 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6939 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6940 .addCFIIndex(CFIIndex)
6941 .setMIFlags(Flag);
6942 }
6943
6944 if (NeedsWinCFI) {
6945 int Imm = (int)(ThisVal << LocalShiftSize);
6946 if (VScale != 1 && DestReg == AArch64::SP) {
6947 if (HasWinCFI)
6948 *HasWinCFI = true;
6949 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6950 .addImm(ThisVal)
6951 .setMIFlag(Flag);
6952 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6953 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6954 assert(VScale == 1 && "Expected non-scalable operation");
6955 if (HasWinCFI)
6956 *HasWinCFI = true;
6957 if (Imm == 0)
6958 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6959 else
6960 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6961 .addImm(Imm)
6962 .setMIFlag(Flag);
6963 assert(Offset == 0 && "Expected remaining offset to be zero to "
6964 "emit a single SEH directive");
6965 } else if (DestReg == AArch64::SP) {
6966 assert(VScale == 1 && "Expected non-scalable operation");
6967 if (HasWinCFI)
6968 *HasWinCFI = true;
6969 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6970 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6971 .addImm(Imm)
6972 .setMIFlag(Flag);
6973 }
6974 }
6975
6976 SrcReg = TmpReg;
6977 } while (Offset);
6978}
6979
6982 unsigned DestReg, unsigned SrcReg,
6984 MachineInstr::MIFlag Flag, bool SetNZCV,
6985 bool NeedsWinCFI, bool *HasWinCFI,
6986 bool EmitCFAOffset, StackOffset CFAOffset,
6987 unsigned FrameReg) {
6988 // If a function is marked as arm_locally_streaming, then the runtime value of
6989 // vscale in the prologue/epilogue is different the runtime value of vscale
6990 // in the function's body. To avoid having to consider multiple vscales,
6991 // we can use `addsvl` to allocate any scalable stack-slots, which under
6992 // most circumstances will be only locals, not callee-save slots.
6993 const Function &F = MBB.getParent()->getFunction();
6994 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6995
6996 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6997 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6998 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6999
7000 // Insert ADDSXri for scalable offset at the end.
7001 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
7002 if (NeedsFinalDefNZCV)
7003 SetNZCV = false;
7004
7005 // First emit non-scalable frame offsets, or a simple 'mov'.
7006 if (Bytes || (!Offset && SrcReg != DestReg)) {
7007 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
7008 "SP increment/decrement not 8-byte aligned");
7009 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
7010 if (Bytes < 0) {
7011 Bytes = -Bytes;
7012 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
7013 }
7014 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
7015 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7016 FrameReg);
7017 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
7018 ? StackOffset::getFixed(-Bytes)
7019 : StackOffset::getFixed(Bytes);
7020 SrcReg = DestReg;
7021 FrameReg = DestReg;
7022 }
7023
7024 assert(!(NeedsWinCFI && NumPredicateVectors) &&
7025 "WinCFI can't allocate fractions of an SVE data vector");
7026
7027 if (NumDataVectors) {
7028 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
7029 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
7030 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7031 FrameReg);
7032 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
7033 SrcReg = DestReg;
7034 }
7035
7036 if (NumPredicateVectors) {
7037 assert(DestReg != AArch64::SP && "Unaligned access to SP");
7038 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
7039 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
7040 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7041 FrameReg);
7042 }
7043
7044 if (NeedsFinalDefNZCV)
7045 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
7046 .addReg(DestReg)
7047 .addImm(0)
7048 .addImm(0);
7049}
7050
7053 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
7054 VirtRegMap *VRM) const {
7056 // This is a bit of a hack. Consider this instruction:
7057 //
7058 // %0 = COPY %sp; GPR64all:%0
7059 //
7060 // We explicitly chose GPR64all for the virtual register so such a copy might
7061 // be eliminated by RegisterCoalescer. However, that may not be possible, and
7062 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
7063 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
7064 //
7065 // To prevent that, we are going to constrain the %0 register class here.
7066 if (MI.isFullCopy()) {
7067 Register DstReg = MI.getOperand(0).getReg();
7068 Register SrcReg = MI.getOperand(1).getReg();
7069 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
7070 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
7071 return nullptr;
7072 }
7073 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
7074 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
7075 return nullptr;
7076 }
7077 // Nothing can folded with copy from/to NZCV.
7078 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
7079 return nullptr;
7080 }
7081
7082 // Handle the case where a copy is being spilled or filled but the source
7083 // and destination register class don't match. For example:
7084 //
7085 // %0 = COPY %xzr; GPR64common:%0
7086 //
7087 // In this case we can still safely fold away the COPY and generate the
7088 // following spill code:
7089 //
7090 // STRXui %xzr, %stack.0
7091 //
7092 // This also eliminates spilled cross register class COPYs (e.g. between x and
7093 // d regs) of the same size. For example:
7094 //
7095 // %0 = COPY %1; GPR64:%0, FPR64:%1
7096 //
7097 // will be filled as
7098 //
7099 // LDRDui %0, fi<#0>
7100 //
7101 // instead of
7102 //
7103 // LDRXui %Temp, fi<#0>
7104 // %0 = FMOV %Temp
7105 //
7106 if (MI.isCopy() && Ops.size() == 1 &&
7107 // Make sure we're only folding the explicit COPY defs/uses.
7108 (Ops[0] == 0 || Ops[0] == 1)) {
7109 bool IsSpill = Ops[0] == 0;
7110 bool IsFill = !IsSpill;
7112 const MachineRegisterInfo &MRI = MF.getRegInfo();
7113 MachineBasicBlock &MBB = *MI.getParent();
7114 const MachineOperand &DstMO = MI.getOperand(0);
7115 const MachineOperand &SrcMO = MI.getOperand(1);
7116 Register DstReg = DstMO.getReg();
7117 Register SrcReg = SrcMO.getReg();
7118 // This is slightly expensive to compute for physical regs since
7119 // getMinimalPhysRegClass is slow.
7120 auto getRegClass = [&](unsigned Reg) {
7121 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
7122 : TRI.getMinimalPhysRegClass(Reg);
7123 };
7124
7125 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
7126 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
7127 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
7128 "Mismatched register size in non subreg COPY");
7129 if (IsSpill)
7130 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
7131 getRegClass(SrcReg), Register());
7132 else
7133 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
7134 getRegClass(DstReg), Register());
7135 return &*--InsertPt;
7136 }
7137
7138 // Handle cases like spilling def of:
7139 //
7140 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
7141 //
7142 // where the physical register source can be widened and stored to the full
7143 // virtual reg destination stack slot, in this case producing:
7144 //
7145 // STRXui %xzr, %stack.0
7146 //
7147 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
7148 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
7149 assert(SrcMO.getSubReg() == 0 &&
7150 "Unexpected subreg on physical register");
7151 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
7152 FrameIndex, &AArch64::GPR64RegClass, Register());
7153 return &*--InsertPt;
7154 }
7155
7156 // Handle cases like filling use of:
7157 //
7158 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
7159 //
7160 // where we can load the full virtual reg source stack slot, into the subreg
7161 // destination, in this case producing:
7162 //
7163 // LDRWui %0:sub_32<def,read-undef>, %stack.0
7164 //
7165 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
7166 const TargetRegisterClass *FillRC = nullptr;
7167 switch (DstMO.getSubReg()) {
7168 default:
7169 break;
7170 case AArch64::sub_32:
7171 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
7172 FillRC = &AArch64::GPR32RegClass;
7173 break;
7174 case AArch64::ssub:
7175 FillRC = &AArch64::FPR32RegClass;
7176 break;
7177 case AArch64::dsub:
7178 FillRC = &AArch64::FPR64RegClass;
7179 break;
7180 }
7181
7182 if (FillRC) {
7183 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
7184 TRI.getRegSizeInBits(*FillRC) &&
7185 "Mismatched regclass size on folded subreg COPY");
7186 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
7187 Register());
7188 MachineInstr &LoadMI = *--InsertPt;
7189 MachineOperand &LoadDst = LoadMI.getOperand(0);
7190 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
7191 LoadDst.setSubReg(DstMO.getSubReg());
7192 LoadDst.setIsUndef();
7193 return &LoadMI;
7194 }
7195 }
7196 }
7197
7198 // Cannot fold.
7199 return nullptr;
7200}
7201
7203 StackOffset &SOffset,
7204 bool *OutUseUnscaledOp,
7205 unsigned *OutUnscaledOp,
7206 int64_t *EmittableOffset) {
7207 // Set output values in case of early exit.
7208 if (EmittableOffset)
7209 *EmittableOffset = 0;
7210 if (OutUseUnscaledOp)
7211 *OutUseUnscaledOp = false;
7212 if (OutUnscaledOp)
7213 *OutUnscaledOp = 0;
7214
7215 // Exit early for structured vector spills/fills as they can't take an
7216 // immediate offset.
7217 switch (MI.getOpcode()) {
7218 default:
7219 break;
7220 case AArch64::LD1Rv1d:
7221 case AArch64::LD1Rv2s:
7222 case AArch64::LD1Rv2d:
7223 case AArch64::LD1Rv4h:
7224 case AArch64::LD1Rv4s:
7225 case AArch64::LD1Rv8b:
7226 case AArch64::LD1Rv8h:
7227 case AArch64::LD1Rv16b:
7228 case AArch64::LD1Twov2d:
7229 case AArch64::LD1Threev2d:
7230 case AArch64::LD1Fourv2d:
7231 case AArch64::LD1Twov1d:
7232 case AArch64::LD1Threev1d:
7233 case AArch64::LD1Fourv1d:
7234 case AArch64::ST1Twov2d:
7235 case AArch64::ST1Threev2d:
7236 case AArch64::ST1Fourv2d:
7237 case AArch64::ST1Twov1d:
7238 case AArch64::ST1Threev1d:
7239 case AArch64::ST1Fourv1d:
7240 case AArch64::ST1i8:
7241 case AArch64::ST1i16:
7242 case AArch64::ST1i32:
7243 case AArch64::ST1i64:
7244 case AArch64::IRG:
7245 case AArch64::IRGstack:
7246 case AArch64::STGloop:
7247 case AArch64::STZGloop:
7249 }
7250
7251 // Get the min/max offset and the scale.
7252 TypeSize ScaleValue(0U, false), Width(0U, false);
7253 int64_t MinOff, MaxOff;
7254 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7255 MaxOff))
7256 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7257
7258 // Construct the complete offset.
7259 bool IsMulVL = ScaleValue.isScalable();
7260 unsigned Scale = ScaleValue.getKnownMinValue();
7261 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7262
7263 const MachineOperand &ImmOpnd =
7264 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7265 Offset += ImmOpnd.getImm() * Scale;
7266
7267 // If the offset doesn't match the scale, we rewrite the instruction to
7268 // use the unscaled instruction instead. Likewise, if we have a negative
7269 // offset and there is an unscaled op to use.
7270 std::optional<unsigned> UnscaledOp =
7272 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7273 if (useUnscaledOp &&
7274 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7275 MaxOff))
7276 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7277
7278 Scale = ScaleValue.getKnownMinValue();
7279 assert(IsMulVL == ScaleValue.isScalable() &&
7280 "Unscaled opcode has different value for scalable");
7281
7282 int64_t Remainder = Offset % Scale;
7283 assert(!(Remainder && useUnscaledOp) &&
7284 "Cannot have remainder when using unscaled op");
7285
7286 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7287 int64_t NewOffset = Offset / Scale;
7288 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7289 Offset = Remainder;
7290 else {
7291 // Try to minimise the number of instructions required to materialise the
7292 // offset calculation. Specifically, for fixed offsets, if masking out the
7293 // low 12 bits leaves a legal add immediate, we can realise the offset
7294 // calculation with a single add instruction. Whenever this is possible,
7295 // prefer this split.
7296 int64_t HighPart = Offset & ~0xFFF;
7297 int64_t LowPart = Offset & 0xFFF;
7298 int64_t LowScaled = LowPart / Scale;
7299 if (!IsMulVL && NewOffset >= 0 && LowPart % Scale == 0 &&
7300 MinOff <= LowScaled && LowScaled <= MaxOff &&
7302 NewOffset = LowScaled;
7303 Offset = HighPart;
7304 } else {
7305 // Default to a greedy split: take the memop immediate to be maximum /
7306 // minimum expressible offset and materialise the remainder.
7307 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7308 Offset = Offset - (NewOffset * Scale);
7309 }
7310 }
7311
7312 if (EmittableOffset)
7313 *EmittableOffset = NewOffset;
7314 if (OutUseUnscaledOp)
7315 *OutUseUnscaledOp = useUnscaledOp;
7316 if (OutUnscaledOp && UnscaledOp)
7317 *OutUnscaledOp = *UnscaledOp;
7318
7319 if (IsMulVL)
7320 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7321 else
7322 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7324 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7325}
7326
7328 unsigned FrameReg, StackOffset &Offset,
7329 const AArch64InstrInfo *TII) {
7330 unsigned Opcode = MI.getOpcode();
7331 unsigned ImmIdx = FrameRegIdx + 1;
7332
7333 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7334 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7335 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7336 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7337 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7338 MI.eraseFromParent();
7339 Offset = StackOffset();
7340 return true;
7341 }
7342
7343 int64_t NewOffset;
7344 unsigned UnscaledOp;
7345 bool UseUnscaledOp;
7346 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7347 &UnscaledOp, &NewOffset);
7350 // Replace the FrameIndex with FrameReg.
7351 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7352 if (UseUnscaledOp)
7353 MI.setDesc(TII->get(UnscaledOp));
7354
7355 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7356 return !Offset;
7357 }
7358
7359 return false;
7360}
7361
7367
7368MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7369
7370// AArch64 supports MachineCombiner.
7371bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7372
7373// True when Opc sets flag
7374static bool isCombineInstrSettingFlag(unsigned Opc) {
7375 switch (Opc) {
7376 case AArch64::ADDSWrr:
7377 case AArch64::ADDSWri:
7378 case AArch64::ADDSXrr:
7379 case AArch64::ADDSXri:
7380 case AArch64::SUBSWrr:
7381 case AArch64::SUBSXrr:
7382 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7383 case AArch64::SUBSWri:
7384 case AArch64::SUBSXri:
7385 return true;
7386 default:
7387 break;
7388 }
7389 return false;
7390}
7391
7392// 32b Opcodes that can be combined with a MUL
7393static bool isCombineInstrCandidate32(unsigned Opc) {
7394 switch (Opc) {
7395 case AArch64::ADDWrr:
7396 case AArch64::ADDWri:
7397 case AArch64::SUBWrr:
7398 case AArch64::ADDSWrr:
7399 case AArch64::ADDSWri:
7400 case AArch64::SUBSWrr:
7401 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7402 case AArch64::SUBWri:
7403 case AArch64::SUBSWri:
7404 return true;
7405 default:
7406 break;
7407 }
7408 return false;
7409}
7410
7411// 64b Opcodes that can be combined with a MUL
7412static bool isCombineInstrCandidate64(unsigned Opc) {
7413 switch (Opc) {
7414 case AArch64::ADDXrr:
7415 case AArch64::ADDXri:
7416 case AArch64::SUBXrr:
7417 case AArch64::ADDSXrr:
7418 case AArch64::ADDSXri:
7419 case AArch64::SUBSXrr:
7420 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7421 case AArch64::SUBXri:
7422 case AArch64::SUBSXri:
7423 case AArch64::ADDv8i8:
7424 case AArch64::ADDv16i8:
7425 case AArch64::ADDv4i16:
7426 case AArch64::ADDv8i16:
7427 case AArch64::ADDv2i32:
7428 case AArch64::ADDv4i32:
7429 case AArch64::SUBv8i8:
7430 case AArch64::SUBv16i8:
7431 case AArch64::SUBv4i16:
7432 case AArch64::SUBv8i16:
7433 case AArch64::SUBv2i32:
7434 case AArch64::SUBv4i32:
7435 return true;
7436 default:
7437 break;
7438 }
7439 return false;
7440}
7441
7442// FP Opcodes that can be combined with a FMUL.
7443static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7444 switch (Inst.getOpcode()) {
7445 default:
7446 break;
7447 case AArch64::FADDHrr:
7448 case AArch64::FADDSrr:
7449 case AArch64::FADDDrr:
7450 case AArch64::FADDv4f16:
7451 case AArch64::FADDv8f16:
7452 case AArch64::FADDv2f32:
7453 case AArch64::FADDv2f64:
7454 case AArch64::FADDv4f32:
7455 case AArch64::FSUBHrr:
7456 case AArch64::FSUBSrr:
7457 case AArch64::FSUBDrr:
7458 case AArch64::FSUBv4f16:
7459 case AArch64::FSUBv8f16:
7460 case AArch64::FSUBv2f32:
7461 case AArch64::FSUBv2f64:
7462 case AArch64::FSUBv4f32:
7464 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7465 // the target options or if FADD/FSUB has the contract fast-math flag.
7466 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7468 }
7469 return false;
7470}
7471
7472// Opcodes that can be combined with a MUL
7476
7477//
7478// Utility routine that checks if \param MO is defined by an
7479// \param CombineOpc instruction in the basic block \param MBB
7481 unsigned CombineOpc, unsigned ZeroReg = 0,
7482 bool CheckZeroReg = false) {
7483 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7484 MachineInstr *MI = nullptr;
7485
7486 if (MO.isReg() && MO.getReg().isVirtual())
7487 MI = MRI.getUniqueVRegDef(MO.getReg());
7488 // And it needs to be in the trace (otherwise, it won't have a depth).
7489 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7490 return false;
7491 // Must only used by the user we combine with.
7492 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7493 return false;
7494
7495 if (CheckZeroReg) {
7496 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7497 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7498 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7499 // The third input reg must be zero.
7500 if (MI->getOperand(3).getReg() != ZeroReg)
7501 return false;
7502 }
7503
7504 if (isCombineInstrSettingFlag(CombineOpc) &&
7505 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7506 return false;
7507
7508 return true;
7509}
7510
7511//
7512// Is \param MO defined by an integer multiply and can be combined?
7514 unsigned MulOpc, unsigned ZeroReg) {
7515 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7516}
7517
7518//
7519// Is \param MO defined by a floating-point multiply and can be combined?
7521 unsigned MulOpc) {
7522 return canCombine(MBB, MO, MulOpc);
7523}
7524
7525// TODO: There are many more machine instruction opcodes to match:
7526// 1. Other data types (integer, vectors)
7527// 2. Other math / logic operations (xor, or)
7528// 3. Other forms of the same operation (intrinsics and other variants)
7529bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7530 bool Invert) const {
7531 if (Invert)
7532 return false;
7533 switch (Inst.getOpcode()) {
7534 // == Floating-point types ==
7535 // -- Floating-point instructions --
7536 case AArch64::FADDHrr:
7537 case AArch64::FADDSrr:
7538 case AArch64::FADDDrr:
7539 case AArch64::FMULHrr:
7540 case AArch64::FMULSrr:
7541 case AArch64::FMULDrr:
7542 case AArch64::FMULX16:
7543 case AArch64::FMULX32:
7544 case AArch64::FMULX64:
7545 // -- Advanced SIMD instructions --
7546 case AArch64::FADDv4f16:
7547 case AArch64::FADDv8f16:
7548 case AArch64::FADDv2f32:
7549 case AArch64::FADDv4f32:
7550 case AArch64::FADDv2f64:
7551 case AArch64::FMULv4f16:
7552 case AArch64::FMULv8f16:
7553 case AArch64::FMULv2f32:
7554 case AArch64::FMULv4f32:
7555 case AArch64::FMULv2f64:
7556 case AArch64::FMULXv4f16:
7557 case AArch64::FMULXv8f16:
7558 case AArch64::FMULXv2f32:
7559 case AArch64::FMULXv4f32:
7560 case AArch64::FMULXv2f64:
7561 // -- SVE instructions --
7562 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7563 // in the SVE instruction set (though there are predicated ones).
7564 case AArch64::FADD_ZZZ_H:
7565 case AArch64::FADD_ZZZ_S:
7566 case AArch64::FADD_ZZZ_D:
7567 case AArch64::FMUL_ZZZ_H:
7568 case AArch64::FMUL_ZZZ_S:
7569 case AArch64::FMUL_ZZZ_D:
7572
7573 // == Integer types ==
7574 // -- Base instructions --
7575 // Opcodes MULWrr and MULXrr don't exist because
7576 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7577 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7578 // The machine-combiner does not support three-source-operands machine
7579 // instruction. So we cannot reassociate MULs.
7580 case AArch64::ADDWrr:
7581 case AArch64::ADDXrr:
7582 case AArch64::ANDWrr:
7583 case AArch64::ANDXrr:
7584 case AArch64::ORRWrr:
7585 case AArch64::ORRXrr:
7586 case AArch64::EORWrr:
7587 case AArch64::EORXrr:
7588 case AArch64::EONWrr:
7589 case AArch64::EONXrr:
7590 // -- Advanced SIMD instructions --
7591 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7592 // in the Advanced SIMD instruction set.
7593 case AArch64::ADDv8i8:
7594 case AArch64::ADDv16i8:
7595 case AArch64::ADDv4i16:
7596 case AArch64::ADDv8i16:
7597 case AArch64::ADDv2i32:
7598 case AArch64::ADDv4i32:
7599 case AArch64::ADDv1i64:
7600 case AArch64::ADDv2i64:
7601 case AArch64::MULv8i8:
7602 case AArch64::MULv16i8:
7603 case AArch64::MULv4i16:
7604 case AArch64::MULv8i16:
7605 case AArch64::MULv2i32:
7606 case AArch64::MULv4i32:
7607 case AArch64::ANDv8i8:
7608 case AArch64::ANDv16i8:
7609 case AArch64::ORRv8i8:
7610 case AArch64::ORRv16i8:
7611 case AArch64::EORv8i8:
7612 case AArch64::EORv16i8:
7613 // -- SVE instructions --
7614 case AArch64::ADD_ZZZ_B:
7615 case AArch64::ADD_ZZZ_H:
7616 case AArch64::ADD_ZZZ_S:
7617 case AArch64::ADD_ZZZ_D:
7618 case AArch64::MUL_ZZZ_B:
7619 case AArch64::MUL_ZZZ_H:
7620 case AArch64::MUL_ZZZ_S:
7621 case AArch64::MUL_ZZZ_D:
7622 case AArch64::AND_ZZZ:
7623 case AArch64::ORR_ZZZ:
7624 case AArch64::EOR_ZZZ:
7625 return true;
7626
7627 default:
7628 return false;
7629 }
7630}
7631
7632/// Find instructions that can be turned into madd.
7634 SmallVectorImpl<unsigned> &Patterns) {
7635 unsigned Opc = Root.getOpcode();
7636 MachineBasicBlock &MBB = *Root.getParent();
7637 bool Found = false;
7638
7640 return false;
7642 int Cmp_NZCV =
7643 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7644 // When NZCV is live bail out.
7645 if (Cmp_NZCV == -1)
7646 return false;
7647 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7648 // When opcode can't change bail out.
7649 // CHECKME: do we miss any cases for opcode conversion?
7650 if (NewOpc == Opc)
7651 return false;
7652 Opc = NewOpc;
7653 }
7654
7655 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7656 unsigned Pattern) {
7657 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7658 Patterns.push_back(Pattern);
7659 Found = true;
7660 }
7661 };
7662
7663 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7664 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7665 Patterns.push_back(Pattern);
7666 Found = true;
7667 }
7668 };
7669
7671
7672 switch (Opc) {
7673 default:
7674 break;
7675 case AArch64::ADDWrr:
7676 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7677 "ADDWrr does not have register operands");
7678 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7679 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7680 break;
7681 case AArch64::ADDXrr:
7682 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7683 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7684 break;
7685 case AArch64::SUBWrr:
7686 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7687 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7688 break;
7689 case AArch64::SUBXrr:
7690 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7691 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7692 break;
7693 case AArch64::ADDWri:
7694 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7695 break;
7696 case AArch64::ADDXri:
7697 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7698 break;
7699 case AArch64::SUBWri:
7700 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7701 break;
7702 case AArch64::SUBXri:
7703 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7704 break;
7705 case AArch64::ADDv8i8:
7706 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7707 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7708 break;
7709 case AArch64::ADDv16i8:
7710 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7711 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7712 break;
7713 case AArch64::ADDv4i16:
7714 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7715 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7716 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7717 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7718 break;
7719 case AArch64::ADDv8i16:
7720 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7721 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7722 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7723 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7724 break;
7725 case AArch64::ADDv2i32:
7726 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7727 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7728 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7729 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7730 break;
7731 case AArch64::ADDv4i32:
7732 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7733 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7734 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7735 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7736 break;
7737 case AArch64::SUBv8i8:
7738 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7739 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7740 break;
7741 case AArch64::SUBv16i8:
7742 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7743 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7744 break;
7745 case AArch64::SUBv4i16:
7746 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7747 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7748 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7749 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7750 break;
7751 case AArch64::SUBv8i16:
7752 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7753 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7754 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7755 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7756 break;
7757 case AArch64::SUBv2i32:
7758 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7759 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7760 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7761 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7762 break;
7763 case AArch64::SUBv4i32:
7764 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7765 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7766 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7767 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7768 break;
7769 }
7770 return Found;
7771}
7772
7773bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7774 switch (Opcode) {
7775 default:
7776 break;
7777 case AArch64::UABALB_ZZZ_D:
7778 case AArch64::UABALB_ZZZ_H:
7779 case AArch64::UABALB_ZZZ_S:
7780 case AArch64::UABALT_ZZZ_D:
7781 case AArch64::UABALT_ZZZ_H:
7782 case AArch64::UABALT_ZZZ_S:
7783 case AArch64::SABALB_ZZZ_D:
7784 case AArch64::SABALB_ZZZ_S:
7785 case AArch64::SABALB_ZZZ_H:
7786 case AArch64::SABALT_ZZZ_D:
7787 case AArch64::SABALT_ZZZ_S:
7788 case AArch64::SABALT_ZZZ_H:
7789 case AArch64::UABALv16i8_v8i16:
7790 case AArch64::UABALv2i32_v2i64:
7791 case AArch64::UABALv4i16_v4i32:
7792 case AArch64::UABALv4i32_v2i64:
7793 case AArch64::UABALv8i16_v4i32:
7794 case AArch64::UABALv8i8_v8i16:
7795 case AArch64::UABAv16i8:
7796 case AArch64::UABAv2i32:
7797 case AArch64::UABAv4i16:
7798 case AArch64::UABAv4i32:
7799 case AArch64::UABAv8i16:
7800 case AArch64::UABAv8i8:
7801 case AArch64::SABALv16i8_v8i16:
7802 case AArch64::SABALv2i32_v2i64:
7803 case AArch64::SABALv4i16_v4i32:
7804 case AArch64::SABALv4i32_v2i64:
7805 case AArch64::SABALv8i16_v4i32:
7806 case AArch64::SABALv8i8_v8i16:
7807 case AArch64::SABAv16i8:
7808 case AArch64::SABAv2i32:
7809 case AArch64::SABAv4i16:
7810 case AArch64::SABAv4i32:
7811 case AArch64::SABAv8i16:
7812 case AArch64::SABAv8i8:
7813 return true;
7814 }
7815
7816 return false;
7817}
7818
7819unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7820 unsigned AccumulationOpcode) const {
7821 switch (AccumulationOpcode) {
7822 default:
7823 llvm_unreachable("Unsupported accumulation Opcode!");
7824 case AArch64::UABALB_ZZZ_D:
7825 return AArch64::UABDLB_ZZZ_D;
7826 case AArch64::UABALB_ZZZ_H:
7827 return AArch64::UABDLB_ZZZ_H;
7828 case AArch64::UABALB_ZZZ_S:
7829 return AArch64::UABDLB_ZZZ_S;
7830 case AArch64::UABALT_ZZZ_D:
7831 return AArch64::UABDLT_ZZZ_D;
7832 case AArch64::UABALT_ZZZ_H:
7833 return AArch64::UABDLT_ZZZ_H;
7834 case AArch64::UABALT_ZZZ_S:
7835 return AArch64::UABDLT_ZZZ_S;
7836 case AArch64::UABALv16i8_v8i16:
7837 return AArch64::UABDLv16i8_v8i16;
7838 case AArch64::UABALv2i32_v2i64:
7839 return AArch64::UABDLv2i32_v2i64;
7840 case AArch64::UABALv4i16_v4i32:
7841 return AArch64::UABDLv4i16_v4i32;
7842 case AArch64::UABALv4i32_v2i64:
7843 return AArch64::UABDLv4i32_v2i64;
7844 case AArch64::UABALv8i16_v4i32:
7845 return AArch64::UABDLv8i16_v4i32;
7846 case AArch64::UABALv8i8_v8i16:
7847 return AArch64::UABDLv8i8_v8i16;
7848 case AArch64::UABAv16i8:
7849 return AArch64::UABDv16i8;
7850 case AArch64::UABAv2i32:
7851 return AArch64::UABDv2i32;
7852 case AArch64::UABAv4i16:
7853 return AArch64::UABDv4i16;
7854 case AArch64::UABAv4i32:
7855 return AArch64::UABDv4i32;
7856 case AArch64::UABAv8i16:
7857 return AArch64::UABDv8i16;
7858 case AArch64::UABAv8i8:
7859 return AArch64::UABDv8i8;
7860 case AArch64::SABALB_ZZZ_D:
7861 return AArch64::SABDLB_ZZZ_D;
7862 case AArch64::SABALB_ZZZ_S:
7863 return AArch64::SABDLB_ZZZ_S;
7864 case AArch64::SABALB_ZZZ_H:
7865 return AArch64::SABDLB_ZZZ_H;
7866 case AArch64::SABALT_ZZZ_D:
7867 return AArch64::SABDLT_ZZZ_D;
7868 case AArch64::SABALT_ZZZ_S:
7869 return AArch64::SABDLT_ZZZ_S;
7870 case AArch64::SABALT_ZZZ_H:
7871 return AArch64::SABDLT_ZZZ_H;
7872 case AArch64::SABALv16i8_v8i16:
7873 return AArch64::SABDLv16i8_v8i16;
7874 case AArch64::SABALv2i32_v2i64:
7875 return AArch64::SABDLv2i32_v2i64;
7876 case AArch64::SABALv4i16_v4i32:
7877 return AArch64::SABDLv4i16_v4i32;
7878 case AArch64::SABALv4i32_v2i64:
7879 return AArch64::SABDLv4i32_v2i64;
7880 case AArch64::SABALv8i16_v4i32:
7881 return AArch64::SABDLv8i16_v4i32;
7882 case AArch64::SABALv8i8_v8i16:
7883 return AArch64::SABDLv8i8_v8i16;
7884 case AArch64::SABAv16i8:
7885 return AArch64::SABDv16i8;
7886 case AArch64::SABAv2i32:
7887 return AArch64::SABAv2i32;
7888 case AArch64::SABAv4i16:
7889 return AArch64::SABDv4i16;
7890 case AArch64::SABAv4i32:
7891 return AArch64::SABDv4i32;
7892 case AArch64::SABAv8i16:
7893 return AArch64::SABDv8i16;
7894 case AArch64::SABAv8i8:
7895 return AArch64::SABDv8i8;
7896 }
7897}
7898
7899/// Floating-Point Support
7900
7901/// Find instructions that can be turned into madd.
7903 SmallVectorImpl<unsigned> &Patterns) {
7904
7905 if (!isCombineInstrCandidateFP(Root))
7906 return false;
7907
7908 MachineBasicBlock &MBB = *Root.getParent();
7909 bool Found = false;
7910
7911 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7912 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7913 Patterns.push_back(Pattern);
7914 return true;
7915 }
7916 return false;
7917 };
7918
7920
7921 switch (Root.getOpcode()) {
7922 default:
7923 assert(false && "Unsupported FP instruction in combiner\n");
7924 break;
7925 case AArch64::FADDHrr:
7926 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7927 "FADDHrr does not have register operands");
7928
7929 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7930 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7931 break;
7932 case AArch64::FADDSrr:
7933 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7934 "FADDSrr does not have register operands");
7935
7936 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7937 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7938
7939 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7940 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7941 break;
7942 case AArch64::FADDDrr:
7943 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7944 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7945
7946 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7947 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7948 break;
7949 case AArch64::FADDv4f16:
7950 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7951 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7952
7953 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7954 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7955 break;
7956 case AArch64::FADDv8f16:
7957 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7958 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7959
7960 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7961 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7962 break;
7963 case AArch64::FADDv2f32:
7964 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7965 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7966
7967 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7968 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7969 break;
7970 case AArch64::FADDv2f64:
7971 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7972 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7973
7974 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7975 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7976 break;
7977 case AArch64::FADDv4f32:
7978 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7979 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7980
7981 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7982 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7983 break;
7984 case AArch64::FSUBHrr:
7985 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7986 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7987 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7988 break;
7989 case AArch64::FSUBSrr:
7990 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7991
7992 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7993 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7994
7995 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7996 break;
7997 case AArch64::FSUBDrr:
7998 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7999
8000 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
8001 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
8002
8003 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
8004 break;
8005 case AArch64::FSUBv4f16:
8006 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
8007 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
8008
8009 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
8010 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
8011 break;
8012 case AArch64::FSUBv8f16:
8013 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
8014 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
8015
8016 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
8017 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
8018 break;
8019 case AArch64::FSUBv2f32:
8020 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
8021 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
8022
8023 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
8024 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
8025 break;
8026 case AArch64::FSUBv2f64:
8027 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
8028 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
8029
8030 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
8031 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
8032 break;
8033 case AArch64::FSUBv4f32:
8034 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
8035 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
8036
8037 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
8038 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
8039 break;
8040 }
8041 return Found;
8042}
8043
8045 SmallVectorImpl<unsigned> &Patterns) {
8046 MachineBasicBlock &MBB = *Root.getParent();
8047 bool Found = false;
8048
8049 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
8050 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8051 MachineOperand &MO = Root.getOperand(Operand);
8052 MachineInstr *MI = nullptr;
8053 if (MO.isReg() && MO.getReg().isVirtual())
8054 MI = MRI.getUniqueVRegDef(MO.getReg());
8055 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
8056 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
8057 MI->getOperand(1).getReg().isVirtual())
8058 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
8059 if (MI && MI->getOpcode() == Opcode) {
8060 Patterns.push_back(Pattern);
8061 return true;
8062 }
8063 return false;
8064 };
8065
8067
8068 switch (Root.getOpcode()) {
8069 default:
8070 return false;
8071 case AArch64::FMULv2f32:
8072 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
8073 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
8074 break;
8075 case AArch64::FMULv2f64:
8076 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
8077 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
8078 break;
8079 case AArch64::FMULv4f16:
8080 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
8081 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
8082 break;
8083 case AArch64::FMULv4f32:
8084 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
8085 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
8086 break;
8087 case AArch64::FMULv8f16:
8088 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
8089 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
8090 break;
8091 }
8092
8093 return Found;
8094}
8095
8097 SmallVectorImpl<unsigned> &Patterns) {
8098 unsigned Opc = Root.getOpcode();
8099 MachineBasicBlock &MBB = *Root.getParent();
8100 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8101
8102 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
8103 MachineOperand &MO = Root.getOperand(1);
8105 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
8106 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
8110 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
8111 Patterns.push_back(Pattern);
8112 return true;
8113 }
8114 return false;
8115 };
8116
8117 switch (Opc) {
8118 default:
8119 break;
8120 case AArch64::FNEGDr:
8121 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
8122 case AArch64::FNEGSr:
8123 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
8124 }
8125
8126 return false;
8127}
8128
8129/// Return true when a code sequence can improve throughput. It
8130/// should be called only for instructions in loops.
8131/// \param Pattern - combiner pattern
8133 switch (Pattern) {
8134 default:
8135 break;
8241 return true;
8242 } // end switch (Pattern)
8243 return false;
8244}
8245
8246/// Find other MI combine patterns.
8248 SmallVectorImpl<unsigned> &Patterns) {
8249 // A - (B + C) ==> (A - B) - C or (A - C) - B
8250 unsigned Opc = Root.getOpcode();
8251 MachineBasicBlock &MBB = *Root.getParent();
8252
8253 switch (Opc) {
8254 case AArch64::SUBWrr:
8255 case AArch64::SUBSWrr:
8256 case AArch64::SUBXrr:
8257 case AArch64::SUBSXrr:
8258 // Found candidate root.
8259 break;
8260 default:
8261 return false;
8262 }
8263
8265 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8266 -1)
8267 return false;
8268
8269 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8270 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8271 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8272 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8275 return true;
8276 }
8277
8278 return false;
8279}
8280
8281/// Check if the given instruction forms a gather load pattern that can be
8282/// optimized for better Memory-Level Parallelism (MLP). This function
8283/// identifies chains of NEON lane load instructions that load data from
8284/// different memory addresses into individual lanes of a 128-bit vector
8285/// register, then attempts to split the pattern into parallel loads to break
8286/// the serial dependency between instructions.
8287///
8288/// Pattern Matched:
8289/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8290/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8291///
8292/// Transformed Into:
8293/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8294/// to combine the results, enabling better memory-level parallelism.
8295///
8296/// Supported Element Types:
8297/// - 32-bit elements (LD1i32, 4 lanes total)
8298/// - 16-bit elements (LD1i16, 8 lanes total)
8299/// - 8-bit elements (LD1i8, 16 lanes total)
8301 SmallVectorImpl<unsigned> &Patterns,
8302 unsigned LoadLaneOpCode, unsigned NumLanes) {
8303 const MachineFunction *MF = Root.getMF();
8304
8305 // Early exit if optimizing for size.
8306 if (MF->getFunction().hasMinSize())
8307 return false;
8308
8309 const MachineRegisterInfo &MRI = MF->getRegInfo();
8311
8312 // The root of the pattern must load into the last lane of the vector.
8313 if (Root.getOperand(2).getImm() != NumLanes - 1)
8314 return false;
8315
8316 // Check that we have load into all lanes except lane 0.
8317 // For each load we also want to check that:
8318 // 1. It has a single non-debug use (since we will be replacing the virtual
8319 // register)
8320 // 2. That the addressing mode only uses a single pointer operand
8321 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8322 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8323 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8325 while (!RemainingLanes.empty() && CurrInstr &&
8326 CurrInstr->getOpcode() == LoadLaneOpCode &&
8327 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8328 CurrInstr->getNumOperands() == 4) {
8329 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8330 LoadInstrs.push_back(CurrInstr);
8331 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8332 }
8333
8334 // Check that we have found a match for lanes N-1.. 1.
8335 if (!RemainingLanes.empty())
8336 return false;
8337
8338 // Match the SUBREG_TO_REG sequence.
8339 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8340 return false;
8341
8342 // Verify that the subreg to reg loads an integer into the first lane.
8343 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8344 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8345 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8346 return false;
8347
8348 // Verify that it also has a single non debug use.
8349 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8350 return false;
8351
8352 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8353
8354 // If there is any chance of aliasing, do not apply the pattern.
8355 // Walk backward through the MBB starting from Root.
8356 // Exit early if we've encountered all load instructions or hit the search
8357 // limit.
8358 auto MBBItr = Root.getIterator();
8359 unsigned RemainingSteps = GatherOptSearchLimit;
8360 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8361 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8362 const MachineBasicBlock *MBB = Root.getParent();
8363
8364 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8365 !RemainingLoadInstrs.empty();
8366 --MBBItr, --RemainingSteps) {
8367 const MachineInstr &CurrInstr = *MBBItr;
8368
8369 // Remove this instruction from remaining loads if it's one we're tracking.
8370 RemainingLoadInstrs.erase(&CurrInstr);
8371
8372 // Check for potential aliasing with any of the load instructions to
8373 // optimize.
8374 if (CurrInstr.isLoadFoldBarrier())
8375 return false;
8376 }
8377
8378 // If we hit the search limit without finding all load instructions,
8379 // don't match the pattern.
8380 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8381 return false;
8382
8383 switch (NumLanes) {
8384 case 4:
8386 break;
8387 case 8:
8389 break;
8390 case 16:
8392 break;
8393 default:
8394 llvm_unreachable("Got bad number of lanes for gather pattern.");
8395 }
8396
8397 return true;
8398}
8399
8400/// Search for patterns of LD instructions we can optimize.
8402 SmallVectorImpl<unsigned> &Patterns) {
8403
8404 // The pattern searches for loads into single lanes.
8405 switch (Root.getOpcode()) {
8406 case AArch64::LD1i32:
8407 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8408 case AArch64::LD1i16:
8409 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8410 case AArch64::LD1i8:
8411 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8412 default:
8413 return false;
8414 }
8415}
8416
8417/// Generate optimized instruction sequence for gather load patterns to improve
8418/// Memory-Level Parallelism (MLP). This function transforms a chain of
8419/// sequential NEON lane loads into parallel vector loads that can execute
8420/// concurrently.
8421static void
8425 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8426 unsigned Pattern, unsigned NumLanes) {
8427 MachineFunction &MF = *Root.getParent()->getParent();
8428 MachineRegisterInfo &MRI = MF.getRegInfo();
8430
8431 // Gather the initial load instructions to build the pattern.
8432 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8433 MachineInstr *CurrInstr = &Root;
8434 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8435 LoadToLaneInstrs.push_back(CurrInstr);
8436 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8437 }
8438
8439 // Sort the load instructions according to the lane.
8440 llvm::sort(LoadToLaneInstrs,
8441 [](const MachineInstr *A, const MachineInstr *B) {
8442 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8443 });
8444
8445 MachineInstr *SubregToReg = CurrInstr;
8446 LoadToLaneInstrs.push_back(
8447 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8448 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8449
8450 const TargetRegisterClass *FPR128RegClass =
8451 MRI.getRegClass(Root.getOperand(0).getReg());
8452
8453 // Helper lambda to create a LD1 instruction.
8454 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8455 Register SrcRegister, unsigned Lane,
8456 Register OffsetRegister,
8457 bool OffsetRegisterKillState) {
8458 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8459 MachineInstrBuilder LoadIndexIntoRegister =
8460 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8461 NewRegister)
8462 .addReg(SrcRegister)
8463 .addImm(Lane)
8464 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState))
8465 .setMemRefs(OriginalInstr->memoperands());
8466 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8467 InsInstrs.push_back(LoadIndexIntoRegister);
8468 return NewRegister;
8469 };
8470
8471 // Helper to create load instruction based on the NumLanes in the NEON
8472 // register we are rewriting.
8473 auto CreateLDRInstruction =
8474 [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
8476 unsigned Opcode;
8477 switch (NumLanes) {
8478 case 4:
8479 Opcode = AArch64::LDRSui;
8480 break;
8481 case 8:
8482 Opcode = AArch64::LDRHui;
8483 break;
8484 case 16:
8485 Opcode = AArch64::LDRBui;
8486 break;
8487 default:
8489 "Got unsupported number of lanes in machine-combiner gather pattern");
8490 }
8491 // Immediate offset load
8492 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8493 .addReg(OffsetReg)
8494 .addImm(0)
8495 .setMemRefs(MMOs);
8496 };
8497
8498 // Load the remaining lanes into register 0.
8499 auto LanesToLoadToReg0 =
8500 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8501 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8502 Register PrevReg = SubregToReg->getOperand(0).getReg();
8503 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8504 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8505 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8506 OffsetRegOperand.getReg(),
8507 OffsetRegOperand.isKill());
8508 DelInstrs.push_back(LoadInstr);
8509 }
8510 Register LastLoadReg0 = PrevReg;
8511
8512 // First load into register 1. Perform an integer load to zero out the upper
8513 // lanes in a single instruction.
8514 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8515 MachineInstr *OriginalSplitLoad =
8516 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8517 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8518 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8519
8520 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8521 OriginalSplitLoad->getOperand(3);
8522 MachineInstrBuilder MiddleIndexLoadInstr =
8523 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8524 OriginalSplitToLoadOffsetOperand.getReg(),
8525 OriginalSplitLoad->memoperands());
8526
8527 InstrIdxForVirtReg.insert(
8528 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8529 InsInstrs.push_back(MiddleIndexLoadInstr);
8530 DelInstrs.push_back(OriginalSplitLoad);
8531
8532 // Subreg To Reg instruction for register 1.
8533 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8534 unsigned SubregType;
8535 switch (NumLanes) {
8536 case 4:
8537 SubregType = AArch64::ssub;
8538 break;
8539 case 8:
8540 SubregType = AArch64::hsub;
8541 break;
8542 case 16:
8543 SubregType = AArch64::bsub;
8544 break;
8545 default:
8547 "Got invalid NumLanes for machine-combiner gather pattern");
8548 }
8549
8550 auto SubRegToRegInstr =
8551 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8552 DestRegForSubregToReg)
8553 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8554 .addImm(SubregType);
8555 InstrIdxForVirtReg.insert(
8556 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8557 InsInstrs.push_back(SubRegToRegInstr);
8558
8559 // Load remaining lanes into register 1.
8560 auto LanesToLoadToReg1 =
8561 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8562 LoadToLaneInstrsAscending.end());
8563 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8564 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8565 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8566 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8567 OffsetRegOperand.getReg(),
8568 OffsetRegOperand.isKill());
8569
8570 // Do not add the last reg to DelInstrs - it will be removed later.
8571 if (Index == NumLanes / 2 - 2) {
8572 break;
8573 }
8574 DelInstrs.push_back(LoadInstr);
8575 }
8576 Register LastLoadReg1 = PrevReg;
8577
8578 // Create the final zip instruction to combine the results.
8579 MachineInstrBuilder ZipInstr =
8580 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8581 Root.getOperand(0).getReg())
8582 .addReg(LastLoadReg0)
8583 .addReg(LastLoadReg1);
8584 InsInstrs.push_back(ZipInstr);
8585}
8586
8600
8601/// Return true when there is potentially a faster code sequence for an
8602/// instruction chain ending in \p Root. All potential patterns are listed in
8603/// the \p Pattern vector. Pattern should be sorted in priority order since the
8604/// pattern evaluator stops checking as soon as it finds a faster sequence.
8605
8606bool AArch64InstrInfo::getMachineCombinerPatterns(
8607 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8608 bool DoRegPressureReduce) const {
8609 // Integer patterns
8610 if (getMaddPatterns(Root, Patterns))
8611 return true;
8612 // Floating point patterns
8613 if (getFMULPatterns(Root, Patterns))
8614 return true;
8615 if (getFMAPatterns(Root, Patterns))
8616 return true;
8617 if (getFNEGPatterns(Root, Patterns))
8618 return true;
8619
8620 // Other patterns
8621 if (getMiscPatterns(Root, Patterns))
8622 return true;
8623
8624 // Load patterns
8625 if (getLoadPatterns(Root, Patterns))
8626 return true;
8627
8628 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8629 DoRegPressureReduce);
8630}
8631
8633/// genFusedMultiply - Generate fused multiply instructions.
8634/// This function supports both integer and floating point instructions.
8635/// A typical example:
8636/// F|MUL I=A,B,0
8637/// F|ADD R,I,C
8638/// ==> F|MADD R,A,B,C
8639/// \param MF Containing MachineFunction
8640/// \param MRI Register information
8641/// \param TII Target information
8642/// \param Root is the F|ADD instruction
8643/// \param [out] InsInstrs is a vector of machine instructions and will
8644/// contain the generated madd instruction
8645/// \param IdxMulOpd is index of operand in Root that is the result of
8646/// the F|MUL. In the example above IdxMulOpd is 1.
8647/// \param MaddOpc the opcode fo the f|madd instruction
8648/// \param RC Register class of operands
8649/// \param kind of fma instruction (addressing mode) to be generated
8650/// \param ReplacedAddend is the result register from the instruction
8651/// replacing the non-combined operand, if any.
8652static MachineInstr *
8654 const TargetInstrInfo *TII, MachineInstr &Root,
8655 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8656 unsigned MaddOpc, const TargetRegisterClass *RC,
8658 const Register *ReplacedAddend = nullptr) {
8659 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8660
8661 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8662 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8663 Register ResultReg = Root.getOperand(0).getReg();
8664 Register SrcReg0 = MUL->getOperand(1).getReg();
8665 bool Src0IsKill = MUL->getOperand(1).isKill();
8666 Register SrcReg1 = MUL->getOperand(2).getReg();
8667 bool Src1IsKill = MUL->getOperand(2).isKill();
8668
8669 Register SrcReg2;
8670 bool Src2IsKill;
8671 if (ReplacedAddend) {
8672 // If we just generated a new addend, we must be it's only use.
8673 SrcReg2 = *ReplacedAddend;
8674 Src2IsKill = true;
8675 } else {
8676 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8677 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8678 }
8679
8680 if (ResultReg.isVirtual())
8681 MRI.constrainRegClass(ResultReg, RC);
8682 if (SrcReg0.isVirtual())
8683 MRI.constrainRegClass(SrcReg0, RC);
8684 if (SrcReg1.isVirtual())
8685 MRI.constrainRegClass(SrcReg1, RC);
8686 if (SrcReg2.isVirtual())
8687 MRI.constrainRegClass(SrcReg2, RC);
8688
8690 if (kind == FMAInstKind::Default)
8691 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8692 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8693 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8694 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8695 else if (kind == FMAInstKind::Indexed)
8696 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8697 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8698 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8699 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8700 .addImm(MUL->getOperand(3).getImm());
8701 else if (kind == FMAInstKind::Accumulator)
8702 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8703 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8704 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8705 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8706 else
8707 assert(false && "Invalid FMA instruction kind \n");
8708 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8709 InsInstrs.push_back(MIB);
8710 return MUL;
8711}
8712
8713static MachineInstr *
8715 const TargetInstrInfo *TII, MachineInstr &Root,
8717 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8718
8719 unsigned Opc = 0;
8720 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8721 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8722 Opc = AArch64::FNMADDSrrr;
8723 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8724 Opc = AArch64::FNMADDDrrr;
8725 else
8726 return nullptr;
8727
8728 Register ResultReg = Root.getOperand(0).getReg();
8729 Register SrcReg0 = MAD->getOperand(1).getReg();
8730 Register SrcReg1 = MAD->getOperand(2).getReg();
8731 Register SrcReg2 = MAD->getOperand(3).getReg();
8732 bool Src0IsKill = MAD->getOperand(1).isKill();
8733 bool Src1IsKill = MAD->getOperand(2).isKill();
8734 bool Src2IsKill = MAD->getOperand(3).isKill();
8735 if (ResultReg.isVirtual())
8736 MRI.constrainRegClass(ResultReg, RC);
8737 if (SrcReg0.isVirtual())
8738 MRI.constrainRegClass(SrcReg0, RC);
8739 if (SrcReg1.isVirtual())
8740 MRI.constrainRegClass(SrcReg1, RC);
8741 if (SrcReg2.isVirtual())
8742 MRI.constrainRegClass(SrcReg2, RC);
8743
8745 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8746 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8747 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8748 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8749 InsInstrs.push_back(MIB);
8750
8751 return MAD;
8752}
8753
8754/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8755static MachineInstr *
8758 unsigned IdxDupOp, unsigned MulOpc,
8759 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8760 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8761 "Invalid index of FMUL operand");
8762
8763 MachineFunction &MF = *Root.getMF();
8765
8766 MachineInstr *Dup =
8767 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8768
8769 if (Dup->getOpcode() == TargetOpcode::COPY)
8770 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8771
8772 Register DupSrcReg = Dup->getOperand(1).getReg();
8773 MRI.clearKillFlags(DupSrcReg);
8774 MRI.constrainRegClass(DupSrcReg, RC);
8775
8776 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8777
8778 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8779 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8780
8781 Register ResultReg = Root.getOperand(0).getReg();
8782
8784 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8785 .add(MulOp)
8786 .addReg(DupSrcReg)
8787 .addImm(DupSrcLane);
8788
8789 InsInstrs.push_back(MIB);
8790 return &Root;
8791}
8792
8793/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8794/// instructions.
8795///
8796/// \see genFusedMultiply
8800 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8801 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8803}
8804
8805/// genNeg - Helper to generate an intermediate negation of the second operand
8806/// of Root
8808 const TargetInstrInfo *TII, MachineInstr &Root,
8810 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8811 unsigned MnegOpc, const TargetRegisterClass *RC) {
8812 Register NewVR = MRI.createVirtualRegister(RC);
8814 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8815 .add(Root.getOperand(2));
8816 InsInstrs.push_back(MIB);
8817
8818 assert(InstrIdxForVirtReg.empty());
8819 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8820
8821 return NewVR;
8822}
8823
8824/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8825/// instructions with an additional negation of the accumulator
8829 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8830 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8831 assert(IdxMulOpd == 1);
8832
8833 Register NewVR =
8834 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8835 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8836 FMAInstKind::Accumulator, &NewVR);
8837}
8838
8839/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8840/// instructions.
8841///
8842/// \see genFusedMultiply
8846 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8847 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8849}
8850
8851/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8852/// instructions with an additional negation of the accumulator
8856 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8857 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8858 assert(IdxMulOpd == 1);
8859
8860 Register NewVR =
8861 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8862
8863 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8864 FMAInstKind::Indexed, &NewVR);
8865}
8866
8867/// genMaddR - Generate madd instruction and combine mul and add using
8868/// an extra virtual register
8869/// Example - an ADD intermediate needs to be stored in a register:
8870/// MUL I=A,B,0
8871/// ADD R,I,Imm
8872/// ==> ORR V, ZR, Imm
8873/// ==> MADD R,A,B,V
8874/// \param MF Containing MachineFunction
8875/// \param MRI Register information
8876/// \param TII Target information
8877/// \param Root is the ADD instruction
8878/// \param [out] InsInstrs is a vector of machine instructions and will
8879/// contain the generated madd instruction
8880/// \param IdxMulOpd is index of operand in Root that is the result of
8881/// the MUL. In the example above IdxMulOpd is 1.
8882/// \param MaddOpc the opcode fo the madd instruction
8883/// \param VR is a virtual register that holds the value of an ADD operand
8884/// (V in the example above).
8885/// \param RC Register class of operands
8887 const TargetInstrInfo *TII, MachineInstr &Root,
8889 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8890 const TargetRegisterClass *RC) {
8891 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8892
8893 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8894 Register ResultReg = Root.getOperand(0).getReg();
8895 Register SrcReg0 = MUL->getOperand(1).getReg();
8896 bool Src0IsKill = MUL->getOperand(1).isKill();
8897 Register SrcReg1 = MUL->getOperand(2).getReg();
8898 bool Src1IsKill = MUL->getOperand(2).isKill();
8899
8900 if (ResultReg.isVirtual())
8901 MRI.constrainRegClass(ResultReg, RC);
8902 if (SrcReg0.isVirtual())
8903 MRI.constrainRegClass(SrcReg0, RC);
8904 if (SrcReg1.isVirtual())
8905 MRI.constrainRegClass(SrcReg1, RC);
8907 MRI.constrainRegClass(VR, RC);
8908
8910 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8911 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8912 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8913 .addReg(VR);
8914 // Insert the MADD
8915 InsInstrs.push_back(MIB);
8916 return MUL;
8917}
8918
8919/// Do the following transformation
8920/// A - (B + C) ==> (A - B) - C
8921/// A - (B + C) ==> (A - C) - B
8923 const TargetInstrInfo *TII, MachineInstr &Root,
8926 unsigned IdxOpd1,
8927 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8928 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8929 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8930 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8931
8932 Register ResultReg = Root.getOperand(0).getReg();
8933 Register RegA = Root.getOperand(1).getReg();
8934 bool RegAIsKill = Root.getOperand(1).isKill();
8935 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8936 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8937 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8938 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8939 Register NewVR =
8941
8942 unsigned Opcode = Root.getOpcode();
8943 if (Opcode == AArch64::SUBSWrr)
8944 Opcode = AArch64::SUBWrr;
8945 else if (Opcode == AArch64::SUBSXrr)
8946 Opcode = AArch64::SUBXrr;
8947 else
8948 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8949 "Unexpected instruction opcode.");
8950
8951 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8952 Flags &= ~MachineInstr::NoSWrap;
8953 Flags &= ~MachineInstr::NoUWrap;
8954
8955 MachineInstrBuilder MIB1 =
8956 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8957 .addReg(RegA, getKillRegState(RegAIsKill))
8958 .addReg(RegB, getKillRegState(RegBIsKill))
8959 .setMIFlags(Flags);
8960 MachineInstrBuilder MIB2 =
8961 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8962 .addReg(NewVR, getKillRegState(true))
8963 .addReg(RegC, getKillRegState(RegCIsKill))
8964 .setMIFlags(Flags);
8965
8966 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8967 InsInstrs.push_back(MIB1);
8968 InsInstrs.push_back(MIB2);
8969 DelInstrs.push_back(AddMI);
8970 DelInstrs.push_back(&Root);
8971}
8972
8973unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8974 unsigned int AccumulatorOpCode) const {
8975 switch (AccumulatorOpCode) {
8976 case AArch64::UABALB_ZZZ_D:
8977 case AArch64::SABALB_ZZZ_D:
8978 case AArch64::UABALT_ZZZ_D:
8979 case AArch64::SABALT_ZZZ_D:
8980 return AArch64::ADD_ZZZ_D;
8981 case AArch64::UABALB_ZZZ_H:
8982 case AArch64::SABALB_ZZZ_H:
8983 case AArch64::UABALT_ZZZ_H:
8984 case AArch64::SABALT_ZZZ_H:
8985 return AArch64::ADD_ZZZ_H;
8986 case AArch64::UABALB_ZZZ_S:
8987 case AArch64::SABALB_ZZZ_S:
8988 case AArch64::UABALT_ZZZ_S:
8989 case AArch64::SABALT_ZZZ_S:
8990 return AArch64::ADD_ZZZ_S;
8991 case AArch64::UABALv16i8_v8i16:
8992 case AArch64::SABALv8i8_v8i16:
8993 case AArch64::SABAv8i16:
8994 case AArch64::UABAv8i16:
8995 return AArch64::ADDv8i16;
8996 case AArch64::SABALv2i32_v2i64:
8997 case AArch64::UABALv2i32_v2i64:
8998 case AArch64::SABALv4i32_v2i64:
8999 return AArch64::ADDv2i64;
9000 case AArch64::UABALv4i16_v4i32:
9001 case AArch64::SABALv4i16_v4i32:
9002 case AArch64::SABALv8i16_v4i32:
9003 case AArch64::SABAv4i32:
9004 case AArch64::UABAv4i32:
9005 return AArch64::ADDv4i32;
9006 case AArch64::UABALv4i32_v2i64:
9007 return AArch64::ADDv2i64;
9008 case AArch64::UABALv8i16_v4i32:
9009 return AArch64::ADDv4i32;
9010 case AArch64::UABALv8i8_v8i16:
9011 case AArch64::SABALv16i8_v8i16:
9012 return AArch64::ADDv8i16;
9013 case AArch64::UABAv16i8:
9014 case AArch64::SABAv16i8:
9015 return AArch64::ADDv16i8;
9016 case AArch64::UABAv4i16:
9017 case AArch64::SABAv4i16:
9018 return AArch64::ADDv4i16;
9019 case AArch64::UABAv2i32:
9020 case AArch64::SABAv2i32:
9021 return AArch64::ADDv2i32;
9022 case AArch64::UABAv8i8:
9023 case AArch64::SABAv8i8:
9024 return AArch64::ADDv8i8;
9025 default:
9026 llvm_unreachable("Unknown accumulator opcode");
9027 }
9028}
9029
9030/// When getMachineCombinerPatterns() finds potential patterns,
9031/// this function generates the instructions that could replace the
9032/// original code sequence
9033void AArch64InstrInfo::genAlternativeCodeSequence(
9034 MachineInstr &Root, unsigned Pattern,
9037 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
9038 MachineBasicBlock &MBB = *Root.getParent();
9039 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9040 MachineFunction &MF = *MBB.getParent();
9041 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
9042
9043 MachineInstr *MUL = nullptr;
9044 const TargetRegisterClass *RC;
9045 unsigned Opc;
9046 switch (Pattern) {
9047 default:
9048 // Reassociate instructions.
9049 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
9050 DelInstrs, InstrIdxForVirtReg);
9051 return;
9053 // A - (B + C)
9054 // ==> (A - B) - C
9055 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
9056 InstrIdxForVirtReg);
9057 return;
9059 // A - (B + C)
9060 // ==> (A - C) - B
9061 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
9062 InstrIdxForVirtReg);
9063 return;
9066 // MUL I=A,B,0
9067 // ADD R,I,C
9068 // ==> MADD R,A,B,C
9069 // --- Create(MADD);
9071 Opc = AArch64::MADDWrrr;
9072 RC = &AArch64::GPR32RegClass;
9073 } else {
9074 Opc = AArch64::MADDXrrr;
9075 RC = &AArch64::GPR64RegClass;
9076 }
9077 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9078 break;
9081 // MUL I=A,B,0
9082 // ADD R,C,I
9083 // ==> MADD R,A,B,C
9084 // --- Create(MADD);
9086 Opc = AArch64::MADDWrrr;
9087 RC = &AArch64::GPR32RegClass;
9088 } else {
9089 Opc = AArch64::MADDXrrr;
9090 RC = &AArch64::GPR64RegClass;
9091 }
9092 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9093 break;
9098 // MUL I=A,B,0
9099 // ADD/SUB R,I,Imm
9100 // ==> MOV V, Imm/-Imm
9101 // ==> MADD R,A,B,V
9102 // --- Create(MADD);
9103 const TargetRegisterClass *RC;
9104 unsigned BitSize, MovImm;
9107 MovImm = AArch64::MOVi32imm;
9108 RC = &AArch64::GPR32spRegClass;
9109 BitSize = 32;
9110 Opc = AArch64::MADDWrrr;
9111 RC = &AArch64::GPR32RegClass;
9112 } else {
9113 MovImm = AArch64::MOVi64imm;
9114 RC = &AArch64::GPR64spRegClass;
9115 BitSize = 64;
9116 Opc = AArch64::MADDXrrr;
9117 RC = &AArch64::GPR64RegClass;
9118 }
9119 Register NewVR = MRI.createVirtualRegister(RC);
9120 uint64_t Imm = Root.getOperand(2).getImm();
9121
9122 if (Root.getOperand(3).isImm()) {
9123 unsigned Val = Root.getOperand(3).getImm();
9124 Imm = Imm << Val;
9125 }
9126 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
9128 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
9129 // Check that the immediate can be composed via a single instruction.
9131 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
9132 if (Insn.size() != 1)
9133 return;
9134 MachineInstrBuilder MIB1 =
9135 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
9136 .addImm(IsSub ? -Imm : Imm);
9137 InsInstrs.push_back(MIB1);
9138 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9139 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9140 break;
9141 }
9144 // MUL I=A,B,0
9145 // SUB R,I, C
9146 // ==> SUB V, 0, C
9147 // ==> MADD R,A,B,V // = -C + A*B
9148 // --- Create(MADD);
9149 const TargetRegisterClass *SubRC;
9150 unsigned SubOpc, ZeroReg;
9152 SubOpc = AArch64::SUBWrr;
9153 SubRC = &AArch64::GPR32spRegClass;
9154 ZeroReg = AArch64::WZR;
9155 Opc = AArch64::MADDWrrr;
9156 RC = &AArch64::GPR32RegClass;
9157 } else {
9158 SubOpc = AArch64::SUBXrr;
9159 SubRC = &AArch64::GPR64spRegClass;
9160 ZeroReg = AArch64::XZR;
9161 Opc = AArch64::MADDXrrr;
9162 RC = &AArch64::GPR64RegClass;
9163 }
9164 Register NewVR = MRI.createVirtualRegister(SubRC);
9165 // SUB NewVR, 0, C
9166 MachineInstrBuilder MIB1 =
9167 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
9168 .addReg(ZeroReg)
9169 .add(Root.getOperand(2));
9170 InsInstrs.push_back(MIB1);
9171 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9172 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9173 break;
9174 }
9177 // MUL I=A,B,0
9178 // SUB R,C,I
9179 // ==> MSUB R,A,B,C (computes C - A*B)
9180 // --- Create(MSUB);
9182 Opc = AArch64::MSUBWrrr;
9183 RC = &AArch64::GPR32RegClass;
9184 } else {
9185 Opc = AArch64::MSUBXrrr;
9186 RC = &AArch64::GPR64RegClass;
9187 }
9188 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9189 break;
9191 Opc = AArch64::MLAv8i8;
9192 RC = &AArch64::FPR64RegClass;
9193 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9194 break;
9196 Opc = AArch64::MLAv8i8;
9197 RC = &AArch64::FPR64RegClass;
9198 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9199 break;
9201 Opc = AArch64::MLAv16i8;
9202 RC = &AArch64::FPR128RegClass;
9203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9204 break;
9206 Opc = AArch64::MLAv16i8;
9207 RC = &AArch64::FPR128RegClass;
9208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9209 break;
9211 Opc = AArch64::MLAv4i16;
9212 RC = &AArch64::FPR64RegClass;
9213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9214 break;
9216 Opc = AArch64::MLAv4i16;
9217 RC = &AArch64::FPR64RegClass;
9218 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9219 break;
9221 Opc = AArch64::MLAv8i16;
9222 RC = &AArch64::FPR128RegClass;
9223 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9224 break;
9226 Opc = AArch64::MLAv8i16;
9227 RC = &AArch64::FPR128RegClass;
9228 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9229 break;
9231 Opc = AArch64::MLAv2i32;
9232 RC = &AArch64::FPR64RegClass;
9233 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9234 break;
9236 Opc = AArch64::MLAv2i32;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9239 break;
9241 Opc = AArch64::MLAv4i32;
9242 RC = &AArch64::FPR128RegClass;
9243 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9244 break;
9246 Opc = AArch64::MLAv4i32;
9247 RC = &AArch64::FPR128RegClass;
9248 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9249 break;
9250
9252 Opc = AArch64::MLAv8i8;
9253 RC = &AArch64::FPR64RegClass;
9254 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9255 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9256 RC);
9257 break;
9259 Opc = AArch64::MLSv8i8;
9260 RC = &AArch64::FPR64RegClass;
9261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9262 break;
9264 Opc = AArch64::MLAv16i8;
9265 RC = &AArch64::FPR128RegClass;
9266 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9267 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9268 RC);
9269 break;
9271 Opc = AArch64::MLSv16i8;
9272 RC = &AArch64::FPR128RegClass;
9273 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9274 break;
9276 Opc = AArch64::MLAv4i16;
9277 RC = &AArch64::FPR64RegClass;
9278 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9279 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9280 RC);
9281 break;
9283 Opc = AArch64::MLSv4i16;
9284 RC = &AArch64::FPR64RegClass;
9285 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9286 break;
9288 Opc = AArch64::MLAv8i16;
9289 RC = &AArch64::FPR128RegClass;
9290 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9291 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9292 RC);
9293 break;
9295 Opc = AArch64::MLSv8i16;
9296 RC = &AArch64::FPR128RegClass;
9297 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9298 break;
9300 Opc = AArch64::MLAv2i32;
9301 RC = &AArch64::FPR64RegClass;
9302 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9303 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9304 RC);
9305 break;
9307 Opc = AArch64::MLSv2i32;
9308 RC = &AArch64::FPR64RegClass;
9309 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9310 break;
9312 Opc = AArch64::MLAv4i32;
9313 RC = &AArch64::FPR128RegClass;
9314 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9315 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9316 RC);
9317 break;
9319 Opc = AArch64::MLSv4i32;
9320 RC = &AArch64::FPR128RegClass;
9321 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9322 break;
9323
9325 Opc = AArch64::MLAv4i16_indexed;
9326 RC = &AArch64::FPR64RegClass;
9327 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9328 break;
9330 Opc = AArch64::MLAv4i16_indexed;
9331 RC = &AArch64::FPR64RegClass;
9332 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9333 break;
9335 Opc = AArch64::MLAv8i16_indexed;
9336 RC = &AArch64::FPR128RegClass;
9337 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9338 break;
9340 Opc = AArch64::MLAv8i16_indexed;
9341 RC = &AArch64::FPR128RegClass;
9342 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9343 break;
9345 Opc = AArch64::MLAv2i32_indexed;
9346 RC = &AArch64::FPR64RegClass;
9347 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9348 break;
9350 Opc = AArch64::MLAv2i32_indexed;
9351 RC = &AArch64::FPR64RegClass;
9352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9353 break;
9355 Opc = AArch64::MLAv4i32_indexed;
9356 RC = &AArch64::FPR128RegClass;
9357 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9358 break;
9360 Opc = AArch64::MLAv4i32_indexed;
9361 RC = &AArch64::FPR128RegClass;
9362 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9363 break;
9364
9366 Opc = AArch64::MLAv4i16_indexed;
9367 RC = &AArch64::FPR64RegClass;
9368 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9369 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9370 RC);
9371 break;
9373 Opc = AArch64::MLSv4i16_indexed;
9374 RC = &AArch64::FPR64RegClass;
9375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9376 break;
9378 Opc = AArch64::MLAv8i16_indexed;
9379 RC = &AArch64::FPR128RegClass;
9380 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9381 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9382 RC);
9383 break;
9385 Opc = AArch64::MLSv8i16_indexed;
9386 RC = &AArch64::FPR128RegClass;
9387 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9388 break;
9390 Opc = AArch64::MLAv2i32_indexed;
9391 RC = &AArch64::FPR64RegClass;
9392 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9393 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9394 RC);
9395 break;
9397 Opc = AArch64::MLSv2i32_indexed;
9398 RC = &AArch64::FPR64RegClass;
9399 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9400 break;
9402 Opc = AArch64::MLAv4i32_indexed;
9403 RC = &AArch64::FPR128RegClass;
9404 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9405 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9406 RC);
9407 break;
9409 Opc = AArch64::MLSv4i32_indexed;
9410 RC = &AArch64::FPR128RegClass;
9411 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9412 break;
9413
9414 // Floating Point Support
9416 Opc = AArch64::FMADDHrrr;
9417 RC = &AArch64::FPR16RegClass;
9418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9419 break;
9421 Opc = AArch64::FMADDSrrr;
9422 RC = &AArch64::FPR32RegClass;
9423 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9424 break;
9426 Opc = AArch64::FMADDDrrr;
9427 RC = &AArch64::FPR64RegClass;
9428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9429 break;
9430
9432 Opc = AArch64::FMADDHrrr;
9433 RC = &AArch64::FPR16RegClass;
9434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9435 break;
9437 Opc = AArch64::FMADDSrrr;
9438 RC = &AArch64::FPR32RegClass;
9439 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9440 break;
9442 Opc = AArch64::FMADDDrrr;
9443 RC = &AArch64::FPR64RegClass;
9444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9445 break;
9446
9448 Opc = AArch64::FMLAv1i32_indexed;
9449 RC = &AArch64::FPR32RegClass;
9450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9452 break;
9454 Opc = AArch64::FMLAv1i32_indexed;
9455 RC = &AArch64::FPR32RegClass;
9456 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9458 break;
9459
9461 Opc = AArch64::FMLAv1i64_indexed;
9462 RC = &AArch64::FPR64RegClass;
9463 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9465 break;
9467 Opc = AArch64::FMLAv1i64_indexed;
9468 RC = &AArch64::FPR64RegClass;
9469 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9471 break;
9472
9474 RC = &AArch64::FPR64RegClass;
9475 Opc = AArch64::FMLAv4i16_indexed;
9476 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9478 break;
9480 RC = &AArch64::FPR64RegClass;
9481 Opc = AArch64::FMLAv4f16;
9482 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9484 break;
9486 RC = &AArch64::FPR64RegClass;
9487 Opc = AArch64::FMLAv4i16_indexed;
9488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9490 break;
9492 RC = &AArch64::FPR64RegClass;
9493 Opc = AArch64::FMLAv4f16;
9494 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9496 break;
9497
9500 RC = &AArch64::FPR64RegClass;
9502 Opc = AArch64::FMLAv2i32_indexed;
9503 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9505 } else {
9506 Opc = AArch64::FMLAv2f32;
9507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9509 }
9510 break;
9513 RC = &AArch64::FPR64RegClass;
9515 Opc = AArch64::FMLAv2i32_indexed;
9516 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9518 } else {
9519 Opc = AArch64::FMLAv2f32;
9520 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9522 }
9523 break;
9524
9526 RC = &AArch64::FPR128RegClass;
9527 Opc = AArch64::FMLAv8i16_indexed;
9528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9530 break;
9532 RC = &AArch64::FPR128RegClass;
9533 Opc = AArch64::FMLAv8f16;
9534 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9536 break;
9538 RC = &AArch64::FPR128RegClass;
9539 Opc = AArch64::FMLAv8i16_indexed;
9540 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9542 break;
9544 RC = &AArch64::FPR128RegClass;
9545 Opc = AArch64::FMLAv8f16;
9546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9548 break;
9549
9552 RC = &AArch64::FPR128RegClass;
9554 Opc = AArch64::FMLAv2i64_indexed;
9555 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9557 } else {
9558 Opc = AArch64::FMLAv2f64;
9559 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9561 }
9562 break;
9565 RC = &AArch64::FPR128RegClass;
9567 Opc = AArch64::FMLAv2i64_indexed;
9568 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9570 } else {
9571 Opc = AArch64::FMLAv2f64;
9572 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9574 }
9575 break;
9576
9579 RC = &AArch64::FPR128RegClass;
9581 Opc = AArch64::FMLAv4i32_indexed;
9582 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9584 } else {
9585 Opc = AArch64::FMLAv4f32;
9586 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9588 }
9589 break;
9590
9593 RC = &AArch64::FPR128RegClass;
9595 Opc = AArch64::FMLAv4i32_indexed;
9596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9598 } else {
9599 Opc = AArch64::FMLAv4f32;
9600 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9602 }
9603 break;
9604
9606 Opc = AArch64::FNMSUBHrrr;
9607 RC = &AArch64::FPR16RegClass;
9608 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9609 break;
9611 Opc = AArch64::FNMSUBSrrr;
9612 RC = &AArch64::FPR32RegClass;
9613 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9614 break;
9616 Opc = AArch64::FNMSUBDrrr;
9617 RC = &AArch64::FPR64RegClass;
9618 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9619 break;
9620
9622 Opc = AArch64::FNMADDHrrr;
9623 RC = &AArch64::FPR16RegClass;
9624 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9625 break;
9627 Opc = AArch64::FNMADDSrrr;
9628 RC = &AArch64::FPR32RegClass;
9629 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9630 break;
9632 Opc = AArch64::FNMADDDrrr;
9633 RC = &AArch64::FPR64RegClass;
9634 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9635 break;
9636
9638 Opc = AArch64::FMSUBHrrr;
9639 RC = &AArch64::FPR16RegClass;
9640 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9641 break;
9643 Opc = AArch64::FMSUBSrrr;
9644 RC = &AArch64::FPR32RegClass;
9645 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9646 break;
9648 Opc = AArch64::FMSUBDrrr;
9649 RC = &AArch64::FPR64RegClass;
9650 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9651 break;
9652
9654 Opc = AArch64::FMLSv1i32_indexed;
9655 RC = &AArch64::FPR32RegClass;
9656 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9658 break;
9659
9661 Opc = AArch64::FMLSv1i64_indexed;
9662 RC = &AArch64::FPR64RegClass;
9663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9665 break;
9666
9669 RC = &AArch64::FPR64RegClass;
9670 Register NewVR = MRI.createVirtualRegister(RC);
9671 MachineInstrBuilder MIB1 =
9672 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9673 .add(Root.getOperand(2));
9674 InsInstrs.push_back(MIB1);
9675 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9677 Opc = AArch64::FMLAv4f16;
9678 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9679 FMAInstKind::Accumulator, &NewVR);
9680 } else {
9681 Opc = AArch64::FMLAv4i16_indexed;
9682 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9683 FMAInstKind::Indexed, &NewVR);
9684 }
9685 break;
9686 }
9688 RC = &AArch64::FPR64RegClass;
9689 Opc = AArch64::FMLSv4f16;
9690 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9692 break;
9694 RC = &AArch64::FPR64RegClass;
9695 Opc = AArch64::FMLSv4i16_indexed;
9696 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9698 break;
9699
9702 RC = &AArch64::FPR64RegClass;
9704 Opc = AArch64::FMLSv2i32_indexed;
9705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9707 } else {
9708 Opc = AArch64::FMLSv2f32;
9709 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9711 }
9712 break;
9713
9716 RC = &AArch64::FPR128RegClass;
9717 Register NewVR = MRI.createVirtualRegister(RC);
9718 MachineInstrBuilder MIB1 =
9719 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9720 .add(Root.getOperand(2));
9721 InsInstrs.push_back(MIB1);
9722 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9724 Opc = AArch64::FMLAv8f16;
9725 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9726 FMAInstKind::Accumulator, &NewVR);
9727 } else {
9728 Opc = AArch64::FMLAv8i16_indexed;
9729 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9730 FMAInstKind::Indexed, &NewVR);
9731 }
9732 break;
9733 }
9735 RC = &AArch64::FPR128RegClass;
9736 Opc = AArch64::FMLSv8f16;
9737 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9739 break;
9741 RC = &AArch64::FPR128RegClass;
9742 Opc = AArch64::FMLSv8i16_indexed;
9743 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9745 break;
9746
9749 RC = &AArch64::FPR128RegClass;
9751 Opc = AArch64::FMLSv2i64_indexed;
9752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9754 } else {
9755 Opc = AArch64::FMLSv2f64;
9756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9758 }
9759 break;
9760
9763 RC = &AArch64::FPR128RegClass;
9765 Opc = AArch64::FMLSv4i32_indexed;
9766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9768 } else {
9769 Opc = AArch64::FMLSv4f32;
9770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9772 }
9773 break;
9776 RC = &AArch64::FPR64RegClass;
9777 Register NewVR = MRI.createVirtualRegister(RC);
9778 MachineInstrBuilder MIB1 =
9779 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9780 .add(Root.getOperand(2));
9781 InsInstrs.push_back(MIB1);
9782 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9784 Opc = AArch64::FMLAv2i32_indexed;
9785 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9786 FMAInstKind::Indexed, &NewVR);
9787 } else {
9788 Opc = AArch64::FMLAv2f32;
9789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9790 FMAInstKind::Accumulator, &NewVR);
9791 }
9792 break;
9793 }
9796 RC = &AArch64::FPR128RegClass;
9797 Register NewVR = MRI.createVirtualRegister(RC);
9798 MachineInstrBuilder MIB1 =
9799 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9800 .add(Root.getOperand(2));
9801 InsInstrs.push_back(MIB1);
9802 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9804 Opc = AArch64::FMLAv4i32_indexed;
9805 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9806 FMAInstKind::Indexed, &NewVR);
9807 } else {
9808 Opc = AArch64::FMLAv4f32;
9809 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9810 FMAInstKind::Accumulator, &NewVR);
9811 }
9812 break;
9813 }
9816 RC = &AArch64::FPR128RegClass;
9817 Register NewVR = MRI.createVirtualRegister(RC);
9818 MachineInstrBuilder MIB1 =
9819 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9820 .add(Root.getOperand(2));
9821 InsInstrs.push_back(MIB1);
9822 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9824 Opc = AArch64::FMLAv2i64_indexed;
9825 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9826 FMAInstKind::Indexed, &NewVR);
9827 } else {
9828 Opc = AArch64::FMLAv2f64;
9829 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9830 FMAInstKind::Accumulator, &NewVR);
9831 }
9832 break;
9833 }
9836 unsigned IdxDupOp =
9838 : 2;
9839 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9840 &AArch64::FPR128RegClass, MRI);
9841 break;
9842 }
9845 unsigned IdxDupOp =
9847 : 2;
9848 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9849 &AArch64::FPR128RegClass, MRI);
9850 break;
9851 }
9854 unsigned IdxDupOp =
9856 : 2;
9857 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9858 &AArch64::FPR128_loRegClass, MRI);
9859 break;
9860 }
9863 unsigned IdxDupOp =
9865 : 2;
9866 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9867 &AArch64::FPR128RegClass, MRI);
9868 break;
9869 }
9872 unsigned IdxDupOp =
9874 : 2;
9875 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9876 &AArch64::FPR128_loRegClass, MRI);
9877 break;
9878 }
9880 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9881 break;
9882 }
9884 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9885 Pattern, 4);
9886 break;
9887 }
9889 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9890 Pattern, 8);
9891 break;
9892 }
9894 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9895 Pattern, 16);
9896 break;
9897 }
9898
9899 } // end switch (Pattern)
9900 // Record MUL and ADD/SUB for deletion
9901 if (MUL)
9902 DelInstrs.push_back(MUL);
9903 DelInstrs.push_back(&Root);
9904
9905 // Set the flags on the inserted instructions to be the merged flags of the
9906 // instructions that we have combined.
9907 uint32_t Flags = Root.getFlags();
9908 if (MUL)
9909 Flags = Root.mergeFlagsWith(*MUL);
9910 for (auto *MI : InsInstrs)
9911 MI->setFlags(Flags);
9912}
9913
9914/// Replace csincr-branch sequence by simple conditional branch
9915///
9916/// Examples:
9917/// 1. \code
9918/// csinc w9, wzr, wzr, <condition code>
9919/// tbnz w9, #0, 0x44
9920/// \endcode
9921/// to
9922/// \code
9923/// b.<inverted condition code>
9924/// \endcode
9925///
9926/// 2. \code
9927/// csinc w9, wzr, wzr, <condition code>
9928/// tbz w9, #0, 0x44
9929/// \endcode
9930/// to
9931/// \code
9932/// b.<condition code>
9933/// \endcode
9934///
9935/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9936/// compare's constant operand is power of 2.
9937///
9938/// Examples:
9939/// \code
9940/// and w8, w8, #0x400
9941/// cbnz w8, L1
9942/// \endcode
9943/// to
9944/// \code
9945/// tbnz w8, #10, L1
9946/// \endcode
9947///
9948/// \param MI Conditional Branch
9949/// \return True when the simple conditional branch is generated
9950///
9952 bool IsNegativeBranch = false;
9953 bool IsTestAndBranch = false;
9954 unsigned TargetBBInMI = 0;
9955 switch (MI.getOpcode()) {
9956 default:
9957 llvm_unreachable("Unknown branch instruction?");
9958 case AArch64::Bcc:
9959 case AArch64::CBWPri:
9960 case AArch64::CBXPri:
9961 case AArch64::CBBAssertExt:
9962 case AArch64::CBHAssertExt:
9963 case AArch64::CBWPrr:
9964 case AArch64::CBXPrr:
9965 return false;
9966 case AArch64::CBZW:
9967 case AArch64::CBZX:
9968 TargetBBInMI = 1;
9969 break;
9970 case AArch64::CBNZW:
9971 case AArch64::CBNZX:
9972 TargetBBInMI = 1;
9973 IsNegativeBranch = true;
9974 break;
9975 case AArch64::TBZW:
9976 case AArch64::TBZX:
9977 TargetBBInMI = 2;
9978 IsTestAndBranch = true;
9979 break;
9980 case AArch64::TBNZW:
9981 case AArch64::TBNZX:
9982 TargetBBInMI = 2;
9983 IsNegativeBranch = true;
9984 IsTestAndBranch = true;
9985 break;
9986 }
9987 // So we increment a zero register and test for bits other
9988 // than bit 0? Conservatively bail out in case the verifier
9989 // missed this case.
9990 if (IsTestAndBranch && MI.getOperand(1).getImm())
9991 return false;
9992
9993 // Find Definition.
9994 assert(MI.getParent() && "Incomplete machine instruction\n");
9995 MachineBasicBlock *MBB = MI.getParent();
9996 MachineFunction *MF = MBB->getParent();
9997 MachineRegisterInfo *MRI = &MF->getRegInfo();
9998 Register VReg = MI.getOperand(0).getReg();
9999 if (!VReg.isVirtual())
10000 return false;
10001
10002 MachineInstr *DefMI = MRI->getVRegDef(VReg);
10003
10004 // Look through COPY instructions to find definition.
10005 while (DefMI->isCopy()) {
10006 Register CopyVReg = DefMI->getOperand(1).getReg();
10007 if (!MRI->hasOneNonDBGUse(CopyVReg))
10008 return false;
10009 if (!MRI->hasOneDef(CopyVReg))
10010 return false;
10011 DefMI = MRI->getVRegDef(CopyVReg);
10012 }
10013
10014 switch (DefMI->getOpcode()) {
10015 default:
10016 return false;
10017 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
10018 case AArch64::ANDWri:
10019 case AArch64::ANDXri: {
10020 if (IsTestAndBranch)
10021 return false;
10022 if (DefMI->getParent() != MBB)
10023 return false;
10024 if (!MRI->hasOneNonDBGUse(VReg))
10025 return false;
10026
10027 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
10029 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
10030 if (!isPowerOf2_64(Mask))
10031 return false;
10032
10033 MachineOperand &MO = DefMI->getOperand(1);
10034 Register NewReg = MO.getReg();
10035 if (!NewReg.isVirtual())
10036 return false;
10037
10038 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
10039
10040 MachineBasicBlock &RefToMBB = *MBB;
10041 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
10042 DebugLoc DL = MI.getDebugLoc();
10043 unsigned Imm = Log2_64(Mask);
10044 unsigned Opc = (Imm < 32)
10045 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
10046 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
10047 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
10048 .addReg(NewReg)
10049 .addImm(Imm)
10050 .addMBB(TBB);
10051 // Register lives on to the CBZ now.
10052 MO.setIsKill(false);
10053
10054 // For immediate smaller than 32, we need to use the 32-bit
10055 // variant (W) in all cases. Indeed the 64-bit variant does not
10056 // allow to encode them.
10057 // Therefore, if the input register is 64-bit, we need to take the
10058 // 32-bit sub-part.
10059 if (!Is32Bit && Imm < 32)
10060 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
10061 MI.eraseFromParent();
10062 return true;
10063 }
10064 // Look for CSINC
10065 case AArch64::CSINCWr:
10066 case AArch64::CSINCXr: {
10067 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
10068 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
10069 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
10070 DefMI->getOperand(2).getReg() == AArch64::XZR))
10071 return false;
10072
10073 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
10074 true) != -1)
10075 return false;
10076
10077 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
10078 // Convert only when the condition code is not modified between
10079 // the CSINC and the branch. The CC may be used by other
10080 // instructions in between.
10082 return false;
10083 MachineBasicBlock &RefToMBB = *MBB;
10084 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
10085 DebugLoc DL = MI.getDebugLoc();
10086 if (IsNegativeBranch)
10088 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
10089 MI.eraseFromParent();
10090 return true;
10091 }
10092 }
10093}
10094
10095std::pair<unsigned, unsigned>
10096AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10097 const unsigned Mask = AArch64II::MO_FRAGMENT;
10098 return std::make_pair(TF & Mask, TF & ~Mask);
10099}
10100
10102AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10103 using namespace AArch64II;
10104
10105 static const std::pair<unsigned, const char *> TargetFlags[] = {
10106 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
10107 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
10108 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
10109 {MO_HI12, "aarch64-hi12"}};
10110 return ArrayRef(TargetFlags);
10111}
10112
10114AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
10115 using namespace AArch64II;
10116
10117 static const std::pair<unsigned, const char *> TargetFlags[] = {
10118 {MO_COFFSTUB, "aarch64-coffstub"},
10119 {MO_GOT, "aarch64-got"},
10120 {MO_NC, "aarch64-nc"},
10121 {MO_S, "aarch64-s"},
10122 {MO_TLS, "aarch64-tls"},
10123 {MO_DLLIMPORT, "aarch64-dllimport"},
10124 {MO_PREL, "aarch64-prel"},
10125 {MO_TAGGED, "aarch64-tagged"},
10126 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
10127 };
10128 return ArrayRef(TargetFlags);
10129}
10130
10132AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10133 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10134 {{MOSuppressPair, "aarch64-suppress-pair"},
10135 {MOStridedAccess, "aarch64-strided-access"}};
10136 return ArrayRef(TargetFlags);
10137}
10138
10139/// Constants defining how certain sequences should be outlined.
10140/// This encompasses how an outlined function should be called, and what kind of
10141/// frame should be emitted for that outlined function.
10142///
10143/// \p MachineOutlinerDefault implies that the function should be called with
10144/// a save and restore of LR to the stack.
10145///
10146/// That is,
10147///
10148/// I1 Save LR OUTLINED_FUNCTION:
10149/// I2 --> BL OUTLINED_FUNCTION I1
10150/// I3 Restore LR I2
10151/// I3
10152/// RET
10153///
10154/// * Call construction overhead: 3 (save + BL + restore)
10155/// * Frame construction overhead: 1 (ret)
10156/// * Requires stack fixups? Yes
10157///
10158/// \p MachineOutlinerTailCall implies that the function is being created from
10159/// a sequence of instructions ending in a return.
10160///
10161/// That is,
10162///
10163/// I1 OUTLINED_FUNCTION:
10164/// I2 --> B OUTLINED_FUNCTION I1
10165/// RET I2
10166/// RET
10167///
10168/// * Call construction overhead: 1 (B)
10169/// * Frame construction overhead: 0 (Return included in sequence)
10170/// * Requires stack fixups? No
10171///
10172/// \p MachineOutlinerNoLRSave implies that the function should be called using
10173/// a BL instruction, but doesn't require LR to be saved and restored. This
10174/// happens when LR is known to be dead.
10175///
10176/// That is,
10177///
10178/// I1 OUTLINED_FUNCTION:
10179/// I2 --> BL OUTLINED_FUNCTION I1
10180/// I3 I2
10181/// I3
10182/// RET
10183///
10184/// * Call construction overhead: 1 (BL)
10185/// * Frame construction overhead: 1 (RET)
10186/// * Requires stack fixups? No
10187///
10188/// \p MachineOutlinerThunk implies that the function is being created from
10189/// a sequence of instructions ending in a call. The outlined function is
10190/// called with a BL instruction, and the outlined function tail-calls the
10191/// original call destination.
10192///
10193/// That is,
10194///
10195/// I1 OUTLINED_FUNCTION:
10196/// I2 --> BL OUTLINED_FUNCTION I1
10197/// BL f I2
10198/// B f
10199/// * Call construction overhead: 1 (BL)
10200/// * Frame construction overhead: 0
10201/// * Requires stack fixups? No
10202///
10203/// \p MachineOutlinerRegSave implies that the function should be called with a
10204/// save and restore of LR to an available register. This allows us to avoid
10205/// stack fixups. Note that this outlining variant is compatible with the
10206/// NoLRSave case.
10207///
10208/// That is,
10209///
10210/// I1 Save LR OUTLINED_FUNCTION:
10211/// I2 --> BL OUTLINED_FUNCTION I1
10212/// I3 Restore LR I2
10213/// I3
10214/// RET
10215///
10216/// * Call construction overhead: 3 (save + BL + restore)
10217/// * Frame construction overhead: 1 (ret)
10218/// * Requires stack fixups? No
10220 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
10221 MachineOutlinerTailCall, /// Only emit a branch.
10222 MachineOutlinerNoLRSave, /// Emit a call and return.
10223 MachineOutlinerThunk, /// Emit a call and tail-call.
10224 MachineOutlinerRegSave /// Same as default, but save to a register.
10225};
10226
10232
10234AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10235 MachineFunction *MF = C.getMF();
10236 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10237 const AArch64RegisterInfo *ARI =
10238 static_cast<const AArch64RegisterInfo *>(&TRI);
10239 // Check if there is an available register across the sequence that we can
10240 // use.
10241 for (unsigned Reg : AArch64::GPR64RegClass) {
10242 if (!ARI->isReservedReg(*MF, Reg) &&
10243 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10244 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10245 Reg != AArch64::X17 && // Ditto for X17.
10246 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10247 C.isAvailableInsideSeq(Reg, TRI))
10248 return Reg;
10249 }
10250 return Register();
10251}
10252
10253static bool
10255 const outliner::Candidate &b) {
10256 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10257 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10258
10259 return MFIa->getSignReturnAddressCondition() ==
10261}
10262
10263static bool
10265 const outliner::Candidate &b) {
10266 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10267 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10268
10269 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10270}
10271
10273 const outliner::Candidate &b) {
10274 const AArch64Subtarget &SubtargetA =
10276 const AArch64Subtarget &SubtargetB =
10277 b.getMF()->getSubtarget<AArch64Subtarget>();
10278 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10279}
10280
10281std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10282AArch64InstrInfo::getOutliningCandidateInfo(
10283 const MachineModuleInfo &MMI,
10284 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10285 unsigned MinRepeats) const {
10286 unsigned SequenceSize = 0;
10287 for (auto &MI : RepeatedSequenceLocs[0])
10288 SequenceSize += getInstSizeInBytes(MI);
10289
10290 unsigned NumBytesToCreateFrame = 0;
10291
10292 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10293 // These instructions are fused together by the scheduler.
10294 // Any candidate where ADRP is the last instruction should be rejected
10295 // as that will lead to splitting ADRP pair.
10296 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10297 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10298 if (LastMI.getOpcode() == AArch64::ADRP &&
10299 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10300 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10301 return std::nullopt;
10302 }
10303
10304 // Similarly any candidate where the first instruction is ADD/LDR with a
10305 // page offset should be rejected to avoid ADRP splitting.
10306 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10307 FirstMI.getOpcode() == AArch64::LDRXui) &&
10308 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10309 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10310 return std::nullopt;
10311 }
10312
10313 // We only allow outlining for functions having exactly matching return
10314 // address signing attributes, i.e., all share the same value for the
10315 // attribute "sign-return-address" and all share the same type of key they
10316 // are signed with.
10317 // Additionally we require all functions to simultaneously either support
10318 // v8.3a features or not. Otherwise an outlined function could get signed
10319 // using dedicated v8.3 instructions and a call from a function that doesn't
10320 // support v8.3 instructions would therefore be invalid.
10321 if (std::adjacent_find(
10322 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10323 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10324 // Return true if a and b are non-equal w.r.t. return address
10325 // signing or support of v8.3a features
10326 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10327 outliningCandidatesSigningKeyConsensus(a, b) &&
10328 outliningCandidatesV8_3OpsConsensus(a, b)) {
10329 return false;
10330 }
10331 return true;
10332 }) != RepeatedSequenceLocs.end()) {
10333 return std::nullopt;
10334 }
10335
10336 // Since at this point all candidates agree on their return address signing
10337 // picking just one is fine. If the candidate functions potentially sign their
10338 // return addresses, the outlined function should do the same. Note that in
10339 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10340 // not certainly true that the outlined function will have to sign its return
10341 // address but this decision is made later, when the decision to outline
10342 // has already been made.
10343 // The same holds for the number of additional instructions we need: On
10344 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10345 // necessary. However, at this point we don't know if the outlined function
10346 // will have a RET instruction so we assume the worst.
10347 const TargetRegisterInfo &TRI = getRegisterInfo();
10348 // Performing a tail call may require extra checks when PAuth is enabled.
10349 // If PAuth is disabled, set it to zero for uniformity.
10350 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10351 const auto RASignCondition = RepeatedSequenceLocs[0]
10352 .getMF()
10353 ->getInfo<AArch64FunctionInfo>()
10354 ->getSignReturnAddressCondition();
10355 if (RASignCondition != SignReturnAddress::None) {
10356 // One PAC and one AUT instructions
10357 NumBytesToCreateFrame += 8;
10358
10359 // PAuth is enabled - set extra tail call cost, if any.
10360 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10361 *RepeatedSequenceLocs[0].getMF());
10362 NumBytesToCheckLRInTCEpilogue =
10364 // Checking the authenticated LR value may significantly impact
10365 // SequenceSize, so account for it for more precise results.
10366 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10367 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10368
10369 // We have to check if sp modifying instructions would get outlined.
10370 // If so we only allow outlining if sp is unchanged overall, so matching
10371 // sub and add instructions are okay to outline, all other sp modifications
10372 // are not
10373 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10374 int SPValue = 0;
10375 for (auto &MI : C) {
10376 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10377 switch (MI.getOpcode()) {
10378 case AArch64::ADDXri:
10379 case AArch64::ADDWri:
10380 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10381 assert(MI.getOperand(2).isImm() &&
10382 "Expected operand to be immediate");
10383 assert(MI.getOperand(1).isReg() &&
10384 "Expected operand to be a register");
10385 // Check if the add just increments sp. If so, we search for
10386 // matching sub instructions that decrement sp. If not, the
10387 // modification is illegal
10388 if (MI.getOperand(1).getReg() == AArch64::SP)
10389 SPValue += MI.getOperand(2).getImm();
10390 else
10391 return true;
10392 break;
10393 case AArch64::SUBXri:
10394 case AArch64::SUBWri:
10395 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10396 assert(MI.getOperand(2).isImm() &&
10397 "Expected operand to be immediate");
10398 assert(MI.getOperand(1).isReg() &&
10399 "Expected operand to be a register");
10400 // Check if the sub just decrements sp. If so, we search for
10401 // matching add instructions that increment sp. If not, the
10402 // modification is illegal
10403 if (MI.getOperand(1).getReg() == AArch64::SP)
10404 SPValue -= MI.getOperand(2).getImm();
10405 else
10406 return true;
10407 break;
10408 default:
10409 return true;
10410 }
10411 }
10412 }
10413 if (SPValue)
10414 return true;
10415 return false;
10416 };
10417 // Remove candidates with illegal stack modifying instructions
10418 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10419
10420 // If the sequence doesn't have enough candidates left, then we're done.
10421 if (RepeatedSequenceLocs.size() < MinRepeats)
10422 return std::nullopt;
10423 }
10424
10425 // Properties about candidate MBBs that hold for all of them.
10426 unsigned FlagsSetInAll = 0xF;
10427
10428 // Compute liveness information for each candidate, and set FlagsSetInAll.
10429 for (outliner::Candidate &C : RepeatedSequenceLocs)
10430 FlagsSetInAll &= C.Flags;
10431
10432 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10433
10434 // Helper lambda which sets call information for every candidate.
10435 auto SetCandidateCallInfo =
10436 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10437 for (outliner::Candidate &C : RepeatedSequenceLocs)
10438 C.setCallInfo(CallID, NumBytesForCall);
10439 };
10440
10441 unsigned FrameID = MachineOutlinerDefault;
10442 NumBytesToCreateFrame += 4;
10443
10444 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10445 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10446 });
10447
10448 // We check to see if CFI Instructions are present, and if they are
10449 // we find the number of CFI Instructions in the candidates.
10450 unsigned CFICount = 0;
10451 for (auto &I : RepeatedSequenceLocs[0]) {
10452 if (I.isCFIInstruction())
10453 CFICount++;
10454 }
10455
10456 // We compare the number of found CFI Instructions to the number of CFI
10457 // instructions in the parent function for each candidate. We must check this
10458 // since if we outline one of the CFI instructions in a function, we have to
10459 // outline them all for correctness. If we do not, the address offsets will be
10460 // incorrect between the two sections of the program.
10461 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10462 std::vector<MCCFIInstruction> CFIInstructions =
10463 C.getMF()->getFrameInstructions();
10464
10465 if (CFICount > 0 && CFICount != CFIInstructions.size())
10466 return std::nullopt;
10467 }
10468
10469 // Returns true if an instructions is safe to fix up, false otherwise.
10470 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10471 if (MI.isCall())
10472 return true;
10473
10474 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10475 !MI.readsRegister(AArch64::SP, &TRI))
10476 return true;
10477
10478 // Any modification of SP will break our code to save/restore LR.
10479 // FIXME: We could handle some instructions which add a constant
10480 // offset to SP, with a bit more work.
10481 if (MI.modifiesRegister(AArch64::SP, &TRI))
10482 return false;
10483
10484 // At this point, we have a stack instruction that we might need to
10485 // fix up. We'll handle it if it's a load or store.
10486 if (MI.mayLoadOrStore()) {
10487 const MachineOperand *Base; // Filled with the base operand of MI.
10488 int64_t Offset; // Filled with the offset of MI.
10489 bool OffsetIsScalable;
10490
10491 // Does it allow us to offset the base operand and is the base the
10492 // register SP?
10493 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10494 !Base->isReg() || Base->getReg() != AArch64::SP)
10495 return false;
10496
10497 // Fixe-up code below assumes bytes.
10498 if (OffsetIsScalable)
10499 return false;
10500
10501 // Find the minimum/maximum offset for this instruction and check
10502 // if fixing it up would be in range.
10503 int64_t MinOffset,
10504 MaxOffset; // Unscaled offsets for the instruction.
10505 // The scale to multiply the offsets by.
10506 TypeSize Scale(0U, false), DummyWidth(0U, false);
10507 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10508
10509 Offset += 16; // Update the offset to what it would be if we outlined.
10510 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10511 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10512 return false;
10513
10514 // It's in range, so we can outline it.
10515 return true;
10516 }
10517
10518 // FIXME: Add handling for instructions like "add x0, sp, #8".
10519
10520 // We can't fix it up, so don't outline it.
10521 return false;
10522 };
10523
10524 // True if it's possible to fix up each stack instruction in this sequence.
10525 // Important for frames/call variants that modify the stack.
10526 bool AllStackInstrsSafe =
10527 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10528
10529 // If the last instruction in any candidate is a terminator, then we should
10530 // tail call all of the candidates.
10531 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10532 FrameID = MachineOutlinerTailCall;
10533 NumBytesToCreateFrame = 0;
10534 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10535 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10536 }
10537
10538 else if (LastInstrOpcode == AArch64::BL ||
10539 ((LastInstrOpcode == AArch64::BLR ||
10540 LastInstrOpcode == AArch64::BLRNoIP) &&
10541 !HasBTI)) {
10542 // FIXME: Do we need to check if the code after this uses the value of LR?
10543 FrameID = MachineOutlinerThunk;
10544 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10545 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10546 }
10547
10548 else {
10549 // We need to decide how to emit calls + frames. We can always emit the same
10550 // frame if we don't need to save to the stack. If we have to save to the
10551 // stack, then we need a different frame.
10552 unsigned NumBytesNoStackCalls = 0;
10553 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10554
10555 // Check if we have to save LR.
10556 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10557 bool LRAvailable =
10559 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10560 : true;
10561 // If we have a noreturn caller, then we're going to be conservative and
10562 // say that we have to save LR. If we don't have a ret at the end of the
10563 // block, then we can't reason about liveness accurately.
10564 //
10565 // FIXME: We can probably do better than always disabling this in
10566 // noreturn functions by fixing up the liveness info.
10567 bool IsNoReturn =
10568 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10569
10570 // Is LR available? If so, we don't need a save.
10571 if (LRAvailable && !IsNoReturn) {
10572 NumBytesNoStackCalls += 4;
10573 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10574 CandidatesWithoutStackFixups.push_back(C);
10575 }
10576
10577 // Is an unused register available? If so, we won't modify the stack, so
10578 // we can outline with the same frame type as those that don't save LR.
10579 else if (findRegisterToSaveLRTo(C)) {
10580 NumBytesNoStackCalls += 12;
10581 C.setCallInfo(MachineOutlinerRegSave, 12);
10582 CandidatesWithoutStackFixups.push_back(C);
10583 }
10584
10585 // Is SP used in the sequence at all? If not, we don't have to modify
10586 // the stack, so we are guaranteed to get the same frame.
10587 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10588 NumBytesNoStackCalls += 12;
10589 C.setCallInfo(MachineOutlinerDefault, 12);
10590 CandidatesWithoutStackFixups.push_back(C);
10591 }
10592
10593 // If we outline this, we need to modify the stack. Pretend we don't
10594 // outline this by saving all of its bytes.
10595 else {
10596 NumBytesNoStackCalls += SequenceSize;
10597 }
10598 }
10599
10600 // If there are no places where we have to save LR, then note that we
10601 // don't have to update the stack. Otherwise, give every candidate the
10602 // default call type, as long as it's safe to do so.
10603 if (!AllStackInstrsSafe ||
10604 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10605 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10606 FrameID = MachineOutlinerNoLRSave;
10607 if (RepeatedSequenceLocs.size() < MinRepeats)
10608 return std::nullopt;
10609 } else {
10610 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10611
10612 // Bugzilla ID: 46767
10613 // TODO: Check if fixing up the stack more than once is safe so we can
10614 // outline these.
10615 //
10616 // An outline resulting in a caller that requires stack fixups at the
10617 // callsite to a callee that also requires stack fixups can happen when
10618 // there are no available registers at the candidate callsite for a
10619 // candidate that itself also has calls.
10620 //
10621 // In other words if function_containing_sequence in the following pseudo
10622 // assembly requires that we save LR at the point of the call, but there
10623 // are no available registers: in this case we save using SP and as a
10624 // result the SP offsets requires stack fixups by multiples of 16.
10625 //
10626 // function_containing_sequence:
10627 // ...
10628 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10629 // call OUTLINED_FUNCTION_N
10630 // restore LR from SP
10631 // ...
10632 //
10633 // OUTLINED_FUNCTION_N:
10634 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10635 // ...
10636 // bl foo
10637 // restore LR from SP
10638 // ret
10639 //
10640 // Because the code to handle more than one stack fixup does not
10641 // currently have the proper checks for legality, these cases will assert
10642 // in the AArch64 MachineOutliner. This is because the code to do this
10643 // needs more hardening, testing, better checks that generated code is
10644 // legal, etc and because it is only verified to handle a single pass of
10645 // stack fixup.
10646 //
10647 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10648 // these cases until they are known to be handled. Bugzilla 46767 is
10649 // referenced in comments at the assert site.
10650 //
10651 // To avoid asserting (or generating non-legal code on noassert builds)
10652 // we remove all candidates which would need more than one stack fixup by
10653 // pruning the cases where the candidate has calls while also having no
10654 // available LR and having no available general purpose registers to copy
10655 // LR to (ie one extra stack save/restore).
10656 //
10657 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10658 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10659 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10660 return (llvm::any_of(C, IsCall)) &&
10661 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10662 !findRegisterToSaveLRTo(C));
10663 });
10664 }
10665 }
10666
10667 // If we dropped all of the candidates, bail out here.
10668 if (RepeatedSequenceLocs.size() < MinRepeats)
10669 return std::nullopt;
10670 }
10671
10672 // Does every candidate's MBB contain a call? If so, then we might have a call
10673 // in the range.
10674 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10675 // Check if the range contains a call. These require a save + restore of the
10676 // link register.
10677 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10678 bool ModStackToSaveLR = false;
10679 if (any_of(drop_end(FirstCand),
10680 [](const MachineInstr &MI) { return MI.isCall(); }))
10681 ModStackToSaveLR = true;
10682
10683 // Handle the last instruction separately. If this is a tail call, then the
10684 // last instruction is a call. We don't want to save + restore in this case.
10685 // However, it could be possible that the last instruction is a call without
10686 // it being valid to tail call this sequence. We should consider this as
10687 // well.
10688 else if (FrameID != MachineOutlinerThunk &&
10689 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10690 ModStackToSaveLR = true;
10691
10692 if (ModStackToSaveLR) {
10693 // We can't fix up the stack. Bail out.
10694 if (!AllStackInstrsSafe)
10695 return std::nullopt;
10696
10697 // Save + restore LR.
10698 NumBytesToCreateFrame += 8;
10699 }
10700 }
10701
10702 // If we have CFI instructions, we can only outline if the outlined section
10703 // can be a tail call
10704 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10705 return std::nullopt;
10706
10707 return std::make_unique<outliner::OutlinedFunction>(
10708 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10709}
10710
10711void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10712 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10713 // If a bunch of candidates reach this point they must agree on their return
10714 // address signing. It is therefore enough to just consider the signing
10715 // behaviour of one of them
10716 const auto &CFn = Candidates.front().getMF()->getFunction();
10717
10718 if (CFn.hasFnAttribute("ptrauth-returns"))
10719 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10720 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10721 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10722 // Since all candidates belong to the same module, just copy the
10723 // function-level attributes of an arbitrary function.
10724 if (CFn.hasFnAttribute("sign-return-address"))
10725 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10726 if (CFn.hasFnAttribute("sign-return-address-key"))
10727 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10728
10729 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10730}
10731
10732bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10733 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10734 const Function &F = MF.getFunction();
10735
10736 // Can F be deduplicated by the linker? If it can, don't outline from it.
10737 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10738 return false;
10739
10740 // Don't outline from functions with section markings; the program could
10741 // expect that all the code is in the named section.
10742 // FIXME: Allow outlining from multiple functions with the same section
10743 // marking.
10744 if (F.hasSection())
10745 return false;
10746
10747 // Outlining from functions with redzones is unsafe since the outliner may
10748 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10749 // outline from it.
10750 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10751 if (!AFI || AFI->hasRedZone().value_or(true))
10752 return false;
10753
10754 // FIXME: Determine whether it is safe to outline from functions which contain
10755 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10756 // outlined together and ensure it is safe to outline with async unwind info,
10757 // required for saving & restoring VG around calls.
10758 if (AFI->hasStreamingModeChanges())
10759 return false;
10760
10761 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10763 return false;
10764
10765 // It's safe to outline from MF.
10766 return true;
10767}
10768
10770AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10771 unsigned &Flags) const {
10773 "Must track liveness!");
10775 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10776 Ranges;
10777 // According to the AArch64 Procedure Call Standard, the following are
10778 // undefined on entry/exit from a function call:
10779 //
10780 // * Registers x16, x17, (and thus w16, w17)
10781 // * Condition codes (and thus the NZCV register)
10782 //
10783 // If any of these registers are used inside or live across an outlined
10784 // function, then they may be modified later, either by the compiler or
10785 // some other tool (like the linker).
10786 //
10787 // To avoid outlining in these situations, partition each block into ranges
10788 // where these registers are dead. We will only outline from those ranges.
10789 LiveRegUnits LRU(getRegisterInfo());
10790 auto AreAllUnsafeRegsDead = [&LRU]() {
10791 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10792 LRU.available(AArch64::NZCV);
10793 };
10794
10795 // We need to know if LR is live across an outlining boundary later on in
10796 // order to decide how we'll create the outlined call, frame, etc.
10797 //
10798 // It's pretty expensive to check this for *every candidate* within a block.
10799 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10800 // to compute liveness from the end of the block for O(n) candidates within
10801 // the block.
10802 //
10803 // So, to improve the average case, let's keep track of liveness from the end
10804 // of the block to the beginning of *every outlinable range*. If we know that
10805 // LR is available in every range we could outline from, then we know that
10806 // we don't need to check liveness for any candidate within that range.
10807 bool LRAvailableEverywhere = true;
10808 // Compute liveness bottom-up.
10809 LRU.addLiveOuts(MBB);
10810 // Update flags that require info about the entire MBB.
10811 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10812 if (MI.isCall() && !MI.isTerminator())
10814 };
10815 // Range: [RangeBegin, RangeEnd)
10816 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10817 unsigned RangeLen;
10818 auto CreateNewRangeStartingAt =
10819 [&RangeBegin, &RangeEnd,
10820 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10821 RangeBegin = NewBegin;
10822 RangeEnd = std::next(RangeBegin);
10823 RangeLen = 0;
10824 };
10825 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10826 // At least one unsafe register is not dead. We do not want to outline at
10827 // this point. If it is long enough to outline from and does not cross a
10828 // bundle boundary, save the range [RangeBegin, RangeEnd).
10829 if (RangeLen <= 1)
10830 return;
10831 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10832 return;
10833 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10834 return;
10835 Ranges.emplace_back(RangeBegin, RangeEnd);
10836 };
10837 // Find the first point where all unsafe registers are dead.
10838 // FIND: <safe instr> <-- end of first potential range
10839 // SKIP: <unsafe def>
10840 // SKIP: ... everything between ...
10841 // SKIP: <unsafe use>
10842 auto FirstPossibleEndPt = MBB.instr_rbegin();
10843 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10844 if (!FirstPossibleEndPt->isDebugInstr())
10845 LRU.stepBackward(*FirstPossibleEndPt);
10846 // Update flags that impact how we outline across the entire block,
10847 // regardless of safety.
10848 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10849 if (AreAllUnsafeRegsDead())
10850 break;
10851 }
10852 // If we exhausted the entire block, we have no safe ranges to outline.
10853 if (FirstPossibleEndPt == MBB.instr_rend())
10854 return Ranges;
10855 // Current range.
10856 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10857 // StartPt points to the first place where all unsafe registers
10858 // are dead (if there is any such point). Begin partitioning the MBB into
10859 // ranges.
10860 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10861 if (!MI.isDebugInstr())
10862 LRU.stepBackward(MI);
10863 UpdateWholeMBBFlags(MI);
10864 if (!AreAllUnsafeRegsDead()) {
10865 SaveRangeIfNonEmpty();
10866 CreateNewRangeStartingAt(MI.getIterator());
10867 continue;
10868 }
10869 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10870 RangeBegin = MI.getIterator();
10871 ++RangeLen;
10872 }
10873 // Above loop misses the last (or only) range. If we are still safe, then
10874 // let's save the range.
10875 if (AreAllUnsafeRegsDead())
10876 SaveRangeIfNonEmpty();
10877 if (Ranges.empty())
10878 return Ranges;
10879 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10880 // the order.
10881 std::reverse(Ranges.begin(), Ranges.end());
10882 // If there is at least one outlinable range where LR is unavailable
10883 // somewhere, remember that.
10884 if (!LRAvailableEverywhere)
10886 return Ranges;
10887}
10888
10890AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10892 unsigned Flags) const {
10893 MachineInstr &MI = *MIT;
10894
10895 // Don't outline anything used for return address signing. The outlined
10896 // function will get signed later if needed
10897 switch (MI.getOpcode()) {
10898 case AArch64::PACM:
10899 case AArch64::PACIASP:
10900 case AArch64::PACIBSP:
10901 case AArch64::PACIASPPC:
10902 case AArch64::PACIBSPPC:
10903 case AArch64::AUTIASP:
10904 case AArch64::AUTIBSP:
10905 case AArch64::AUTIASPPCi:
10906 case AArch64::AUTIASPPCr:
10907 case AArch64::AUTIBSPPCi:
10908 case AArch64::AUTIBSPPCr:
10909 case AArch64::RETAA:
10910 case AArch64::RETAB:
10911 case AArch64::RETAASPPCi:
10912 case AArch64::RETAASPPCr:
10913 case AArch64::RETABSPPCi:
10914 case AArch64::RETABSPPCr:
10915 case AArch64::EMITBKEY:
10916 case AArch64::PAUTH_PROLOGUE:
10917 case AArch64::PAUTH_EPILOGUE:
10919 }
10920
10921 // We can only outline these if we will tail call the outlined function, or
10922 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10923 // in a tail call.
10924 //
10925 // FIXME: If the proper fixups for the offset are implemented, this should be
10926 // possible.
10927 if (MI.isCFIInstruction())
10929
10930 // Is this a terminator for a basic block?
10931 if (MI.isTerminator())
10932 // TargetInstrInfo::getOutliningType has already filtered out anything
10933 // that would break this, so we can allow it here.
10935
10936 // Make sure none of the operands are un-outlinable.
10937 for (const MachineOperand &MOP : MI.operands()) {
10938 // A check preventing CFI indices was here before, but only CFI
10939 // instructions should have those.
10940 assert(!MOP.isCFIIndex());
10941
10942 // If it uses LR or W30 explicitly, then don't touch it.
10943 if (MOP.isReg() && !MOP.isImplicit() &&
10944 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10946 }
10947
10948 // Special cases for instructions that can always be outlined, but will fail
10949 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10950 // be outlined because they don't require a *specific* value to be in LR.
10951 if (MI.getOpcode() == AArch64::ADRP)
10953
10954 // If MI is a call we might be able to outline it. We don't want to outline
10955 // any calls that rely on the position of items on the stack. When we outline
10956 // something containing a call, we have to emit a save and restore of LR in
10957 // the outlined function. Currently, this always happens by saving LR to the
10958 // stack. Thus, if we outline, say, half the parameters for a function call
10959 // plus the call, then we'll break the callee's expectations for the layout
10960 // of the stack.
10961 //
10962 // FIXME: Allow calls to functions which construct a stack frame, as long
10963 // as they don't access arguments on the stack.
10964 // FIXME: Figure out some way to analyze functions defined in other modules.
10965 // We should be able to compute the memory usage based on the IR calling
10966 // convention, even if we can't see the definition.
10967 if (MI.isCall()) {
10968 // Get the function associated with the call. Look at each operand and find
10969 // the one that represents the callee and get its name.
10970 const Function *Callee = nullptr;
10971 for (const MachineOperand &MOP : MI.operands()) {
10972 if (MOP.isGlobal()) {
10973 Callee = dyn_cast<Function>(MOP.getGlobal());
10974 break;
10975 }
10976 }
10977
10978 // Never outline calls to mcount. There isn't any rule that would require
10979 // this, but the Linux kernel's "ftrace" feature depends on it.
10980 if (Callee && Callee->getName() == "\01_mcount")
10982
10983 // If we don't know anything about the callee, assume it depends on the
10984 // stack layout of the caller. In that case, it's only legal to outline
10985 // as a tail-call. Explicitly list the call instructions we know about so we
10986 // don't get unexpected results with call pseudo-instructions.
10987 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10988 if (MI.getOpcode() == AArch64::BLR ||
10989 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10990 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10991
10992 if (!Callee)
10993 return UnknownCallOutlineType;
10994
10995 // We have a function we have information about. Check it if it's something
10996 // can safely outline.
10997 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10998
10999 // We don't know what's going on with the callee at all. Don't touch it.
11000 if (!CalleeMF)
11001 return UnknownCallOutlineType;
11002
11003 // Check if we know anything about the callee saves on the function. If we
11004 // don't, then don't touch it, since that implies that we haven't
11005 // computed anything about its stack frame yet.
11006 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
11007 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
11008 MFI.getNumObjects() > 0)
11009 return UnknownCallOutlineType;
11010
11011 // At this point, we can say that CalleeMF ought to not pass anything on the
11012 // stack. Therefore, we can outline it.
11014 }
11015
11016 // Don't touch the link register or W30.
11017 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
11018 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
11020
11021 // Don't outline BTI instructions, because that will prevent the outlining
11022 // site from being indirectly callable.
11023 if (hasBTISemantics(MI))
11025
11027}
11028
11029void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
11030 for (MachineInstr &MI : MBB) {
11031 const MachineOperand *Base;
11032 TypeSize Width(0, false);
11033 int64_t Offset;
11034 bool OffsetIsScalable;
11035
11036 // Is this a load or store with an immediate offset with SP as the base?
11037 if (!MI.mayLoadOrStore() ||
11038 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
11039 &RI) ||
11040 (Base->isReg() && Base->getReg() != AArch64::SP))
11041 continue;
11042
11043 // It is, so we have to fix it up.
11044 TypeSize Scale(0U, false);
11045 int64_t Dummy1, Dummy2;
11046
11047 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
11048 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
11049 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
11050 assert(Scale != 0 && "Unexpected opcode!");
11051 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
11052
11053 // We've pushed the return address to the stack, so add 16 to the offset.
11054 // This is safe, since we already checked if it would overflow when we
11055 // checked if this instruction was legal to outline.
11056 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
11057 StackOffsetOperand.setImm(NewImm);
11058 }
11059}
11060
11062 const AArch64InstrInfo *TII,
11063 bool ShouldSignReturnAddr) {
11064 if (!ShouldSignReturnAddr)
11065 return;
11066
11067 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
11069 TII->createPauthEpilogueInstr(MBB, DebugLoc());
11070}
11071
11072void AArch64InstrInfo::buildOutlinedFrame(
11074 const outliner::OutlinedFunction &OF) const {
11075
11076 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
11077
11078 if (OF.FrameConstructionID == MachineOutlinerTailCall)
11079 FI->setOutliningStyle("Tail Call");
11080 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
11081 // For thunk outlining, rewrite the last instruction from a call to a
11082 // tail-call.
11083 MachineInstr *Call = &*--MBB.instr_end();
11084 unsigned TailOpcode;
11085 if (Call->getOpcode() == AArch64::BL) {
11086 TailOpcode = AArch64::TCRETURNdi;
11087 } else {
11088 assert(Call->getOpcode() == AArch64::BLR ||
11089 Call->getOpcode() == AArch64::BLRNoIP);
11090 TailOpcode = AArch64::TCRETURNriALL;
11091 }
11092 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
11093 .add(Call->getOperand(0))
11094 .addImm(0);
11095 MBB.insert(MBB.end(), TC);
11097
11098 FI->setOutliningStyle("Thunk");
11099 }
11100
11101 bool IsLeafFunction = true;
11102
11103 // Is there a call in the outlined range?
11104 auto IsNonTailCall = [](const MachineInstr &MI) {
11105 return MI.isCall() && !MI.isReturn();
11106 };
11107
11108 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
11109 // Fix up the instructions in the range, since we're going to modify the
11110 // stack.
11111
11112 // Bugzilla ID: 46767
11113 // TODO: Check if fixing up twice is safe so we can outline these.
11114 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
11115 "Can only fix up stack references once");
11116 fixupPostOutline(MBB);
11117
11118 IsLeafFunction = false;
11119
11120 // LR has to be a live in so that we can save it.
11121 if (!MBB.isLiveIn(AArch64::LR))
11122 MBB.addLiveIn(AArch64::LR);
11123
11126
11127 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11128 OF.FrameConstructionID == MachineOutlinerThunk)
11129 Et = std::prev(MBB.end());
11130
11131 // Insert a save before the outlined region
11132 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11133 .addReg(AArch64::SP, RegState::Define)
11134 .addReg(AArch64::LR)
11135 .addReg(AArch64::SP)
11136 .addImm(-16);
11137 It = MBB.insert(It, STRXpre);
11138
11139 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
11140 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
11141
11142 // Add a CFI saying the stack was moved 16 B down.
11143 CFIBuilder.buildDefCFAOffset(16);
11144
11145 // Add a CFI saying that the LR that we want to find is now 16 B higher
11146 // than before.
11147 CFIBuilder.buildOffset(AArch64::LR, -16);
11148 }
11149
11150 // Insert a restore before the terminator for the function.
11151 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11152 .addReg(AArch64::SP, RegState::Define)
11153 .addReg(AArch64::LR, RegState::Define)
11154 .addReg(AArch64::SP)
11155 .addImm(16);
11156 Et = MBB.insert(Et, LDRXpost);
11157 }
11158
11159 auto RASignCondition = FI->getSignReturnAddressCondition();
11160 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
11161 RASignCondition, !IsLeafFunction);
11162
11163 // If this is a tail call outlined function, then there's already a return.
11164 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11165 OF.FrameConstructionID == MachineOutlinerThunk) {
11166 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11167 return;
11168 }
11169
11170 // It's not a tail call, so we have to insert the return ourselves.
11171
11172 // LR has to be a live in so that we can return to it.
11173 if (!MBB.isLiveIn(AArch64::LR))
11174 MBB.addLiveIn(AArch64::LR);
11175
11176 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
11177 .addReg(AArch64::LR);
11178 MBB.insert(MBB.end(), ret);
11179
11180 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11181
11182 FI->setOutliningStyle("Function");
11183
11184 // Did we have to modify the stack by saving the link register?
11185 if (OF.FrameConstructionID != MachineOutlinerDefault)
11186 return;
11187
11188 // We modified the stack.
11189 // Walk over the basic block and fix up all the stack accesses.
11190 fixupPostOutline(MBB);
11191}
11192
11193MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
11196
11197 // Are we tail calling?
11198 if (C.CallConstructionID == MachineOutlinerTailCall) {
11199 // If yes, then we can just branch to the label.
11200 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
11201 .addGlobalAddress(M.getNamedValue(MF.getName()))
11202 .addImm(0));
11203 return It;
11204 }
11205
11206 // Are we saving the link register?
11207 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
11208 C.CallConstructionID == MachineOutlinerThunk) {
11209 // No, so just insert the call.
11210 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11211 .addGlobalAddress(M.getNamedValue(MF.getName())));
11212 return It;
11213 }
11214
11215 // We want to return the spot where we inserted the call.
11217
11218 // Instructions for saving and restoring LR around the call instruction we're
11219 // going to insert.
11220 MachineInstr *Save;
11221 MachineInstr *Restore;
11222 // Can we save to a register?
11223 if (C.CallConstructionID == MachineOutlinerRegSave) {
11224 // FIXME: This logic should be sunk into a target-specific interface so that
11225 // we don't have to recompute the register.
11226 Register Reg = findRegisterToSaveLRTo(C);
11227 assert(Reg && "No callee-saved register available?");
11228
11229 // LR has to be a live in so that we can save it.
11230 if (!MBB.isLiveIn(AArch64::LR))
11231 MBB.addLiveIn(AArch64::LR);
11232
11233 // Save and restore LR from Reg.
11234 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11235 .addReg(AArch64::XZR)
11236 .addReg(AArch64::LR)
11237 .addImm(0);
11238 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11239 .addReg(AArch64::XZR)
11240 .addReg(Reg)
11241 .addImm(0);
11242 } else {
11243 // We have the default case. Save and restore from SP.
11244 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11245 .addReg(AArch64::SP, RegState::Define)
11246 .addReg(AArch64::LR)
11247 .addReg(AArch64::SP)
11248 .addImm(-16);
11249 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11250 .addReg(AArch64::SP, RegState::Define)
11251 .addReg(AArch64::LR, RegState::Define)
11252 .addReg(AArch64::SP)
11253 .addImm(16);
11254 }
11255
11256 It = MBB.insert(It, Save);
11257 It++;
11258
11259 // Insert the call.
11260 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11261 .addGlobalAddress(M.getNamedValue(MF.getName())));
11262 CallPt = It;
11263 It++;
11264
11265 It = MBB.insert(It, Restore);
11266 return CallPt;
11267}
11268
11269bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11270 MachineFunction &MF) const {
11271 return MF.getFunction().hasMinSize();
11272}
11273
11274void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11276 DebugLoc &DL,
11277 bool AllowSideEffects) const {
11278 const MachineFunction &MF = *MBB.getParent();
11279 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11280 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11281
11282 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11283 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11284 } else if (STI.isSVEorStreamingSVEAvailable()) {
11285 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11286 .addImm(0)
11287 .addImm(0);
11288 } else if (STI.isNeonAvailable()) {
11289 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11290 .addImm(0);
11291 } else {
11292 // This is a streaming-compatible function without SVE. We don't have full
11293 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11294 // So given `movi v..` would be illegal use `fmov d..` instead.
11295 assert(STI.hasNEON() && "Expected to have NEON.");
11296 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11297 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11298 }
11299}
11300
11301std::optional<DestSourcePair>
11303
11304 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11305 // and zero immediate operands used as an alias for mov instruction.
11306 if (((MI.getOpcode() == AArch64::ORRWrs &&
11307 MI.getOperand(1).getReg() == AArch64::WZR &&
11308 MI.getOperand(3).getImm() == 0x0) ||
11309 (MI.getOpcode() == AArch64::ORRWrr &&
11310 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11311 // Check that the w->w move is not a zero-extending w->x mov.
11312 (!MI.getOperand(0).getReg().isVirtual() ||
11313 MI.getOperand(0).getSubReg() == 0) &&
11314 (!MI.getOperand(0).getReg().isPhysical() ||
11315 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11316 /*TRI=*/nullptr) == -1))
11317 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11318
11319 if (MI.getOpcode() == AArch64::ORRXrs &&
11320 MI.getOperand(1).getReg() == AArch64::XZR &&
11321 MI.getOperand(3).getImm() == 0x0)
11322 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11323
11324 return std::nullopt;
11325}
11326
11327std::optional<DestSourcePair>
11329 if ((MI.getOpcode() == AArch64::ORRWrs &&
11330 MI.getOperand(1).getReg() == AArch64::WZR &&
11331 MI.getOperand(3).getImm() == 0x0) ||
11332 (MI.getOpcode() == AArch64::ORRWrr &&
11333 MI.getOperand(1).getReg() == AArch64::WZR))
11334 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11335 return std::nullopt;
11336}
11337
11338std::optional<RegImmPair>
11339AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11340 int Sign = 1;
11341 int64_t Offset = 0;
11342
11343 // TODO: Handle cases where Reg is a super- or sub-register of the
11344 // destination register.
11345 const MachineOperand &Op0 = MI.getOperand(0);
11346 if (!Op0.isReg() || Reg != Op0.getReg())
11347 return std::nullopt;
11348
11349 switch (MI.getOpcode()) {
11350 default:
11351 return std::nullopt;
11352 case AArch64::SUBWri:
11353 case AArch64::SUBXri:
11354 case AArch64::SUBSWri:
11355 case AArch64::SUBSXri:
11356 Sign *= -1;
11357 [[fallthrough]];
11358 case AArch64::ADDSWri:
11359 case AArch64::ADDSXri:
11360 case AArch64::ADDWri:
11361 case AArch64::ADDXri: {
11362 // TODO: Third operand can be global address (usually some string).
11363 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11364 !MI.getOperand(2).isImm())
11365 return std::nullopt;
11366 int Shift = MI.getOperand(3).getImm();
11367 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11368 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11369 }
11370 }
11371 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11372}
11373
11374/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11375/// the destination register then, if possible, describe the value in terms of
11376/// the source register.
11377static std::optional<ParamLoadedValue>
11379 const TargetInstrInfo *TII,
11380 const TargetRegisterInfo *TRI) {
11381 auto DestSrc = TII->isCopyLikeInstr(MI);
11382 if (!DestSrc)
11383 return std::nullopt;
11384
11385 Register DestReg = DestSrc->Destination->getReg();
11386 Register SrcReg = DestSrc->Source->getReg();
11387
11388 if (!DestReg.isValid() || !SrcReg.isValid())
11389 return std::nullopt;
11390
11391 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11392
11393 // If the described register is the destination, just return the source.
11394 if (DestReg == DescribedReg)
11395 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11396
11397 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11398 if (MI.getOpcode() == AArch64::ORRWrs &&
11399 TRI->isSuperRegister(DestReg, DescribedReg))
11400 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11401
11402 // We may need to describe the lower part of a ORRXrs move.
11403 if (MI.getOpcode() == AArch64::ORRXrs &&
11404 TRI->isSubRegister(DestReg, DescribedReg)) {
11405 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11406 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11407 }
11408
11409 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11410 "Unhandled ORR[XW]rs copy case");
11411
11412 return std::nullopt;
11413}
11414
11415bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11416 // Functions cannot be split to different sections on AArch64 if they have
11417 // a red zone. This is because relaxing a cross-section branch may require
11418 // incrementing the stack pointer to spill a register, which would overwrite
11419 // the red zone.
11420 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11421 return false;
11422
11424}
11425
11426bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11427 const MachineBasicBlock &MBB) const {
11428 // Asm Goto blocks can contain conditional branches to goto labels, which can
11429 // get moved out of range of the branch instruction.
11430 auto isAsmGoto = [](const MachineInstr &MI) {
11431 return MI.getOpcode() == AArch64::INLINEASM_BR;
11432 };
11433 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11434 return false;
11435
11436 // Because jump tables are label-relative instead of table-relative, they all
11437 // must be in the same section or relocation fixup handling will fail.
11438
11439 // Check if MBB is a jump table target
11440 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11441 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11442 return llvm::is_contained(JTE.MBBs, &MBB);
11443 };
11444 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11445 return false;
11446
11447 // Check if MBB contains a jump table lookup
11448 for (const MachineInstr &MI : MBB) {
11449 switch (MI.getOpcode()) {
11450 case TargetOpcode::G_BRJT:
11451 case AArch64::JumpTableDest32:
11452 case AArch64::JumpTableDest16:
11453 case AArch64::JumpTableDest8:
11454 return false;
11455 default:
11456 continue;
11457 }
11458 }
11459
11460 // MBB isn't a special case, so it's safe to be split to the cold section.
11461 return true;
11462}
11463
11464std::optional<ParamLoadedValue>
11465AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11466 Register Reg) const {
11467 const MachineFunction *MF = MI.getMF();
11468 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11469 switch (MI.getOpcode()) {
11470 case AArch64::MOVZWi:
11471 case AArch64::MOVZXi: {
11472 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11473 // 64-bit parameters, so we need to consider super-registers.
11474 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11475 return std::nullopt;
11476
11477 if (!MI.getOperand(1).isImm())
11478 return std::nullopt;
11479 int64_t Immediate = MI.getOperand(1).getImm();
11480 int Shift = MI.getOperand(2).getImm();
11481 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11482 nullptr);
11483 }
11484 case AArch64::ORRWrs:
11485 case AArch64::ORRXrs:
11486 return describeORRLoadedValue(MI, Reg, this, TRI);
11487 }
11488
11490}
11491
11492bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11493 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11494 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11495 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11496 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11497
11498 // Anyexts are nops.
11499 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11500 return true;
11501
11502 Register DefReg = ExtMI.getOperand(0).getReg();
11503 if (!MRI.hasOneNonDBGUse(DefReg))
11504 return false;
11505
11506 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11507 // addressing mode.
11508 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11509 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11510}
11511
11512uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11513 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11514}
11515
11516bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11517 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11518}
11519
11520bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11521 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11522}
11523
11524unsigned int
11525AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11526 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11527}
11528
11529bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11530 unsigned Scale) const {
11531 if (Offset && Scale)
11532 return false;
11533
11534 // Check Reg + Imm
11535 if (!Scale) {
11536 // 9-bit signed offset
11537 if (isInt<9>(Offset))
11538 return true;
11539
11540 // 12-bit unsigned offset
11541 unsigned Shift = Log2_64(NumBytes);
11542 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11543 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11544 (Offset >> Shift) << Shift == Offset)
11545 return true;
11546 return false;
11547 }
11548
11549 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11550 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11551}
11552
11554 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11555 return AArch64::BLRNoIP;
11556 else
11557 return AArch64::BLR;
11558}
11559
11561 DebugLoc DL) const {
11562 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11563 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11565
11566 MachineFunction &MF = *MBB.getParent();
11567 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
11568 auto &AFL = *static_cast<const AArch64FrameLowering *>(
11569 MF.getSubtarget().getFrameLowering());
11570 if (AFL.getArgumentStackToRestore(MF, MBB)) {
11571 Builder.addReg(AArch64::X17, RegState::ImplicitDefine);
11572 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11573 if (Subtarget.hasPAuthLR())
11574 Builder.addReg(AArch64::X15, RegState::ImplicitDefine);
11575 return;
11576 }
11577
11578 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11579 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11580}
11581
11583AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11584 Register TargetReg, bool FrameSetup) const {
11585 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11586
11587 MachineBasicBlock &MBB = *MBBI->getParent();
11588 MachineFunction &MF = *MBB.getParent();
11589 const AArch64InstrInfo *TII =
11590 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11591 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11592 DebugLoc DL = MBB.findDebugLoc(MBBI);
11593
11594 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11595 MachineBasicBlock *LoopTestMBB =
11596 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11597 MF.insert(MBBInsertPoint, LoopTestMBB);
11598 MachineBasicBlock *LoopBodyMBB =
11599 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11600 MF.insert(MBBInsertPoint, LoopBodyMBB);
11601 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11602 MF.insert(MBBInsertPoint, ExitMBB);
11603 MachineInstr::MIFlag Flags =
11605
11606 // LoopTest:
11607 // SUB SP, SP, #ProbeSize
11608 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11609 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11610
11611 // CMP SP, TargetReg
11612 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11613 AArch64::XZR)
11614 .addReg(AArch64::SP)
11615 .addReg(TargetReg)
11617 .setMIFlags(Flags);
11618
11619 // B.<Cond> LoopExit
11620 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11622 .addMBB(ExitMBB)
11623 .setMIFlags(Flags);
11624
11625 // LDR XZR, [SP]
11626 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11627 .addDef(AArch64::XZR)
11628 .addReg(AArch64::SP)
11629 .addImm(0)
11633 Align(8)))
11634 .setMIFlags(Flags);
11635
11636 // B loop
11637 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11638 .addMBB(LoopTestMBB)
11639 .setMIFlags(Flags);
11640
11641 // LoopExit:
11642 // MOV SP, TargetReg
11643 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11644 .addReg(TargetReg)
11645 .addImm(0)
11647 .setMIFlags(Flags);
11648
11649 // LDR XZR, [SP]
11650 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11651 .addReg(AArch64::XZR, RegState::Define)
11652 .addReg(AArch64::SP)
11653 .addImm(0)
11654 .setMIFlags(Flags);
11655
11656 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11658
11659 LoopTestMBB->addSuccessor(ExitMBB);
11660 LoopTestMBB->addSuccessor(LoopBodyMBB);
11661 LoopBodyMBB->addSuccessor(LoopTestMBB);
11662 MBB.addSuccessor(LoopTestMBB);
11663
11664 // Update liveins.
11665 if (MF.getRegInfo().reservedRegsFrozen())
11666 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11667
11668 return ExitMBB->begin();
11669}
11670
11671namespace {
11672class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11673 MachineFunction *MF;
11674 const TargetInstrInfo *TII;
11675 const TargetRegisterInfo *TRI;
11676 MachineRegisterInfo &MRI;
11677
11678 /// The block of the loop
11679 MachineBasicBlock *LoopBB;
11680 /// The conditional branch of the loop
11681 MachineInstr *CondBranch;
11682 /// The compare instruction for loop control
11683 MachineInstr *Comp;
11684 /// The number of the operand of the loop counter value in Comp
11685 unsigned CompCounterOprNum;
11686 /// The instruction that updates the loop counter value
11687 MachineInstr *Update;
11688 /// The number of the operand of the loop counter value in Update
11689 unsigned UpdateCounterOprNum;
11690 /// The initial value of the loop counter
11691 Register Init;
11692 /// True iff Update is a predecessor of Comp
11693 bool IsUpdatePriorComp;
11694
11695 /// The normalized condition used by createTripCountGreaterCondition()
11697
11698public:
11699 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11700 MachineInstr *Comp, unsigned CompCounterOprNum,
11701 MachineInstr *Update, unsigned UpdateCounterOprNum,
11702 Register Init, bool IsUpdatePriorComp,
11703 const SmallVectorImpl<MachineOperand> &Cond)
11704 : MF(Comp->getParent()->getParent()),
11705 TII(MF->getSubtarget().getInstrInfo()),
11706 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11707 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11708 CompCounterOprNum(CompCounterOprNum), Update(Update),
11709 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11710 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11711
11712 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11713 // Make the instructions for loop control be placed in stage 0.
11714 // The predecessors of Comp are considered by the caller.
11715 return MI == Comp;
11716 }
11717
11718 std::optional<bool> createTripCountGreaterCondition(
11719 int TC, MachineBasicBlock &MBB,
11720 SmallVectorImpl<MachineOperand> &CondParam) override {
11721 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11722 // Cond is normalized for such use.
11723 // The predecessors of the branch are assumed to have already been inserted.
11724 CondParam = Cond;
11725 return {};
11726 }
11727
11728 void createRemainingIterationsGreaterCondition(
11729 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11730 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11731
11732 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11733
11734 void adjustTripCount(int TripCountAdjust) override {}
11735
11736 bool isMVEExpanderSupported() override { return true; }
11737};
11738} // namespace
11739
11740/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11741/// is replaced by ReplaceReg. The output register is newly created.
11742/// The other operands are unchanged from MI.
11743static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11744 Register ReplaceReg, MachineBasicBlock &MBB,
11745 MachineBasicBlock::iterator InsertTo) {
11746 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11747 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11748 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11749 Register Result = 0;
11750 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11751 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11752 Result = MRI.createVirtualRegister(
11753 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11754 NewMI->getOperand(I).setReg(Result);
11755 } else if (I == ReplaceOprNum) {
11756 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11757 NewMI->getOperand(I).setReg(ReplaceReg);
11758 }
11759 }
11760 MBB.insert(InsertTo, NewMI);
11761 return Result;
11762}
11763
11764void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11767 // Create and accumulate conditions for next TC iterations.
11768 // Example:
11769 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11770 // # iteration of the kernel
11771 //
11772 // # insert the following instructions
11773 // cond = CSINCXr 0, 0, C, implicit $nzcv
11774 // counter = ADDXri counter, 1 # clone from this->Update
11775 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11776 // cond = CSINCXr cond, cond, C, implicit $nzcv
11777 // ... (repeat TC times)
11778 // SUBSXri cond, 0, implicit-def $nzcv
11779
11780 assert(CondBranch->getOpcode() == AArch64::Bcc);
11781 // CondCode to exit the loop
11783 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11784 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11786
11787 // Accumulate conditions to exit the loop
11788 Register AccCond = AArch64::XZR;
11789
11790 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11791 auto AccumulateCond = [&](Register CurCond,
11793 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11794 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11795 .addReg(NewCond, RegState::Define)
11796 .addReg(CurCond)
11797 .addReg(CurCond)
11799 return NewCond;
11800 };
11801
11802 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11803 // Update and Comp for I==0 are already exists in MBB
11804 // (MBB is an unrolled kernel)
11805 Register Counter;
11806 for (int I = 0; I <= TC; ++I) {
11807 Register NextCounter;
11808 if (I != 0)
11809 NextCounter =
11810 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11811
11812 AccCond = AccumulateCond(AccCond, CC);
11813
11814 if (I != TC) {
11815 if (I == 0) {
11816 if (Update != Comp && IsUpdatePriorComp) {
11817 Counter =
11818 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11819 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11820 MBB.end());
11821 } else {
11822 // can use already calculated value
11823 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11824 }
11825 } else if (Update != Comp) {
11826 NextCounter =
11827 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11828 }
11829 }
11830 Counter = NextCounter;
11831 }
11832 } else {
11833 Register Counter;
11834 if (LastStage0Insts.empty()) {
11835 // use initial counter value (testing if the trip count is sufficient to
11836 // be executed by pipelined code)
11837 Counter = Init;
11838 if (IsUpdatePriorComp)
11839 Counter =
11840 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11841 } else {
11842 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11843 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11844 }
11845
11846 for (int I = 0; I <= TC; ++I) {
11847 Register NextCounter;
11848 NextCounter =
11849 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11850 AccCond = AccumulateCond(AccCond, CC);
11851 if (I != TC && Update != Comp)
11852 NextCounter =
11853 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11854 Counter = NextCounter;
11855 }
11856 }
11857
11858 // If AccCond == 0, the remainder is greater than TC.
11859 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11860 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11861 .addReg(AccCond)
11862 .addImm(0)
11863 .addImm(0);
11864 Cond.clear();
11866}
11867
11868static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11869 Register &RegMBB, Register &RegOther) {
11870 assert(Phi.getNumOperands() == 5);
11871 if (Phi.getOperand(2).getMBB() == MBB) {
11872 RegMBB = Phi.getOperand(1).getReg();
11873 RegOther = Phi.getOperand(3).getReg();
11874 } else {
11875 assert(Phi.getOperand(4).getMBB() == MBB);
11876 RegMBB = Phi.getOperand(3).getReg();
11877 RegOther = Phi.getOperand(1).getReg();
11878 }
11879}
11880
11882 if (!Reg.isVirtual())
11883 return false;
11884 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11885 return MRI.getVRegDef(Reg)->getParent() != BB;
11886}
11887
11888/// If Reg is an induction variable, return true and set some parameters
11889static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11890 MachineInstr *&UpdateInst,
11891 unsigned &UpdateCounterOprNum, Register &InitReg,
11892 bool &IsUpdatePriorComp) {
11893 // Example:
11894 //
11895 // Preheader:
11896 // InitReg = ...
11897 // LoopBB:
11898 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11899 // Reg = COPY Reg0 ; COPY is ignored.
11900 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11901 // ; Reg is the value calculated in the previous
11902 // ; iteration, so IsUpdatePriorComp == false.
11903
11904 if (LoopBB->pred_size() != 2)
11905 return false;
11906 if (!Reg.isVirtual())
11907 return false;
11908 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11909 UpdateInst = nullptr;
11910 UpdateCounterOprNum = 0;
11911 InitReg = 0;
11912 IsUpdatePriorComp = true;
11913 Register CurReg = Reg;
11914 while (true) {
11915 MachineInstr *Def = MRI.getVRegDef(CurReg);
11916 if (Def->getParent() != LoopBB)
11917 return false;
11918 if (Def->isCopy()) {
11919 // Ignore copy instructions unless they contain subregisters
11920 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11921 return false;
11922 CurReg = Def->getOperand(1).getReg();
11923 } else if (Def->isPHI()) {
11924 if (InitReg != 0)
11925 return false;
11926 if (!UpdateInst)
11927 IsUpdatePriorComp = false;
11928 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11929 } else {
11930 if (UpdateInst)
11931 return false;
11932 switch (Def->getOpcode()) {
11933 case AArch64::ADDSXri:
11934 case AArch64::ADDSWri:
11935 case AArch64::SUBSXri:
11936 case AArch64::SUBSWri:
11937 case AArch64::ADDXri:
11938 case AArch64::ADDWri:
11939 case AArch64::SUBXri:
11940 case AArch64::SUBWri:
11941 UpdateInst = Def;
11942 UpdateCounterOprNum = 1;
11943 break;
11944 case AArch64::ADDSXrr:
11945 case AArch64::ADDSWrr:
11946 case AArch64::SUBSXrr:
11947 case AArch64::SUBSWrr:
11948 case AArch64::ADDXrr:
11949 case AArch64::ADDWrr:
11950 case AArch64::SUBXrr:
11951 case AArch64::SUBWrr:
11952 UpdateInst = Def;
11953 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11954 UpdateCounterOprNum = 1;
11955 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11956 UpdateCounterOprNum = 2;
11957 else
11958 return false;
11959 break;
11960 default:
11961 return false;
11962 }
11963 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11964 }
11965
11966 if (!CurReg.isVirtual())
11967 return false;
11968 if (Reg == CurReg)
11969 break;
11970 }
11971
11972 if (!UpdateInst)
11973 return false;
11974
11975 return true;
11976}
11977
11978std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11980 // Accept loops that meet the following conditions
11981 // * The conditional branch is BCC
11982 // * The compare instruction is ADDS/SUBS/WHILEXX
11983 // * One operand of the compare is an induction variable and the other is a
11984 // loop invariant value
11985 // * The induction variable is incremented/decremented by a single instruction
11986 // * Does not contain CALL or instructions which have unmodeled side effects
11987
11988 for (MachineInstr &MI : *LoopBB)
11989 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11990 // This instruction may use NZCV, which interferes with the instruction to
11991 // be inserted for loop control.
11992 return nullptr;
11993
11994 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11996 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11997 return nullptr;
11998
11999 // Infinite loops are not supported
12000 if (TBB == LoopBB && FBB == LoopBB)
12001 return nullptr;
12002
12003 // Must be conditional branch
12004 if (TBB != LoopBB && FBB == nullptr)
12005 return nullptr;
12006
12007 assert((TBB == LoopBB || FBB == LoopBB) &&
12008 "The Loop must be a single-basic-block loop");
12009
12010 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
12012
12013 if (CondBranch->getOpcode() != AArch64::Bcc)
12014 return nullptr;
12015
12016 // Normalization for createTripCountGreaterCondition()
12017 if (TBB == LoopBB)
12019
12020 MachineInstr *Comp = nullptr;
12021 unsigned CompCounterOprNum = 0;
12022 for (MachineInstr &MI : reverse(*LoopBB)) {
12023 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
12024 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
12025 // operands is a loop invariant value
12026
12027 switch (MI.getOpcode()) {
12028 case AArch64::SUBSXri:
12029 case AArch64::SUBSWri:
12030 case AArch64::ADDSXri:
12031 case AArch64::ADDSWri:
12032 Comp = &MI;
12033 CompCounterOprNum = 1;
12034 break;
12035 case AArch64::ADDSWrr:
12036 case AArch64::ADDSXrr:
12037 case AArch64::SUBSWrr:
12038 case AArch64::SUBSXrr:
12039 Comp = &MI;
12040 break;
12041 default:
12042 if (isWhileOpcode(MI.getOpcode())) {
12043 Comp = &MI;
12044 break;
12045 }
12046 return nullptr;
12047 }
12048
12049 if (CompCounterOprNum == 0) {
12050 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
12051 CompCounterOprNum = 2;
12052 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
12053 CompCounterOprNum = 1;
12054 else
12055 return nullptr;
12056 }
12057 break;
12058 }
12059 }
12060 if (!Comp)
12061 return nullptr;
12062
12063 MachineInstr *Update = nullptr;
12064 Register Init;
12065 bool IsUpdatePriorComp;
12066 unsigned UpdateCounterOprNum;
12067 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
12068 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
12069 return nullptr;
12070
12071 return std::make_unique<AArch64PipelinerLoopInfo>(
12072 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
12073 Init, IsUpdatePriorComp, Cond);
12074}
12075
12076/// verifyInstruction - Perform target specific instruction verification.
12077bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
12078 StringRef &ErrInfo) const {
12079 // Verify that immediate offsets on load/store instructions are within range.
12080 // Stack objects with an FI operand are excluded as they can be fixed up
12081 // during PEI.
12082 TypeSize Scale(0U, false), Width(0U, false);
12083 int64_t MinOffset, MaxOffset;
12084 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
12085 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
12086 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
12087 int64_t Imm = MI.getOperand(ImmIdx).getImm();
12088 if (Imm < MinOffset || Imm > MaxOffset) {
12089 ErrInfo = "Unexpected immediate on load/store instruction";
12090 return false;
12091 }
12092 }
12093 }
12094
12095 const MCInstrDesc &MCID = MI.getDesc();
12096 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
12097 const MachineOperand &MO = MI.getOperand(Op);
12098 switch (MCID.operands()[Op].OperandType) {
12100 if (!MO.isImm() || MO.getImm() != 0) {
12101 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
12102 return false;
12103 }
12104 break;
12106 if (!MO.isImm() ||
12108 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
12109 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
12110 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
12111 return false;
12112 }
12113 break;
12114 default:
12115 break;
12116 }
12117 }
12118 return true;
12119}
12120
12121#define GET_INSTRINFO_HELPERS
12122#define GET_INSTRMAP_INFO
12123#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:124
bool empty() const
Definition DenseMap.h:173
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:688
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:685
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:66
bool usesWindowsCFI() const
Definition MCAsmInfo.h:674
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:615
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:657
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:630
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:727
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
constexpr bool isLegalArithImmed(const uint64_t C)
isLegalArithImmed -
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVAddr(unsigned Opcode, unsigned TargetFlags, bool IsTargetMachO, SmallVectorImpl< AddrInsnModel > &Insn)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool isLFIPrePostMemAccess(unsigned Opcode)
Returns true if Opcode is a pre- or post-indexed memory access that the LFI rewriter expands with a b...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.