LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
22#include "llvm/ADT/ArrayRef.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
45#include "llvm/IR/DebugLoc.h"
46#include "llvm/IR/GlobalValue.h"
47#include "llvm/IR/Module.h"
48#include "llvm/MC/MCAsmInfo.h"
49#include "llvm/MC/MCInst.h"
51#include "llvm/MC/MCInstrDesc.h"
56#include "llvm/Support/LEB128.h"
60#include <cassert>
61#include <cstdint>
62#include <iterator>
63#include <utility>
64
65using namespace llvm;
66
67#define GET_INSTRINFO_CTOR_DTOR
68#include "AArch64GenInstrInfo.inc"
69
70#define DEBUG_TYPE "AArch64InstrInfo"
71
72STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
73STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
74 "instructions expanded from canonical COPY");
75STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
76 "instructions expanded from canonical COPY");
77STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
78 "instructions expanded from canonical COPY");
79// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
80
82 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
83 cl::desc("Restrict range of CB instructions (DEBUG)"));
84
86 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
87 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
88
90 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
91 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
92
94 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
95 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
96
98 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
99 cl::desc("Restrict range of B instructions (DEBUG)"));
100
102 "aarch64-search-limit", cl::Hidden, cl::init(2048),
103 cl::desc("Restrict range of instructions to search for the "
104 "machine-combiner gather pattern optimization"));
105
107 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
108 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
109 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
110
111/// Return the maximum number of bytes of code the specified instruction may be
112/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
113/// returned (use default sizing).
114///
115/// NOTE: the size estimates here must be kept in sync with the rewrites in
116/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
117/// instruction sequences.
118static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
119 switch (MI.getOpcode()) {
120 case AArch64::SVC:
121 // SVC expands to 4 instructions.
122 return 16;
123 case AArch64::BR:
124 case AArch64::BLR:
125 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
126 return 8;
127 case AArch64::RET:
128 // RET through LR is not rewritten, but RET through another register
129 // expands to 2 instructions (guard + ret).
130 if (MI.getOperand(0).getReg() != AArch64::LR)
131 return 8;
132 return 4;
133 case AArch64::SYSxt:
134 // VA-based DC/IC ops (op1=3, Cn=7, op2=1) expand to 2 instructions.
135 if (MI.getOperand(0).getImm() == 3 && MI.getOperand(1).getImm() == 7 &&
136 MI.getOperand(3).getImm() == 1)
137 return 8;
138 return std::nullopt;
139 default:
140 break;
141 }
142
143 // Detect instructions that explicitly define SP or LR.
144 bool ModifiesLR = false;
145 bool ModifiesSP = false;
146 for (const MachineOperand &MO : MI.defs()) {
147 if (!MO.isReg())
148 continue;
149 if (MO.getReg() == AArch64::LR)
150 ModifiesLR = true;
151 else if (MO.getReg() == AArch64::SP)
152 ModifiesSP = true;
153 }
154
155 // Memory accesses expand to a base-register guard plus the rewritten access
156 // (8 bytes), with an extra base-register update for pre/post-index forms (12
157 // bytes total). If the access also defines LR, an LR mask is appended (+4
158 // bytes). Depending on additional optimizations that the rewriter performs,
159 // this may be an overestimate.
160 if (MI.mayLoadOrStore()) {
161 unsigned Size = isLFIPrePostMemAccess(MI.getOpcode()) ? 12 : 8;
162 if (ModifiesLR)
163 Size += 4;
164 return Size;
165 }
166
167 // Non memory operations that modify LR or SP expand to 2 instructions.
168 if (ModifiesSP || ModifiesLR)
169 return 8;
170
171 // Default case: instructions that don't cause expansion.
172 // - TP accesses in LFI are a single load/store, so no expansion.
173 // - All remaining instructions are not rewritten.
174 return std::nullopt;
175}
176
177/// GetInstSize - Return the number of bytes of code the specified
178/// instruction may be. This returns the maximum number of bytes.
180 const MachineBasicBlock &MBB = *MI.getParent();
181 const MachineFunction *MF = MBB.getParent();
182 const Function &F = MF->getFunction();
183 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
184
185 {
186 auto Op = MI.getOpcode();
187 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
188 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
189 }
190
191 // Meta-instructions emit no code.
192 if (MI.isMetaInstruction())
193 return 0;
194
195 // FIXME: We currently only handle pseudoinstructions that don't get expanded
196 // before the assembly printer.
197 unsigned NumBytes = 0;
198 const MCInstrDesc &Desc = MI.getDesc();
199
200 // LFI rewriter expansions that supersede normal sizing.
201 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
202 if (STI.isLFI())
203 if (auto Size = getLFIInstSizeInBytes(MI))
204 return *Size;
205
206 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
207 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
208
209 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
210 if (!MFI->shouldSignReturnAddress(*MF))
211 return NumBytes;
212
213 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
214 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
215 return NumBytes;
216 }
217
218 // Size should be preferably set in
219 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
220 // Specific cases handle instructions of variable sizes
221 switch (Desc.getOpcode()) {
222 default:
223 if (Desc.getSize())
224 return Desc.getSize();
225
226 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
227 // with fixed constant size but not specified in .td file) is a normal
228 // 4-byte insn.
229 NumBytes = 4;
230 break;
231 case TargetOpcode::STACKMAP:
232 // The upper bound for a stackmap intrinsic is the full length of its shadow
233 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
234 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
235 break;
236 case TargetOpcode::PATCHPOINT:
237 // The size of the patchpoint intrinsic is the number of bytes requested
238 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
239 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
240 break;
241 case TargetOpcode::STATEPOINT:
242 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
243 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
244 // No patch bytes means a normal call inst is emitted
245 if (NumBytes == 0)
246 NumBytes = 4;
247 break;
248 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
249 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
250 // instructions are expanded to the specified number of NOPs. Otherwise,
251 // they are expanded to 36-byte XRay sleds.
252 NumBytes =
253 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
254 break;
255 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
256 case TargetOpcode::PATCHABLE_TAIL_CALL:
257 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
258 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
259 NumBytes = 36;
260 break;
261 case TargetOpcode::PATCHABLE_EVENT_CALL:
262 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
263 NumBytes = 24;
264 break;
265
266 case AArch64::SPACE:
267 NumBytes = MI.getOperand(1).getImm();
268 break;
269 case AArch64::MOVaddr:
270 case AArch64::MOVaddrJT:
271 case AArch64::MOVaddrCP:
272 case AArch64::MOVaddrBA:
273 case AArch64::MOVaddrTLS:
274 case AArch64::MOVaddrEXT: {
275 // Use the same logic as the pseudo expansion to count instructions.
278 MI.getOperand(1).getTargetFlags(),
279 Subtarget.isTargetMachO(), Insn);
280 NumBytes = Insn.size() * 4;
281 break;
282 }
283
284 case AArch64::MOVi32imm:
285 case AArch64::MOVi64imm: {
286 // Use the same logic as the pseudo expansion to count instructions.
287 unsigned BitSize = Desc.getOpcode() == AArch64::MOVi32imm ? 32 : 64;
289 AArch64_IMM::expandMOVImm(MI.getOperand(1).getImm(), BitSize, Insn);
290 NumBytes = Insn.size() * 4;
291 break;
292 }
293
294 case TargetOpcode::BUNDLE:
295 NumBytes = getInstBundleSize(MI);
296 break;
297 }
298
299 return NumBytes;
300}
301
304 // Block ends with fall-through condbranch.
305 switch (LastInst->getOpcode()) {
306 default:
307 llvm_unreachable("Unknown branch instruction?");
308 case AArch64::Bcc:
309 Target = LastInst->getOperand(1).getMBB();
310 Cond.push_back(LastInst->getOperand(0));
311 break;
312 case AArch64::CBZW:
313 case AArch64::CBZX:
314 case AArch64::CBNZW:
315 case AArch64::CBNZX:
316 Target = LastInst->getOperand(1).getMBB();
317 Cond.push_back(MachineOperand::CreateImm(-1));
318 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
319 Cond.push_back(LastInst->getOperand(0));
320 break;
321 case AArch64::TBZW:
322 case AArch64::TBZX:
323 case AArch64::TBNZW:
324 case AArch64::TBNZX:
325 Target = LastInst->getOperand(2).getMBB();
326 Cond.push_back(MachineOperand::CreateImm(-1));
327 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
328 Cond.push_back(LastInst->getOperand(0));
329 Cond.push_back(LastInst->getOperand(1));
330 break;
331 case AArch64::CBWPri:
332 case AArch64::CBXPri:
333 case AArch64::CBWPrr:
334 case AArch64::CBXPrr:
335 Target = LastInst->getOperand(3).getMBB();
336 Cond.push_back(MachineOperand::CreateImm(-1));
337 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
338 Cond.push_back(LastInst->getOperand(0));
339 Cond.push_back(LastInst->getOperand(1));
340 Cond.push_back(LastInst->getOperand(2));
341 break;
342 case AArch64::CBBAssertExt:
343 case AArch64::CBHAssertExt:
344 Target = LastInst->getOperand(3).getMBB();
345 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
346 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
347 Cond.push_back(LastInst->getOperand(0)); // Cond
348 Cond.push_back(LastInst->getOperand(1)); // Op0
349 Cond.push_back(LastInst->getOperand(2)); // Op1
350 Cond.push_back(LastInst->getOperand(4)); // Ext0
351 Cond.push_back(LastInst->getOperand(5)); // Ext1
352 break;
353 }
354}
355
356static unsigned getBranchDisplacementBits(unsigned Opc) {
357 switch (Opc) {
358 default:
359 llvm_unreachable("unexpected opcode!");
360 case AArch64::B:
361 return BDisplacementBits;
362 case AArch64::TBNZW:
363 case AArch64::TBZW:
364 case AArch64::TBNZX:
365 case AArch64::TBZX:
366 return TBZDisplacementBits;
367 case AArch64::CBNZW:
368 case AArch64::CBZW:
369 case AArch64::CBNZX:
370 case AArch64::CBZX:
371 return CBZDisplacementBits;
372 case AArch64::Bcc:
373 return BCCDisplacementBits;
374 case AArch64::CBWPri:
375 case AArch64::CBXPri:
376 case AArch64::CBBAssertExt:
377 case AArch64::CBHAssertExt:
378 case AArch64::CBWPrr:
379 case AArch64::CBXPrr:
380 return CBDisplacementBits;
381 }
382}
383
385 int64_t BrOffset) const {
386 unsigned Bits = getBranchDisplacementBits(BranchOp);
387 assert(Bits >= 3 && "max branch displacement must be enough to jump"
388 "over conditional branch expansion");
389 return isIntN(Bits, BrOffset / 4);
390}
391
394 switch (MI.getOpcode()) {
395 default:
396 llvm_unreachable("unexpected opcode!");
397 case AArch64::B:
398 return MI.getOperand(0).getMBB();
399 case AArch64::TBZW:
400 case AArch64::TBNZW:
401 case AArch64::TBZX:
402 case AArch64::TBNZX:
403 return MI.getOperand(2).getMBB();
404 case AArch64::CBZW:
405 case AArch64::CBNZW:
406 case AArch64::CBZX:
407 case AArch64::CBNZX:
408 case AArch64::Bcc:
409 return MI.getOperand(1).getMBB();
410 case AArch64::CBWPri:
411 case AArch64::CBXPri:
412 case AArch64::CBBAssertExt:
413 case AArch64::CBHAssertExt:
414 case AArch64::CBWPrr:
415 case AArch64::CBXPrr:
416 return MI.getOperand(3).getMBB();
417 }
418}
419
421 MachineBasicBlock &NewDestBB,
422 MachineBasicBlock &RestoreBB,
423 const DebugLoc &DL,
424 int64_t BrOffset,
425 RegScavenger *RS) const {
426 assert(RS && "RegScavenger required for long branching");
427 assert(MBB.empty() &&
428 "new block should be inserted for expanding unconditional branch");
429 assert(MBB.pred_size() == 1);
430 assert(RestoreBB.empty() &&
431 "restore block should be inserted for restoring clobbered registers");
432
433 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
434 // Offsets outside of the signed 33-bit range are not supported for ADRP +
435 // ADD.
436 if (!isInt<33>(BrOffset))
438 "Branch offsets outside of the signed 33-bit range not supported");
439
440 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
441 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
442 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
443 .addReg(Reg)
444 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
445 .addImm(0);
446 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
447 };
448
449 RS->enterBasicBlockEnd(MBB);
450 // If X16 is unused, we can rely on the linker to insert a range extension
451 // thunk if NewDestBB is out of range of a single B instruction.
452 constexpr Register Reg = AArch64::X16;
453 if (!RS->isRegUsed(Reg)) {
454 insertUnconditionalBranch(MBB, &NewDestBB, DL);
455 RS->setRegUsed(Reg);
456 return;
457 }
458
459 // In a cold block without BTI, insert the indirect branch if a register is
460 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
461 // prioritizing a dynamic cost in cold code over a static cost in hot code.
462 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
463 bool HasBTI = AFI && AFI->branchTargetEnforcement();
464 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
465 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
466 if (Scavenged != AArch64::NoRegister) {
467 buildIndirectBranch(Scavenged, NewDestBB);
468 RS->setRegUsed(Scavenged);
469 return;
470 }
471 }
472
473 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
474 // with red zones.
475 if (!AFI || AFI->hasRedZone().value_or(true))
477 "Unable to insert indirect branch inside function that has red zone");
478
479 // Otherwise, spill X16 and defer range extension to the linker.
480 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
481 .addReg(AArch64::SP, RegState::Define)
482 .addReg(Reg)
483 .addReg(AArch64::SP)
484 .addImm(-16);
485
486 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
487
488 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
489 .addReg(AArch64::SP, RegState::Define)
491 .addReg(AArch64::SP)
492 .addImm(16);
493}
494
495// Branch analysis.
498 MachineBasicBlock *&FBB,
500 bool AllowModify) const {
501 // If the block has no terminators, it just falls into the block after it.
502 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
503 if (I == MBB.end())
504 return false;
505
506 // Skip over SpeculationBarrierEndBB terminators
507 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
508 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
509 --I;
510 }
511
512 if (!isUnpredicatedTerminator(*I))
513 return false;
514
515 // Get the last instruction in the block.
516 MachineInstr *LastInst = &*I;
517
518 // If there is only one terminator instruction, process it.
519 unsigned LastOpc = LastInst->getOpcode();
520 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
521 if (isUncondBranchOpcode(LastOpc)) {
522 TBB = LastInst->getOperand(0).getMBB();
523 return false;
524 }
525 if (isCondBranchOpcode(LastOpc)) {
526 // Block ends with fall-through condbranch.
527 parseCondBranch(LastInst, TBB, Cond);
528 return false;
529 }
530 return true; // Can't handle indirect branch.
531 }
532
533 // Get the instruction before it if it is a terminator.
534 MachineInstr *SecondLastInst = &*I;
535 unsigned SecondLastOpc = SecondLastInst->getOpcode();
536
537 // If AllowModify is true and the block ends with two or more unconditional
538 // branches, delete all but the first unconditional branch.
539 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
540 while (isUncondBranchOpcode(SecondLastOpc)) {
541 LastInst->eraseFromParent();
542 LastInst = SecondLastInst;
543 LastOpc = LastInst->getOpcode();
544 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
545 // Return now the only terminator is an unconditional branch.
546 TBB = LastInst->getOperand(0).getMBB();
547 return false;
548 }
549 SecondLastInst = &*I;
550 SecondLastOpc = SecondLastInst->getOpcode();
551 }
552 }
553
554 // If we're allowed to modify and the block ends in a unconditional branch
555 // which could simply fallthrough, remove the branch. (Note: This case only
556 // matters when we can't understand the whole sequence, otherwise it's also
557 // handled by BranchFolding.cpp.)
558 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
559 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
560 LastInst->eraseFromParent();
561 LastInst = SecondLastInst;
562 LastOpc = LastInst->getOpcode();
563 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
564 assert(!isUncondBranchOpcode(LastOpc) &&
565 "unreachable unconditional branches removed above");
566
567 if (isCondBranchOpcode(LastOpc)) {
568 // Block ends with fall-through condbranch.
569 parseCondBranch(LastInst, TBB, Cond);
570 return false;
571 }
572 return true; // Can't handle indirect branch.
573 }
574 SecondLastInst = &*I;
575 SecondLastOpc = SecondLastInst->getOpcode();
576 }
577
578 // If there are three terminators, we don't know what sort of block this is.
579 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
580 return true;
581
582 // If the block ends with a B and a Bcc, handle it.
583 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
584 parseCondBranch(SecondLastInst, TBB, Cond);
585 FBB = LastInst->getOperand(0).getMBB();
586 return false;
587 }
588
589 // If the block ends with two unconditional branches, handle it. The second
590 // one is not executed, so remove it.
591 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
592 TBB = SecondLastInst->getOperand(0).getMBB();
593 I = LastInst;
594 if (AllowModify)
595 I->eraseFromParent();
596 return false;
597 }
598
599 // ...likewise if it ends with an indirect branch followed by an unconditional
600 // branch.
601 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
602 I = LastInst;
603 if (AllowModify)
604 I->eraseFromParent();
605 return true;
606 }
607
608 // Otherwise, can't handle this.
609 return true;
610}
611
613 MachineBranchPredicate &MBP,
614 bool AllowModify) const {
615 // Use analyzeBranch to validate the branch pattern.
616 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
618 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
619 return true;
620
621 // analyzeBranch returns success with empty Cond for unconditional branches.
622 if (Cond.empty())
623 return true;
624
625 MBP.TrueDest = TBB;
626 assert(MBP.TrueDest && "expected!");
627 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
628
629 MBP.ConditionDef = nullptr;
630 MBP.SingleUseCondition = false;
631
632 // Find the conditional branch. After analyzeBranch succeeds with non-empty
633 // Cond, there's exactly one conditional branch - either last (fallthrough)
634 // or second-to-last (followed by unconditional B).
635 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
636 if (I == MBB.end())
637 return true;
638
639 if (isUncondBranchOpcode(I->getOpcode())) {
640 if (I == MBB.begin())
641 return true;
642 --I;
643 }
644
645 MachineInstr *CondBranch = &*I;
646 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
647
648 switch (CondBranch->getOpcode()) {
649 default:
650 return true;
651
652 case AArch64::Bcc:
653 // Bcc takes the NZCV flag as the operand to branch on, walk up the
654 // instruction stream to find the last instruction to define NZCV.
656 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
657 MBP.ConditionDef = &MI;
658 break;
659 }
660 }
661 return false;
662
663 case AArch64::CBZW:
664 case AArch64::CBZX:
665 case AArch64::CBNZW:
666 case AArch64::CBNZX: {
667 MBP.LHS = CondBranch->getOperand(0);
668 MBP.RHS = MachineOperand::CreateImm(0);
669 unsigned Opc = CondBranch->getOpcode();
670 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
671 ? MachineBranchPredicate::PRED_NE
672 : MachineBranchPredicate::PRED_EQ;
673 Register CondReg = MBP.LHS.getReg();
674 if (CondReg.isVirtual())
675 MBP.ConditionDef = MRI.getVRegDef(CondReg);
676 return false;
677 }
678
679 case AArch64::TBZW:
680 case AArch64::TBZX:
681 case AArch64::TBNZW:
682 case AArch64::TBNZX: {
683 Register CondReg = CondBranch->getOperand(0).getReg();
684 if (CondReg.isVirtual())
685 MBP.ConditionDef = MRI.getVRegDef(CondReg);
686 return false;
687 }
688 }
689}
690
693 if (Cond[0].getImm() != -1) {
694 // Regular Bcc
695 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
697 } else {
698 // Folded compare-and-branch
699 switch (Cond[1].getImm()) {
700 default:
701 llvm_unreachable("Unknown conditional branch!");
702 case AArch64::CBZW:
703 Cond[1].setImm(AArch64::CBNZW);
704 break;
705 case AArch64::CBNZW:
706 Cond[1].setImm(AArch64::CBZW);
707 break;
708 case AArch64::CBZX:
709 Cond[1].setImm(AArch64::CBNZX);
710 break;
711 case AArch64::CBNZX:
712 Cond[1].setImm(AArch64::CBZX);
713 break;
714 case AArch64::TBZW:
715 Cond[1].setImm(AArch64::TBNZW);
716 break;
717 case AArch64::TBNZW:
718 Cond[1].setImm(AArch64::TBZW);
719 break;
720 case AArch64::TBZX:
721 Cond[1].setImm(AArch64::TBNZX);
722 break;
723 case AArch64::TBNZX:
724 Cond[1].setImm(AArch64::TBZX);
725 break;
726
727 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
728 case AArch64::CBWPri:
729 case AArch64::CBXPri:
730 case AArch64::CBBAssertExt:
731 case AArch64::CBHAssertExt:
732 case AArch64::CBWPrr:
733 case AArch64::CBXPrr: {
734 // Pseudos using standard 4bit Arm condition codes
736 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
738 }
739 }
740 }
741
742 return false;
743}
744
746 int *BytesRemoved) const {
747 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
748 if (I == MBB.end())
749 return 0;
750
751 if (!isUncondBranchOpcode(I->getOpcode()) &&
752 !isCondBranchOpcode(I->getOpcode()))
753 return 0;
754
755 // Remove the branch.
756 I->eraseFromParent();
757
758 I = MBB.end();
759
760 if (I == MBB.begin()) {
761 if (BytesRemoved)
762 *BytesRemoved = 4;
763 return 1;
764 }
765 --I;
766 if (!isCondBranchOpcode(I->getOpcode())) {
767 if (BytesRemoved)
768 *BytesRemoved = 4;
769 return 1;
770 }
771
772 // Remove the branch.
773 I->eraseFromParent();
774 if (BytesRemoved)
775 *BytesRemoved = 8;
776
777 return 2;
778}
779
780void AArch64InstrInfo::instantiateCondBranch(
783 if (Cond[0].getImm() != -1) {
784 // Regular Bcc
785 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
786 } else {
787 // Folded compare-and-branch
788 // Note that we use addOperand instead of addReg to keep the flags.
789
790 // cbz, cbnz
791 const MachineInstrBuilder MIB =
792 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
793
794 // tbz/tbnz
795 if (Cond.size() > 3)
796 MIB.add(Cond[3]);
797
798 // cb
799 if (Cond.size() > 4)
800 MIB.add(Cond[4]);
801
802 MIB.addMBB(TBB);
803
804 // cb[b,h]
805 if (Cond.size() > 5) {
806 MIB.addImm(Cond[5].getImm());
807 MIB.addImm(Cond[6].getImm());
808 }
809 }
810}
811
814 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
815 // Shouldn't be a fall through.
816 assert(TBB && "insertBranch must not be told to insert a fallthrough");
817
818 if (!FBB) {
819 if (Cond.empty()) // Unconditional branch?
820 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
821 else
822 instantiateCondBranch(MBB, DL, TBB, Cond);
823
824 if (BytesAdded)
825 *BytesAdded = 4;
826
827 return 1;
828 }
829
830 // Two-way conditional branch.
831 instantiateCondBranch(MBB, DL, TBB, Cond);
832 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
833
834 if (BytesAdded)
835 *BytesAdded = 8;
836
837 return 2;
838}
839
841 const TargetInstrInfo &TII) {
842 for (MachineInstr &MI : MBB->terminators()) {
843 unsigned Opc = MI.getOpcode();
844 switch (Opc) {
845 case AArch64::CBZW:
846 case AArch64::CBZX:
847 case AArch64::TBZW:
848 case AArch64::TBZX:
849 // CBZ/TBZ with WZR/XZR -> unconditional B
850 if (MI.getOperand(0).getReg() == AArch64::WZR ||
851 MI.getOperand(0).getReg() == AArch64::XZR) {
852 DEBUG_WITH_TYPE("optimizeTerminators",
853 dbgs() << "Removing always taken branch: " << MI);
854 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
855 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
856 for (auto *S : Succs)
857 if (S != Target)
858 MBB->removeSuccessor(S);
859 DebugLoc DL = MI.getDebugLoc();
860 while (MBB->rbegin() != &MI)
861 MBB->rbegin()->eraseFromParent();
862 MI.eraseFromParent();
863 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
864 return true;
865 }
866 break;
867 case AArch64::CBNZW:
868 case AArch64::CBNZX:
869 case AArch64::TBNZW:
870 case AArch64::TBNZX:
871 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
872 if (MI.getOperand(0).getReg() == AArch64::WZR ||
873 MI.getOperand(0).getReg() == AArch64::XZR) {
874 DEBUG_WITH_TYPE("optimizeTerminators",
875 dbgs() << "Removing never taken branch: " << MI);
876 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
877 MI.getParent()->removeSuccessor(Target);
878 MI.eraseFromParent();
879 return true;
880 }
881 break;
882 }
883 }
884 return false;
885}
886
887// Find the original register that VReg is copied from.
888static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
889 while (Register::isVirtualRegister(VReg)) {
890 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
891 if (!DefMI->isFullCopy())
892 return VReg;
893 VReg = DefMI->getOperand(1).getReg();
894 }
895 return VReg;
896}
897
898// Determine if VReg is defined by an instruction that can be folded into a
899// csel instruction. If so, return the folded opcode, and the replacement
900// register.
901static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
902 unsigned *NewReg = nullptr) {
903 VReg = removeCopies(MRI, VReg);
905 return 0;
906
907 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
908 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
909 unsigned Opc = 0;
910 unsigned SrcReg = 0;
911 switch (DefMI->getOpcode()) {
912 case AArch64::SUBREG_TO_REG:
913 // Check for the following way to define an 64-bit immediate:
914 // %0:gpr32 = MOVi32imm 1
915 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
916 if (!DefMI->getOperand(1).isReg())
917 return 0;
918 if (!DefMI->getOperand(2).isImm() ||
919 DefMI->getOperand(2).getImm() != AArch64::sub_32)
920 return 0;
921 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
922 if (DefMI->getOpcode() != AArch64::MOVi32imm)
923 return 0;
924 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
925 return 0;
926 assert(Is64Bit);
927 SrcReg = AArch64::XZR;
928 Opc = AArch64::CSINCXr;
929 break;
930
931 case AArch64::MOVi32imm:
932 case AArch64::MOVi64imm:
933 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
934 return 0;
935 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
936 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
937 break;
938
939 case AArch64::ADDSXri:
940 case AArch64::ADDSWri:
941 // if NZCV is used, do not fold.
942 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
943 true) == -1)
944 return 0;
945 // fall-through to ADDXri and ADDWri.
946 [[fallthrough]];
947 case AArch64::ADDXri:
948 case AArch64::ADDWri:
949 // add x, 1 -> csinc.
950 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
951 DefMI->getOperand(3).getImm() != 0)
952 return 0;
953 SrcReg = DefMI->getOperand(1).getReg();
954 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
955 break;
956
957 case AArch64::ORNXrr:
958 case AArch64::ORNWrr: {
959 // not x -> csinv, represented as orn dst, xzr, src.
960 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
961 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
962 return 0;
963 SrcReg = DefMI->getOperand(2).getReg();
964 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
965 break;
966 }
967
968 case AArch64::SUBSXrr:
969 case AArch64::SUBSWrr:
970 // if NZCV is used, do not fold.
971 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
972 true) == -1)
973 return 0;
974 // fall-through to SUBXrr and SUBWrr.
975 [[fallthrough]];
976 case AArch64::SUBXrr:
977 case AArch64::SUBWrr: {
978 // neg x -> csneg, represented as sub dst, xzr, src.
979 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
980 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
981 return 0;
982 SrcReg = DefMI->getOperand(2).getReg();
983 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
984 break;
985 }
986 default:
987 return 0;
988 }
989 assert(Opc && SrcReg && "Missing parameters");
990
991 if (NewReg)
992 *NewReg = SrcReg;
993 return Opc;
994}
995
998 Register DstReg, Register TrueReg,
999 Register FalseReg, int &CondCycles,
1000 int &TrueCycles,
1001 int &FalseCycles) const {
1002 // Check register classes.
1003 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1004 const TargetRegisterClass *RC =
1005 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
1006 if (!RC)
1007 return false;
1008
1009 // Also need to check the dest regclass, in case we're trying to optimize
1010 // something like:
1011 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
1012 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
1013 return false;
1014
1015 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
1016 unsigned ExtraCondLat = Cond.size() != 1;
1017
1018 // GPRs are handled by csel.
1019 // FIXME: Fold in x+1, -x, and ~x when applicable.
1020 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
1021 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
1022 // Single-cycle csel, csinc, csinv, and csneg.
1023 CondCycles = 1 + ExtraCondLat;
1024 TrueCycles = FalseCycles = 1;
1025 if (canFoldIntoCSel(MRI, TrueReg))
1026 TrueCycles = 0;
1027 else if (canFoldIntoCSel(MRI, FalseReg))
1028 FalseCycles = 0;
1029 return true;
1030 }
1031
1032 // Scalar floating point is handled by fcsel.
1033 // FIXME: Form fabs, fmin, and fmax when applicable.
1034 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
1035 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
1036 CondCycles = 5 + ExtraCondLat;
1037 TrueCycles = FalseCycles = 2;
1038 return true;
1039 }
1040
1041 // Can't do vectors.
1042 return false;
1043}
1044
1047 const DebugLoc &DL, Register DstReg,
1049 Register TrueReg, Register FalseReg) const {
1050 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1051
1052 // Parse the condition code, see parseCondBranch() above.
1054 switch (Cond.size()) {
1055 default:
1056 llvm_unreachable("Unknown condition opcode in Cond");
1057 case 1: // b.cc
1058 CC = AArch64CC::CondCode(Cond[0].getImm());
1059 break;
1060 case 3: { // cbz/cbnz
1061 // We must insert a compare against 0.
1062 bool Is64Bit;
1063 switch (Cond[1].getImm()) {
1064 default:
1065 llvm_unreachable("Unknown branch opcode in Cond");
1066 case AArch64::CBZW:
1067 Is64Bit = false;
1068 CC = AArch64CC::EQ;
1069 break;
1070 case AArch64::CBZX:
1071 Is64Bit = true;
1072 CC = AArch64CC::EQ;
1073 break;
1074 case AArch64::CBNZW:
1075 Is64Bit = false;
1076 CC = AArch64CC::NE;
1077 break;
1078 case AArch64::CBNZX:
1079 Is64Bit = true;
1080 CC = AArch64CC::NE;
1081 break;
1082 }
1083 Register SrcReg = Cond[2].getReg();
1084 if (Is64Bit) {
1085 // cmp reg, #0 is actually subs xzr, reg, #0.
1086 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1087 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1088 .addReg(SrcReg)
1089 .addImm(0)
1090 .addImm(0);
1091 } else {
1092 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1093 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1094 .addReg(SrcReg)
1095 .addImm(0)
1096 .addImm(0);
1097 }
1098 break;
1099 }
1100 case 4: { // tbz/tbnz
1101 // We must insert a tst instruction.
1102 switch (Cond[1].getImm()) {
1103 default:
1104 llvm_unreachable("Unknown branch opcode in Cond");
1105 case AArch64::TBZW:
1106 case AArch64::TBZX:
1107 CC = AArch64CC::EQ;
1108 break;
1109 case AArch64::TBNZW:
1110 case AArch64::TBNZX:
1111 CC = AArch64CC::NE;
1112 break;
1113 }
1114 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1115 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1116 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1117 .addReg(Cond[2].getReg())
1118 .addImm(
1120 else
1121 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1122 .addReg(Cond[2].getReg())
1123 .addImm(
1125 break;
1126 }
1127 case 5: { // cb
1128 // We must insert a cmp, that is a subs
1129 // 0 1 2 3 4
1130 // Cond is { -1, Opcode, CC, Op0, Op1 }
1131
1132 unsigned SubsOpc, SubsDestReg;
1133 bool IsImm = false;
1134 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1135 switch (Cond[1].getImm()) {
1136 default:
1137 llvm_unreachable("Unknown branch opcode in Cond");
1138 case AArch64::CBWPri:
1139 SubsOpc = AArch64::SUBSWri;
1140 SubsDestReg = AArch64::WZR;
1141 IsImm = true;
1142 break;
1143 case AArch64::CBXPri:
1144 SubsOpc = AArch64::SUBSXri;
1145 SubsDestReg = AArch64::XZR;
1146 IsImm = true;
1147 break;
1148 case AArch64::CBWPrr:
1149 SubsOpc = AArch64::SUBSWrr;
1150 SubsDestReg = AArch64::WZR;
1151 IsImm = false;
1152 break;
1153 case AArch64::CBXPrr:
1154 SubsOpc = AArch64::SUBSXrr;
1155 SubsDestReg = AArch64::XZR;
1156 IsImm = false;
1157 break;
1158 }
1159
1160 if (IsImm)
1161 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1162 .addReg(Cond[3].getReg())
1163 .addImm(Cond[4].getImm())
1164 .addImm(0);
1165 else
1166 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1167 .addReg(Cond[3].getReg())
1168 .addReg(Cond[4].getReg());
1169 } break;
1170 case 7: { // cb[b,h]
1171 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1172 // that have been folded. For the first operand we codegen an explicit
1173 // extension, for the second operand we fold the extension into cmp.
1174 // 0 1 2 3 4 5 6
1175 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1176
1177 // We need a new register for the now explicitly extended register
1178 Register Reg = Cond[4].getReg();
1180 unsigned ExtOpc;
1181 unsigned ExtBits;
1182 AArch64_AM::ShiftExtendType ExtendType =
1184 switch (ExtendType) {
1185 default:
1186 llvm_unreachable("Unknown shift-extend for CB instruction");
1187 case AArch64_AM::SXTB:
1188 assert(
1189 Cond[1].getImm() == AArch64::CBBAssertExt &&
1190 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1191 ExtOpc = AArch64::SBFMWri;
1192 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1193 break;
1194 case AArch64_AM::SXTH:
1195 assert(
1196 Cond[1].getImm() == AArch64::CBHAssertExt &&
1197 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1198 ExtOpc = AArch64::SBFMWri;
1199 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1200 break;
1201 case AArch64_AM::UXTB:
1202 assert(
1203 Cond[1].getImm() == AArch64::CBBAssertExt &&
1204 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1205 ExtOpc = AArch64::ANDWri;
1206 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1207 break;
1208 case AArch64_AM::UXTH:
1209 assert(
1210 Cond[1].getImm() == AArch64::CBHAssertExt &&
1211 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1212 ExtOpc = AArch64::ANDWri;
1213 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1214 break;
1215 }
1216
1217 // Build the explicit extension of the first operand
1218 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1220 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1221 if (ExtOpc != AArch64::ANDWri)
1222 MBBI.addImm(0);
1223 MBBI.addImm(ExtBits);
1224 }
1225
1226 // Now, subs with an extended second operand
1228 AArch64_AM::ShiftExtendType ExtendType =
1230 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1231 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1232 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1233 .addReg(Cond[3].getReg())
1234 .addReg(Reg)
1235 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1236 } // If no extension is needed, just a regular subs
1237 else {
1238 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1239 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1240 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1241 .addReg(Cond[3].getReg())
1242 .addReg(Reg);
1243 }
1244
1245 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1246 } break;
1247 }
1248
1249 unsigned Opc = 0;
1250 const TargetRegisterClass *RC = nullptr;
1251 bool TryFold = false;
1252 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1253 RC = &AArch64::GPR64RegClass;
1254 Opc = AArch64::CSELXr;
1255 TryFold = true;
1256 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1257 RC = &AArch64::GPR32RegClass;
1258 Opc = AArch64::CSELWr;
1259 TryFold = true;
1260 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1261 RC = &AArch64::FPR64RegClass;
1262 Opc = AArch64::FCSELDrrr;
1263 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1264 RC = &AArch64::FPR32RegClass;
1265 Opc = AArch64::FCSELSrrr;
1266 }
1267 assert(RC && "Unsupported regclass");
1268
1269 // Try folding simple instructions into the csel.
1270 if (TryFold) {
1271 unsigned NewReg = 0;
1272 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1273 if (FoldedOpc) {
1274 // The folded opcodes csinc, csinc and csneg apply the operation to
1275 // FalseReg, so we need to invert the condition.
1277 TrueReg = FalseReg;
1278 } else
1279 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1280
1281 // Fold the operation. Leave any dead instructions for DCE to clean up.
1282 if (FoldedOpc) {
1283 FalseReg = NewReg;
1284 Opc = FoldedOpc;
1285 // Extend the live range of NewReg.
1286 MRI.clearKillFlags(NewReg);
1287 }
1288 }
1289
1290 // Pull all virtual register into the appropriate class.
1291 MRI.constrainRegClass(TrueReg, RC);
1292 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1293 assert(
1294 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1295 FalseReg == AArch64::XZR) &&
1296 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1297 if (FalseReg.isVirtual())
1298 MRI.constrainRegClass(FalseReg, RC);
1299
1300 // Insert the csel.
1301 BuildMI(MBB, I, DL, get(Opc), DstReg)
1302 .addReg(TrueReg)
1303 .addReg(FalseReg)
1304 .addImm(CC);
1305}
1306
1307// Return true if Imm can be loaded into a register by a "cheap" sequence of
1308// instructions. For now, "cheap" means at most two instructions.
1309static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1310 if (BitSize == 32)
1311 return true;
1312
1313 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1314 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1316 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1317
1318 return Is.size() <= 2;
1319}
1320
1321// Check if a COPY instruction is cheap.
1322static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1323 assert(MI.isCopy() && "Expected COPY instruction");
1324 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1325
1326 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1327 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1328 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1329 if (Reg.isVirtual())
1330 return MRI.getRegClass(Reg);
1331 if (Reg.isPhysical())
1332 return RI.getMinimalPhysRegClass(Reg);
1333 return nullptr;
1334 };
1335 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1336 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1337 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1338 return false;
1339
1340 return MI.isAsCheapAsAMove();
1341}
1342
1343// FIXME: this implementation should be micro-architecture dependent, so a
1344// micro-architecture target hook should be introduced here in future.
1346 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1347 if (isExynosCheapAsMove(MI))
1348 return true;
1349 return MI.isAsCheapAsAMove();
1350 }
1351
1352 switch (MI.getOpcode()) {
1353 default:
1354 return MI.isAsCheapAsAMove();
1355
1356 case TargetOpcode::COPY:
1357 return isCheapCopy(MI, RI);
1358
1359 case AArch64::ADDWrs:
1360 case AArch64::ADDXrs:
1361 case AArch64::SUBWrs:
1362 case AArch64::SUBXrs:
1363 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1364
1365 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1366 // ORRXri, it is as cheap as MOV.
1367 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1368 case AArch64::MOVi32imm:
1369 return isCheapImmediate(MI, 32);
1370 case AArch64::MOVi64imm:
1371 return isCheapImmediate(MI, 64);
1372 }
1373}
1374
1375bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1376 switch (MI.getOpcode()) {
1377 default:
1378 return false;
1379
1380 case AArch64::ADDWrs:
1381 case AArch64::ADDXrs:
1382 case AArch64::ADDSWrs:
1383 case AArch64::ADDSXrs: {
1384 unsigned Imm = MI.getOperand(3).getImm();
1385 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1386 if (ShiftVal == 0)
1387 return true;
1388 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1389 }
1390
1391 case AArch64::ADDWrx:
1392 case AArch64::ADDXrx:
1393 case AArch64::ADDXrx64:
1394 case AArch64::ADDSWrx:
1395 case AArch64::ADDSXrx:
1396 case AArch64::ADDSXrx64: {
1397 unsigned Imm = MI.getOperand(3).getImm();
1398 switch (AArch64_AM::getArithExtendType(Imm)) {
1399 default:
1400 return false;
1401 case AArch64_AM::UXTB:
1402 case AArch64_AM::UXTH:
1403 case AArch64_AM::UXTW:
1404 case AArch64_AM::UXTX:
1405 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1406 }
1407 }
1408
1409 case AArch64::SUBWrs:
1410 case AArch64::SUBSWrs: {
1411 unsigned Imm = MI.getOperand(3).getImm();
1412 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1413 return ShiftVal == 0 ||
1414 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1415 }
1416
1417 case AArch64::SUBXrs:
1418 case AArch64::SUBSXrs: {
1419 unsigned Imm = MI.getOperand(3).getImm();
1420 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1421 return ShiftVal == 0 ||
1422 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1423 }
1424
1425 case AArch64::SUBWrx:
1426 case AArch64::SUBXrx:
1427 case AArch64::SUBXrx64:
1428 case AArch64::SUBSWrx:
1429 case AArch64::SUBSXrx:
1430 case AArch64::SUBSXrx64: {
1431 unsigned Imm = MI.getOperand(3).getImm();
1432 switch (AArch64_AM::getArithExtendType(Imm)) {
1433 default:
1434 return false;
1435 case AArch64_AM::UXTB:
1436 case AArch64_AM::UXTH:
1437 case AArch64_AM::UXTW:
1438 case AArch64_AM::UXTX:
1439 return AArch64_AM::getArithShiftValue(Imm) == 0;
1440 }
1441 }
1442
1443 case AArch64::LDRBBroW:
1444 case AArch64::LDRBBroX:
1445 case AArch64::LDRBroW:
1446 case AArch64::LDRBroX:
1447 case AArch64::LDRDroW:
1448 case AArch64::LDRDroX:
1449 case AArch64::LDRHHroW:
1450 case AArch64::LDRHHroX:
1451 case AArch64::LDRHroW:
1452 case AArch64::LDRHroX:
1453 case AArch64::LDRQroW:
1454 case AArch64::LDRQroX:
1455 case AArch64::LDRSBWroW:
1456 case AArch64::LDRSBWroX:
1457 case AArch64::LDRSBXroW:
1458 case AArch64::LDRSBXroX:
1459 case AArch64::LDRSHWroW:
1460 case AArch64::LDRSHWroX:
1461 case AArch64::LDRSHXroW:
1462 case AArch64::LDRSHXroX:
1463 case AArch64::LDRSWroW:
1464 case AArch64::LDRSWroX:
1465 case AArch64::LDRSroW:
1466 case AArch64::LDRSroX:
1467 case AArch64::LDRWroW:
1468 case AArch64::LDRWroX:
1469 case AArch64::LDRXroW:
1470 case AArch64::LDRXroX:
1471 case AArch64::PRFMroW:
1472 case AArch64::PRFMroX:
1473 case AArch64::STRBBroW:
1474 case AArch64::STRBBroX:
1475 case AArch64::STRBroW:
1476 case AArch64::STRBroX:
1477 case AArch64::STRDroW:
1478 case AArch64::STRDroX:
1479 case AArch64::STRHHroW:
1480 case AArch64::STRHHroX:
1481 case AArch64::STRHroW:
1482 case AArch64::STRHroX:
1483 case AArch64::STRQroW:
1484 case AArch64::STRQroX:
1485 case AArch64::STRSroW:
1486 case AArch64::STRSroX:
1487 case AArch64::STRWroW:
1488 case AArch64::STRWroX:
1489 case AArch64::STRXroW:
1490 case AArch64::STRXroX: {
1491 unsigned IsSigned = MI.getOperand(3).getImm();
1492 return !IsSigned;
1493 }
1494 }
1495}
1496
1497bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1498 unsigned Opc = MI.getOpcode();
1499 switch (Opc) {
1500 default:
1501 return false;
1502 case AArch64::SEH_StackAlloc:
1503 case AArch64::SEH_SaveFPLR:
1504 case AArch64::SEH_SaveFPLR_X:
1505 case AArch64::SEH_SaveReg:
1506 case AArch64::SEH_SaveReg_X:
1507 case AArch64::SEH_SaveRegP:
1508 case AArch64::SEH_SaveRegP_X:
1509 case AArch64::SEH_SaveFReg:
1510 case AArch64::SEH_SaveFReg_X:
1511 case AArch64::SEH_SaveFRegP:
1512 case AArch64::SEH_SaveFRegP_X:
1513 case AArch64::SEH_SetFP:
1514 case AArch64::SEH_AddFP:
1515 case AArch64::SEH_Nop:
1516 case AArch64::SEH_PrologEnd:
1517 case AArch64::SEH_EpilogStart:
1518 case AArch64::SEH_EpilogEnd:
1519 case AArch64::SEH_PACSignLR:
1520 case AArch64::SEH_SaveAnyRegI:
1521 case AArch64::SEH_SaveAnyRegIP:
1522 case AArch64::SEH_SaveAnyRegQP:
1523 case AArch64::SEH_SaveAnyRegQPX:
1524 case AArch64::SEH_AllocZ:
1525 case AArch64::SEH_SaveZReg:
1526 case AArch64::SEH_SavePReg:
1527 return true;
1528 }
1529}
1530
1532 Register &SrcReg, Register &DstReg,
1533 unsigned &SubIdx) const {
1534 switch (MI.getOpcode()) {
1535 default:
1536 return false;
1537 case AArch64::SBFMXri: // aka sxtw
1538 case AArch64::UBFMXri: // aka uxtw
1539 // Check for the 32 -> 64 bit extension case, these instructions can do
1540 // much more.
1541 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1542 return false;
1543 // This is a signed or unsigned 32 -> 64 bit extension.
1544 SrcReg = MI.getOperand(1).getReg();
1545 DstReg = MI.getOperand(0).getReg();
1546 SubIdx = AArch64::sub_32;
1547 return true;
1548 }
1549}
1550
1552 const MachineInstr &MIa, const MachineInstr &MIb) const {
1554 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1555 int64_t OffsetA = 0, OffsetB = 0;
1556 TypeSize WidthA(0, false), WidthB(0, false);
1557 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1558
1559 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1560 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1561
1564 return false;
1565
1566 // Retrieve the base, offset from the base and width. Width
1567 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1568 // base are identical, and the offset of a lower memory access +
1569 // the width doesn't overlap the offset of a higher memory access,
1570 // then the memory accesses are different.
1571 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1572 // are assumed to have the same scale (vscale).
1573 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1574 WidthA, TRI) &&
1575 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1576 WidthB, TRI)) {
1577 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1578 OffsetAIsScalable == OffsetBIsScalable) {
1579 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1580 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1581 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1582 if (LowWidth.isScalable() == OffsetAIsScalable &&
1583 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1584 return true;
1585 }
1586 }
1587 return false;
1588}
1589
1591 const MachineBasicBlock *MBB,
1592 const MachineFunction &MF) const {
1594 return true;
1595
1596 // Do not move an instruction that can be recognized as a branch target.
1597 if (hasBTISemantics(MI))
1598 return true;
1599
1600 switch (MI.getOpcode()) {
1601 case AArch64::HINT:
1602 // CSDB hints are scheduling barriers.
1603 if (MI.getOperand(0).getImm() == 0x14)
1604 return true;
1605 break;
1606 case AArch64::DSB:
1607 case AArch64::ISB:
1608 // DSB and ISB also are scheduling barriers.
1609 return true;
1610 case AArch64::MSRpstatesvcrImm1:
1611 // SMSTART and SMSTOP are also scheduling barriers.
1612 return true;
1613 default:;
1614 }
1615 if (isSEHInstruction(MI))
1616 return true;
1617 auto Next = std::next(MI.getIterator());
1618 return Next != MBB->end() && Next->isCFIInstruction();
1619}
1620
1621/// analyzeCompare - For a comparison instruction, return the source registers
1622/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1623/// Return true if the comparison instruction can be analyzed.
1625 Register &SrcReg2, int64_t &CmpMask,
1626 int64_t &CmpValue) const {
1627 // The first operand can be a frame index where we'd normally expect a
1628 // register.
1629 // FIXME: Pass subregisters out of analyzeCompare
1630 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1631 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1632 return false;
1633
1634 switch (MI.getOpcode()) {
1635 default:
1636 break;
1637 case AArch64::PTEST_PP:
1638 case AArch64::PTEST_PP_ANY:
1639 case AArch64::PTEST_PP_FIRST:
1640 SrcReg = MI.getOperand(0).getReg();
1641 SrcReg2 = MI.getOperand(1).getReg();
1642 if (MI.getOperand(2).getSubReg())
1643 return false;
1644
1645 // Not sure about the mask and value for now...
1646 CmpMask = ~0;
1647 CmpValue = 0;
1648 return true;
1649 case AArch64::SUBSWrr:
1650 case AArch64::SUBSWrs:
1651 case AArch64::SUBSWrx:
1652 case AArch64::SUBSXrr:
1653 case AArch64::SUBSXrs:
1654 case AArch64::SUBSXrx:
1655 case AArch64::ADDSWrr:
1656 case AArch64::ADDSWrs:
1657 case AArch64::ADDSWrx:
1658 case AArch64::ADDSXrr:
1659 case AArch64::ADDSXrs:
1660 case AArch64::ADDSXrx:
1661 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1662 SrcReg = MI.getOperand(1).getReg();
1663 SrcReg2 = MI.getOperand(2).getReg();
1664
1665 // FIXME: Pass subregisters out of analyzeCompare
1666 if (MI.getOperand(2).getSubReg())
1667 return false;
1668
1669 CmpMask = ~0;
1670 CmpValue = 0;
1671 return true;
1672 case AArch64::SUBSWri:
1673 case AArch64::ADDSWri:
1674 case AArch64::SUBSXri:
1675 case AArch64::ADDSXri:
1676 SrcReg = MI.getOperand(1).getReg();
1677 SrcReg2 = 0;
1678 CmpMask = ~0;
1679 CmpValue = MI.getOperand(2).getImm();
1680 return true;
1681 case AArch64::ANDSWri:
1682 case AArch64::ANDSXri:
1683 // ANDS does not use the same encoding scheme as the others xxxS
1684 // instructions.
1685 SrcReg = MI.getOperand(1).getReg();
1686 SrcReg2 = 0;
1687 CmpMask = ~0;
1689 MI.getOperand(2).getImm(),
1690 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1691 return true;
1692 }
1693
1694 return false;
1695}
1696
1698 MachineBasicBlock *MBB = Instr.getParent();
1699 assert(MBB && "Can't get MachineBasicBlock here");
1700 MachineFunction *MF = MBB->getParent();
1701 assert(MF && "Can't get MachineFunction here");
1704 MachineRegisterInfo *MRI = &MF->getRegInfo();
1705
1706 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1707 ++OpIdx) {
1708 MachineOperand &MO = Instr.getOperand(OpIdx);
1709 const TargetRegisterClass *OpRegCstraints =
1710 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1711
1712 // If there's no constraint, there's nothing to do.
1713 if (!OpRegCstraints)
1714 continue;
1715 // If the operand is a frame index, there's nothing to do here.
1716 // A frame index operand will resolve correctly during PEI.
1717 if (MO.isFI())
1718 continue;
1719
1720 assert(MO.isReg() &&
1721 "Operand has register constraints without being a register!");
1722
1723 Register Reg = MO.getReg();
1724 if (Reg.isPhysical()) {
1725 if (!OpRegCstraints->contains(Reg))
1726 return false;
1727 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1728 !MRI->constrainRegClass(Reg, OpRegCstraints))
1729 return false;
1730 }
1731
1732 return true;
1733}
1734
1735/// Return the opcode that does not set flags when possible - otherwise
1736/// return the original opcode. The caller is responsible to do the actual
1737/// substitution and legality checking.
1739 // Don't convert all compare instructions, because for some the zero register
1740 // encoding becomes the sp register.
1741 bool MIDefinesZeroReg = false;
1742 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1743 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1744 MIDefinesZeroReg = true;
1745
1746 switch (MI.getOpcode()) {
1747 default:
1748 return MI.getOpcode();
1749 case AArch64::ADDSWrr:
1750 return AArch64::ADDWrr;
1751 case AArch64::ADDSWri:
1752 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1753 case AArch64::ADDSWrs:
1754 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1755 case AArch64::ADDSWrx:
1756 return AArch64::ADDWrx;
1757 case AArch64::ADDSXrr:
1758 return AArch64::ADDXrr;
1759 case AArch64::ADDSXri:
1760 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1761 case AArch64::ADDSXrs:
1762 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1763 case AArch64::ADDSXrx:
1764 return AArch64::ADDXrx;
1765 case AArch64::SUBSWrr:
1766 return AArch64::SUBWrr;
1767 case AArch64::SUBSWri:
1768 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1769 case AArch64::SUBSWrs:
1770 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1771 case AArch64::SUBSWrx:
1772 return AArch64::SUBWrx;
1773 case AArch64::SUBSXrr:
1774 return AArch64::SUBXrr;
1775 case AArch64::SUBSXri:
1776 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1777 case AArch64::SUBSXrs:
1778 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1779 case AArch64::SUBSXrx:
1780 return AArch64::SUBXrx;
1781 }
1782}
1783
1784enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1785
1786/// True when condition flags are accessed (either by writing or reading)
1787/// on the instruction trace starting at From and ending at To.
1788///
1789/// Note: If From and To are from different blocks it's assumed CC are accessed
1790/// on the path.
1793 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1794 // Early exit if To is at the beginning of the BB.
1795 if (To == To->getParent()->begin())
1796 return true;
1797
1798 // Check whether the instructions are in the same basic block
1799 // If not, assume the condition flags might get modified somewhere.
1800 if (To->getParent() != From->getParent())
1801 return true;
1802
1803 // From must be above To.
1804 assert(std::any_of(
1805 ++To.getReverse(), To->getParent()->rend(),
1806 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1807
1808 // We iterate backward starting at \p To until we hit \p From.
1809 for (const MachineInstr &Instr :
1811 if (((AccessToCheck & AK_Write) &&
1812 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1813 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1814 return true;
1815 }
1816 return false;
1817}
1818
1819std::optional<unsigned>
1820AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1821 MachineInstr *Pred,
1822 const MachineRegisterInfo *MRI) const {
1823 unsigned MaskOpcode = Mask->getOpcode();
1824 unsigned PredOpcode = Pred->getOpcode();
1825 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1826 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1827
1828 if (PredIsWhileLike) {
1829 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1830 // instruction and the condition is "any" since WHILcc does an implicit
1831 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1832 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1833 return PredOpcode;
1834
1835 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1836 // redundant since WHILE performs an implicit PTEST with an all active
1837 // mask.
1838 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1839 getElementSizeForOpcode(MaskOpcode) ==
1840 getElementSizeForOpcode(PredOpcode))
1841 return PredOpcode;
1842
1843 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1844 // WHILEcc performs an implicit PTEST with an all active mask, setting
1845 // the N flag as the PTEST_FIRST would.
1846 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1847 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1848 return PredOpcode;
1849
1850 return {};
1851 }
1852
1853 if (PredIsPTestLike) {
1854 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1855 // instruction that sets the flags as PTEST would and the condition is
1856 // "any" since PG is always a subset of the governing predicate of the
1857 // ptest-like instruction.
1858 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1859 return PredOpcode;
1860
1861 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1862
1863 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1864 // to look through a copy and try again. This is because some instructions
1865 // take a predicate whose register class is a subset of its result class.
1866 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1867 PTestLikeMask->getOperand(1).getReg().isVirtual())
1868 PTestLikeMask =
1869 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1870
1871 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1872 // the element size matches and either the PTEST_LIKE instruction uses
1873 // the same all active mask or the condition is "any".
1874 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1875 getElementSizeForOpcode(MaskOpcode) ==
1876 getElementSizeForOpcode(PredOpcode)) {
1877 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1878 return PredOpcode;
1879 }
1880
1881 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1882 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1883 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1884 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1885 // performed by the compare could consider fewer lanes for these element
1886 // sizes.
1887 //
1888 // For example, consider
1889 //
1890 // ptrue p0.b ; P0=1111-1111-1111-1111
1891 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1892 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1893 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1894 // ; ^ last active
1895 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1896 // ; ^ last active
1897 //
1898 // where the compare generates a canonical all active 32-bit predicate
1899 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1900 // active flag, whereas the PTEST instruction with the same mask doesn't.
1901 // For PTEST_ANY this doesn't apply as the flags in this case would be
1902 // identical regardless of element size.
1903 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1904 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1905 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1906 return PredOpcode;
1907
1908 return {};
1909 }
1910
1911 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1912 // opcode so the PTEST becomes redundant.
1913 switch (PredOpcode) {
1914 case AArch64::AND_PPzPP:
1915 case AArch64::BIC_PPzPP:
1916 case AArch64::EOR_PPzPP:
1917 case AArch64::NAND_PPzPP:
1918 case AArch64::NOR_PPzPP:
1919 case AArch64::ORN_PPzPP:
1920 case AArch64::ORR_PPzPP:
1921 case AArch64::BRKA_PPzP:
1922 case AArch64::BRKPA_PPzPP:
1923 case AArch64::BRKB_PPzP:
1924 case AArch64::BRKPB_PPzPP:
1925 case AArch64::RDFFR_PPz: {
1926 // Check to see if our mask is the same. If not the resulting flag bits
1927 // may be different and we can't remove the ptest.
1928 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1929 if (Mask != PredMask)
1930 return {};
1931 break;
1932 }
1933 case AArch64::BRKN_PPzP: {
1934 // BRKN uses an all active implicit mask to set flags unlike the other
1935 // flag-setting instructions.
1936 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1937 if ((MaskOpcode != AArch64::PTRUE_B) ||
1938 (Mask->getOperand(1).getImm() != 31))
1939 return {};
1940 break;
1941 }
1942 case AArch64::PTRUE_B:
1943 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1944 break;
1945 default:
1946 // Bail out if we don't recognize the input
1947 return {};
1948 }
1949
1950 return convertToFlagSettingOpc(PredOpcode);
1951}
1952
1953/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1954/// operation which could set the flags in an identical manner
1955bool AArch64InstrInfo::optimizePTestInstr(
1956 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1957 const MachineRegisterInfo *MRI) const {
1958 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1959 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1960
1961 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1962 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1963 // before the branch to extract each subregister.
1964 auto Op = Pred->getOperand(1);
1965 if (Op.isReg() && Op.getReg().isVirtual() &&
1966 Op.getSubReg() == AArch64::psub0)
1967 Pred = MRI->getUniqueVRegDef(Op.getReg());
1968 }
1969
1970 unsigned PredOpcode = Pred->getOpcode();
1971 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1972 if (!NewOp)
1973 return false;
1974
1975 const TargetRegisterInfo *TRI = &getRegisterInfo();
1976
1977 // If another instruction between Pred and PTest accesses flags, don't remove
1978 // the ptest or update the earlier instruction to modify them.
1979 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1980 return false;
1981
1982 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1983 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1984 // operand to be replaced with an equivalent instruction that also sets the
1985 // flags.
1986 PTest->eraseFromParent();
1987 if (*NewOp != PredOpcode) {
1988 Pred->setDesc(get(*NewOp));
1989 bool succeeded = UpdateOperandRegClass(*Pred);
1990 (void)succeeded;
1991 assert(succeeded && "Operands have incompatible register classes!");
1992 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1993 }
1994
1995 // Ensure that the flags def is live.
1996 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1997 unsigned i = 0, e = Pred->getNumOperands();
1998 for (; i != e; ++i) {
1999 MachineOperand &MO = Pred->getOperand(i);
2000 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
2001 MO.setIsDead(false);
2002 break;
2003 }
2004 }
2005 }
2006 return true;
2007}
2008
2009/// Try to optimize a compare instruction. A compare instruction is an
2010/// instruction which produces AArch64::NZCV. It can be truly compare
2011/// instruction
2012/// when there are no uses of its destination register.
2013///
2014/// The following steps are tried in order:
2015/// 1. Convert CmpInstr into an unconditional version.
2016/// 2. Remove CmpInstr if above there is an instruction producing a needed
2017/// condition code or an instruction which can be converted into such an
2018/// instruction.
2019/// Only comparison with zero is supported.
2021 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
2022 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
2023 assert(CmpInstr.getParent());
2024 assert(MRI);
2025
2026 // Replace SUBSWrr with SUBWrr if NZCV is not used.
2027 int DeadNZCVIdx =
2028 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
2029 if (DeadNZCVIdx != -1) {
2030 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
2031 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
2032 CmpInstr.eraseFromParent();
2033 return true;
2034 }
2035 unsigned Opc = CmpInstr.getOpcode();
2036 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
2037 if (NewOpc == Opc)
2038 return false;
2039 const MCInstrDesc &MCID = get(NewOpc);
2040 CmpInstr.setDesc(MCID);
2041 CmpInstr.removeOperand(DeadNZCVIdx);
2042 bool succeeded = UpdateOperandRegClass(CmpInstr);
2043 (void)succeeded;
2044 assert(succeeded && "Some operands reg class are incompatible!");
2045 return true;
2046 }
2047
2048 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
2049 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
2050 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
2051 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
2052
2053 if (SrcReg2 != 0)
2054 return false;
2055
2056 // CmpInstr is a Compare instruction if destination register is not used.
2057 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2058 return false;
2059
2060 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2061 return true;
2062 return (CmpValue == 0 || CmpValue == 1) &&
2063 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2064}
2065
2066/// Get opcode of S version of Instr.
2067/// If Instr is S version its opcode is returned.
2068/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2069/// or we are not interested in it.
2070static unsigned sForm(MachineInstr &Instr) {
2071 switch (Instr.getOpcode()) {
2072 default:
2073 return AArch64::INSTRUCTION_LIST_END;
2074
2075 case AArch64::ADDSWrr:
2076 case AArch64::ADDSWri:
2077 case AArch64::ADDSXrr:
2078 case AArch64::ADDSXri:
2079 case AArch64::ADDSWrx:
2080 case AArch64::ADDSXrx:
2081 case AArch64::SUBSWrr:
2082 case AArch64::SUBSWri:
2083 case AArch64::SUBSWrx:
2084 case AArch64::SUBSXrr:
2085 case AArch64::SUBSXri:
2086 case AArch64::SUBSXrx:
2087 case AArch64::ANDSWri:
2088 case AArch64::ANDSWrr:
2089 case AArch64::ANDSWrs:
2090 case AArch64::ANDSXri:
2091 case AArch64::ANDSXrr:
2092 case AArch64::ANDSXrs:
2093 case AArch64::BICSWrr:
2094 case AArch64::BICSXrr:
2095 case AArch64::BICSWrs:
2096 case AArch64::BICSXrs:
2097 return Instr.getOpcode();
2098
2099 case AArch64::ADDWrr:
2100 return AArch64::ADDSWrr;
2101 case AArch64::ADDWri:
2102 return AArch64::ADDSWri;
2103 case AArch64::ADDXrr:
2104 return AArch64::ADDSXrr;
2105 case AArch64::ADDXri:
2106 return AArch64::ADDSXri;
2107 case AArch64::ADDWrx:
2108 return AArch64::ADDSWrx;
2109 case AArch64::ADDXrx:
2110 return AArch64::ADDSXrx;
2111 case AArch64::ADCWr:
2112 return AArch64::ADCSWr;
2113 case AArch64::ADCXr:
2114 return AArch64::ADCSXr;
2115 case AArch64::SUBWrr:
2116 return AArch64::SUBSWrr;
2117 case AArch64::SUBWri:
2118 return AArch64::SUBSWri;
2119 case AArch64::SUBXrr:
2120 return AArch64::SUBSXrr;
2121 case AArch64::SUBXri:
2122 return AArch64::SUBSXri;
2123 case AArch64::SUBWrx:
2124 return AArch64::SUBSWrx;
2125 case AArch64::SUBXrx:
2126 return AArch64::SUBSXrx;
2127 case AArch64::SBCWr:
2128 return AArch64::SBCSWr;
2129 case AArch64::SBCXr:
2130 return AArch64::SBCSXr;
2131 case AArch64::ANDWri:
2132 return AArch64::ANDSWri;
2133 case AArch64::ANDXri:
2134 return AArch64::ANDSXri;
2135 case AArch64::ANDWrr:
2136 return AArch64::ANDSWrr;
2137 case AArch64::ANDWrs:
2138 return AArch64::ANDSWrs;
2139 case AArch64::ANDXrr:
2140 return AArch64::ANDSXrr;
2141 case AArch64::ANDXrs:
2142 return AArch64::ANDSXrs;
2143 case AArch64::BICWrr:
2144 return AArch64::BICSWrr;
2145 case AArch64::BICXrr:
2146 return AArch64::BICSXrr;
2147 case AArch64::BICWrs:
2148 return AArch64::BICSWrs;
2149 case AArch64::BICXrs:
2150 return AArch64::BICSXrs;
2151 }
2152}
2153
2154/// Check if AArch64::NZCV should be alive in successors of MBB.
2156 for (auto *BB : MBB->successors())
2157 if (BB->isLiveIn(AArch64::NZCV))
2158 return true;
2159 return false;
2160}
2161
2162/// \returns The condition code operand index for \p Instr if it is a branch
2163/// or select and -1 otherwise.
2164int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2165 const MachineInstr &Instr) {
2166 switch (Instr.getOpcode()) {
2167 default:
2168 return -1;
2169
2170 case AArch64::Bcc: {
2171 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2172 assert(Idx >= 2);
2173 return Idx - 2;
2174 }
2175
2176 case AArch64::CSINVWr:
2177 case AArch64::CSINVXr:
2178 case AArch64::CSINCWr:
2179 case AArch64::CSINCXr:
2180 case AArch64::CSELWr:
2181 case AArch64::CSELXr:
2182 case AArch64::CSNEGWr:
2183 case AArch64::CSNEGXr:
2184 case AArch64::FCSELSrrr:
2185 case AArch64::FCSELDrrr: {
2186 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2187 assert(Idx >= 1);
2188 return Idx - 1;
2189 }
2190 }
2191}
2192
2193/// Find a condition code used by the instruction.
2194/// Returns AArch64CC::Invalid if either the instruction does not use condition
2195/// codes or we don't optimize CmpInstr in the presence of such instructions.
2197 int CCIdx =
2198 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2199 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2200 Instr.getOperand(CCIdx).getImm())
2202}
2203
2206 UsedNZCV UsedFlags;
2207 switch (CC) {
2208 default:
2209 break;
2210
2211 case AArch64CC::EQ: // Z set
2212 case AArch64CC::NE: // Z clear
2213 UsedFlags.Z = true;
2214 break;
2215
2216 case AArch64CC::HI: // Z clear and C set
2217 case AArch64CC::LS: // Z set or C clear
2218 UsedFlags.Z = true;
2219 [[fallthrough]];
2220 case AArch64CC::HS: // C set
2221 case AArch64CC::LO: // C clear
2222 UsedFlags.C = true;
2223 break;
2224
2225 case AArch64CC::MI: // N set
2226 case AArch64CC::PL: // N clear
2227 UsedFlags.N = true;
2228 break;
2229
2230 case AArch64CC::VS: // V set
2231 case AArch64CC::VC: // V clear
2232 UsedFlags.V = true;
2233 break;
2234
2235 case AArch64CC::GT: // Z clear, N and V the same
2236 case AArch64CC::LE: // Z set, N and V differ
2237 UsedFlags.Z = true;
2238 [[fallthrough]];
2239 case AArch64CC::GE: // N and V the same
2240 case AArch64CC::LT: // N and V differ
2241 UsedFlags.N = true;
2242 UsedFlags.V = true;
2243 break;
2244 }
2245 return UsedFlags;
2246}
2247
2248/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2249/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2250/// \returns std::nullopt otherwise.
2251///
2252/// Collect instructions using that flags in \p CCUseInstrs if provided.
2253std::optional<UsedNZCV>
2255 const TargetRegisterInfo &TRI,
2256 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2257 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2258 if (MI.getParent() != CmpParent)
2259 return std::nullopt;
2260
2261 if (areCFlagsAliveInSuccessors(CmpParent))
2262 return std::nullopt;
2263
2264 UsedNZCV NZCVUsedAfterCmp;
2266 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2267 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2269 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2270 return std::nullopt;
2271 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2272 if (CCUseInstrs)
2273 CCUseInstrs->push_back(&Instr);
2274 }
2275 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2276 break;
2277 }
2278 return NZCVUsedAfterCmp;
2279}
2280
2281static bool isADDSRegImm(unsigned Opcode) {
2282 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2283}
2284
2285static bool isSUBSRegImm(unsigned Opcode) {
2286 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2287}
2288
2290 unsigned Opc = sForm(MI);
2291 switch (Opc) {
2292 case AArch64::ANDSWri:
2293 case AArch64::ANDSWrr:
2294 case AArch64::ANDSWrs:
2295 case AArch64::ANDSXri:
2296 case AArch64::ANDSXrr:
2297 case AArch64::ANDSXrs:
2298 case AArch64::BICSWrr:
2299 case AArch64::BICSXrr:
2300 case AArch64::BICSWrs:
2301 case AArch64::BICSXrs:
2302 return true;
2303 default:
2304 return false;
2305 }
2306}
2307
2308/// Check if CmpInstr can be substituted by MI.
2309///
2310/// CmpInstr can be substituted:
2311/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2312/// - and, MI and CmpInstr are from the same MachineBB
2313/// - and, condition flags are not alive in successors of the CmpInstr parent
2314/// - and, if MI opcode is the S form there must be no defs of flags between
2315/// MI and CmpInstr
2316/// or if MI opcode is not the S form there must be neither defs of flags
2317/// nor uses of flags between MI and CmpInstr.
2318/// - and, if C/V flags are not used after CmpInstr
2319/// or if N flag is used but MI produces poison value if signed overflow
2320/// occurs.
2322 const TargetRegisterInfo &TRI) {
2323 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2324 // that may or may not set flags.
2325 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2326
2327 const unsigned CmpOpcode = CmpInstr.getOpcode();
2328 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2329 return false;
2330
2331 assert((CmpInstr.getOperand(2).isImm() &&
2332 CmpInstr.getOperand(2).getImm() == 0) &&
2333 "Caller guarantees that CmpInstr compares with constant 0");
2334
2335 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2336 if (!NZVCUsed || NZVCUsed->C)
2337 return false;
2338
2339 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2340 // '%vreg = add ...' or '%vreg = sub ...'.
2341 // Condition flag V is used to indicate signed overflow.
2342 // 1) MI and CmpInstr set N and V to the same value.
2343 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2344 // signed overflow occurs, so CmpInstr could still be simplified away.
2345 // Note that Ands and Bics instructions always clear the V flag.
2346 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2347 return false;
2348
2349 AccessKind AccessToCheck = AK_Write;
2350 if (sForm(MI) != MI.getOpcode())
2351 AccessToCheck = AK_All;
2352 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2353}
2354
2355/// Substitute an instruction comparing to zero with another instruction
2356/// which produces needed condition flags.
2357///
2358/// Return true on success.
2359bool AArch64InstrInfo::substituteCmpToZero(
2360 MachineInstr &CmpInstr, unsigned SrcReg,
2361 const MachineRegisterInfo &MRI) const {
2362 // Get the unique definition of SrcReg.
2363 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2364 if (!MI)
2365 return false;
2366
2367 const TargetRegisterInfo &TRI = getRegisterInfo();
2368
2369 unsigned NewOpc = sForm(*MI);
2370 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2371 return false;
2372
2373 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2374 return false;
2375
2376 // Update the instruction to set NZCV.
2377 MI->setDesc(get(NewOpc));
2378 CmpInstr.eraseFromParent();
2380 (void)succeeded;
2381 assert(succeeded && "Some operands reg class are incompatible!");
2382 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2383 return true;
2384}
2385
2386/// \returns True if \p CmpInstr can be removed.
2387///
2388/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2389/// codes used in \p CCUseInstrs must be inverted.
2391 int CmpValue, const TargetRegisterInfo &TRI,
2393 bool &IsInvertCC) {
2394 assert((CmpValue == 0 || CmpValue == 1) &&
2395 "Only comparisons to 0 or 1 considered for removal!");
2396
2397 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2398 unsigned MIOpc = MI.getOpcode();
2399 if (MIOpc == AArch64::CSINCWr) {
2400 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2401 MI.getOperand(2).getReg() != AArch64::WZR)
2402 return false;
2403 } else if (MIOpc == AArch64::CSINCXr) {
2404 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2405 MI.getOperand(2).getReg() != AArch64::XZR)
2406 return false;
2407 } else {
2408 return false;
2409 }
2411 if (MICC == AArch64CC::Invalid)
2412 return false;
2413
2414 // NZCV needs to be defined
2415 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2416 return false;
2417
2418 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2419 const unsigned CmpOpcode = CmpInstr.getOpcode();
2420 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2421 if (CmpValue && !IsSubsRegImm)
2422 return false;
2423 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2424 return false;
2425
2426 // MI conditions allowed: eq, ne, mi, pl
2427 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2428 if (MIUsedNZCV.C || MIUsedNZCV.V)
2429 return false;
2430
2431 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2432 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2433 // Condition flags are not used in CmpInstr basic block successors and only
2434 // Z or N flags allowed to be used after CmpInstr within its basic block
2435 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2436 return false;
2437 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2438 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2439 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2440 return false;
2441 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2442 if (MIUsedNZCV.N && !CmpValue)
2443 return false;
2444
2445 // There must be no defs of flags between MI and CmpInstr
2446 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2447 return false;
2448
2449 // Condition code is inverted in the following cases:
2450 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2451 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2452 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2453 (!CmpValue && MICC == AArch64CC::NE);
2454 return true;
2455}
2456
2457/// Remove comparison in csinc-cmp sequence
2458///
2459/// Examples:
2460/// 1. \code
2461/// csinc w9, wzr, wzr, ne
2462/// cmp w9, #0
2463/// b.eq
2464/// \endcode
2465/// to
2466/// \code
2467/// csinc w9, wzr, wzr, ne
2468/// b.ne
2469/// \endcode
2470///
2471/// 2. \code
2472/// csinc x2, xzr, xzr, mi
2473/// cmp x2, #1
2474/// b.pl
2475/// \endcode
2476/// to
2477/// \code
2478/// csinc x2, xzr, xzr, mi
2479/// b.pl
2480/// \endcode
2481///
2482/// \param CmpInstr comparison instruction
2483/// \return True when comparison removed
2484bool AArch64InstrInfo::removeCmpToZeroOrOne(
2485 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2486 const MachineRegisterInfo &MRI) const {
2487 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2488 if (!MI)
2489 return false;
2490 const TargetRegisterInfo &TRI = getRegisterInfo();
2491 SmallVector<MachineInstr *, 4> CCUseInstrs;
2492 bool IsInvertCC = false;
2493 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2494 IsInvertCC))
2495 return false;
2496 // Make transformation
2497 CmpInstr.eraseFromParent();
2498 if (IsInvertCC) {
2499 // Invert condition codes in CmpInstr CC users
2500 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2501 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2502 assert(Idx >= 0 && "Unexpected instruction using CC.");
2503 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2505 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2506 CCOperand.setImm(CCUse);
2507 }
2508 }
2509 return true;
2510}
2511
2512bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2513 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2514 MI.getOpcode() != AArch64::CATCHRET)
2515 return false;
2516
2517 MachineBasicBlock &MBB = *MI.getParent();
2518 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2519 auto TRI = Subtarget.getRegisterInfo();
2520 DebugLoc DL = MI.getDebugLoc();
2521
2522 if (MI.getOpcode() == AArch64::CATCHRET) {
2523 // Skip to the first instruction before the epilog.
2524 const TargetInstrInfo *TII =
2526 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2528 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2529 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2530 FirstEpilogSEH != MBB.begin())
2531 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2532 if (FirstEpilogSEH != MBB.begin())
2533 FirstEpilogSEH = std::next(FirstEpilogSEH);
2534 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2535 .addReg(AArch64::X0, RegState::Define)
2536 .addMBB(TargetMBB);
2537 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2538 .addReg(AArch64::X0, RegState::Define)
2539 .addReg(AArch64::X0)
2540 .addMBB(TargetMBB)
2541 .addImm(0);
2542 TargetMBB->setMachineBlockAddressTaken();
2543 return true;
2544 }
2545
2546 Register Reg = MI.getOperand(0).getReg();
2548 if (M.getStackProtectorGuard() == "sysreg") {
2549 const AArch64SysReg::SysReg *SrcReg =
2550 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2551 if (!SrcReg)
2552 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2553
2554 // mrs xN, sysreg
2555 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2557 .addImm(SrcReg->Encoding);
2558 int Offset = M.getStackProtectorGuardOffset();
2559 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2560 // ldr xN, [xN, #offset]
2561 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2562 .addDef(Reg)
2564 .addImm(Offset / 8);
2565 } else if (Offset >= -256 && Offset <= 255) {
2566 // ldur xN, [xN, #offset]
2567 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2568 .addDef(Reg)
2570 .addImm(Offset);
2571 } else if (Offset >= -4095 && Offset <= 4095) {
2572 if (Offset > 0) {
2573 // add xN, xN, #offset
2574 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2575 .addDef(Reg)
2577 .addImm(Offset)
2578 .addImm(0);
2579 } else {
2580 // sub xN, xN, #offset
2581 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2582 .addDef(Reg)
2584 .addImm(-Offset)
2585 .addImm(0);
2586 }
2587 // ldr xN, [xN]
2588 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2589 .addDef(Reg)
2591 .addImm(0);
2592 } else {
2593 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2594 // than 23760.
2595 // It might be nice to use AArch64::MOVi32imm here, which would get
2596 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2597 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2598 // AArch64FrameLowering might help us find such a scratch register
2599 // though. If we failed to find a scratch register, we could emit a
2600 // stream of add instructions to build up the immediate. Or, we could try
2601 // to insert a AArch64::MOVi32imm before register allocation so that we
2602 // didn't need to scavenge for a scratch register.
2603 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2604 }
2605 MBB.erase(MI);
2606 return true;
2607 }
2608
2609 const GlobalValue *GV =
2610 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2611 const TargetMachine &TM = MBB.getParent()->getTarget();
2612 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2613 const unsigned char MO_NC = AArch64II::MO_NC;
2614
2615 unsigned GuardWidth = M.getStackProtectorGuardValueWidth().value_or(
2616 Subtarget.isTargetILP32() ? 4 : 8);
2617 if (GuardWidth != 4 && GuardWidth != 8)
2618 report_fatal_error("Unsupported stack protector value width");
2619 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2620 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2621 .addGlobalAddress(GV, 0, OpFlags);
2622 if (GuardWidth == 4) {
2623 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2624 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2625 .addDef(Reg32, RegState::Dead)
2627 .addImm(0)
2628 .addMemOperand(*MI.memoperands_begin())
2630 } else {
2631 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2633 .addImm(0)
2634 .addMemOperand(*MI.memoperands_begin());
2635 }
2636 } else if (TM.getCodeModel() == CodeModel::Large) {
2637 if (GuardWidth == 4)
2638 report_fatal_error("Large code model with 4-byte stack protector not yet "
2639 "supported");
2640 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2641 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2642 .addImm(0);
2643 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2645 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2646 .addImm(16);
2647 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2649 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2650 .addImm(32);
2651 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2654 .addImm(48);
2655 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2657 .addImm(0)
2658 .addMemOperand(*MI.memoperands_begin());
2659 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2660 // FIXME: This is computing the stack protector value as a constant
2661 // pc-relative offset, not loading it from memory. Which is maybe
2662 // an interesting compromise in some environments, but it looks like it
2663 // was done accidentally. And it probably shouldn't be tied to the
2664 // code model.
2665 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2666 .addGlobalAddress(GV, 0, OpFlags);
2667 } else {
2668 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2669 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2670 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2671 if (GuardWidth == 4) {
2672 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2673 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2674 .addDef(Reg32, RegState::Dead)
2676 .addGlobalAddress(GV, 0, LoFlags)
2677 .addMemOperand(*MI.memoperands_begin())
2679 } else {
2680 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2682 .addGlobalAddress(GV, 0, LoFlags)
2683 .addMemOperand(*MI.memoperands_begin());
2684 }
2685 }
2686
2687 MBB.erase(MI);
2688
2689 return true;
2690}
2691
2692// Return true if this instruction simply sets its single destination register
2693// to zero. This is equivalent to a register rename of the zero-register.
2695 switch (MI.getOpcode()) {
2696 default:
2697 break;
2698 case AArch64::MOVZWi:
2699 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2700 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2701 assert(MI.getDesc().getNumOperands() == 3 &&
2702 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2703 return true;
2704 }
2705 break;
2706 case AArch64::ANDWri: // and Rd, Rzr, #imm
2707 return MI.getOperand(1).getReg() == AArch64::WZR;
2708 case AArch64::ANDXri:
2709 return MI.getOperand(1).getReg() == AArch64::XZR;
2710 case TargetOpcode::COPY:
2711 return MI.getOperand(1).getReg() == AArch64::WZR;
2712 }
2713 return false;
2714}
2715
2716// Return true if this instruction simply renames a general register without
2717// modifying bits.
2719 switch (MI.getOpcode()) {
2720 default:
2721 break;
2722 case TargetOpcode::COPY: {
2723 // GPR32 copies will by lowered to ORRXrs
2724 Register DstReg = MI.getOperand(0).getReg();
2725 return (AArch64::GPR32RegClass.contains(DstReg) ||
2726 AArch64::GPR64RegClass.contains(DstReg));
2727 }
2728 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2729 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2730 assert(MI.getDesc().getNumOperands() == 4 &&
2731 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2732 return true;
2733 }
2734 break;
2735 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2736 if (MI.getOperand(2).getImm() == 0) {
2737 assert(MI.getDesc().getNumOperands() == 4 &&
2738 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2739 return true;
2740 }
2741 break;
2742 }
2743 return false;
2744}
2745
2746// Return true if this instruction simply renames a general register without
2747// modifying bits.
2749 switch (MI.getOpcode()) {
2750 default:
2751 break;
2752 case TargetOpcode::COPY: {
2753 Register DstReg = MI.getOperand(0).getReg();
2754 return AArch64::FPR128RegClass.contains(DstReg);
2755 }
2756 case AArch64::ORRv16i8:
2757 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2758 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2759 "invalid ORRv16i8 operands");
2760 return true;
2761 }
2762 break;
2763 }
2764 return false;
2765}
2766
2767static bool isFrameLoadOpcode(int Opcode) {
2768 switch (Opcode) {
2769 default:
2770 return false;
2771 case AArch64::LDRWui:
2772 case AArch64::LDRXui:
2773 case AArch64::LDRBui:
2774 case AArch64::LDRHui:
2775 case AArch64::LDRSui:
2776 case AArch64::LDRDui:
2777 case AArch64::LDRQui:
2778 case AArch64::LDR_PXI:
2779 return true;
2780 }
2781}
2782
2784 int &FrameIndex) const {
2785 if (!isFrameLoadOpcode(MI.getOpcode()))
2786 return Register();
2787
2788 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2789 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2790 FrameIndex = MI.getOperand(1).getIndex();
2791 return MI.getOperand(0).getReg();
2792 }
2793 return Register();
2794}
2795
2796static bool isFrameStoreOpcode(int Opcode) {
2797 switch (Opcode) {
2798 default:
2799 return false;
2800 case AArch64::STRWui:
2801 case AArch64::STRXui:
2802 case AArch64::STRBui:
2803 case AArch64::STRHui:
2804 case AArch64::STRSui:
2805 case AArch64::STRDui:
2806 case AArch64::STRQui:
2807 case AArch64::STR_PXI:
2808 return true;
2809 }
2810}
2811
2813 int &FrameIndex) const {
2814 if (!isFrameStoreOpcode(MI.getOpcode()))
2815 return Register();
2816
2817 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2818 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2819 FrameIndex = MI.getOperand(1).getIndex();
2820 return MI.getOperand(0).getReg();
2821 }
2822 return Register();
2823}
2824
2826 int &FrameIndex) const {
2827 if (!isFrameStoreOpcode(MI.getOpcode()))
2828 return Register();
2829
2830 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2831 return Reg;
2832
2834 if (hasStoreToStackSlot(MI, Accesses)) {
2835 if (Accesses.size() > 1)
2836 return Register();
2837
2838 FrameIndex =
2839 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2840 ->getFrameIndex();
2841 return MI.getOperand(0).getReg();
2842 }
2843 return Register();
2844}
2845
2847 int &FrameIndex) const {
2848 if (!isFrameLoadOpcode(MI.getOpcode()))
2849 return Register();
2850
2851 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2852 return Reg;
2853
2855 if (hasLoadFromStackSlot(MI, Accesses)) {
2856 if (Accesses.size() > 1)
2857 return Register();
2858
2859 FrameIndex =
2860 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2861 ->getFrameIndex();
2862 return MI.getOperand(0).getReg();
2863 }
2864 return Register();
2865}
2866
2867/// Check all MachineMemOperands for a hint to suppress pairing.
2869 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2870 return MMO->getFlags() & MOSuppressPair;
2871 });
2872}
2873
2874/// Set a flag on the first MachineMemOperand to suppress pairing.
2876 if (MI.memoperands_empty())
2877 return;
2878 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2879}
2880
2881/// Check all MachineMemOperands for a hint that the load/store is strided.
2883 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2884 return MMO->getFlags() & MOStridedAccess;
2885 });
2886}
2887
2889 switch (Opc) {
2890 default:
2891 return false;
2892 case AArch64::STURSi:
2893 case AArch64::STRSpre:
2894 case AArch64::STURDi:
2895 case AArch64::STRDpre:
2896 case AArch64::STURQi:
2897 case AArch64::STRQpre:
2898 case AArch64::STURBBi:
2899 case AArch64::STURHHi:
2900 case AArch64::STURWi:
2901 case AArch64::STRWpre:
2902 case AArch64::STURXi:
2903 case AArch64::STRXpre:
2904 case AArch64::LDURSi:
2905 case AArch64::LDRSpre:
2906 case AArch64::LDURDi:
2907 case AArch64::LDRDpre:
2908 case AArch64::LDURQi:
2909 case AArch64::LDRQpre:
2910 case AArch64::LDURWi:
2911 case AArch64::LDRWpre:
2912 case AArch64::LDURXi:
2913 case AArch64::LDRXpre:
2914 case AArch64::LDRSWpre:
2915 case AArch64::LDURSWi:
2916 case AArch64::LDURHHi:
2917 case AArch64::LDURBBi:
2918 case AArch64::LDURSBWi:
2919 case AArch64::LDURSHWi:
2920 return true;
2921 }
2922}
2923
2924std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2925 switch (Opc) {
2926 default: return {};
2927 case AArch64::PRFMui: return AArch64::PRFUMi;
2928 case AArch64::LDRXui: return AArch64::LDURXi;
2929 case AArch64::LDRWui: return AArch64::LDURWi;
2930 case AArch64::LDRBui: return AArch64::LDURBi;
2931 case AArch64::LDRHui: return AArch64::LDURHi;
2932 case AArch64::LDRSui: return AArch64::LDURSi;
2933 case AArch64::LDRDui: return AArch64::LDURDi;
2934 case AArch64::LDRQui: return AArch64::LDURQi;
2935 case AArch64::LDRBBui: return AArch64::LDURBBi;
2936 case AArch64::LDRHHui: return AArch64::LDURHHi;
2937 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2938 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2939 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2940 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2941 case AArch64::LDRSWui: return AArch64::LDURSWi;
2942 case AArch64::STRXui: return AArch64::STURXi;
2943 case AArch64::STRWui: return AArch64::STURWi;
2944 case AArch64::STRBui: return AArch64::STURBi;
2945 case AArch64::STRHui: return AArch64::STURHi;
2946 case AArch64::STRSui: return AArch64::STURSi;
2947 case AArch64::STRDui: return AArch64::STURDi;
2948 case AArch64::STRQui: return AArch64::STURQi;
2949 case AArch64::STRBBui: return AArch64::STURBBi;
2950 case AArch64::STRHHui: return AArch64::STURHHi;
2951 }
2952}
2953
2955 switch (Opc) {
2956 default:
2957 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2958 case AArch64::ADDG:
2959 case AArch64::LDAPURBi:
2960 case AArch64::LDAPURHi:
2961 case AArch64::LDAPURi:
2962 case AArch64::LDAPURSBWi:
2963 case AArch64::LDAPURSBXi:
2964 case AArch64::LDAPURSHWi:
2965 case AArch64::LDAPURSHXi:
2966 case AArch64::LDAPURSWi:
2967 case AArch64::LDAPURXi:
2968 case AArch64::LDR_PPXI:
2969 case AArch64::LDR_PXI:
2970 case AArch64::LDR_ZXI:
2971 case AArch64::LDR_ZZXI:
2972 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2973 case AArch64::LDR_ZZZXI:
2974 case AArch64::LDR_ZZZZXI:
2975 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2976 case AArch64::LDRBBui:
2977 case AArch64::LDRBui:
2978 case AArch64::LDRDui:
2979 case AArch64::LDRHHui:
2980 case AArch64::LDRHui:
2981 case AArch64::LDRQui:
2982 case AArch64::LDRSBWui:
2983 case AArch64::LDRSBXui:
2984 case AArch64::LDRSHWui:
2985 case AArch64::LDRSHXui:
2986 case AArch64::LDRSui:
2987 case AArch64::LDRSWui:
2988 case AArch64::LDRWui:
2989 case AArch64::LDRXui:
2990 case AArch64::LDURBBi:
2991 case AArch64::LDURBi:
2992 case AArch64::LDURDi:
2993 case AArch64::LDURHHi:
2994 case AArch64::LDURHi:
2995 case AArch64::LDURQi:
2996 case AArch64::LDURSBWi:
2997 case AArch64::LDURSBXi:
2998 case AArch64::LDURSHWi:
2999 case AArch64::LDURSHXi:
3000 case AArch64::LDURSi:
3001 case AArch64::LDURSWi:
3002 case AArch64::LDURWi:
3003 case AArch64::LDURXi:
3004 case AArch64::PRFMui:
3005 case AArch64::PRFUMi:
3006 case AArch64::ST2Gi:
3007 case AArch64::STGi:
3008 case AArch64::STLURBi:
3009 case AArch64::STLURHi:
3010 case AArch64::STLURWi:
3011 case AArch64::STLURXi:
3012 case AArch64::StoreSwiftAsyncContext:
3013 case AArch64::STR_PPXI:
3014 case AArch64::STR_PXI:
3015 case AArch64::STR_ZXI:
3016 case AArch64::STR_ZZXI:
3017 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
3018 case AArch64::STR_ZZZXI:
3019 case AArch64::STR_ZZZZXI:
3020 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
3021 case AArch64::STRBBui:
3022 case AArch64::STRBui:
3023 case AArch64::STRDui:
3024 case AArch64::STRHHui:
3025 case AArch64::STRHui:
3026 case AArch64::STRQui:
3027 case AArch64::STRSui:
3028 case AArch64::STRWui:
3029 case AArch64::STRXui:
3030 case AArch64::STURBBi:
3031 case AArch64::STURBi:
3032 case AArch64::STURDi:
3033 case AArch64::STURHHi:
3034 case AArch64::STURHi:
3035 case AArch64::STURQi:
3036 case AArch64::STURSi:
3037 case AArch64::STURWi:
3038 case AArch64::STURXi:
3039 case AArch64::STZ2Gi:
3040 case AArch64::STZGi:
3041 case AArch64::TAGPstack:
3042 return 2;
3043 case AArch64::LD1B_D_IMM:
3044 case AArch64::LD1B_H_IMM:
3045 case AArch64::LD1B_IMM:
3046 case AArch64::LD1B_S_IMM:
3047 case AArch64::LD1D_IMM:
3048 case AArch64::LD1H_D_IMM:
3049 case AArch64::LD1H_IMM:
3050 case AArch64::LD1H_S_IMM:
3051 case AArch64::LD1RB_D_IMM:
3052 case AArch64::LD1RB_H_IMM:
3053 case AArch64::LD1RB_IMM:
3054 case AArch64::LD1RB_S_IMM:
3055 case AArch64::LD1RD_IMM:
3056 case AArch64::LD1RH_D_IMM:
3057 case AArch64::LD1RH_IMM:
3058 case AArch64::LD1RH_S_IMM:
3059 case AArch64::LD1RSB_D_IMM:
3060 case AArch64::LD1RSB_H_IMM:
3061 case AArch64::LD1RSB_S_IMM:
3062 case AArch64::LD1RSH_D_IMM:
3063 case AArch64::LD1RSH_S_IMM:
3064 case AArch64::LD1RSW_IMM:
3065 case AArch64::LD1RW_D_IMM:
3066 case AArch64::LD1RW_IMM:
3067 case AArch64::LD1SB_D_IMM:
3068 case AArch64::LD1SB_H_IMM:
3069 case AArch64::LD1SB_S_IMM:
3070 case AArch64::LD1SH_D_IMM:
3071 case AArch64::LD1SH_S_IMM:
3072 case AArch64::LD1SW_D_IMM:
3073 case AArch64::LD1W_D_IMM:
3074 case AArch64::LD1W_IMM:
3075 case AArch64::LD2B_IMM:
3076 case AArch64::LD2D_IMM:
3077 case AArch64::LD2H_IMM:
3078 case AArch64::LD2W_IMM:
3079 case AArch64::LD3B_IMM:
3080 case AArch64::LD3D_IMM:
3081 case AArch64::LD3H_IMM:
3082 case AArch64::LD3W_IMM:
3083 case AArch64::LD4B_IMM:
3084 case AArch64::LD4D_IMM:
3085 case AArch64::LD4H_IMM:
3086 case AArch64::LD4W_IMM:
3087 case AArch64::LDG:
3088 case AArch64::LDNF1B_D_IMM:
3089 case AArch64::LDNF1B_H_IMM:
3090 case AArch64::LDNF1B_IMM:
3091 case AArch64::LDNF1B_S_IMM:
3092 case AArch64::LDNF1D_IMM:
3093 case AArch64::LDNF1H_D_IMM:
3094 case AArch64::LDNF1H_IMM:
3095 case AArch64::LDNF1H_S_IMM:
3096 case AArch64::LDNF1SB_D_IMM:
3097 case AArch64::LDNF1SB_H_IMM:
3098 case AArch64::LDNF1SB_S_IMM:
3099 case AArch64::LDNF1SH_D_IMM:
3100 case AArch64::LDNF1SH_S_IMM:
3101 case AArch64::LDNF1SW_D_IMM:
3102 case AArch64::LDNF1W_D_IMM:
3103 case AArch64::LDNF1W_IMM:
3104 case AArch64::LDNPDi:
3105 case AArch64::LDNPQi:
3106 case AArch64::LDNPSi:
3107 case AArch64::LDNPWi:
3108 case AArch64::LDNPXi:
3109 case AArch64::LDNT1B_ZRI:
3110 case AArch64::LDNT1D_ZRI:
3111 case AArch64::LDNT1H_ZRI:
3112 case AArch64::LDNT1W_ZRI:
3113 case AArch64::LDPDi:
3114 case AArch64::LDPQi:
3115 case AArch64::LDPSi:
3116 case AArch64::LDPWi:
3117 case AArch64::LDPXi:
3118 case AArch64::LDRBBpost:
3119 case AArch64::LDRBBpre:
3120 case AArch64::LDRBpost:
3121 case AArch64::LDRBpre:
3122 case AArch64::LDRDpost:
3123 case AArch64::LDRDpre:
3124 case AArch64::LDRHHpost:
3125 case AArch64::LDRHHpre:
3126 case AArch64::LDRHpost:
3127 case AArch64::LDRHpre:
3128 case AArch64::LDRQpost:
3129 case AArch64::LDRQpre:
3130 case AArch64::LDRSpost:
3131 case AArch64::LDRSpre:
3132 case AArch64::LDRWpost:
3133 case AArch64::LDRWpre:
3134 case AArch64::LDRXpost:
3135 case AArch64::LDRXpre:
3136 case AArch64::ST1B_D_IMM:
3137 case AArch64::ST1B_H_IMM:
3138 case AArch64::ST1B_IMM:
3139 case AArch64::ST1B_S_IMM:
3140 case AArch64::ST1D_IMM:
3141 case AArch64::ST1H_D_IMM:
3142 case AArch64::ST1H_IMM:
3143 case AArch64::ST1H_S_IMM:
3144 case AArch64::ST1W_D_IMM:
3145 case AArch64::ST1W_IMM:
3146 case AArch64::ST2B_IMM:
3147 case AArch64::ST2D_IMM:
3148 case AArch64::ST2H_IMM:
3149 case AArch64::ST2W_IMM:
3150 case AArch64::ST3B_IMM:
3151 case AArch64::ST3D_IMM:
3152 case AArch64::ST3H_IMM:
3153 case AArch64::ST3W_IMM:
3154 case AArch64::ST4B_IMM:
3155 case AArch64::ST4D_IMM:
3156 case AArch64::ST4H_IMM:
3157 case AArch64::ST4W_IMM:
3158 case AArch64::STGPi:
3159 case AArch64::STGPreIndex:
3160 case AArch64::STZGPreIndex:
3161 case AArch64::ST2GPreIndex:
3162 case AArch64::STZ2GPreIndex:
3163 case AArch64::STGPostIndex:
3164 case AArch64::STZGPostIndex:
3165 case AArch64::ST2GPostIndex:
3166 case AArch64::STZ2GPostIndex:
3167 case AArch64::STNPDi:
3168 case AArch64::STNPQi:
3169 case AArch64::STNPSi:
3170 case AArch64::STNPWi:
3171 case AArch64::STNPXi:
3172 case AArch64::STNT1B_ZRI:
3173 case AArch64::STNT1D_ZRI:
3174 case AArch64::STNT1H_ZRI:
3175 case AArch64::STNT1W_ZRI:
3176 case AArch64::STPDi:
3177 case AArch64::STPQi:
3178 case AArch64::STPSi:
3179 case AArch64::STPWi:
3180 case AArch64::STPXi:
3181 case AArch64::STRBBpost:
3182 case AArch64::STRBBpre:
3183 case AArch64::STRBpost:
3184 case AArch64::STRBpre:
3185 case AArch64::STRDpost:
3186 case AArch64::STRDpre:
3187 case AArch64::STRHHpost:
3188 case AArch64::STRHHpre:
3189 case AArch64::STRHpost:
3190 case AArch64::STRHpre:
3191 case AArch64::STRQpost:
3192 case AArch64::STRQpre:
3193 case AArch64::STRSpost:
3194 case AArch64::STRSpre:
3195 case AArch64::STRWpost:
3196 case AArch64::STRWpre:
3197 case AArch64::STRXpost:
3198 case AArch64::STRXpre:
3199 case AArch64::LD1B_2Z_IMM:
3200 case AArch64::LD1B_2Z_STRIDED_IMM:
3201 case AArch64::LD1H_2Z_IMM:
3202 case AArch64::LD1H_2Z_STRIDED_IMM:
3203 case AArch64::LD1W_2Z_IMM:
3204 case AArch64::LD1W_2Z_STRIDED_IMM:
3205 case AArch64::LD1D_2Z_IMM:
3206 case AArch64::LD1D_2Z_STRIDED_IMM:
3207 case AArch64::LD1B_4Z_IMM:
3208 case AArch64::LD1B_4Z_STRIDED_IMM:
3209 case AArch64::LD1H_4Z_IMM:
3210 case AArch64::LD1H_4Z_STRIDED_IMM:
3211 case AArch64::LD1W_4Z_IMM:
3212 case AArch64::LD1W_4Z_STRIDED_IMM:
3213 case AArch64::LD1D_4Z_IMM:
3214 case AArch64::LD1D_4Z_STRIDED_IMM:
3215 case AArch64::LD1B_2Z_IMM_PSEUDO:
3216 case AArch64::LD1H_2Z_IMM_PSEUDO:
3217 case AArch64::LD1W_2Z_IMM_PSEUDO:
3218 case AArch64::LD1D_2Z_IMM_PSEUDO:
3219 case AArch64::LD1B_4Z_IMM_PSEUDO:
3220 case AArch64::LD1H_4Z_IMM_PSEUDO:
3221 case AArch64::LD1W_4Z_IMM_PSEUDO:
3222 case AArch64::LD1D_4Z_IMM_PSEUDO:
3223 case AArch64::ST1B_2Z_IMM:
3224 case AArch64::ST1B_2Z_STRIDED_IMM:
3225 case AArch64::ST1H_2Z_IMM:
3226 case AArch64::ST1H_2Z_STRIDED_IMM:
3227 case AArch64::ST1W_2Z_IMM:
3228 case AArch64::ST1W_2Z_STRIDED_IMM:
3229 case AArch64::ST1D_2Z_IMM:
3230 case AArch64::ST1D_2Z_STRIDED_IMM:
3231 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
3232 case AArch64::LDNT1B_2Z_IMM:
3233 case AArch64::LDNT1B_2Z_STRIDED_IMM:
3234 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
3235 case AArch64::LDNT1H_2Z_IMM:
3236 case AArch64::LDNT1H_2Z_STRIDED_IMM:
3237 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
3238 case AArch64::LDNT1W_2Z_IMM:
3239 case AArch64::LDNT1W_2Z_STRIDED_IMM:
3240 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
3241 case AArch64::LDNT1D_2Z_IMM:
3242 case AArch64::LDNT1D_2Z_STRIDED_IMM:
3243 case AArch64::STNT1B_2Z_IMM:
3244 case AArch64::STNT1B_2Z_STRIDED_IMM:
3245 case AArch64::STNT1H_2Z_IMM:
3246 case AArch64::STNT1H_2Z_STRIDED_IMM:
3247 case AArch64::STNT1W_2Z_IMM:
3248 case AArch64::STNT1W_2Z_STRIDED_IMM:
3249 case AArch64::STNT1D_2Z_IMM:
3250 case AArch64::STNT1D_2Z_STRIDED_IMM:
3251 case AArch64::ST1B_4Z_IMM:
3252 case AArch64::ST1B_4Z_STRIDED_IMM:
3253 case AArch64::ST1H_4Z_IMM:
3254 case AArch64::ST1H_4Z_STRIDED_IMM:
3255 case AArch64::ST1W_4Z_IMM:
3256 case AArch64::ST1W_4Z_STRIDED_IMM:
3257 case AArch64::ST1D_4Z_IMM:
3258 case AArch64::ST1D_4Z_STRIDED_IMM:
3259 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
3260 case AArch64::LDNT1B_4Z_IMM:
3261 case AArch64::LDNT1B_4Z_STRIDED_IMM:
3262 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
3263 case AArch64::LDNT1H_4Z_IMM:
3264 case AArch64::LDNT1H_4Z_STRIDED_IMM:
3265 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
3266 case AArch64::LDNT1W_4Z_IMM:
3267 case AArch64::LDNT1W_4Z_STRIDED_IMM:
3268 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
3269 case AArch64::LDNT1D_4Z_IMM:
3270 case AArch64::LDNT1D_4Z_STRIDED_IMM:
3271 case AArch64::STNT1B_4Z_IMM:
3272 case AArch64::STNT1B_4Z_STRIDED_IMM:
3273 case AArch64::STNT1H_4Z_IMM:
3274 case AArch64::STNT1H_4Z_STRIDED_IMM:
3275 case AArch64::STNT1W_4Z_IMM:
3276 case AArch64::STNT1W_4Z_STRIDED_IMM:
3277 case AArch64::STNT1D_4Z_IMM:
3278 case AArch64::STNT1D_4Z_STRIDED_IMM:
3279 return 3;
3280 case AArch64::LDPDpost:
3281 case AArch64::LDPDpre:
3282 case AArch64::LDPQpost:
3283 case AArch64::LDPQpre:
3284 case AArch64::LDPSpost:
3285 case AArch64::LDPSpre:
3286 case AArch64::LDPWpost:
3287 case AArch64::LDPWpre:
3288 case AArch64::LDPXpost:
3289 case AArch64::LDPXpre:
3290 case AArch64::STGPpre:
3291 case AArch64::STGPpost:
3292 case AArch64::STPDpost:
3293 case AArch64::STPDpre:
3294 case AArch64::STPQpost:
3295 case AArch64::STPQpre:
3296 case AArch64::STPSpost:
3297 case AArch64::STPSpre:
3298 case AArch64::STPWpost:
3299 case AArch64::STPWpre:
3300 case AArch64::STPXpost:
3301 case AArch64::STPXpre:
3302 return 4;
3303 }
3304}
3305
3307 switch (MI.getOpcode()) {
3308 default:
3309 return false;
3310 // Scaled instructions.
3311 case AArch64::STRSui:
3312 case AArch64::STRDui:
3313 case AArch64::STRQui:
3314 case AArch64::STRXui:
3315 case AArch64::STRWui:
3316 case AArch64::LDRSui:
3317 case AArch64::LDRDui:
3318 case AArch64::LDRQui:
3319 case AArch64::LDRXui:
3320 case AArch64::LDRWui:
3321 case AArch64::LDRSWui:
3322 // Unscaled instructions.
3323 case AArch64::STURSi:
3324 case AArch64::STRSpre:
3325 case AArch64::STURDi:
3326 case AArch64::STRDpre:
3327 case AArch64::STURQi:
3328 case AArch64::STRQpre:
3329 case AArch64::STURWi:
3330 case AArch64::STRWpre:
3331 case AArch64::STURXi:
3332 case AArch64::STRXpre:
3333 case AArch64::LDURSi:
3334 case AArch64::LDRSpre:
3335 case AArch64::LDURDi:
3336 case AArch64::LDRDpre:
3337 case AArch64::LDURQi:
3338 case AArch64::LDRQpre:
3339 case AArch64::LDURWi:
3340 case AArch64::LDRWpre:
3341 case AArch64::LDURXi:
3342 case AArch64::LDRXpre:
3343 case AArch64::LDURSWi:
3344 case AArch64::LDRSWpre:
3345 // SVE instructions.
3346 case AArch64::LDR_ZXI:
3347 case AArch64::STR_ZXI:
3348 return true;
3349 }
3350}
3351
3353 switch (MI.getOpcode()) {
3354 default:
3355 assert((!MI.isCall() || !MI.isReturn()) &&
3356 "Unexpected instruction - was a new tail call opcode introduced?");
3357 return false;
3358 case AArch64::TCRETURNdi:
3359 case AArch64::TCRETURNri:
3360 case AArch64::TCRETURNrix16x17:
3361 case AArch64::TCRETURNrix17:
3362 case AArch64::TCRETURNrinotx16:
3363 case AArch64::TCRETURNriALL:
3364 case AArch64::AUTH_TCRETURN:
3365 case AArch64::AUTH_TCRETURN_BTI:
3366 return true;
3367 }
3368}
3369
3371 switch (Opc) {
3372 default:
3373 llvm_unreachable("Opcode has no flag setting equivalent!");
3374 // 32-bit cases:
3375 case AArch64::ADDWri:
3376 return AArch64::ADDSWri;
3377 case AArch64::ADDWrr:
3378 return AArch64::ADDSWrr;
3379 case AArch64::ADDWrs:
3380 return AArch64::ADDSWrs;
3381 case AArch64::ADDWrx:
3382 return AArch64::ADDSWrx;
3383 case AArch64::ANDWri:
3384 return AArch64::ANDSWri;
3385 case AArch64::ANDWrr:
3386 return AArch64::ANDSWrr;
3387 case AArch64::ANDWrs:
3388 return AArch64::ANDSWrs;
3389 case AArch64::BICWrr:
3390 return AArch64::BICSWrr;
3391 case AArch64::BICWrs:
3392 return AArch64::BICSWrs;
3393 case AArch64::SUBWri:
3394 return AArch64::SUBSWri;
3395 case AArch64::SUBWrr:
3396 return AArch64::SUBSWrr;
3397 case AArch64::SUBWrs:
3398 return AArch64::SUBSWrs;
3399 case AArch64::SUBWrx:
3400 return AArch64::SUBSWrx;
3401 // 64-bit cases:
3402 case AArch64::ADDXri:
3403 return AArch64::ADDSXri;
3404 case AArch64::ADDXrr:
3405 return AArch64::ADDSXrr;
3406 case AArch64::ADDXrs:
3407 return AArch64::ADDSXrs;
3408 case AArch64::ADDXrx:
3409 return AArch64::ADDSXrx;
3410 case AArch64::ANDXri:
3411 return AArch64::ANDSXri;
3412 case AArch64::ANDXrr:
3413 return AArch64::ANDSXrr;
3414 case AArch64::ANDXrs:
3415 return AArch64::ANDSXrs;
3416 case AArch64::BICXrr:
3417 return AArch64::BICSXrr;
3418 case AArch64::BICXrs:
3419 return AArch64::BICSXrs;
3420 case AArch64::SUBXri:
3421 return AArch64::SUBSXri;
3422 case AArch64::SUBXrr:
3423 return AArch64::SUBSXrr;
3424 case AArch64::SUBXrs:
3425 return AArch64::SUBSXrs;
3426 case AArch64::SUBXrx:
3427 return AArch64::SUBSXrx;
3428 // SVE instructions:
3429 case AArch64::AND_PPzPP:
3430 return AArch64::ANDS_PPzPP;
3431 case AArch64::BIC_PPzPP:
3432 return AArch64::BICS_PPzPP;
3433 case AArch64::EOR_PPzPP:
3434 return AArch64::EORS_PPzPP;
3435 case AArch64::NAND_PPzPP:
3436 return AArch64::NANDS_PPzPP;
3437 case AArch64::NOR_PPzPP:
3438 return AArch64::NORS_PPzPP;
3439 case AArch64::ORN_PPzPP:
3440 return AArch64::ORNS_PPzPP;
3441 case AArch64::ORR_PPzPP:
3442 return AArch64::ORRS_PPzPP;
3443 case AArch64::BRKA_PPzP:
3444 return AArch64::BRKAS_PPzP;
3445 case AArch64::BRKPA_PPzPP:
3446 return AArch64::BRKPAS_PPzPP;
3447 case AArch64::BRKB_PPzP:
3448 return AArch64::BRKBS_PPzP;
3449 case AArch64::BRKPB_PPzPP:
3450 return AArch64::BRKPBS_PPzPP;
3451 case AArch64::BRKN_PPzP:
3452 return AArch64::BRKNS_PPzP;
3453 case AArch64::RDFFR_PPz:
3454 return AArch64::RDFFRS_PPz;
3455 case AArch64::PTRUE_B:
3456 return AArch64::PTRUES_B;
3457 }
3458}
3459
3460// Is this a candidate for ld/st merging or pairing? For example, we don't
3461// touch volatiles or load/stores that have a hint to avoid pair formation.
3463
3464 bool IsPreLdSt = isPreLdSt(MI);
3465
3466 // If this is a volatile load/store, don't mess with it.
3467 if (MI.hasOrderedMemoryRef())
3468 return false;
3469
3470 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3471 // For Pre-inc LD/ST, the operand is shifted by one.
3472 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3473 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3474 "Expected a reg or frame index operand.");
3475
3476 // For Pre-indexed addressing quadword instructions, the third operand is the
3477 // immediate value.
3478 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3479
3480 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3481 return false;
3482
3483 // Can't merge/pair if the instruction modifies the base register.
3484 // e.g., ldr x0, [x0]
3485 // This case will never occur with an FI base.
3486 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3487 // STR<S,D,Q,W,X>pre, it can be merged.
3488 // For example:
3489 // ldr q0, [x11, #32]!
3490 // ldr q1, [x11, #16]
3491 // to
3492 // ldp q0, q1, [x11, #32]!
3493 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3494 Register BaseReg = MI.getOperand(1).getReg();
3496 if (MI.modifiesRegister(BaseReg, TRI))
3497 return false;
3498 }
3499
3500 // Pairing SVE fills/spills is only valid for little-endian targets that
3501 // implement VLS 128.
3502 switch (MI.getOpcode()) {
3503 default:
3504 break;
3505 case AArch64::LDR_ZXI:
3506 case AArch64::STR_ZXI:
3507 if (!Subtarget.isLittleEndian() ||
3508 Subtarget.getSVEVectorSizeInBits() != 128)
3509 return false;
3510 }
3511
3512 // Check if this load/store has a hint to avoid pair formation.
3513 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3515 return false;
3516
3517 // Do not pair any callee-save store/reload instructions in the
3518 // prologue/epilogue if the CFI information encoded the operations as separate
3519 // instructions, as that will cause the size of the actual prologue to mismatch
3520 // with the prologue size recorded in the Windows CFI.
3521 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3522 bool NeedsWinCFI =
3523 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3524 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3526 return false;
3527
3528 // On some CPUs quad load/store pairs are slower than two single load/stores.
3529 if (Subtarget.isPaired128Slow()) {
3530 switch (MI.getOpcode()) {
3531 default:
3532 break;
3533 case AArch64::LDURQi:
3534 case AArch64::STURQi:
3535 case AArch64::LDRQui:
3536 case AArch64::STRQui:
3537 return false;
3538 }
3539 }
3540
3541 return true;
3542}
3543
3546 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3547 const TargetRegisterInfo *TRI) const {
3548 if (!LdSt.mayLoadOrStore())
3549 return false;
3550
3551 const MachineOperand *BaseOp;
3552 TypeSize WidthN(0, false);
3553 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3554 WidthN, TRI))
3555 return false;
3556 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3557 // vector.
3558 Width = LocationSize::precise(WidthN);
3559 BaseOps.push_back(BaseOp);
3560 return true;
3561}
3562
3563std::optional<ExtAddrMode>
3565 const TargetRegisterInfo *TRI) const {
3566 const MachineOperand *Base; // Filled with the base operand of MI.
3567 int64_t Offset; // Filled with the offset of MI.
3568 bool OffsetIsScalable;
3569 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3570 return std::nullopt;
3571
3572 if (!Base->isReg())
3573 return std::nullopt;
3574 ExtAddrMode AM;
3575 AM.BaseReg = Base->getReg();
3576 AM.Displacement = Offset;
3577 AM.ScaledReg = 0;
3578 AM.Scale = 0;
3579 return AM;
3580}
3581
3583 Register Reg,
3584 const MachineInstr &AddrI,
3585 ExtAddrMode &AM) const {
3586 // Filter out instructions into which we cannot fold.
3587 unsigned NumBytes;
3588 int64_t OffsetScale = 1;
3589 switch (MemI.getOpcode()) {
3590 default:
3591 return false;
3592
3593 case AArch64::LDURQi:
3594 case AArch64::STURQi:
3595 NumBytes = 16;
3596 break;
3597
3598 case AArch64::LDURDi:
3599 case AArch64::STURDi:
3600 case AArch64::LDURXi:
3601 case AArch64::STURXi:
3602 NumBytes = 8;
3603 break;
3604
3605 case AArch64::LDURWi:
3606 case AArch64::LDURSWi:
3607 case AArch64::STURWi:
3608 NumBytes = 4;
3609 break;
3610
3611 case AArch64::LDURHi:
3612 case AArch64::STURHi:
3613 case AArch64::LDURHHi:
3614 case AArch64::STURHHi:
3615 case AArch64::LDURSHXi:
3616 case AArch64::LDURSHWi:
3617 NumBytes = 2;
3618 break;
3619
3620 case AArch64::LDRBroX:
3621 case AArch64::LDRBBroX:
3622 case AArch64::LDRSBXroX:
3623 case AArch64::LDRSBWroX:
3624 case AArch64::STRBroX:
3625 case AArch64::STRBBroX:
3626 case AArch64::LDURBi:
3627 case AArch64::LDURBBi:
3628 case AArch64::LDURSBXi:
3629 case AArch64::LDURSBWi:
3630 case AArch64::STURBi:
3631 case AArch64::STURBBi:
3632 case AArch64::LDRBui:
3633 case AArch64::LDRBBui:
3634 case AArch64::LDRSBXui:
3635 case AArch64::LDRSBWui:
3636 case AArch64::STRBui:
3637 case AArch64::STRBBui:
3638 NumBytes = 1;
3639 break;
3640
3641 case AArch64::LDRQroX:
3642 case AArch64::STRQroX:
3643 case AArch64::LDRQui:
3644 case AArch64::STRQui:
3645 NumBytes = 16;
3646 OffsetScale = 16;
3647 break;
3648
3649 case AArch64::LDRDroX:
3650 case AArch64::STRDroX:
3651 case AArch64::LDRXroX:
3652 case AArch64::STRXroX:
3653 case AArch64::LDRDui:
3654 case AArch64::STRDui:
3655 case AArch64::LDRXui:
3656 case AArch64::STRXui:
3657 NumBytes = 8;
3658 OffsetScale = 8;
3659 break;
3660
3661 case AArch64::LDRWroX:
3662 case AArch64::LDRSWroX:
3663 case AArch64::STRWroX:
3664 case AArch64::LDRWui:
3665 case AArch64::LDRSWui:
3666 case AArch64::STRWui:
3667 NumBytes = 4;
3668 OffsetScale = 4;
3669 break;
3670
3671 case AArch64::LDRHroX:
3672 case AArch64::STRHroX:
3673 case AArch64::LDRHHroX:
3674 case AArch64::STRHHroX:
3675 case AArch64::LDRSHXroX:
3676 case AArch64::LDRSHWroX:
3677 case AArch64::LDRHui:
3678 case AArch64::STRHui:
3679 case AArch64::LDRHHui:
3680 case AArch64::STRHHui:
3681 case AArch64::LDRSHXui:
3682 case AArch64::LDRSHWui:
3683 NumBytes = 2;
3684 OffsetScale = 2;
3685 break;
3686 }
3687
3688 // Check the fold operand is not the loaded/stored value.
3689 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3690 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3691 return false;
3692
3693 // Handle memory instructions with a [Reg, Reg] addressing mode.
3694 if (MemI.getOperand(2).isReg()) {
3695 // Bail if the addressing mode already includes extension of the offset
3696 // register.
3697 if (MemI.getOperand(3).getImm())
3698 return false;
3699
3700 // Check if we actually have a scaled offset.
3701 if (MemI.getOperand(4).getImm() == 0)
3702 OffsetScale = 1;
3703
3704 // If the address instructions is folded into the base register, then the
3705 // addressing mode must not have a scale. Then we can swap the base and the
3706 // scaled registers.
3707 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3708 return false;
3709
3710 switch (AddrI.getOpcode()) {
3711 default:
3712 return false;
3713
3714 case AArch64::SBFMXri:
3715 // sxtw Xa, Wm
3716 // ldr Xd, [Xn, Xa, lsl #N]
3717 // ->
3718 // ldr Xd, [Xn, Wm, sxtw #N]
3719 if (AddrI.getOperand(2).getImm() != 0 ||
3720 AddrI.getOperand(3).getImm() != 31)
3721 return false;
3722
3723 AM.BaseReg = MemI.getOperand(1).getReg();
3724 if (AM.BaseReg == Reg)
3725 AM.BaseReg = MemI.getOperand(2).getReg();
3726 AM.ScaledReg = AddrI.getOperand(1).getReg();
3727 AM.Scale = OffsetScale;
3728 AM.Displacement = 0;
3730 return true;
3731
3732 case TargetOpcode::SUBREG_TO_REG: {
3733 // mov Wa, Wm
3734 // ldr Xd, [Xn, Xa, lsl #N]
3735 // ->
3736 // ldr Xd, [Xn, Wm, uxtw #N]
3737
3738 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3739 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3740 return false;
3741
3742 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3743 Register OffsetReg = AddrI.getOperand(1).getReg();
3744 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3745 return false;
3746
3747 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3748 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3749 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3750 DefMI.getOperand(3).getImm() != 0)
3751 return false;
3752
3753 AM.BaseReg = MemI.getOperand(1).getReg();
3754 if (AM.BaseReg == Reg)
3755 AM.BaseReg = MemI.getOperand(2).getReg();
3756 AM.ScaledReg = DefMI.getOperand(2).getReg();
3757 AM.Scale = OffsetScale;
3758 AM.Displacement = 0;
3760 return true;
3761 }
3762 }
3763 }
3764
3765 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3766
3767 // Check we are not breaking a potential conversion to an LDP.
3768 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3769 int64_t NewOffset) -> bool {
3770 int64_t MinOffset, MaxOffset;
3771 switch (NumBytes) {
3772 default:
3773 return true;
3774 case 4:
3775 MinOffset = -256;
3776 MaxOffset = 252;
3777 break;
3778 case 8:
3779 MinOffset = -512;
3780 MaxOffset = 504;
3781 break;
3782 case 16:
3783 MinOffset = -1024;
3784 MaxOffset = 1008;
3785 break;
3786 }
3787 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3788 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3789 };
3790 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3791 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3792 int64_t NewOffset = OldOffset + Disp;
3793 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3794 return false;
3795 // If the old offset would fit into an LDP, but the new offset wouldn't,
3796 // bail out.
3797 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3798 return false;
3799 AM.BaseReg = AddrI.getOperand(1).getReg();
3800 AM.ScaledReg = 0;
3801 AM.Scale = 0;
3802 AM.Displacement = NewOffset;
3804 return true;
3805 };
3806
3807 auto canFoldAddRegIntoAddrMode =
3808 [&](int64_t Scale,
3810 if (MemI.getOperand(2).getImm() != 0)
3811 return false;
3812 if ((unsigned)Scale != Scale)
3813 return false;
3814 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3815 return false;
3816 AM.BaseReg = AddrI.getOperand(1).getReg();
3817 AM.ScaledReg = AddrI.getOperand(2).getReg();
3818 AM.Scale = Scale;
3819 AM.Displacement = 0;
3820 AM.Form = Form;
3821 return true;
3822 };
3823
3824 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3825 unsigned Opcode = MemI.getOpcode();
3826 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3827 Subtarget.isSTRQroSlow();
3828 };
3829
3830 int64_t Disp = 0;
3831 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3832 switch (AddrI.getOpcode()) {
3833 default:
3834 return false;
3835
3836 case AArch64::ADDXri:
3837 // add Xa, Xn, #N
3838 // ldr Xd, [Xa, #M]
3839 // ->
3840 // ldr Xd, [Xn, #N'+M]
3841 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3842 return canFoldAddSubImmIntoAddrMode(Disp);
3843
3844 case AArch64::SUBXri:
3845 // sub Xa, Xn, #N
3846 // ldr Xd, [Xa, #M]
3847 // ->
3848 // ldr Xd, [Xn, #N'+M]
3849 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3850 return canFoldAddSubImmIntoAddrMode(-Disp);
3851
3852 case AArch64::ADDXrs: {
3853 // add Xa, Xn, Xm, lsl #N
3854 // ldr Xd, [Xa]
3855 // ->
3856 // ldr Xd, [Xn, Xm, lsl #N]
3857
3858 // Don't fold the add if the result would be slower, unless optimising for
3859 // size.
3860 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3862 return false;
3863 Shift = AArch64_AM::getShiftValue(Shift);
3864 if (!OptSize) {
3865 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3866 return false;
3867 if (avoidSlowSTRQ(MemI))
3868 return false;
3869 }
3870 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3871 }
3872
3873 case AArch64::ADDXrr:
3874 // add Xa, Xn, Xm
3875 // ldr Xd, [Xa]
3876 // ->
3877 // ldr Xd, [Xn, Xm, lsl #0]
3878
3879 // Don't fold the add if the result would be slower, unless optimising for
3880 // size.
3881 if (!OptSize && avoidSlowSTRQ(MemI))
3882 return false;
3883 return canFoldAddRegIntoAddrMode(1);
3884
3885 case AArch64::ADDXrx:
3886 // add Xa, Xn, Wm, {s,u}xtw #N
3887 // ldr Xd, [Xa]
3888 // ->
3889 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3890
3891 // Don't fold the add if the result would be slower, unless optimising for
3892 // size.
3893 if (!OptSize && avoidSlowSTRQ(MemI))
3894 return false;
3895
3896 // Can fold only sign-/zero-extend of a word.
3897 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3899 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3900 return false;
3901
3902 return canFoldAddRegIntoAddrMode(
3903 1ULL << AArch64_AM::getArithShiftValue(Imm),
3906 }
3907}
3908
3909// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3910// return the opcode of an instruction performing the same operation, but using
3911// the [Reg, Reg] addressing mode.
3912static unsigned regOffsetOpcode(unsigned Opcode) {
3913 switch (Opcode) {
3914 default:
3915 llvm_unreachable("Address folding not implemented for instruction");
3916
3917 case AArch64::LDURQi:
3918 case AArch64::LDRQui:
3919 return AArch64::LDRQroX;
3920 case AArch64::STURQi:
3921 case AArch64::STRQui:
3922 return AArch64::STRQroX;
3923 case AArch64::LDURDi:
3924 case AArch64::LDRDui:
3925 return AArch64::LDRDroX;
3926 case AArch64::STURDi:
3927 case AArch64::STRDui:
3928 return AArch64::STRDroX;
3929 case AArch64::LDURXi:
3930 case AArch64::LDRXui:
3931 return AArch64::LDRXroX;
3932 case AArch64::STURXi:
3933 case AArch64::STRXui:
3934 return AArch64::STRXroX;
3935 case AArch64::LDURWi:
3936 case AArch64::LDRWui:
3937 return AArch64::LDRWroX;
3938 case AArch64::LDURSWi:
3939 case AArch64::LDRSWui:
3940 return AArch64::LDRSWroX;
3941 case AArch64::STURWi:
3942 case AArch64::STRWui:
3943 return AArch64::STRWroX;
3944 case AArch64::LDURHi:
3945 case AArch64::LDRHui:
3946 return AArch64::LDRHroX;
3947 case AArch64::STURHi:
3948 case AArch64::STRHui:
3949 return AArch64::STRHroX;
3950 case AArch64::LDURHHi:
3951 case AArch64::LDRHHui:
3952 return AArch64::LDRHHroX;
3953 case AArch64::STURHHi:
3954 case AArch64::STRHHui:
3955 return AArch64::STRHHroX;
3956 case AArch64::LDURSHXi:
3957 case AArch64::LDRSHXui:
3958 return AArch64::LDRSHXroX;
3959 case AArch64::LDURSHWi:
3960 case AArch64::LDRSHWui:
3961 return AArch64::LDRSHWroX;
3962 case AArch64::LDURBi:
3963 case AArch64::LDRBui:
3964 return AArch64::LDRBroX;
3965 case AArch64::LDURBBi:
3966 case AArch64::LDRBBui:
3967 return AArch64::LDRBBroX;
3968 case AArch64::LDURSBXi:
3969 case AArch64::LDRSBXui:
3970 return AArch64::LDRSBXroX;
3971 case AArch64::LDURSBWi:
3972 case AArch64::LDRSBWui:
3973 return AArch64::LDRSBWroX;
3974 case AArch64::STURBi:
3975 case AArch64::STRBui:
3976 return AArch64::STRBroX;
3977 case AArch64::STURBBi:
3978 case AArch64::STRBBui:
3979 return AArch64::STRBBroX;
3980 }
3981}
3982
3983// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3984// the opcode of an instruction performing the same operation, but using the
3985// [Reg, #Imm] addressing mode with scaled offset.
3986unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3987 switch (Opcode) {
3988 default:
3989 llvm_unreachable("Address folding not implemented for instruction");
3990
3991 case AArch64::LDURQi:
3992 Scale = 16;
3993 return AArch64::LDRQui;
3994 case AArch64::STURQi:
3995 Scale = 16;
3996 return AArch64::STRQui;
3997 case AArch64::LDURDi:
3998 Scale = 8;
3999 return AArch64::LDRDui;
4000 case AArch64::STURDi:
4001 Scale = 8;
4002 return AArch64::STRDui;
4003 case AArch64::LDURXi:
4004 Scale = 8;
4005 return AArch64::LDRXui;
4006 case AArch64::STURXi:
4007 Scale = 8;
4008 return AArch64::STRXui;
4009 case AArch64::LDURWi:
4010 Scale = 4;
4011 return AArch64::LDRWui;
4012 case AArch64::LDURSWi:
4013 Scale = 4;
4014 return AArch64::LDRSWui;
4015 case AArch64::STURWi:
4016 Scale = 4;
4017 return AArch64::STRWui;
4018 case AArch64::LDURHi:
4019 Scale = 2;
4020 return AArch64::LDRHui;
4021 case AArch64::STURHi:
4022 Scale = 2;
4023 return AArch64::STRHui;
4024 case AArch64::LDURHHi:
4025 Scale = 2;
4026 return AArch64::LDRHHui;
4027 case AArch64::STURHHi:
4028 Scale = 2;
4029 return AArch64::STRHHui;
4030 case AArch64::LDURSHXi:
4031 Scale = 2;
4032 return AArch64::LDRSHXui;
4033 case AArch64::LDURSHWi:
4034 Scale = 2;
4035 return AArch64::LDRSHWui;
4036 case AArch64::LDURBi:
4037 Scale = 1;
4038 return AArch64::LDRBui;
4039 case AArch64::LDURBBi:
4040 Scale = 1;
4041 return AArch64::LDRBBui;
4042 case AArch64::LDURSBXi:
4043 Scale = 1;
4044 return AArch64::LDRSBXui;
4045 case AArch64::LDURSBWi:
4046 Scale = 1;
4047 return AArch64::LDRSBWui;
4048 case AArch64::STURBi:
4049 Scale = 1;
4050 return AArch64::STRBui;
4051 case AArch64::STURBBi:
4052 Scale = 1;
4053 return AArch64::STRBBui;
4054 case AArch64::LDRQui:
4055 case AArch64::STRQui:
4056 Scale = 16;
4057 return Opcode;
4058 case AArch64::LDRDui:
4059 case AArch64::STRDui:
4060 case AArch64::LDRXui:
4061 case AArch64::STRXui:
4062 Scale = 8;
4063 return Opcode;
4064 case AArch64::LDRWui:
4065 case AArch64::LDRSWui:
4066 case AArch64::STRWui:
4067 Scale = 4;
4068 return Opcode;
4069 case AArch64::LDRHui:
4070 case AArch64::STRHui:
4071 case AArch64::LDRHHui:
4072 case AArch64::STRHHui:
4073 case AArch64::LDRSHXui:
4074 case AArch64::LDRSHWui:
4075 Scale = 2;
4076 return Opcode;
4077 case AArch64::LDRBui:
4078 case AArch64::LDRBBui:
4079 case AArch64::LDRSBXui:
4080 case AArch64::LDRSBWui:
4081 case AArch64::STRBui:
4082 case AArch64::STRBBui:
4083 Scale = 1;
4084 return Opcode;
4085 }
4086}
4087
4088// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
4089// the opcode of an instruction performing the same operation, but using the
4090// [Reg, #Imm] addressing mode with unscaled offset.
4091unsigned unscaledOffsetOpcode(unsigned Opcode) {
4092 switch (Opcode) {
4093 default:
4094 llvm_unreachable("Address folding not implemented for instruction");
4095
4096 case AArch64::LDURQi:
4097 case AArch64::STURQi:
4098 case AArch64::LDURDi:
4099 case AArch64::STURDi:
4100 case AArch64::LDURXi:
4101 case AArch64::STURXi:
4102 case AArch64::LDURWi:
4103 case AArch64::LDURSWi:
4104 case AArch64::STURWi:
4105 case AArch64::LDURHi:
4106 case AArch64::STURHi:
4107 case AArch64::LDURHHi:
4108 case AArch64::STURHHi:
4109 case AArch64::LDURSHXi:
4110 case AArch64::LDURSHWi:
4111 case AArch64::LDURBi:
4112 case AArch64::STURBi:
4113 case AArch64::LDURBBi:
4114 case AArch64::STURBBi:
4115 case AArch64::LDURSBWi:
4116 case AArch64::LDURSBXi:
4117 return Opcode;
4118 case AArch64::LDRQui:
4119 return AArch64::LDURQi;
4120 case AArch64::STRQui:
4121 return AArch64::STURQi;
4122 case AArch64::LDRDui:
4123 return AArch64::LDURDi;
4124 case AArch64::STRDui:
4125 return AArch64::STURDi;
4126 case AArch64::LDRXui:
4127 return AArch64::LDURXi;
4128 case AArch64::STRXui:
4129 return AArch64::STURXi;
4130 case AArch64::LDRWui:
4131 return AArch64::LDURWi;
4132 case AArch64::LDRSWui:
4133 return AArch64::LDURSWi;
4134 case AArch64::STRWui:
4135 return AArch64::STURWi;
4136 case AArch64::LDRHui:
4137 return AArch64::LDURHi;
4138 case AArch64::STRHui:
4139 return AArch64::STURHi;
4140 case AArch64::LDRHHui:
4141 return AArch64::LDURHHi;
4142 case AArch64::STRHHui:
4143 return AArch64::STURHHi;
4144 case AArch64::LDRSHXui:
4145 return AArch64::LDURSHXi;
4146 case AArch64::LDRSHWui:
4147 return AArch64::LDURSHWi;
4148 case AArch64::LDRBBui:
4149 return AArch64::LDURBBi;
4150 case AArch64::LDRBui:
4151 return AArch64::LDURBi;
4152 case AArch64::STRBBui:
4153 return AArch64::STURBBi;
4154 case AArch64::STRBui:
4155 return AArch64::STURBi;
4156 case AArch64::LDRSBWui:
4157 return AArch64::LDURSBWi;
4158 case AArch64::LDRSBXui:
4159 return AArch64::LDURSBXi;
4160 }
4161}
4162
4163// Given the opcode of a memory load/store instruction, return the opcode of an
4164// instruction performing the same operation, but using
4165// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4166// offset register.
4167static unsigned offsetExtendOpcode(unsigned Opcode) {
4168 switch (Opcode) {
4169 default:
4170 llvm_unreachable("Address folding not implemented for instruction");
4171
4172 case AArch64::LDRQroX:
4173 case AArch64::LDURQi:
4174 case AArch64::LDRQui:
4175 return AArch64::LDRQroW;
4176 case AArch64::STRQroX:
4177 case AArch64::STURQi:
4178 case AArch64::STRQui:
4179 return AArch64::STRQroW;
4180 case AArch64::LDRDroX:
4181 case AArch64::LDURDi:
4182 case AArch64::LDRDui:
4183 return AArch64::LDRDroW;
4184 case AArch64::STRDroX:
4185 case AArch64::STURDi:
4186 case AArch64::STRDui:
4187 return AArch64::STRDroW;
4188 case AArch64::LDRXroX:
4189 case AArch64::LDURXi:
4190 case AArch64::LDRXui:
4191 return AArch64::LDRXroW;
4192 case AArch64::STRXroX:
4193 case AArch64::STURXi:
4194 case AArch64::STRXui:
4195 return AArch64::STRXroW;
4196 case AArch64::LDRWroX:
4197 case AArch64::LDURWi:
4198 case AArch64::LDRWui:
4199 return AArch64::LDRWroW;
4200 case AArch64::LDRSWroX:
4201 case AArch64::LDURSWi:
4202 case AArch64::LDRSWui:
4203 return AArch64::LDRSWroW;
4204 case AArch64::STRWroX:
4205 case AArch64::STURWi:
4206 case AArch64::STRWui:
4207 return AArch64::STRWroW;
4208 case AArch64::LDRHroX:
4209 case AArch64::LDURHi:
4210 case AArch64::LDRHui:
4211 return AArch64::LDRHroW;
4212 case AArch64::STRHroX:
4213 case AArch64::STURHi:
4214 case AArch64::STRHui:
4215 return AArch64::STRHroW;
4216 case AArch64::LDRHHroX:
4217 case AArch64::LDURHHi:
4218 case AArch64::LDRHHui:
4219 return AArch64::LDRHHroW;
4220 case AArch64::STRHHroX:
4221 case AArch64::STURHHi:
4222 case AArch64::STRHHui:
4223 return AArch64::STRHHroW;
4224 case AArch64::LDRSHXroX:
4225 case AArch64::LDURSHXi:
4226 case AArch64::LDRSHXui:
4227 return AArch64::LDRSHXroW;
4228 case AArch64::LDRSHWroX:
4229 case AArch64::LDURSHWi:
4230 case AArch64::LDRSHWui:
4231 return AArch64::LDRSHWroW;
4232 case AArch64::LDRBroX:
4233 case AArch64::LDURBi:
4234 case AArch64::LDRBui:
4235 return AArch64::LDRBroW;
4236 case AArch64::LDRBBroX:
4237 case AArch64::LDURBBi:
4238 case AArch64::LDRBBui:
4239 return AArch64::LDRBBroW;
4240 case AArch64::LDRSBXroX:
4241 case AArch64::LDURSBXi:
4242 case AArch64::LDRSBXui:
4243 return AArch64::LDRSBXroW;
4244 case AArch64::LDRSBWroX:
4245 case AArch64::LDURSBWi:
4246 case AArch64::LDRSBWui:
4247 return AArch64::LDRSBWroW;
4248 case AArch64::STRBroX:
4249 case AArch64::STURBi:
4250 case AArch64::STRBui:
4251 return AArch64::STRBroW;
4252 case AArch64::STRBBroX:
4253 case AArch64::STURBBi:
4254 case AArch64::STRBBui:
4255 return AArch64::STRBBroW;
4256 }
4257}
4258
4260 const ExtAddrMode &AM) const {
4261
4262 const DebugLoc &DL = MemI.getDebugLoc();
4263 MachineBasicBlock &MBB = *MemI.getParent();
4264 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4265
4267 if (AM.ScaledReg) {
4268 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4269 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4270 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4271 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4272 .addReg(MemI.getOperand(0).getReg(),
4273 getDefRegState(MemI.mayLoad()))
4274 .addReg(AM.BaseReg)
4275 .addReg(AM.ScaledReg)
4276 .addImm(0)
4277 .addImm(AM.Scale > 1)
4278 .setMemRefs(MemI.memoperands())
4279 .setMIFlags(MemI.getFlags());
4280 return B.getInstr();
4281 }
4282
4283 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4284 "Addressing mode not supported for folding");
4285
4286 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4287 unsigned Scale = 1;
4288 unsigned Opcode = MemI.getOpcode();
4289 if (isInt<9>(AM.Displacement))
4290 Opcode = unscaledOffsetOpcode(Opcode);
4291 else
4292 Opcode = scaledOffsetOpcode(Opcode, Scale);
4293
4294 auto B =
4295 BuildMI(MBB, MemI, DL, get(Opcode))
4296 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4297 .addReg(AM.BaseReg)
4298 .addImm(AM.Displacement / Scale)
4299 .setMemRefs(MemI.memoperands())
4300 .setMIFlags(MemI.getFlags());
4301 return B.getInstr();
4302 }
4303
4306 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4307 assert(AM.ScaledReg && !AM.Displacement &&
4308 "Address offset can be a register or an immediate, but not both");
4309 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4310 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4311 // Make sure the offset register is in the correct register class.
4312 Register OffsetReg = AM.ScaledReg;
4313 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4314 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4315 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4316 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4317 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4318 }
4319 auto B =
4320 BuildMI(MBB, MemI, DL, get(Opcode))
4321 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4322 .addReg(AM.BaseReg)
4323 .addReg(OffsetReg)
4325 .addImm(AM.Scale != 1)
4326 .setMemRefs(MemI.memoperands())
4327 .setMIFlags(MemI.getFlags());
4328
4329 return B.getInstr();
4330 }
4331
4333 "Function must not be called with an addressing mode it can't handle");
4334}
4335
4336/// Return true if the opcode is a post-index ld/st instruction, which really
4337/// loads from base+0.
4338static bool isPostIndexLdStOpcode(unsigned Opcode) {
4339 switch (Opcode) {
4340 default:
4341 return false;
4342 case AArch64::LD1Fourv16b_POST:
4343 case AArch64::LD1Fourv1d_POST:
4344 case AArch64::LD1Fourv2d_POST:
4345 case AArch64::LD1Fourv2s_POST:
4346 case AArch64::LD1Fourv4h_POST:
4347 case AArch64::LD1Fourv4s_POST:
4348 case AArch64::LD1Fourv8b_POST:
4349 case AArch64::LD1Fourv8h_POST:
4350 case AArch64::LD1Onev16b_POST:
4351 case AArch64::LD1Onev1d_POST:
4352 case AArch64::LD1Onev2d_POST:
4353 case AArch64::LD1Onev2s_POST:
4354 case AArch64::LD1Onev4h_POST:
4355 case AArch64::LD1Onev4s_POST:
4356 case AArch64::LD1Onev8b_POST:
4357 case AArch64::LD1Onev8h_POST:
4358 case AArch64::LD1Rv16b_POST:
4359 case AArch64::LD1Rv1d_POST:
4360 case AArch64::LD1Rv2d_POST:
4361 case AArch64::LD1Rv2s_POST:
4362 case AArch64::LD1Rv4h_POST:
4363 case AArch64::LD1Rv4s_POST:
4364 case AArch64::LD1Rv8b_POST:
4365 case AArch64::LD1Rv8h_POST:
4366 case AArch64::LD1Threev16b_POST:
4367 case AArch64::LD1Threev1d_POST:
4368 case AArch64::LD1Threev2d_POST:
4369 case AArch64::LD1Threev2s_POST:
4370 case AArch64::LD1Threev4h_POST:
4371 case AArch64::LD1Threev4s_POST:
4372 case AArch64::LD1Threev8b_POST:
4373 case AArch64::LD1Threev8h_POST:
4374 case AArch64::LD1Twov16b_POST:
4375 case AArch64::LD1Twov1d_POST:
4376 case AArch64::LD1Twov2d_POST:
4377 case AArch64::LD1Twov2s_POST:
4378 case AArch64::LD1Twov4h_POST:
4379 case AArch64::LD1Twov4s_POST:
4380 case AArch64::LD1Twov8b_POST:
4381 case AArch64::LD1Twov8h_POST:
4382 case AArch64::LD1i16_POST:
4383 case AArch64::LD1i32_POST:
4384 case AArch64::LD1i64_POST:
4385 case AArch64::LD1i8_POST:
4386 case AArch64::LD2Rv16b_POST:
4387 case AArch64::LD2Rv1d_POST:
4388 case AArch64::LD2Rv2d_POST:
4389 case AArch64::LD2Rv2s_POST:
4390 case AArch64::LD2Rv4h_POST:
4391 case AArch64::LD2Rv4s_POST:
4392 case AArch64::LD2Rv8b_POST:
4393 case AArch64::LD2Rv8h_POST:
4394 case AArch64::LD2Twov16b_POST:
4395 case AArch64::LD2Twov2d_POST:
4396 case AArch64::LD2Twov2s_POST:
4397 case AArch64::LD2Twov4h_POST:
4398 case AArch64::LD2Twov4s_POST:
4399 case AArch64::LD2Twov8b_POST:
4400 case AArch64::LD2Twov8h_POST:
4401 case AArch64::LD2i16_POST:
4402 case AArch64::LD2i32_POST:
4403 case AArch64::LD2i64_POST:
4404 case AArch64::LD2i8_POST:
4405 case AArch64::LD3Rv16b_POST:
4406 case AArch64::LD3Rv1d_POST:
4407 case AArch64::LD3Rv2d_POST:
4408 case AArch64::LD3Rv2s_POST:
4409 case AArch64::LD3Rv4h_POST:
4410 case AArch64::LD3Rv4s_POST:
4411 case AArch64::LD3Rv8b_POST:
4412 case AArch64::LD3Rv8h_POST:
4413 case AArch64::LD3Threev16b_POST:
4414 case AArch64::LD3Threev2d_POST:
4415 case AArch64::LD3Threev2s_POST:
4416 case AArch64::LD3Threev4h_POST:
4417 case AArch64::LD3Threev4s_POST:
4418 case AArch64::LD3Threev8b_POST:
4419 case AArch64::LD3Threev8h_POST:
4420 case AArch64::LD3i16_POST:
4421 case AArch64::LD3i32_POST:
4422 case AArch64::LD3i64_POST:
4423 case AArch64::LD3i8_POST:
4424 case AArch64::LD4Fourv16b_POST:
4425 case AArch64::LD4Fourv2d_POST:
4426 case AArch64::LD4Fourv2s_POST:
4427 case AArch64::LD4Fourv4h_POST:
4428 case AArch64::LD4Fourv4s_POST:
4429 case AArch64::LD4Fourv8b_POST:
4430 case AArch64::LD4Fourv8h_POST:
4431 case AArch64::LD4Rv16b_POST:
4432 case AArch64::LD4Rv1d_POST:
4433 case AArch64::LD4Rv2d_POST:
4434 case AArch64::LD4Rv2s_POST:
4435 case AArch64::LD4Rv4h_POST:
4436 case AArch64::LD4Rv4s_POST:
4437 case AArch64::LD4Rv8b_POST:
4438 case AArch64::LD4Rv8h_POST:
4439 case AArch64::LD4i16_POST:
4440 case AArch64::LD4i32_POST:
4441 case AArch64::LD4i64_POST:
4442 case AArch64::LD4i8_POST:
4443 case AArch64::LDAPRWpost:
4444 case AArch64::LDAPRXpost:
4445 case AArch64::LDIAPPWpost:
4446 case AArch64::LDIAPPXpost:
4447 case AArch64::LDPDpost:
4448 case AArch64::LDPQpost:
4449 case AArch64::LDPSWpost:
4450 case AArch64::LDPSpost:
4451 case AArch64::LDPWpost:
4452 case AArch64::LDPXpost:
4453 case AArch64::LDRBBpost:
4454 case AArch64::LDRBpost:
4455 case AArch64::LDRDpost:
4456 case AArch64::LDRHHpost:
4457 case AArch64::LDRHpost:
4458 case AArch64::LDRQpost:
4459 case AArch64::LDRSBWpost:
4460 case AArch64::LDRSBXpost:
4461 case AArch64::LDRSHWpost:
4462 case AArch64::LDRSHXpost:
4463 case AArch64::LDRSWpost:
4464 case AArch64::LDRSpost:
4465 case AArch64::LDRWpost:
4466 case AArch64::LDRXpost:
4467 case AArch64::ST1Fourv16b_POST:
4468 case AArch64::ST1Fourv1d_POST:
4469 case AArch64::ST1Fourv2d_POST:
4470 case AArch64::ST1Fourv2s_POST:
4471 case AArch64::ST1Fourv4h_POST:
4472 case AArch64::ST1Fourv4s_POST:
4473 case AArch64::ST1Fourv8b_POST:
4474 case AArch64::ST1Fourv8h_POST:
4475 case AArch64::ST1Onev16b_POST:
4476 case AArch64::ST1Onev1d_POST:
4477 case AArch64::ST1Onev2d_POST:
4478 case AArch64::ST1Onev2s_POST:
4479 case AArch64::ST1Onev4h_POST:
4480 case AArch64::ST1Onev4s_POST:
4481 case AArch64::ST1Onev8b_POST:
4482 case AArch64::ST1Onev8h_POST:
4483 case AArch64::ST1Threev16b_POST:
4484 case AArch64::ST1Threev1d_POST:
4485 case AArch64::ST1Threev2d_POST:
4486 case AArch64::ST1Threev2s_POST:
4487 case AArch64::ST1Threev4h_POST:
4488 case AArch64::ST1Threev4s_POST:
4489 case AArch64::ST1Threev8b_POST:
4490 case AArch64::ST1Threev8h_POST:
4491 case AArch64::ST1Twov16b_POST:
4492 case AArch64::ST1Twov1d_POST:
4493 case AArch64::ST1Twov2d_POST:
4494 case AArch64::ST1Twov2s_POST:
4495 case AArch64::ST1Twov4h_POST:
4496 case AArch64::ST1Twov4s_POST:
4497 case AArch64::ST1Twov8b_POST:
4498 case AArch64::ST1Twov8h_POST:
4499 case AArch64::ST1i16_POST:
4500 case AArch64::ST1i32_POST:
4501 case AArch64::ST1i64_POST:
4502 case AArch64::ST1i8_POST:
4503 case AArch64::ST2GPostIndex:
4504 case AArch64::ST2Twov16b_POST:
4505 case AArch64::ST2Twov2d_POST:
4506 case AArch64::ST2Twov2s_POST:
4507 case AArch64::ST2Twov4h_POST:
4508 case AArch64::ST2Twov4s_POST:
4509 case AArch64::ST2Twov8b_POST:
4510 case AArch64::ST2Twov8h_POST:
4511 case AArch64::ST2i16_POST:
4512 case AArch64::ST2i32_POST:
4513 case AArch64::ST2i64_POST:
4514 case AArch64::ST2i8_POST:
4515 case AArch64::ST3Threev16b_POST:
4516 case AArch64::ST3Threev2d_POST:
4517 case AArch64::ST3Threev2s_POST:
4518 case AArch64::ST3Threev4h_POST:
4519 case AArch64::ST3Threev4s_POST:
4520 case AArch64::ST3Threev8b_POST:
4521 case AArch64::ST3Threev8h_POST:
4522 case AArch64::ST3i16_POST:
4523 case AArch64::ST3i32_POST:
4524 case AArch64::ST3i64_POST:
4525 case AArch64::ST3i8_POST:
4526 case AArch64::ST4Fourv16b_POST:
4527 case AArch64::ST4Fourv2d_POST:
4528 case AArch64::ST4Fourv2s_POST:
4529 case AArch64::ST4Fourv4h_POST:
4530 case AArch64::ST4Fourv4s_POST:
4531 case AArch64::ST4Fourv8b_POST:
4532 case AArch64::ST4Fourv8h_POST:
4533 case AArch64::ST4i16_POST:
4534 case AArch64::ST4i32_POST:
4535 case AArch64::ST4i64_POST:
4536 case AArch64::ST4i8_POST:
4537 case AArch64::STGPostIndex:
4538 case AArch64::STGPpost:
4539 case AArch64::STPDpost:
4540 case AArch64::STPQpost:
4541 case AArch64::STPSpost:
4542 case AArch64::STPWpost:
4543 case AArch64::STPXpost:
4544 case AArch64::STRBBpost:
4545 case AArch64::STRBpost:
4546 case AArch64::STRDpost:
4547 case AArch64::STRHHpost:
4548 case AArch64::STRHpost:
4549 case AArch64::STRQpost:
4550 case AArch64::STRSpost:
4551 case AArch64::STRWpost:
4552 case AArch64::STRXpost:
4553 case AArch64::STZ2GPostIndex:
4554 case AArch64::STZGPostIndex:
4555 return true;
4556 }
4557}
4558
4560 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4561 bool &OffsetIsScalable, TypeSize &Width,
4562 const TargetRegisterInfo *TRI) const {
4563 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4564 // Handle only loads/stores with base register followed by immediate offset.
4565 if (LdSt.getNumExplicitOperands() == 3) {
4566 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4567 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4568 !LdSt.getOperand(2).isImm())
4569 return false;
4570 } else if (LdSt.getNumExplicitOperands() == 4) {
4571 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4572 if (!LdSt.getOperand(1).isReg() ||
4573 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4574 !LdSt.getOperand(3).isImm())
4575 return false;
4576 } else
4577 return false;
4578
4579 // Get the scaling factor for the instruction and set the width for the
4580 // instruction.
4581 TypeSize Scale(0U, false);
4582 int64_t Dummy1, Dummy2;
4583
4584 // If this returns false, then it's an instruction we don't want to handle.
4585 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4586 return false;
4587
4588 // Compute the offset. Offset is calculated as the immediate operand
4589 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4590 // set to 1. Postindex are a special case which have an offset of 0.
4591 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4592 BaseOp = &LdSt.getOperand(2);
4593 Offset = 0;
4594 } else if (LdSt.getNumExplicitOperands() == 3) {
4595 BaseOp = &LdSt.getOperand(1);
4596 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4597 } else {
4598 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4599 BaseOp = &LdSt.getOperand(2);
4600 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4601 }
4602 OffsetIsScalable = Scale.isScalable();
4603
4604 return BaseOp->isReg() || BaseOp->isFI();
4605}
4606
4609 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4610 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4611 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4612 return OfsOp;
4613}
4614
4615bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4616 TypeSize &Width, int64_t &MinOffset,
4617 int64_t &MaxOffset) {
4618 switch (Opcode) {
4619 // Not a memory operation or something we want to handle.
4620 default:
4621 Scale = Width = TypeSize::getFixed(0);
4622 MinOffset = MaxOffset = 0;
4623 return false;
4624 // LDR / STR
4625 case AArch64::LDRQui:
4626 case AArch64::STRQui:
4627 Scale = Width = TypeSize::getFixed(16);
4628 MinOffset = 0;
4629 MaxOffset = 4095;
4630 break;
4631 case AArch64::LDRXui:
4632 case AArch64::LDRDui:
4633 case AArch64::STRXui:
4634 case AArch64::STRDui:
4635 case AArch64::PRFMui:
4636 Scale = Width = TypeSize::getFixed(8);
4637 MinOffset = 0;
4638 MaxOffset = 4095;
4639 break;
4640 case AArch64::LDRWui:
4641 case AArch64::LDRSui:
4642 case AArch64::LDRSWui:
4643 case AArch64::STRWui:
4644 case AArch64::STRSui:
4645 Scale = Width = TypeSize::getFixed(4);
4646 MinOffset = 0;
4647 MaxOffset = 4095;
4648 break;
4649 case AArch64::LDRHui:
4650 case AArch64::LDRHHui:
4651 case AArch64::LDRSHWui:
4652 case AArch64::LDRSHXui:
4653 case AArch64::STRHui:
4654 case AArch64::STRHHui:
4655 Scale = Width = TypeSize::getFixed(2);
4656 MinOffset = 0;
4657 MaxOffset = 4095;
4658 break;
4659 case AArch64::LDRBui:
4660 case AArch64::LDRBBui:
4661 case AArch64::LDRSBWui:
4662 case AArch64::LDRSBXui:
4663 case AArch64::STRBui:
4664 case AArch64::STRBBui:
4665 Scale = Width = TypeSize::getFixed(1);
4666 MinOffset = 0;
4667 MaxOffset = 4095;
4668 break;
4669 // post/pre inc
4670 case AArch64::STRQpre:
4671 case AArch64::LDRQpost:
4672 Scale = TypeSize::getFixed(1);
4673 Width = TypeSize::getFixed(16);
4674 MinOffset = -256;
4675 MaxOffset = 255;
4676 break;
4677 case AArch64::LDRDpost:
4678 case AArch64::LDRDpre:
4679 case AArch64::LDRXpost:
4680 case AArch64::LDRXpre:
4681 case AArch64::STRDpost:
4682 case AArch64::STRDpre:
4683 case AArch64::STRXpost:
4684 case AArch64::STRXpre:
4685 Scale = TypeSize::getFixed(1);
4686 Width = TypeSize::getFixed(8);
4687 MinOffset = -256;
4688 MaxOffset = 255;
4689 break;
4690 case AArch64::STRWpost:
4691 case AArch64::STRWpre:
4692 case AArch64::LDRWpost:
4693 case AArch64::LDRWpre:
4694 case AArch64::STRSpost:
4695 case AArch64::STRSpre:
4696 case AArch64::LDRSpost:
4697 case AArch64::LDRSpre:
4698 Scale = TypeSize::getFixed(1);
4699 Width = TypeSize::getFixed(4);
4700 MinOffset = -256;
4701 MaxOffset = 255;
4702 break;
4703 case AArch64::LDRHpost:
4704 case AArch64::LDRHpre:
4705 case AArch64::STRHpost:
4706 case AArch64::STRHpre:
4707 case AArch64::LDRHHpost:
4708 case AArch64::LDRHHpre:
4709 case AArch64::STRHHpost:
4710 case AArch64::STRHHpre:
4711 Scale = TypeSize::getFixed(1);
4712 Width = TypeSize::getFixed(2);
4713 MinOffset = -256;
4714 MaxOffset = 255;
4715 break;
4716 case AArch64::LDRBpost:
4717 case AArch64::LDRBpre:
4718 case AArch64::STRBpost:
4719 case AArch64::STRBpre:
4720 case AArch64::LDRBBpost:
4721 case AArch64::LDRBBpre:
4722 case AArch64::STRBBpost:
4723 case AArch64::STRBBpre:
4724 Scale = Width = TypeSize::getFixed(1);
4725 MinOffset = -256;
4726 MaxOffset = 255;
4727 break;
4728 // Unscaled
4729 case AArch64::LDURQi:
4730 case AArch64::STURQi:
4731 Scale = TypeSize::getFixed(1);
4732 Width = TypeSize::getFixed(16);
4733 MinOffset = -256;
4734 MaxOffset = 255;
4735 break;
4736 case AArch64::LDURXi:
4737 case AArch64::LDURDi:
4738 case AArch64::LDAPURXi:
4739 case AArch64::STURXi:
4740 case AArch64::STURDi:
4741 case AArch64::STLURXi:
4742 case AArch64::PRFUMi:
4743 Scale = TypeSize::getFixed(1);
4744 Width = TypeSize::getFixed(8);
4745 MinOffset = -256;
4746 MaxOffset = 255;
4747 break;
4748 case AArch64::LDURWi:
4749 case AArch64::LDURSi:
4750 case AArch64::LDURSWi:
4751 case AArch64::LDAPURi:
4752 case AArch64::LDAPURSWi:
4753 case AArch64::STURWi:
4754 case AArch64::STURSi:
4755 case AArch64::STLURWi:
4756 Scale = TypeSize::getFixed(1);
4757 Width = TypeSize::getFixed(4);
4758 MinOffset = -256;
4759 MaxOffset = 255;
4760 break;
4761 case AArch64::LDURHi:
4762 case AArch64::LDURHHi:
4763 case AArch64::LDURSHXi:
4764 case AArch64::LDURSHWi:
4765 case AArch64::LDAPURHi:
4766 case AArch64::LDAPURSHWi:
4767 case AArch64::LDAPURSHXi:
4768 case AArch64::STURHi:
4769 case AArch64::STURHHi:
4770 case AArch64::STLURHi:
4771 Scale = TypeSize::getFixed(1);
4772 Width = TypeSize::getFixed(2);
4773 MinOffset = -256;
4774 MaxOffset = 255;
4775 break;
4776 case AArch64::LDURBi:
4777 case AArch64::LDURBBi:
4778 case AArch64::LDURSBXi:
4779 case AArch64::LDURSBWi:
4780 case AArch64::LDAPURBi:
4781 case AArch64::LDAPURSBWi:
4782 case AArch64::LDAPURSBXi:
4783 case AArch64::STURBi:
4784 case AArch64::STURBBi:
4785 case AArch64::STLURBi:
4786 Scale = Width = TypeSize::getFixed(1);
4787 MinOffset = -256;
4788 MaxOffset = 255;
4789 break;
4790 // LDP / STP (including pre/post inc)
4791 case AArch64::LDPQi:
4792 case AArch64::LDNPQi:
4793 case AArch64::STPQi:
4794 case AArch64::STNPQi:
4795 case AArch64::LDPQpost:
4796 case AArch64::LDPQpre:
4797 case AArch64::STPQpost:
4798 case AArch64::STPQpre:
4799 Scale = TypeSize::getFixed(16);
4800 Width = TypeSize::getFixed(16 * 2);
4801 MinOffset = -64;
4802 MaxOffset = 63;
4803 break;
4804 case AArch64::LDPXi:
4805 case AArch64::LDPDi:
4806 case AArch64::LDNPXi:
4807 case AArch64::LDNPDi:
4808 case AArch64::STPXi:
4809 case AArch64::STPDi:
4810 case AArch64::STNPXi:
4811 case AArch64::STNPDi:
4812 case AArch64::LDPDpost:
4813 case AArch64::LDPDpre:
4814 case AArch64::LDPXpost:
4815 case AArch64::LDPXpre:
4816 case AArch64::STPDpost:
4817 case AArch64::STPDpre:
4818 case AArch64::STPXpost:
4819 case AArch64::STPXpre:
4820 Scale = TypeSize::getFixed(8);
4821 Width = TypeSize::getFixed(8 * 2);
4822 MinOffset = -64;
4823 MaxOffset = 63;
4824 break;
4825 case AArch64::LDPWi:
4826 case AArch64::LDPSi:
4827 case AArch64::LDNPWi:
4828 case AArch64::LDNPSi:
4829 case AArch64::STPWi:
4830 case AArch64::STPSi:
4831 case AArch64::STNPWi:
4832 case AArch64::STNPSi:
4833 case AArch64::LDPSpost:
4834 case AArch64::LDPSpre:
4835 case AArch64::LDPWpost:
4836 case AArch64::LDPWpre:
4837 case AArch64::STPSpost:
4838 case AArch64::STPSpre:
4839 case AArch64::STPWpost:
4840 case AArch64::STPWpre:
4841 Scale = TypeSize::getFixed(4);
4842 Width = TypeSize::getFixed(4 * 2);
4843 MinOffset = -64;
4844 MaxOffset = 63;
4845 break;
4846 case AArch64::StoreSwiftAsyncContext:
4847 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4848 Scale = TypeSize::getFixed(1);
4849 Width = TypeSize::getFixed(8);
4850 MinOffset = 0;
4851 MaxOffset = 4095;
4852 break;
4853 case AArch64::ADDG:
4854 Scale = TypeSize::getFixed(16);
4855 Width = TypeSize::getFixed(0);
4856 MinOffset = 0;
4857 MaxOffset = 63;
4858 break;
4859 case AArch64::TAGPstack:
4860 Scale = TypeSize::getFixed(16);
4861 Width = TypeSize::getFixed(0);
4862 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4863 // of 63 (not 64!).
4864 MinOffset = -63;
4865 MaxOffset = 63;
4866 break;
4867 case AArch64::LDG:
4868 case AArch64::STGi:
4869 case AArch64::STGPreIndex:
4870 case AArch64::STGPostIndex:
4871 case AArch64::STZGi:
4872 case AArch64::STZGPreIndex:
4873 case AArch64::STZGPostIndex:
4874 Scale = Width = TypeSize::getFixed(16);
4875 MinOffset = -256;
4876 MaxOffset = 255;
4877 break;
4878 // SVE
4879 case AArch64::STR_ZZZZXI:
4880 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4881 case AArch64::LDR_ZZZZXI:
4882 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4883 Scale = TypeSize::getScalable(16);
4884 Width = TypeSize::getScalable(16 * 4);
4885 MinOffset = -256;
4886 MaxOffset = 252;
4887 break;
4888 case AArch64::STR_ZZZXI:
4889 case AArch64::LDR_ZZZXI:
4890 Scale = TypeSize::getScalable(16);
4891 Width = TypeSize::getScalable(16 * 3);
4892 MinOffset = -256;
4893 MaxOffset = 253;
4894 break;
4895 case AArch64::STR_ZZXI:
4896 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4897 case AArch64::LDR_ZZXI:
4898 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4899 Scale = TypeSize::getScalable(16);
4900 Width = TypeSize::getScalable(16 * 2);
4901 MinOffset = -256;
4902 MaxOffset = 254;
4903 break;
4904 case AArch64::LDR_PXI:
4905 case AArch64::STR_PXI:
4906 Scale = Width = TypeSize::getScalable(2);
4907 MinOffset = -256;
4908 MaxOffset = 255;
4909 break;
4910 case AArch64::LDR_PPXI:
4911 case AArch64::STR_PPXI:
4912 Scale = TypeSize::getScalable(2);
4913 Width = TypeSize::getScalable(2 * 2);
4914 MinOffset = -256;
4915 MaxOffset = 254;
4916 break;
4917 case AArch64::LDR_ZXI:
4918 case AArch64::STR_ZXI:
4919 Scale = Width = TypeSize::getScalable(16);
4920 MinOffset = -256;
4921 MaxOffset = 255;
4922 break;
4923 case AArch64::LD1B_IMM:
4924 case AArch64::LD1H_IMM:
4925 case AArch64::LD1W_IMM:
4926 case AArch64::LD1D_IMM:
4927 case AArch64::LDNT1B_ZRI:
4928 case AArch64::LDNT1H_ZRI:
4929 case AArch64::LDNT1W_ZRI:
4930 case AArch64::LDNT1D_ZRI:
4931 case AArch64::ST1B_IMM:
4932 case AArch64::ST1H_IMM:
4933 case AArch64::ST1W_IMM:
4934 case AArch64::ST1D_IMM:
4935 case AArch64::STNT1B_ZRI:
4936 case AArch64::STNT1H_ZRI:
4937 case AArch64::STNT1W_ZRI:
4938 case AArch64::STNT1D_ZRI:
4939 case AArch64::LDNF1B_IMM:
4940 case AArch64::LDNF1H_IMM:
4941 case AArch64::LDNF1W_IMM:
4942 case AArch64::LDNF1D_IMM:
4943 // A full vectors worth of data
4944 // Width = mbytes * elements
4945 Scale = Width = TypeSize::getScalable(16);
4946 MinOffset = -8;
4947 MaxOffset = 7;
4948 break;
4949 case AArch64::LD2B_IMM:
4950 case AArch64::LD2H_IMM:
4951 case AArch64::LD2W_IMM:
4952 case AArch64::LD2D_IMM:
4953 case AArch64::ST2B_IMM:
4954 case AArch64::ST2H_IMM:
4955 case AArch64::ST2W_IMM:
4956 case AArch64::ST2D_IMM:
4957 case AArch64::LD1B_2Z_IMM:
4958 case AArch64::LD1B_2Z_STRIDED_IMM:
4959 case AArch64::LD1H_2Z_IMM:
4960 case AArch64::LD1H_2Z_STRIDED_IMM:
4961 case AArch64::LD1W_2Z_IMM:
4962 case AArch64::LD1W_2Z_STRIDED_IMM:
4963 case AArch64::LD1D_2Z_IMM:
4964 case AArch64::LD1D_2Z_STRIDED_IMM:
4965 case AArch64::LD1B_2Z_IMM_PSEUDO:
4966 case AArch64::LD1H_2Z_IMM_PSEUDO:
4967 case AArch64::LD1W_2Z_IMM_PSEUDO:
4968 case AArch64::LD1D_2Z_IMM_PSEUDO:
4969 case AArch64::ST1B_2Z_IMM:
4970 case AArch64::ST1B_2Z_STRIDED_IMM:
4971 case AArch64::ST1H_2Z_IMM:
4972 case AArch64::ST1H_2Z_STRIDED_IMM:
4973 case AArch64::ST1W_2Z_IMM:
4974 case AArch64::ST1W_2Z_STRIDED_IMM:
4975 case AArch64::ST1D_2Z_IMM:
4976 case AArch64::ST1D_2Z_STRIDED_IMM:
4977 case AArch64::LDNT1B_2Z_IMM_PSEUDO:
4978 case AArch64::LDNT1B_2Z_IMM:
4979 case AArch64::LDNT1B_2Z_STRIDED_IMM:
4980 case AArch64::LDNT1H_2Z_IMM_PSEUDO:
4981 case AArch64::LDNT1H_2Z_IMM:
4982 case AArch64::LDNT1H_2Z_STRIDED_IMM:
4983 case AArch64::LDNT1W_2Z_IMM_PSEUDO:
4984 case AArch64::LDNT1W_2Z_IMM:
4985 case AArch64::LDNT1W_2Z_STRIDED_IMM:
4986 case AArch64::LDNT1D_2Z_IMM_PSEUDO:
4987 case AArch64::LDNT1D_2Z_IMM:
4988 case AArch64::LDNT1D_2Z_STRIDED_IMM:
4989 case AArch64::STNT1B_2Z_IMM:
4990 case AArch64::STNT1B_2Z_STRIDED_IMM:
4991 case AArch64::STNT1H_2Z_IMM:
4992 case AArch64::STNT1H_2Z_STRIDED_IMM:
4993 case AArch64::STNT1W_2Z_IMM:
4994 case AArch64::STNT1W_2Z_STRIDED_IMM:
4995 case AArch64::STNT1D_2Z_IMM:
4996 case AArch64::STNT1D_2Z_STRIDED_IMM:
4997 Scale = Width = TypeSize::getScalable(16 * 2);
4998 MinOffset = -8;
4999 MaxOffset = 7;
5000 break;
5001 case AArch64::LD3B_IMM:
5002 case AArch64::LD3H_IMM:
5003 case AArch64::LD3W_IMM:
5004 case AArch64::LD3D_IMM:
5005 case AArch64::ST3B_IMM:
5006 case AArch64::ST3H_IMM:
5007 case AArch64::ST3W_IMM:
5008 case AArch64::ST3D_IMM:
5009 Scale = Width = TypeSize::getScalable(16 * 3);
5010 MinOffset = -8;
5011 MaxOffset = 7;
5012 break;
5013 case AArch64::LD4B_IMM:
5014 case AArch64::LD4H_IMM:
5015 case AArch64::LD4W_IMM:
5016 case AArch64::LD4D_IMM:
5017 case AArch64::ST4B_IMM:
5018 case AArch64::ST4H_IMM:
5019 case AArch64::ST4W_IMM:
5020 case AArch64::ST4D_IMM:
5021 case AArch64::LD1B_4Z_IMM:
5022 case AArch64::LD1B_4Z_STRIDED_IMM:
5023 case AArch64::LD1H_4Z_IMM:
5024 case AArch64::LD1H_4Z_STRIDED_IMM:
5025 case AArch64::LD1W_4Z_IMM:
5026 case AArch64::LD1W_4Z_STRIDED_IMM:
5027 case AArch64::LD1D_4Z_IMM:
5028 case AArch64::LD1D_4Z_STRIDED_IMM:
5029 case AArch64::LD1B_4Z_IMM_PSEUDO:
5030 case AArch64::LD1H_4Z_IMM_PSEUDO:
5031 case AArch64::LD1W_4Z_IMM_PSEUDO:
5032 case AArch64::LD1D_4Z_IMM_PSEUDO:
5033 case AArch64::ST1B_4Z_IMM:
5034 case AArch64::ST1B_4Z_STRIDED_IMM:
5035 case AArch64::ST1H_4Z_IMM:
5036 case AArch64::ST1H_4Z_STRIDED_IMM:
5037 case AArch64::ST1W_4Z_IMM:
5038 case AArch64::ST1W_4Z_STRIDED_IMM:
5039 case AArch64::ST1D_4Z_IMM:
5040 case AArch64::ST1D_4Z_STRIDED_IMM:
5041 case AArch64::LDNT1B_4Z_IMM_PSEUDO:
5042 case AArch64::LDNT1B_4Z_IMM:
5043 case AArch64::LDNT1B_4Z_STRIDED_IMM:
5044 case AArch64::LDNT1H_4Z_IMM_PSEUDO:
5045 case AArch64::LDNT1H_4Z_IMM:
5046 case AArch64::LDNT1H_4Z_STRIDED_IMM:
5047 case AArch64::LDNT1W_4Z_IMM_PSEUDO:
5048 case AArch64::LDNT1W_4Z_IMM:
5049 case AArch64::LDNT1W_4Z_STRIDED_IMM:
5050 case AArch64::LDNT1D_4Z_IMM_PSEUDO:
5051 case AArch64::LDNT1D_4Z_IMM:
5052 case AArch64::LDNT1D_4Z_STRIDED_IMM:
5053 case AArch64::STNT1B_4Z_IMM:
5054 case AArch64::STNT1B_4Z_STRIDED_IMM:
5055 case AArch64::STNT1H_4Z_IMM:
5056 case AArch64::STNT1H_4Z_STRIDED_IMM:
5057 case AArch64::STNT1W_4Z_IMM:
5058 case AArch64::STNT1W_4Z_STRIDED_IMM:
5059 case AArch64::STNT1D_4Z_IMM:
5060 case AArch64::STNT1D_4Z_STRIDED_IMM:
5061 Scale = Width = TypeSize::getScalable(16 * 4);
5062 MinOffset = -8;
5063 MaxOffset = 7;
5064 break;
5065 case AArch64::LD1B_H_IMM:
5066 case AArch64::LD1SB_H_IMM:
5067 case AArch64::LD1H_S_IMM:
5068 case AArch64::LD1SH_S_IMM:
5069 case AArch64::LD1W_D_IMM:
5070 case AArch64::LD1SW_D_IMM:
5071 case AArch64::ST1B_H_IMM:
5072 case AArch64::ST1H_S_IMM:
5073 case AArch64::ST1W_D_IMM:
5074 case AArch64::LDNF1B_H_IMM:
5075 case AArch64::LDNF1SB_H_IMM:
5076 case AArch64::LDNF1H_S_IMM:
5077 case AArch64::LDNF1SH_S_IMM:
5078 case AArch64::LDNF1W_D_IMM:
5079 case AArch64::LDNF1SW_D_IMM:
5080 // A half vector worth of data
5081 // Width = mbytes * elements
5082 Scale = Width = TypeSize::getScalable(8);
5083 MinOffset = -8;
5084 MaxOffset = 7;
5085 break;
5086 case AArch64::LD1B_S_IMM:
5087 case AArch64::LD1SB_S_IMM:
5088 case AArch64::LD1H_D_IMM:
5089 case AArch64::LD1SH_D_IMM:
5090 case AArch64::ST1B_S_IMM:
5091 case AArch64::ST1H_D_IMM:
5092 case AArch64::LDNF1B_S_IMM:
5093 case AArch64::LDNF1SB_S_IMM:
5094 case AArch64::LDNF1H_D_IMM:
5095 case AArch64::LDNF1SH_D_IMM:
5096 // A quarter vector worth of data
5097 // Width = mbytes * elements
5098 Scale = Width = TypeSize::getScalable(4);
5099 MinOffset = -8;
5100 MaxOffset = 7;
5101 break;
5102 case AArch64::LD1B_D_IMM:
5103 case AArch64::LD1SB_D_IMM:
5104 case AArch64::ST1B_D_IMM:
5105 case AArch64::LDNF1B_D_IMM:
5106 case AArch64::LDNF1SB_D_IMM:
5107 // A eighth vector worth of data
5108 // Width = mbytes * elements
5109 Scale = Width = TypeSize::getScalable(2);
5110 MinOffset = -8;
5111 MaxOffset = 7;
5112 break;
5113 case AArch64::ST2Gi:
5114 case AArch64::ST2GPreIndex:
5115 case AArch64::ST2GPostIndex:
5116 case AArch64::STZ2Gi:
5117 case AArch64::STZ2GPreIndex:
5118 case AArch64::STZ2GPostIndex:
5119 Scale = TypeSize::getFixed(16);
5120 Width = TypeSize::getFixed(32);
5121 MinOffset = -256;
5122 MaxOffset = 255;
5123 break;
5124 case AArch64::STGPi:
5125 case AArch64::STGPpost:
5126 case AArch64::STGPpre:
5127 Scale = Width = TypeSize::getFixed(16);
5128 MinOffset = -64;
5129 MaxOffset = 63;
5130 break;
5131 case AArch64::LD1RB_IMM:
5132 case AArch64::LD1RB_H_IMM:
5133 case AArch64::LD1RB_S_IMM:
5134 case AArch64::LD1RB_D_IMM:
5135 case AArch64::LD1RSB_H_IMM:
5136 case AArch64::LD1RSB_S_IMM:
5137 case AArch64::LD1RSB_D_IMM:
5138 Scale = Width = TypeSize::getFixed(1);
5139 MinOffset = 0;
5140 MaxOffset = 63;
5141 break;
5142 case AArch64::LD1RH_IMM:
5143 case AArch64::LD1RH_S_IMM:
5144 case AArch64::LD1RH_D_IMM:
5145 case AArch64::LD1RSH_S_IMM:
5146 case AArch64::LD1RSH_D_IMM:
5147 Scale = Width = TypeSize::getFixed(2);
5148 MinOffset = 0;
5149 MaxOffset = 63;
5150 break;
5151 case AArch64::LD1RW_IMM:
5152 case AArch64::LD1RW_D_IMM:
5153 case AArch64::LD1RSW_IMM:
5154 Scale = Width = TypeSize::getFixed(4);
5155 MinOffset = 0;
5156 MaxOffset = 63;
5157 break;
5158 case AArch64::LD1RD_IMM:
5159 Scale = Width = TypeSize::getFixed(8);
5160 MinOffset = 0;
5161 MaxOffset = 63;
5162 break;
5163 }
5164
5165 return true;
5166}
5167
5168// Scaling factor for unscaled load or store.
5170 switch (Opc) {
5171 default:
5172 llvm_unreachable("Opcode has unknown scale!");
5173 case AArch64::LDRBui:
5174 case AArch64::LDRBBui:
5175 case AArch64::LDURBBi:
5176 case AArch64::LDRSBWui:
5177 case AArch64::LDURSBWi:
5178 case AArch64::STRBui:
5179 case AArch64::STRBBui:
5180 case AArch64::STURBBi:
5181 return 1;
5182 case AArch64::LDRHui:
5183 case AArch64::LDRHHui:
5184 case AArch64::LDURHHi:
5185 case AArch64::LDRSHWui:
5186 case AArch64::LDURSHWi:
5187 case AArch64::STRHui:
5188 case AArch64::STRHHui:
5189 case AArch64::STURHHi:
5190 return 2;
5191 case AArch64::LDRSui:
5192 case AArch64::LDURSi:
5193 case AArch64::LDRSpre:
5194 case AArch64::LDRSWui:
5195 case AArch64::LDURSWi:
5196 case AArch64::LDRSWpre:
5197 case AArch64::LDRWpre:
5198 case AArch64::LDRWui:
5199 case AArch64::LDURWi:
5200 case AArch64::STRSui:
5201 case AArch64::STURSi:
5202 case AArch64::STRSpre:
5203 case AArch64::STRWui:
5204 case AArch64::STURWi:
5205 case AArch64::STRWpre:
5206 case AArch64::LDPSi:
5207 case AArch64::LDPSWi:
5208 case AArch64::LDPWi:
5209 case AArch64::STPSi:
5210 case AArch64::STPWi:
5211 return 4;
5212 case AArch64::LDRDui:
5213 case AArch64::LDURDi:
5214 case AArch64::LDRDpre:
5215 case AArch64::LDRXui:
5216 case AArch64::LDURXi:
5217 case AArch64::LDRXpre:
5218 case AArch64::STRDui:
5219 case AArch64::STURDi:
5220 case AArch64::STRDpre:
5221 case AArch64::STRXui:
5222 case AArch64::STURXi:
5223 case AArch64::STRXpre:
5224 case AArch64::LDPDi:
5225 case AArch64::LDPXi:
5226 case AArch64::STPDi:
5227 case AArch64::STPXi:
5228 return 8;
5229 case AArch64::LDRQui:
5230 case AArch64::LDURQi:
5231 case AArch64::STRQui:
5232 case AArch64::STURQi:
5233 case AArch64::STRQpre:
5234 case AArch64::LDPQi:
5235 case AArch64::LDRQpre:
5236 case AArch64::STPQi:
5237 case AArch64::STGi:
5238 case AArch64::STZGi:
5239 case AArch64::ST2Gi:
5240 case AArch64::STZ2Gi:
5241 case AArch64::STGPi:
5242 return 16;
5243 }
5244}
5245
5247 switch (MI.getOpcode()) {
5248 default:
5249 return false;
5250 case AArch64::LDRWpre:
5251 case AArch64::LDRXpre:
5252 case AArch64::LDRSWpre:
5253 case AArch64::LDRSpre:
5254 case AArch64::LDRDpre:
5255 case AArch64::LDRQpre:
5256 return true;
5257 }
5258}
5259
5261 switch (MI.getOpcode()) {
5262 default:
5263 return false;
5264 case AArch64::STRWpre:
5265 case AArch64::STRXpre:
5266 case AArch64::STRSpre:
5267 case AArch64::STRDpre:
5268 case AArch64::STRQpre:
5269 return true;
5270 }
5271}
5272
5274 return isPreLd(MI) || isPreSt(MI);
5275}
5276
5278 switch (MI.getOpcode()) {
5279 default:
5280 return false;
5281 case AArch64::LDURBBi:
5282 case AArch64::LDURHHi:
5283 case AArch64::LDURWi:
5284 case AArch64::LDRBBui:
5285 case AArch64::LDRHHui:
5286 case AArch64::LDRWui:
5287 case AArch64::LDRBBroX:
5288 case AArch64::LDRHHroX:
5289 case AArch64::LDRWroX:
5290 case AArch64::LDRBBroW:
5291 case AArch64::LDRHHroW:
5292 case AArch64::LDRWroW:
5293 return true;
5294 }
5295}
5296
5298 switch (MI.getOpcode()) {
5299 default:
5300 return false;
5301 case AArch64::LDURSBWi:
5302 case AArch64::LDURSHWi:
5303 case AArch64::LDURSBXi:
5304 case AArch64::LDURSHXi:
5305 case AArch64::LDURSWi:
5306 case AArch64::LDRSBWui:
5307 case AArch64::LDRSHWui:
5308 case AArch64::LDRSBXui:
5309 case AArch64::LDRSHXui:
5310 case AArch64::LDRSWui:
5311 case AArch64::LDRSBWroX:
5312 case AArch64::LDRSHWroX:
5313 case AArch64::LDRSBXroX:
5314 case AArch64::LDRSHXroX:
5315 case AArch64::LDRSWroX:
5316 case AArch64::LDRSBWroW:
5317 case AArch64::LDRSHWroW:
5318 case AArch64::LDRSBXroW:
5319 case AArch64::LDRSHXroW:
5320 case AArch64::LDRSWroW:
5321 return true;
5322 }
5323}
5324
5326 switch (MI.getOpcode()) {
5327 default:
5328 return false;
5329 case AArch64::LDPSi:
5330 case AArch64::LDPSWi:
5331 case AArch64::LDPDi:
5332 case AArch64::LDPQi:
5333 case AArch64::LDPWi:
5334 case AArch64::LDPXi:
5335 case AArch64::STPSi:
5336 case AArch64::STPDi:
5337 case AArch64::STPQi:
5338 case AArch64::STPWi:
5339 case AArch64::STPXi:
5340 case AArch64::STGPi:
5341 return true;
5342 }
5343}
5344
5346 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5347 unsigned Idx =
5349 : 1;
5350 return MI.getOperand(Idx);
5351}
5352
5353const MachineOperand &
5355 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5356 unsigned Idx =
5358 : 2;
5359 return MI.getOperand(Idx);
5360}
5361
5362const MachineOperand &
5364 switch (MI.getOpcode()) {
5365 default:
5366 llvm_unreachable("Unexpected opcode");
5367 case AArch64::LDRBroX:
5368 case AArch64::LDRBBroX:
5369 case AArch64::LDRSBXroX:
5370 case AArch64::LDRSBWroX:
5371 case AArch64::LDRHroX:
5372 case AArch64::LDRHHroX:
5373 case AArch64::LDRSHXroX:
5374 case AArch64::LDRSHWroX:
5375 case AArch64::LDRWroX:
5376 case AArch64::LDRSroX:
5377 case AArch64::LDRSWroX:
5378 case AArch64::LDRDroX:
5379 case AArch64::LDRXroX:
5380 case AArch64::LDRQroX:
5381 return MI.getOperand(4);
5382 }
5383}
5384
5386 Register Reg) {
5387 if (MI.getParent() == nullptr)
5388 return nullptr;
5389 const MachineFunction *MF = MI.getParent()->getParent();
5390 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5391}
5392
5394 auto IsHFPR = [&](const MachineOperand &Op) {
5395 if (!Op.isReg())
5396 return false;
5397 auto Reg = Op.getReg();
5398 if (Reg.isPhysical())
5399 return AArch64::FPR16RegClass.contains(Reg);
5400 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5401 return TRC == &AArch64::FPR16RegClass ||
5402 TRC == &AArch64::FPR16_loRegClass;
5403 };
5404 return llvm::any_of(MI.operands(), IsHFPR);
5405}
5406
5408 auto IsQFPR = [&](const MachineOperand &Op) {
5409 if (!Op.isReg())
5410 return false;
5411 auto Reg = Op.getReg();
5412 if (Reg.isPhysical())
5413 return AArch64::FPR128RegClass.contains(Reg);
5414 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5415 return TRC == &AArch64::FPR128RegClass ||
5416 TRC == &AArch64::FPR128_loRegClass;
5417 };
5418 return llvm::any_of(MI.operands(), IsQFPR);
5419}
5420
5422 switch (MI.getOpcode()) {
5423 case AArch64::BRK:
5424 case AArch64::HLT:
5425 case AArch64::PACIASP:
5426 case AArch64::PACIBSP:
5427 // Implicit BTI behavior.
5428 return true;
5429 case AArch64::PAUTH_PROLOGUE:
5430 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5431 return true;
5432 case AArch64::HINT: {
5433 unsigned Imm = MI.getOperand(0).getImm();
5434 // Explicit BTI instruction.
5435 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5436 return true;
5437 // PACI(A|B)SP instructions.
5438 if (Imm == 25 || Imm == 27)
5439 return true;
5440 return false;
5441 }
5442 default:
5443 return false;
5444 }
5445}
5446
5448 if (Reg == 0)
5449 return false;
5450 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5451 return AArch64::FPR128RegClass.contains(Reg) ||
5452 AArch64::FPR64RegClass.contains(Reg) ||
5453 AArch64::FPR32RegClass.contains(Reg) ||
5454 AArch64::FPR16RegClass.contains(Reg) ||
5455 AArch64::FPR8RegClass.contains(Reg);
5456}
5457
5459 auto IsFPR = [&](const MachineOperand &Op) {
5460 if (!Op.isReg())
5461 return false;
5462 auto Reg = Op.getReg();
5463 if (Reg.isPhysical())
5464 return isFpOrNEON(Reg);
5465
5466 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5467 return TRC == &AArch64::FPR128RegClass ||
5468 TRC == &AArch64::FPR128_loRegClass ||
5469 TRC == &AArch64::FPR64RegClass ||
5470 TRC == &AArch64::FPR64_loRegClass ||
5471 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5472 TRC == &AArch64::FPR8RegClass;
5473 };
5474 return llvm::any_of(MI.operands(), IsFPR);
5475}
5476
5477// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5478// scaled.
5479static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5481
5482 // If the byte-offset isn't a multiple of the stride, we can't scale this
5483 // offset.
5484 if (Offset % Scale != 0)
5485 return false;
5486
5487 // Convert the byte-offset used by unscaled into an "element" offset used
5488 // by the scaled pair load/store instructions.
5489 Offset /= Scale;
5490 return true;
5491}
5492
5493static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5494 if (FirstOpc == SecondOpc)
5495 return true;
5496 // We can also pair sign-ext and zero-ext instructions.
5497 switch (FirstOpc) {
5498 default:
5499 return false;
5500 case AArch64::STRSui:
5501 case AArch64::STURSi:
5502 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5503 case AArch64::STRDui:
5504 case AArch64::STURDi:
5505 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5506 case AArch64::STRQui:
5507 case AArch64::STURQi:
5508 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5509 case AArch64::STRWui:
5510 case AArch64::STURWi:
5511 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5512 case AArch64::STRXui:
5513 case AArch64::STURXi:
5514 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5515 case AArch64::LDRSui:
5516 case AArch64::LDURSi:
5517 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5518 case AArch64::LDRDui:
5519 case AArch64::LDURDi:
5520 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5521 case AArch64::LDRQui:
5522 case AArch64::LDURQi:
5523 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5524 case AArch64::LDRWui:
5525 case AArch64::LDURWi:
5526 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5527 case AArch64::LDRSWui:
5528 case AArch64::LDURSWi:
5529 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5530 case AArch64::LDRXui:
5531 case AArch64::LDURXi:
5532 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5533 }
5534 // These instructions can't be paired based on their opcodes.
5535 return false;
5536}
5537
5538static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5539 int64_t Offset1, unsigned Opcode1, int FI2,
5540 int64_t Offset2, unsigned Opcode2) {
5541 // Accesses through fixed stack object frame indices may access a different
5542 // fixed stack slot. Check that the object offsets + offsets match.
5543 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5544 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5545 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5546 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5547 // Convert to scaled object offsets.
5548 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5549 if (ObjectOffset1 % Scale1 != 0)
5550 return false;
5551 ObjectOffset1 /= Scale1;
5552 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5553 if (ObjectOffset2 % Scale2 != 0)
5554 return false;
5555 ObjectOffset2 /= Scale2;
5556 ObjectOffset1 += Offset1;
5557 ObjectOffset2 += Offset2;
5558 return ObjectOffset1 + 1 == ObjectOffset2;
5559 }
5560
5561 return FI1 == FI2;
5562}
5563
5564/// Detect opportunities for ldp/stp formation.
5565///
5566/// Only called for LdSt for which getMemOperandWithOffset returns true.
5568 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5569 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5570 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5571 unsigned NumBytes) const {
5572 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5573 const MachineOperand &BaseOp1 = *BaseOps1.front();
5574 const MachineOperand &BaseOp2 = *BaseOps2.front();
5575 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5576 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5577 if (BaseOp1.getType() != BaseOp2.getType())
5578 return false;
5579
5580 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5581 "Only base registers and frame indices are supported.");
5582
5583 // Check for both base regs and base FI.
5584 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5585 return false;
5586
5587 // Only cluster up to a single pair.
5588 if (ClusterSize > 2)
5589 return false;
5590
5591 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5592 return false;
5593
5594 // Can we pair these instructions based on their opcodes?
5595 unsigned FirstOpc = FirstLdSt.getOpcode();
5596 unsigned SecondOpc = SecondLdSt.getOpcode();
5597 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5598 return false;
5599
5600 // Can't merge volatiles or load/stores that have a hint to avoid pair
5601 // formation, for example.
5602 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5603 !isCandidateToMergeOrPair(SecondLdSt))
5604 return false;
5605
5606 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5607 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5608 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5609 return false;
5610
5611 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5612 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5613 return false;
5614
5615 // Pairwise instructions have a 7-bit signed offset field.
5616 if (Offset1 > 63 || Offset1 < -64)
5617 return false;
5618
5619 // The caller should already have ordered First/SecondLdSt by offset.
5620 // Note: except for non-equal frame index bases
5621 if (BaseOp1.isFI()) {
5622 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5623 "Caller should have ordered offsets.");
5624
5625 const MachineFrameInfo &MFI =
5626 FirstLdSt.getParent()->getParent()->getFrameInfo();
5627 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5628 BaseOp2.getIndex(), Offset2, SecondOpc);
5629 }
5630
5631 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5632
5633 return Offset1 + 1 == Offset2;
5634}
5635
5637 MCRegister Reg, unsigned SubIdx,
5638 RegState State,
5639 const TargetRegisterInfo *TRI) {
5640 if (!SubIdx)
5641 return MIB.addReg(Reg, State);
5642
5643 if (Reg.isPhysical())
5644 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5645 return MIB.addReg(Reg, State, SubIdx);
5646}
5647
5648static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5649 unsigned NumRegs) {
5650 // We really want the positive remainder mod 32 here, that happens to be
5651 // easily obtainable with a mask.
5652 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5653}
5654
5657 const DebugLoc &DL, MCRegister DestReg,
5658 MCRegister SrcReg, bool KillSrc,
5659 unsigned Opcode,
5660 ArrayRef<unsigned> Indices) const {
5661 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5663 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5664 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5665 unsigned NumRegs = Indices.size();
5666
5667 int SubReg = 0, End = NumRegs, Incr = 1;
5668 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5669 SubReg = NumRegs - 1;
5670 End = -1;
5671 Incr = -1;
5672 }
5673
5674 for (; SubReg != End; SubReg += Incr) {
5675 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5676 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5677 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5678 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5679 }
5680}
5681
5684 const DebugLoc &DL, MCRegister DestReg,
5685 MCRegister SrcReg, bool KillSrc,
5686 unsigned Opcode, unsigned ZeroReg,
5687 llvm::ArrayRef<unsigned> Indices) const {
5689 unsigned NumRegs = Indices.size();
5690
5691#ifndef NDEBUG
5692 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5693 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5694 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5695 "GPR reg sequences should not be able to overlap");
5696#endif
5697
5698 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5699 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5700 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5701 MIB.addReg(ZeroReg);
5702 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5703 MIB.addImm(0);
5704 }
5705}
5706
5707/// Returns true if the instruction at I is in a streaming call site region,
5708/// within a single basic block.
5709/// A "call site streaming region" starts after smstart and ends at smstop
5710/// around a call to a streaming function. This walks backward from I.
5713 MachineFunction &MF = *MBB.getParent();
5715 if (!AFI->hasStreamingModeChanges())
5716 return false;
5717 // Walk backwards to find smstart/smstop
5718 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5719 unsigned Opc = MI.getOpcode();
5720 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5721 // Check if this is SM change (not ZA)
5722 int64_t PState = MI.getOperand(0).getImm();
5723 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5724 // Operand 1 is 1 for start, 0 for stop
5725 return MI.getOperand(1).getImm() == 1;
5726 }
5727 }
5728 }
5729 return false;
5730}
5731
5732/// Returns true if in a streaming call site region without SME-FA64.
5733static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5736 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5737}
5738
5741 const DebugLoc &DL, Register DestReg,
5742 Register SrcReg, bool KillSrc,
5743 bool RenamableDest,
5744 bool RenamableSrc) const {
5745 ++NumCopyInstrs;
5746 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5747 AArch64::GPR32spRegClass.contains(SrcReg)) {
5748 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5749 // If either operand is WSP, expand to ADD #0.
5750 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5751 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5752 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5753 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5754 &AArch64::GPR64spRegClass);
5755 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5756 &AArch64::GPR64spRegClass);
5757 // This instruction is reading and writing X registers. This may upset
5758 // the register scavenger and machine verifier, so we need to indicate
5759 // that we are reading an undefined value from SrcRegX, but a proper
5760 // value from SrcReg.
5761 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5762 .addReg(SrcRegX, RegState::Undef)
5763 .addImm(0)
5765 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5766 ++NumZCRegMoveInstrsGPR;
5767 } else {
5768 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5769 .addReg(SrcReg, getKillRegState(KillSrc))
5770 .addImm(0)
5772 if (Subtarget.hasZeroCycleRegMoveGPR32())
5773 ++NumZCRegMoveInstrsGPR;
5774 }
5775 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5776 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5777 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5778 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5779 &AArch64::GPR64spRegClass);
5780 assert(DestRegX.isValid() && "Destination super-reg not valid");
5781 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5782 &AArch64::GPR64spRegClass);
5783 assert(SrcRegX.isValid() && "Source super-reg not valid");
5784 // This instruction is reading and writing X registers. This may upset
5785 // the register scavenger and machine verifier, so we need to indicate
5786 // that we are reading an undefined value from SrcRegX, but a proper
5787 // value from SrcReg.
5788 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5789 .addReg(AArch64::XZR)
5790 .addReg(SrcRegX, RegState::Undef)
5791 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5792 ++NumZCRegMoveInstrsGPR;
5793 } else {
5794 // Otherwise, expand to ORR WZR.
5795 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5796 .addReg(AArch64::WZR)
5797 .addReg(SrcReg, getKillRegState(KillSrc));
5798 if (Subtarget.hasZeroCycleRegMoveGPR32())
5799 ++NumZCRegMoveInstrsGPR;
5800 }
5801 return;
5802 }
5803
5804 // GPR32 zeroing
5805 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5806 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5807 !Subtarget.hasZeroCycleZeroingGPR32()) {
5808 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5809 &AArch64::GPR64spRegClass);
5810 assert(DestRegX.isValid() && "Destination super-reg not valid");
5811 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5812 .addImm(0)
5814 ++NumZCZeroingInstrsGPR;
5815 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5816 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5817 .addImm(0)
5819 ++NumZCZeroingInstrsGPR;
5820 } else {
5821 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5822 .addReg(AArch64::WZR)
5823 .addReg(AArch64::WZR);
5824 }
5825 return;
5826 }
5827
5828 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5829 AArch64::GPR64spRegClass.contains(SrcReg)) {
5830 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5831 // If either operand is SP, expand to ADD #0.
5832 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5833 .addReg(SrcReg, getKillRegState(KillSrc))
5834 .addImm(0)
5836 if (Subtarget.hasZeroCycleRegMoveGPR64())
5837 ++NumZCRegMoveInstrsGPR;
5838 } else {
5839 // Otherwise, expand to ORR XZR.
5840 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5841 .addReg(AArch64::XZR)
5842 .addReg(SrcReg, getKillRegState(KillSrc));
5843 if (Subtarget.hasZeroCycleRegMoveGPR64())
5844 ++NumZCRegMoveInstrsGPR;
5845 }
5846 return;
5847 }
5848
5849 // GPR64 zeroing
5850 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5851 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5852 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5853 .addImm(0)
5855 ++NumZCZeroingInstrsGPR;
5856 } else {
5857 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5858 .addReg(AArch64::XZR)
5859 .addReg(AArch64::XZR);
5860 }
5861 return;
5862 }
5863
5864 // Copy a Predicate register by ORRing with itself.
5865 if (AArch64::PPRRegClass.contains(DestReg) &&
5866 AArch64::PPRRegClass.contains(SrcReg)) {
5867 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5868 "Unexpected SVE register.");
5869 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5870 .addReg(SrcReg) // Pg
5871 .addReg(SrcReg)
5872 .addReg(SrcReg, getKillRegState(KillSrc));
5873 return;
5874 }
5875
5876 // Copy a predicate-as-counter register by ORRing with itself as if it
5877 // were a regular predicate (mask) register.
5878 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5879 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5880 if (DestIsPNR || SrcIsPNR) {
5881 auto ToPPR = [](MCRegister R) -> MCRegister {
5882 return (R - AArch64::PN0) + AArch64::P0;
5883 };
5884 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5885 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5886
5887 if (PPRSrcReg != PPRDestReg) {
5888 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5889 .addReg(PPRSrcReg) // Pg
5890 .addReg(PPRSrcReg)
5891 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5892 if (DestIsPNR)
5893 NewMI.addDef(DestReg, RegState::Implicit);
5894 }
5895 return;
5896 }
5897
5898 // Copy a Z register by ORRing with itself.
5899 if (AArch64::ZPRRegClass.contains(DestReg) &&
5900 AArch64::ZPRRegClass.contains(SrcReg)) {
5901 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5902 "Unexpected SVE register.");
5903 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5904 .addReg(SrcReg)
5905 .addReg(SrcReg, getKillRegState(KillSrc));
5906 return;
5907 }
5908
5909 // Copy a Z register pair by copying the individual sub-registers.
5910 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5911 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5912 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5913 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5914 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5915 "Unexpected SVE register.");
5916 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5917 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5918 Indices);
5919 return;
5920 }
5921
5922 // Copy a Z register triple by copying the individual sub-registers.
5923 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5924 AArch64::ZPR3RegClass.contains(SrcReg)) {
5925 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5926 "Unexpected SVE register.");
5927 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5928 AArch64::zsub2};
5929 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5930 Indices);
5931 return;
5932 }
5933
5934 // Copy a Z register quad by copying the individual sub-registers.
5935 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5936 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5937 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5938 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5939 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5940 "Unexpected SVE register.");
5941 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5942 AArch64::zsub2, AArch64::zsub3};
5943 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5944 Indices);
5945 return;
5946 }
5947
5948 // Copy a DDDD register quad by copying the individual sub-registers.
5949 if (AArch64::DDDDRegClass.contains(DestReg) &&
5950 AArch64::DDDDRegClass.contains(SrcReg)) {
5951 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5952 AArch64::dsub2, AArch64::dsub3};
5953 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5954 Indices);
5955 return;
5956 }
5957
5958 // Copy a DDD register triple by copying the individual sub-registers.
5959 if (AArch64::DDDRegClass.contains(DestReg) &&
5960 AArch64::DDDRegClass.contains(SrcReg)) {
5961 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5962 AArch64::dsub2};
5963 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5964 Indices);
5965 return;
5966 }
5967
5968 // Copy a DD register pair by copying the individual sub-registers.
5969 if (AArch64::DDRegClass.contains(DestReg) &&
5970 AArch64::DDRegClass.contains(SrcReg)) {
5971 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5972 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5973 Indices);
5974 return;
5975 }
5976
5977 // Copy a QQQQ register quad by copying the individual sub-registers.
5978 if (AArch64::QQQQRegClass.contains(DestReg) &&
5979 AArch64::QQQQRegClass.contains(SrcReg)) {
5980 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5981 AArch64::qsub2, AArch64::qsub3};
5982 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5983 Indices);
5984 return;
5985 }
5986
5987 // Copy a QQQ register triple by copying the individual sub-registers.
5988 if (AArch64::QQQRegClass.contains(DestReg) &&
5989 AArch64::QQQRegClass.contains(SrcReg)) {
5990 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5991 AArch64::qsub2};
5992 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5993 Indices);
5994 return;
5995 }
5996
5997 // Copy a QQ register pair by copying the individual sub-registers.
5998 if (AArch64::QQRegClass.contains(DestReg) &&
5999 AArch64::QQRegClass.contains(SrcReg)) {
6000 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
6001 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
6002 Indices);
6003 return;
6004 }
6005
6006 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
6007 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
6008 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
6009 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
6010 AArch64::XZR, Indices);
6011 return;
6012 }
6013
6014 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
6015 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
6016 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
6017 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
6018 AArch64::WZR, Indices);
6019 return;
6020 }
6021
6022 if (AArch64::FPR128RegClass.contains(DestReg) &&
6023 AArch64::FPR128RegClass.contains(SrcReg)) {
6024 // In streaming regions, NEON is illegal but streaming-SVE is available.
6025 // Use SVE for copies if we're in a streaming region and SME is available.
6026 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
6027 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
6028 !Subtarget.isNeonAvailable()) ||
6029 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6030 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
6031 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
6032 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
6033 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
6034 } else if (Subtarget.isNeonAvailable()) {
6035 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
6036 .addReg(SrcReg)
6037 .addReg(SrcReg, getKillRegState(KillSrc));
6038 if (Subtarget.hasZeroCycleRegMoveFPR128())
6039 ++NumZCRegMoveInstrsFPR;
6040 } else {
6041 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
6042 .addReg(AArch64::SP, RegState::Define)
6043 .addReg(SrcReg, getKillRegState(KillSrc))
6044 .addReg(AArch64::SP)
6045 .addImm(-16);
6046 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
6047 .addReg(AArch64::SP, RegState::Define)
6048 .addReg(DestReg, RegState::Define)
6049 .addReg(AArch64::SP)
6050 .addImm(16);
6051 }
6052 return;
6053 }
6054
6055 if (AArch64::FPR64RegClass.contains(DestReg) &&
6056 AArch64::FPR64RegClass.contains(SrcReg)) {
6057 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6058 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6059 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6060 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6061 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
6062 &AArch64::FPR128RegClass);
6063 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
6064 &AArch64::FPR128RegClass);
6065 // This instruction is reading and writing Q registers. This may upset
6066 // the register scavenger and machine verifier, so we need to indicate
6067 // that we are reading an undefined value from SrcRegQ, but a proper
6068 // value from SrcReg.
6069 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6070 .addReg(SrcRegQ, RegState::Undef)
6071 .addReg(SrcRegQ, RegState::Undef)
6072 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6073 ++NumZCRegMoveInstrsFPR;
6074 } else {
6075 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
6076 .addReg(SrcReg, getKillRegState(KillSrc));
6077 if (Subtarget.hasZeroCycleRegMoveFPR64())
6078 ++NumZCRegMoveInstrsFPR;
6079 }
6080 return;
6081 }
6082
6083 if (AArch64::FPR32RegClass.contains(DestReg) &&
6084 AArch64::FPR32RegClass.contains(SrcReg)) {
6085 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6086 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6087 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6088 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6089 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6090 &AArch64::FPR128RegClass);
6091 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6092 &AArch64::FPR128RegClass);
6093 // This instruction is reading and writing Q registers. This may upset
6094 // the register scavenger and machine verifier, so we need to indicate
6095 // that we are reading an undefined value from SrcRegQ, but a proper
6096 // value from SrcReg.
6097 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6098 .addReg(SrcRegQ, RegState::Undef)
6099 .addReg(SrcRegQ, RegState::Undef)
6100 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6101 ++NumZCRegMoveInstrsFPR;
6102 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6103 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6104 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
6105 &AArch64::FPR64RegClass);
6106 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
6107 &AArch64::FPR64RegClass);
6108 // This instruction is reading and writing D registers. This may upset
6109 // the register scavenger and machine verifier, so we need to indicate
6110 // that we are reading an undefined value from SrcRegD, but a proper
6111 // value from SrcReg.
6112 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6113 .addReg(SrcRegD, RegState::Undef)
6114 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6115 ++NumZCRegMoveInstrsFPR;
6116 } else {
6117 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6118 .addReg(SrcReg, getKillRegState(KillSrc));
6119 if (Subtarget.hasZeroCycleRegMoveFPR32())
6120 ++NumZCRegMoveInstrsFPR;
6121 }
6122 return;
6123 }
6124
6125 if (AArch64::FPR16RegClass.contains(DestReg) &&
6126 AArch64::FPR16RegClass.contains(SrcReg)) {
6127 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6128 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6129 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6130 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6131 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6132 &AArch64::FPR128RegClass);
6133 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6134 &AArch64::FPR128RegClass);
6135 // This instruction is reading and writing Q registers. This may upset
6136 // the register scavenger and machine verifier, so we need to indicate
6137 // that we are reading an undefined value from SrcRegQ, but a proper
6138 // value from SrcReg.
6139 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6140 .addReg(SrcRegQ, RegState::Undef)
6141 .addReg(SrcRegQ, RegState::Undef)
6142 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6143 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6144 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6145 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6146 &AArch64::FPR64RegClass);
6147 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6148 &AArch64::FPR64RegClass);
6149 // This instruction is reading and writing D registers. This may upset
6150 // the register scavenger and machine verifier, so we need to indicate
6151 // that we are reading an undefined value from SrcRegD, but a proper
6152 // value from SrcReg.
6153 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6154 .addReg(SrcRegD, RegState::Undef)
6155 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6156 } else {
6157 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
6158 &AArch64::FPR32RegClass);
6159 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
6160 &AArch64::FPR32RegClass);
6161 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6162 .addReg(SrcReg, getKillRegState(KillSrc));
6163 }
6164 return;
6165 }
6166
6167 if (AArch64::FPR8RegClass.contains(DestReg) &&
6168 AArch64::FPR8RegClass.contains(SrcReg)) {
6169 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
6170 !Subtarget.hasZeroCycleRegMoveFPR64() &&
6171 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
6172 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
6173 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6174 &AArch64::FPR128RegClass);
6175 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6176 &AArch64::FPR128RegClass);
6177 // This instruction is reading and writing Q registers. This may upset
6178 // the register scavenger and machine verifier, so we need to indicate
6179 // that we are reading an undefined value from SrcRegQ, but a proper
6180 // value from SrcReg.
6181 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
6182 .addReg(SrcRegQ, RegState::Undef)
6183 .addReg(SrcRegQ, RegState::Undef)
6184 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6185 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
6186 !Subtarget.hasZeroCycleRegMoveFPR32()) {
6187 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6188 &AArch64::FPR64RegClass);
6189 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6190 &AArch64::FPR64RegClass);
6191 // This instruction is reading and writing D registers. This may upset
6192 // the register scavenger and machine verifier, so we need to indicate
6193 // that we are reading an undefined value from SrcRegD, but a proper
6194 // value from SrcReg.
6195 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
6196 .addReg(SrcRegD, RegState::Undef)
6197 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
6198 } else {
6199 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
6200 &AArch64::FPR32RegClass);
6201 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
6202 &AArch64::FPR32RegClass);
6203 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6204 .addReg(SrcReg, getKillRegState(KillSrc));
6205 }
6206 return;
6207 }
6208
6209 // Copies between GPR64 and FPR64.
6210 if (AArch64::FPR64RegClass.contains(DestReg) &&
6211 AArch64::GPR64RegClass.contains(SrcReg)) {
6212 if (AArch64::XZR == SrcReg) {
6213 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6214 } else {
6215 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6216 .addReg(SrcReg, getKillRegState(KillSrc));
6217 }
6218 return;
6219 }
6220 if (AArch64::GPR64RegClass.contains(DestReg) &&
6221 AArch64::FPR64RegClass.contains(SrcReg)) {
6222 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6223 .addReg(SrcReg, getKillRegState(KillSrc));
6224 return;
6225 }
6226 // Copies between GPR32 and FPR32.
6227 if (AArch64::FPR32RegClass.contains(DestReg) &&
6228 AArch64::GPR32RegClass.contains(SrcReg)) {
6229 if (AArch64::WZR == SrcReg) {
6230 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6231 } else {
6232 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6233 .addReg(SrcReg, getKillRegState(KillSrc));
6234 }
6235 return;
6236 }
6237 if (AArch64::GPR32RegClass.contains(DestReg) &&
6238 AArch64::FPR32RegClass.contains(SrcReg)) {
6239 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6240 .addReg(SrcReg, getKillRegState(KillSrc));
6241 return;
6242 }
6243
6244 if (DestReg == AArch64::NZCV) {
6245 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6246 BuildMI(MBB, I, DL, get(AArch64::MSR))
6247 .addImm(AArch64SysReg::NZCV)
6248 .addReg(SrcReg, getKillRegState(KillSrc))
6249 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6250 return;
6251 }
6252
6253 if (SrcReg == AArch64::NZCV) {
6254 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6255 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6256 .addImm(AArch64SysReg::NZCV)
6257 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6258 return;
6259 }
6260
6261#ifndef NDEBUG
6262 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6263 << "\n";
6264#endif
6265 llvm_unreachable("unimplemented reg-to-reg copy");
6266}
6267
6270 MachineBasicBlock::iterator InsertBefore,
6271 const MCInstrDesc &MCID,
6272 Register SrcReg, bool IsKill,
6273 unsigned SubIdx0, unsigned SubIdx1, int FI,
6274 MachineMemOperand *MMO) {
6275 Register SrcReg0 = SrcReg;
6276 Register SrcReg1 = SrcReg;
6277 if (SrcReg.isPhysical()) {
6278 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6279 SubIdx0 = 0;
6280 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6281 SubIdx1 = 0;
6282 }
6283 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6284 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6285 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6286 .addFrameIndex(FI)
6287 .addImm(0)
6288 .addMemOperand(MMO);
6289}
6290
6293 Register SrcReg, bool isKill, int FI,
6294 const TargetRegisterClass *RC,
6295 Register VReg,
6296 MachineInstr::MIFlag Flags) const {
6297 MachineFunction &MF = *MBB.getParent();
6298 MachineFrameInfo &MFI = MF.getFrameInfo();
6299
6301 MachineMemOperand *MMO =
6303 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6304 unsigned Opc = 0;
6305 bool Offset = true;
6307 unsigned StackID = TargetStackID::Default;
6308 switch (RI.getSpillSize(*RC)) {
6309 case 1:
6310 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6311 Opc = AArch64::STRBui;
6312 break;
6313 case 2: {
6314 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6315 Opc = AArch64::STRHui;
6316 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6317 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6318 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6319 "Unexpected register store without SVE store instructions");
6320 Opc = AArch64::STR_PXI;
6322 }
6323 break;
6324 }
6325 case 4:
6326 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6327 Opc = AArch64::STRWui;
6328 if (SrcReg.isVirtual())
6329 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6330 else
6331 assert(SrcReg != AArch64::WSP);
6332 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6333 Opc = AArch64::STRSui;
6334 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6335 Opc = AArch64::STR_PPXI;
6337 }
6338 break;
6339 case 8:
6340 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6341 Opc = AArch64::STRXui;
6342 if (SrcReg.isVirtual())
6343 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6344 else
6345 assert(SrcReg != AArch64::SP);
6346 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6347 Opc = AArch64::STRDui;
6348 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6350 get(AArch64::STPWi), SrcReg, isKill,
6351 AArch64::sube32, AArch64::subo32, FI, MMO);
6352 return;
6353 }
6354 break;
6355 case 16:
6356 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6357 Opc = AArch64::STRQui;
6358 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6359 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6360 Opc = AArch64::ST1Twov1d;
6361 Offset = false;
6362 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6364 get(AArch64::STPXi), SrcReg, isKill,
6365 AArch64::sube64, AArch64::subo64, FI, MMO);
6366 return;
6367 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6368 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6369 "Unexpected register store without SVE store instructions");
6370 Opc = AArch64::STR_ZXI;
6372 }
6373 break;
6374 case 24:
6375 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6376 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6377 Opc = AArch64::ST1Threev1d;
6378 Offset = false;
6379 }
6380 break;
6381 case 32:
6382 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6383 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6384 Opc = AArch64::ST1Fourv1d;
6385 Offset = false;
6386 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6387 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6388 Opc = AArch64::ST1Twov2d;
6389 Offset = false;
6390 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6391 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6392 "Unexpected register store without SVE store instructions");
6393 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6395 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6396 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6397 "Unexpected register store without SVE store instructions");
6398 Opc = AArch64::STR_ZZXI;
6400 }
6401 break;
6402 case 48:
6403 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6404 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6405 Opc = AArch64::ST1Threev2d;
6406 Offset = false;
6407 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6408 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6409 "Unexpected register store without SVE store instructions");
6410 Opc = AArch64::STR_ZZZXI;
6412 }
6413 break;
6414 case 64:
6415 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6416 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6417 Opc = AArch64::ST1Fourv2d;
6418 Offset = false;
6419 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6420 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6421 "Unexpected register store without SVE store instructions");
6422 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6424 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6425 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6426 "Unexpected register store without SVE store instructions");
6427 Opc = AArch64::STR_ZZZZXI;
6429 }
6430 break;
6431 }
6432 assert(Opc && "Unknown register class");
6433 MFI.setStackID(FI, StackID);
6434
6436 .addReg(SrcReg, getKillRegState(isKill))
6437 .addFrameIndex(FI);
6438
6439 if (Offset)
6440 MI.addImm(0);
6441 if (PNRReg.isValid())
6442 MI.addDef(PNRReg, RegState::Implicit);
6443 MI.addMemOperand(MMO);
6444}
6445
6448 MachineBasicBlock::iterator InsertBefore,
6449 const MCInstrDesc &MCID,
6450 Register DestReg, unsigned SubIdx0,
6451 unsigned SubIdx1, int FI,
6452 MachineMemOperand *MMO) {
6453 Register DestReg0 = DestReg;
6454 Register DestReg1 = DestReg;
6455 bool IsUndef = true;
6456 if (DestReg.isPhysical()) {
6457 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6458 SubIdx0 = 0;
6459 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6460 SubIdx1 = 0;
6461 IsUndef = false;
6462 }
6463 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6464 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6465 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6466 .addFrameIndex(FI)
6467 .addImm(0)
6468 .addMemOperand(MMO);
6469}
6470
6473 Register DestReg, int FI,
6474 const TargetRegisterClass *RC,
6475 Register VReg, unsigned SubReg,
6476 MachineInstr::MIFlag Flags) const {
6477 MachineFunction &MF = *MBB.getParent();
6478 MachineFrameInfo &MFI = MF.getFrameInfo();
6480 MachineMemOperand *MMO =
6482 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6483
6484 unsigned Opc = 0;
6485 bool Offset = true;
6486 unsigned StackID = TargetStackID::Default;
6488 switch (TRI.getSpillSize(*RC)) {
6489 case 1:
6490 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6491 Opc = AArch64::LDRBui;
6492 break;
6493 case 2: {
6494 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6495 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6496 Opc = AArch64::LDRHui;
6497 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6498 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6499 "Unexpected register load without SVE load instructions");
6500 if (IsPNR)
6501 PNRReg = DestReg;
6502 Opc = AArch64::LDR_PXI;
6504 }
6505 break;
6506 }
6507 case 4:
6508 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6509 Opc = AArch64::LDRWui;
6510 if (DestReg.isVirtual())
6511 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6512 else
6513 assert(DestReg != AArch64::WSP);
6514 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6515 Opc = AArch64::LDRSui;
6516 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6517 Opc = AArch64::LDR_PPXI;
6519 }
6520 break;
6521 case 8:
6522 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6523 Opc = AArch64::LDRXui;
6524 if (DestReg.isVirtual())
6525 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6526 else
6527 assert(DestReg != AArch64::SP);
6528 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6529 Opc = AArch64::LDRDui;
6530 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6532 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6533 AArch64::subo32, FI, MMO);
6534 return;
6535 }
6536 break;
6537 case 16:
6538 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6539 Opc = AArch64::LDRQui;
6540 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6541 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6542 Opc = AArch64::LD1Twov1d;
6543 Offset = false;
6544 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6546 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6547 AArch64::subo64, FI, MMO);
6548 return;
6549 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6550 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6551 "Unexpected register load without SVE load instructions");
6552 Opc = AArch64::LDR_ZXI;
6554 }
6555 break;
6556 case 24:
6557 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6558 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6559 Opc = AArch64::LD1Threev1d;
6560 Offset = false;
6561 }
6562 break;
6563 case 32:
6564 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6565 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6566 Opc = AArch64::LD1Fourv1d;
6567 Offset = false;
6568 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6569 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6570 Opc = AArch64::LD1Twov2d;
6571 Offset = false;
6572 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6573 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6574 "Unexpected register load without SVE load instructions");
6575 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6577 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6578 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6579 "Unexpected register load without SVE load instructions");
6580 Opc = AArch64::LDR_ZZXI;
6582 }
6583 break;
6584 case 48:
6585 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6586 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6587 Opc = AArch64::LD1Threev2d;
6588 Offset = false;
6589 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6590 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6591 "Unexpected register load without SVE load instructions");
6592 Opc = AArch64::LDR_ZZZXI;
6594 }
6595 break;
6596 case 64:
6597 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6598 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6599 Opc = AArch64::LD1Fourv2d;
6600 Offset = false;
6601 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6602 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6603 "Unexpected register load without SVE load instructions");
6604 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6606 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6607 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6608 "Unexpected register load without SVE load instructions");
6609 Opc = AArch64::LDR_ZZZZXI;
6611 }
6612 break;
6613 }
6614
6615 assert(Opc && "Unknown register class");
6616 MFI.setStackID(FI, StackID);
6617
6619 .addReg(DestReg, getDefRegState(true))
6620 .addFrameIndex(FI);
6621 if (Offset)
6622 MI.addImm(0);
6623 if (PNRReg.isValid() && !PNRReg.isVirtual())
6624 MI.addDef(PNRReg, RegState::Implicit);
6625 MI.addMemOperand(MMO);
6626}
6627
6629 const MachineInstr &UseMI,
6630 const TargetRegisterInfo *TRI) {
6631 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6632 UseMI.getIterator()),
6633 [TRI](const MachineInstr &I) {
6634 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6635 I.readsRegister(AArch64::NZCV, TRI);
6636 });
6637}
6638
6639void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6640 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6641 // The smallest scalable element supported by scaled SVE addressing
6642 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6643 // byte offset must always be a multiple of 2.
6644 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6645
6646 // VGSized offsets are divided by '2', because the VG register is the
6647 // the number of 64bit granules as opposed to 128bit vector chunks,
6648 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6649 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6650 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6651 ByteSized = Offset.getFixed();
6652 VGSized = Offset.getScalable() / 2;
6653}
6654
6655/// Returns the offset in parts to which this frame offset can be
6656/// decomposed for the purpose of describing a frame offset.
6657/// For non-scalable offsets this is simply its byte size.
6658void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6659 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6660 int64_t &NumDataVectors) {
6661 // The smallest scalable element supported by scaled SVE addressing
6662 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6663 // byte offset must always be a multiple of 2.
6664 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6665
6666 NumBytes = Offset.getFixed();
6667 NumDataVectors = 0;
6668 NumPredicateVectors = Offset.getScalable() / 2;
6669 // This method is used to get the offsets to adjust the frame offset.
6670 // If the function requires ADDPL to be used and needs more than two ADDPL
6671 // instructions, part of the offset is folded into NumDataVectors so that it
6672 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6673 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6674 NumPredicateVectors > 62) {
6675 NumDataVectors = NumPredicateVectors / 8;
6676 NumPredicateVectors -= NumDataVectors * 8;
6677 }
6678}
6679
6680// Convenience function to create a DWARF expression for: Constant `Operation`.
6681// This helper emits compact sequences for common cases. For example, for`-15
6682// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6685 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6686 // -Constant (1 to 31)
6687 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6688 Operation = dwarf::DW_OP_minus;
6689 } else if (Constant >= 0 && Constant <= 31) {
6690 // Literal value 0 to 31
6691 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6692 } else {
6693 // Signed constant
6694 Expr.push_back(dwarf::DW_OP_consts);
6696 }
6697 return Expr.push_back(Operation);
6698}
6699
6700// Convenience function to create a DWARF expression for a register.
6701static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6702 Expr.push_back((char)dwarf::DW_OP_bregx);
6704 Expr.push_back(0);
6705}
6706
6707// Convenience function to create a DWARF expression for loading a register from
6708// a CFA offset.
6710 int64_t OffsetFromDefCFA) {
6711 // This assumes the top of the DWARF stack contains the CFA.
6712 Expr.push_back(dwarf::DW_OP_dup);
6713 // Add the offset to the register.
6714 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6715 // Dereference the address (loads a 64 bit value)..
6716 Expr.push_back(dwarf::DW_OP_deref);
6717}
6718
6719// Convenience function to create a comment for
6720// (+/-) NumBytes (* RegScale)?
6721static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6722 StringRef RegScale = {}) {
6723 if (NumBytes) {
6724 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6725 if (!RegScale.empty())
6726 Comment << ' ' << RegScale;
6727 }
6728}
6729
6730// Creates an MCCFIInstruction:
6731// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6733 unsigned Reg,
6734 const StackOffset &Offset) {
6735 int64_t NumBytes, NumVGScaledBytes;
6736 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6737 NumVGScaledBytes);
6738 std::string CommentBuffer;
6739 llvm::raw_string_ostream Comment(CommentBuffer);
6740
6741 if (Reg == AArch64::SP)
6742 Comment << "sp";
6743 else if (Reg == AArch64::FP)
6744 Comment << "fp";
6745 else
6746 Comment << printReg(Reg, &TRI);
6747
6748 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6749 SmallString<64> Expr;
6750 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6751 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6752 // Reg + NumBytes
6753 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6754 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6755 appendOffsetComment(NumBytes, Comment);
6756 if (NumVGScaledBytes) {
6757 // + VG * NumVGScaledBytes
6758 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6759 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6760 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6761 Expr.push_back(dwarf::DW_OP_plus);
6762 }
6763
6764 // Wrap this into DW_CFA_def_cfa.
6765 SmallString<64> DefCfaExpr;
6766 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6767 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6768 DefCfaExpr.append(Expr.str());
6769 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6770 Comment.str());
6771}
6772
6774 unsigned FrameReg, unsigned Reg,
6775 const StackOffset &Offset,
6776 bool LastAdjustmentWasScalable) {
6777 if (Offset.getScalable())
6778 return createDefCFAExpression(TRI, Reg, Offset);
6779
6780 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6781 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6782
6783 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6784 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6785}
6786
6789 const StackOffset &OffsetFromDefCFA,
6790 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6791 int64_t NumBytes, NumVGScaledBytes;
6792 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6793 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6794
6795 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6796
6797 // Non-scalable offsets can use DW_CFA_offset directly.
6798 if (!NumVGScaledBytes)
6799 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6800
6801 std::string CommentBuffer;
6802 llvm::raw_string_ostream Comment(CommentBuffer);
6803 Comment << printReg(Reg, &TRI) << " @ cfa";
6804
6805 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6806 assert(NumVGScaledBytes && "Expected scalable offset");
6807 SmallString<64> OffsetExpr;
6808 // + VG * NumVGScaledBytes
6809 StringRef VGRegScale;
6810 if (IncomingVGOffsetFromDefCFA) {
6811 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6812 VGRegScale = "* IncomingVG";
6813 } else {
6814 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6815 VGRegScale = "* VG";
6816 }
6817 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6818 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6819 OffsetExpr.push_back(dwarf::DW_OP_plus);
6820 if (NumBytes) {
6821 // + NumBytes
6822 appendOffsetComment(NumBytes, Comment);
6823 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6824 }
6825
6826 // Wrap this into DW_CFA_expression
6827 SmallString<64> CfaExpr;
6828 CfaExpr.push_back(dwarf::DW_CFA_expression);
6829 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6830 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6831 CfaExpr.append(OffsetExpr.str());
6832
6833 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6834 Comment.str());
6835}
6836
6837// Helper function to emit a frame offset adjustment from a given
6838// pointer (SrcReg), stored into DestReg. This function is explicit
6839// in that it requires the opcode.
6842 const DebugLoc &DL, unsigned DestReg,
6843 unsigned SrcReg, int64_t Offset, unsigned Opc,
6844 const TargetInstrInfo *TII,
6845 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6846 bool *HasWinCFI, bool EmitCFAOffset,
6847 StackOffset CFAOffset, unsigned FrameReg) {
6848 int Sign = 1;
6849 unsigned MaxEncoding, ShiftSize;
6850 switch (Opc) {
6851 case AArch64::ADDXri:
6852 case AArch64::ADDSXri:
6853 case AArch64::SUBXri:
6854 case AArch64::SUBSXri:
6855 MaxEncoding = 0xfff;
6856 ShiftSize = 12;
6857 break;
6858 case AArch64::ADDVL_XXI:
6859 case AArch64::ADDPL_XXI:
6860 case AArch64::ADDSVL_XXI:
6861 case AArch64::ADDSPL_XXI:
6862 MaxEncoding = 31;
6863 ShiftSize = 0;
6864 if (Offset < 0) {
6865 MaxEncoding = 32;
6866 Sign = -1;
6867 Offset = -Offset;
6868 }
6869 break;
6870 default:
6871 llvm_unreachable("Unsupported opcode");
6872 }
6873
6874 // `Offset` can be in bytes or in "scalable bytes".
6875 int VScale = 1;
6876 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6877 VScale = 16;
6878 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6879 VScale = 2;
6880
6881 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6882 // scratch register. If DestReg is a virtual register, use it as the
6883 // scratch register; otherwise, create a new virtual register (to be
6884 // replaced by the scavenger at the end of PEI). That case can be optimized
6885 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6886 // register can be loaded with offset%8 and the add/sub can use an extending
6887 // instruction with LSL#3.
6888 // Currently the function handles any offsets but generates a poor sequence
6889 // of code.
6890 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6891
6892 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6893 Register TmpReg = DestReg;
6894 if (TmpReg == AArch64::XZR)
6895 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6896 &AArch64::GPR64RegClass);
6897 do {
6898 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6899 unsigned LocalShiftSize = 0;
6900 if (ThisVal > MaxEncoding) {
6901 ThisVal = ThisVal >> ShiftSize;
6902 LocalShiftSize = ShiftSize;
6903 }
6904 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6905 "Encoding cannot handle value that big");
6906
6907 Offset -= ThisVal << LocalShiftSize;
6908 if (Offset == 0)
6909 TmpReg = DestReg;
6910 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6911 .addReg(SrcReg)
6912 .addImm(Sign * (int)ThisVal);
6913 if (ShiftSize)
6914 MBI = MBI.addImm(
6916 MBI = MBI.setMIFlag(Flag);
6917
6918 auto Change =
6919 VScale == 1
6920 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6921 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6922 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6923 CFAOffset += Change;
6924 else
6925 CFAOffset -= Change;
6926 if (EmitCFAOffset && DestReg == TmpReg) {
6927 MachineFunction &MF = *MBB.getParent();
6928 const TargetSubtargetInfo &STI = MF.getSubtarget();
6929 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6930
6931 unsigned CFIIndex = MF.addFrameInst(
6932 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6933 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6934 .addCFIIndex(CFIIndex)
6935 .setMIFlags(Flag);
6936 }
6937
6938 if (NeedsWinCFI) {
6939 int Imm = (int)(ThisVal << LocalShiftSize);
6940 if (VScale != 1 && DestReg == AArch64::SP) {
6941 if (HasWinCFI)
6942 *HasWinCFI = true;
6943 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6944 .addImm(ThisVal)
6945 .setMIFlag(Flag);
6946 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6947 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6948 assert(VScale == 1 && "Expected non-scalable operation");
6949 if (HasWinCFI)
6950 *HasWinCFI = true;
6951 if (Imm == 0)
6952 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6953 else
6954 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6955 .addImm(Imm)
6956 .setMIFlag(Flag);
6957 assert(Offset == 0 && "Expected remaining offset to be zero to "
6958 "emit a single SEH directive");
6959 } else if (DestReg == AArch64::SP) {
6960 assert(VScale == 1 && "Expected non-scalable operation");
6961 if (HasWinCFI)
6962 *HasWinCFI = true;
6963 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6964 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6965 .addImm(Imm)
6966 .setMIFlag(Flag);
6967 }
6968 }
6969
6970 SrcReg = TmpReg;
6971 } while (Offset);
6972}
6973
6976 unsigned DestReg, unsigned SrcReg,
6978 MachineInstr::MIFlag Flag, bool SetNZCV,
6979 bool NeedsWinCFI, bool *HasWinCFI,
6980 bool EmitCFAOffset, StackOffset CFAOffset,
6981 unsigned FrameReg) {
6982 // If a function is marked as arm_locally_streaming, then the runtime value of
6983 // vscale in the prologue/epilogue is different the runtime value of vscale
6984 // in the function's body. To avoid having to consider multiple vscales,
6985 // we can use `addsvl` to allocate any scalable stack-slots, which under
6986 // most circumstances will be only locals, not callee-save slots.
6987 const Function &F = MBB.getParent()->getFunction();
6988 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6989
6990 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6991 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6992 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6993
6994 // Insert ADDSXri for scalable offset at the end.
6995 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6996 if (NeedsFinalDefNZCV)
6997 SetNZCV = false;
6998
6999 // First emit non-scalable frame offsets, or a simple 'mov'.
7000 if (Bytes || (!Offset && SrcReg != DestReg)) {
7001 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
7002 "SP increment/decrement not 8-byte aligned");
7003 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
7004 if (Bytes < 0) {
7005 Bytes = -Bytes;
7006 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
7007 }
7008 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
7009 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7010 FrameReg);
7011 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
7012 ? StackOffset::getFixed(-Bytes)
7013 : StackOffset::getFixed(Bytes);
7014 SrcReg = DestReg;
7015 FrameReg = DestReg;
7016 }
7017
7018 assert(!(NeedsWinCFI && NumPredicateVectors) &&
7019 "WinCFI can't allocate fractions of an SVE data vector");
7020
7021 if (NumDataVectors) {
7022 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
7023 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
7024 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7025 FrameReg);
7026 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
7027 SrcReg = DestReg;
7028 }
7029
7030 if (NumPredicateVectors) {
7031 assert(DestReg != AArch64::SP && "Unaligned access to SP");
7032 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
7033 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
7034 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
7035 FrameReg);
7036 }
7037
7038 if (NeedsFinalDefNZCV)
7039 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
7040 .addReg(DestReg)
7041 .addImm(0)
7042 .addImm(0);
7043}
7044
7047 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
7048 VirtRegMap *VRM) const {
7050 // This is a bit of a hack. Consider this instruction:
7051 //
7052 // %0 = COPY %sp; GPR64all:%0
7053 //
7054 // We explicitly chose GPR64all for the virtual register so such a copy might
7055 // be eliminated by RegisterCoalescer. However, that may not be possible, and
7056 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
7057 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
7058 //
7059 // To prevent that, we are going to constrain the %0 register class here.
7060 if (MI.isFullCopy()) {
7061 Register DstReg = MI.getOperand(0).getReg();
7062 Register SrcReg = MI.getOperand(1).getReg();
7063 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
7064 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
7065 return nullptr;
7066 }
7067 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
7068 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
7069 return nullptr;
7070 }
7071 // Nothing can folded with copy from/to NZCV.
7072 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
7073 return nullptr;
7074 }
7075
7076 // Handle the case where a copy is being spilled or filled but the source
7077 // and destination register class don't match. For example:
7078 //
7079 // %0 = COPY %xzr; GPR64common:%0
7080 //
7081 // In this case we can still safely fold away the COPY and generate the
7082 // following spill code:
7083 //
7084 // STRXui %xzr, %stack.0
7085 //
7086 // This also eliminates spilled cross register class COPYs (e.g. between x and
7087 // d regs) of the same size. For example:
7088 //
7089 // %0 = COPY %1; GPR64:%0, FPR64:%1
7090 //
7091 // will be filled as
7092 //
7093 // LDRDui %0, fi<#0>
7094 //
7095 // instead of
7096 //
7097 // LDRXui %Temp, fi<#0>
7098 // %0 = FMOV %Temp
7099 //
7100 if (MI.isCopy() && Ops.size() == 1 &&
7101 // Make sure we're only folding the explicit COPY defs/uses.
7102 (Ops[0] == 0 || Ops[0] == 1)) {
7103 bool IsSpill = Ops[0] == 0;
7104 bool IsFill = !IsSpill;
7106 const MachineRegisterInfo &MRI = MF.getRegInfo();
7107 MachineBasicBlock &MBB = *MI.getParent();
7108 const MachineOperand &DstMO = MI.getOperand(0);
7109 const MachineOperand &SrcMO = MI.getOperand(1);
7110 Register DstReg = DstMO.getReg();
7111 Register SrcReg = SrcMO.getReg();
7112 // This is slightly expensive to compute for physical regs since
7113 // getMinimalPhysRegClass is slow.
7114 auto getRegClass = [&](unsigned Reg) {
7115 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
7116 : TRI.getMinimalPhysRegClass(Reg);
7117 };
7118
7119 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
7120 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
7121 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
7122 "Mismatched register size in non subreg COPY");
7123 if (IsSpill)
7124 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
7125 getRegClass(SrcReg), Register());
7126 else
7127 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
7128 getRegClass(DstReg), Register());
7129 return &*--InsertPt;
7130 }
7131
7132 // Handle cases like spilling def of:
7133 //
7134 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
7135 //
7136 // where the physical register source can be widened and stored to the full
7137 // virtual reg destination stack slot, in this case producing:
7138 //
7139 // STRXui %xzr, %stack.0
7140 //
7141 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
7142 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
7143 assert(SrcMO.getSubReg() == 0 &&
7144 "Unexpected subreg on physical register");
7145 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
7146 FrameIndex, &AArch64::GPR64RegClass, Register());
7147 return &*--InsertPt;
7148 }
7149
7150 // Handle cases like filling use of:
7151 //
7152 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
7153 //
7154 // where we can load the full virtual reg source stack slot, into the subreg
7155 // destination, in this case producing:
7156 //
7157 // LDRWui %0:sub_32<def,read-undef>, %stack.0
7158 //
7159 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
7160 const TargetRegisterClass *FillRC = nullptr;
7161 switch (DstMO.getSubReg()) {
7162 default:
7163 break;
7164 case AArch64::sub_32:
7165 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
7166 FillRC = &AArch64::GPR32RegClass;
7167 break;
7168 case AArch64::ssub:
7169 FillRC = &AArch64::FPR32RegClass;
7170 break;
7171 case AArch64::dsub:
7172 FillRC = &AArch64::FPR64RegClass;
7173 break;
7174 }
7175
7176 if (FillRC) {
7177 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
7178 TRI.getRegSizeInBits(*FillRC) &&
7179 "Mismatched regclass size on folded subreg COPY");
7180 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
7181 Register());
7182 MachineInstr &LoadMI = *--InsertPt;
7183 MachineOperand &LoadDst = LoadMI.getOperand(0);
7184 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
7185 LoadDst.setSubReg(DstMO.getSubReg());
7186 LoadDst.setIsUndef();
7187 return &LoadMI;
7188 }
7189 }
7190 }
7191
7192 // Cannot fold.
7193 return nullptr;
7194}
7195
7197 StackOffset &SOffset,
7198 bool *OutUseUnscaledOp,
7199 unsigned *OutUnscaledOp,
7200 int64_t *EmittableOffset) {
7201 // Set output values in case of early exit.
7202 if (EmittableOffset)
7203 *EmittableOffset = 0;
7204 if (OutUseUnscaledOp)
7205 *OutUseUnscaledOp = false;
7206 if (OutUnscaledOp)
7207 *OutUnscaledOp = 0;
7208
7209 // Exit early for structured vector spills/fills as they can't take an
7210 // immediate offset.
7211 switch (MI.getOpcode()) {
7212 default:
7213 break;
7214 case AArch64::LD1Rv1d:
7215 case AArch64::LD1Rv2s:
7216 case AArch64::LD1Rv2d:
7217 case AArch64::LD1Rv4h:
7218 case AArch64::LD1Rv4s:
7219 case AArch64::LD1Rv8b:
7220 case AArch64::LD1Rv8h:
7221 case AArch64::LD1Rv16b:
7222 case AArch64::LD1Twov2d:
7223 case AArch64::LD1Threev2d:
7224 case AArch64::LD1Fourv2d:
7225 case AArch64::LD1Twov1d:
7226 case AArch64::LD1Threev1d:
7227 case AArch64::LD1Fourv1d:
7228 case AArch64::ST1Twov2d:
7229 case AArch64::ST1Threev2d:
7230 case AArch64::ST1Fourv2d:
7231 case AArch64::ST1Twov1d:
7232 case AArch64::ST1Threev1d:
7233 case AArch64::ST1Fourv1d:
7234 case AArch64::ST1i8:
7235 case AArch64::ST1i16:
7236 case AArch64::ST1i32:
7237 case AArch64::ST1i64:
7238 case AArch64::IRG:
7239 case AArch64::IRGstack:
7240 case AArch64::STGloop:
7241 case AArch64::STZGloop:
7243 }
7244
7245 // Get the min/max offset and the scale.
7246 TypeSize ScaleValue(0U, false), Width(0U, false);
7247 int64_t MinOff, MaxOff;
7248 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7249 MaxOff))
7250 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7251
7252 // Construct the complete offset.
7253 bool IsMulVL = ScaleValue.isScalable();
7254 unsigned Scale = ScaleValue.getKnownMinValue();
7255 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7256
7257 const MachineOperand &ImmOpnd =
7258 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7259 Offset += ImmOpnd.getImm() * Scale;
7260
7261 // If the offset doesn't match the scale, we rewrite the instruction to
7262 // use the unscaled instruction instead. Likewise, if we have a negative
7263 // offset and there is an unscaled op to use.
7264 std::optional<unsigned> UnscaledOp =
7266 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7267 if (useUnscaledOp &&
7268 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7269 MaxOff))
7270 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7271
7272 Scale = ScaleValue.getKnownMinValue();
7273 assert(IsMulVL == ScaleValue.isScalable() &&
7274 "Unscaled opcode has different value for scalable");
7275
7276 int64_t Remainder = Offset % Scale;
7277 assert(!(Remainder && useUnscaledOp) &&
7278 "Cannot have remainder when using unscaled op");
7279
7280 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7281 int64_t NewOffset = Offset / Scale;
7282 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7283 Offset = Remainder;
7284 else {
7285 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7286 Offset = Offset - (NewOffset * Scale);
7287 }
7288
7289 if (EmittableOffset)
7290 *EmittableOffset = NewOffset;
7291 if (OutUseUnscaledOp)
7292 *OutUseUnscaledOp = useUnscaledOp;
7293 if (OutUnscaledOp && UnscaledOp)
7294 *OutUnscaledOp = *UnscaledOp;
7295
7296 if (IsMulVL)
7297 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7298 else
7299 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7301 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7302}
7303
7305 unsigned FrameReg, StackOffset &Offset,
7306 const AArch64InstrInfo *TII) {
7307 unsigned Opcode = MI.getOpcode();
7308 unsigned ImmIdx = FrameRegIdx + 1;
7309
7310 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7311 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7312 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7313 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7314 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7315 MI.eraseFromParent();
7316 Offset = StackOffset();
7317 return true;
7318 }
7319
7320 int64_t NewOffset;
7321 unsigned UnscaledOp;
7322 bool UseUnscaledOp;
7323 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7324 &UnscaledOp, &NewOffset);
7327 // Replace the FrameIndex with FrameReg.
7328 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7329 if (UseUnscaledOp)
7330 MI.setDesc(TII->get(UnscaledOp));
7331
7332 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7333 return !Offset;
7334 }
7335
7336 return false;
7337}
7338
7344
7345MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7346
7347// AArch64 supports MachineCombiner.
7348bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7349
7350// True when Opc sets flag
7351static bool isCombineInstrSettingFlag(unsigned Opc) {
7352 switch (Opc) {
7353 case AArch64::ADDSWrr:
7354 case AArch64::ADDSWri:
7355 case AArch64::ADDSXrr:
7356 case AArch64::ADDSXri:
7357 case AArch64::SUBSWrr:
7358 case AArch64::SUBSXrr:
7359 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7360 case AArch64::SUBSWri:
7361 case AArch64::SUBSXri:
7362 return true;
7363 default:
7364 break;
7365 }
7366 return false;
7367}
7368
7369// 32b Opcodes that can be combined with a MUL
7370static bool isCombineInstrCandidate32(unsigned Opc) {
7371 switch (Opc) {
7372 case AArch64::ADDWrr:
7373 case AArch64::ADDWri:
7374 case AArch64::SUBWrr:
7375 case AArch64::ADDSWrr:
7376 case AArch64::ADDSWri:
7377 case AArch64::SUBSWrr:
7378 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7379 case AArch64::SUBWri:
7380 case AArch64::SUBSWri:
7381 return true;
7382 default:
7383 break;
7384 }
7385 return false;
7386}
7387
7388// 64b Opcodes that can be combined with a MUL
7389static bool isCombineInstrCandidate64(unsigned Opc) {
7390 switch (Opc) {
7391 case AArch64::ADDXrr:
7392 case AArch64::ADDXri:
7393 case AArch64::SUBXrr:
7394 case AArch64::ADDSXrr:
7395 case AArch64::ADDSXri:
7396 case AArch64::SUBSXrr:
7397 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7398 case AArch64::SUBXri:
7399 case AArch64::SUBSXri:
7400 case AArch64::ADDv8i8:
7401 case AArch64::ADDv16i8:
7402 case AArch64::ADDv4i16:
7403 case AArch64::ADDv8i16:
7404 case AArch64::ADDv2i32:
7405 case AArch64::ADDv4i32:
7406 case AArch64::SUBv8i8:
7407 case AArch64::SUBv16i8:
7408 case AArch64::SUBv4i16:
7409 case AArch64::SUBv8i16:
7410 case AArch64::SUBv2i32:
7411 case AArch64::SUBv4i32:
7412 return true;
7413 default:
7414 break;
7415 }
7416 return false;
7417}
7418
7419// FP Opcodes that can be combined with a FMUL.
7420static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7421 switch (Inst.getOpcode()) {
7422 default:
7423 break;
7424 case AArch64::FADDHrr:
7425 case AArch64::FADDSrr:
7426 case AArch64::FADDDrr:
7427 case AArch64::FADDv4f16:
7428 case AArch64::FADDv8f16:
7429 case AArch64::FADDv2f32:
7430 case AArch64::FADDv2f64:
7431 case AArch64::FADDv4f32:
7432 case AArch64::FSUBHrr:
7433 case AArch64::FSUBSrr:
7434 case AArch64::FSUBDrr:
7435 case AArch64::FSUBv4f16:
7436 case AArch64::FSUBv8f16:
7437 case AArch64::FSUBv2f32:
7438 case AArch64::FSUBv2f64:
7439 case AArch64::FSUBv4f32:
7441 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7442 // the target options or if FADD/FSUB has the contract fast-math flag.
7443 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7445 }
7446 return false;
7447}
7448
7449// Opcodes that can be combined with a MUL
7453
7454//
7455// Utility routine that checks if \param MO is defined by an
7456// \param CombineOpc instruction in the basic block \param MBB
7458 unsigned CombineOpc, unsigned ZeroReg = 0,
7459 bool CheckZeroReg = false) {
7460 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7461 MachineInstr *MI = nullptr;
7462
7463 if (MO.isReg() && MO.getReg().isVirtual())
7464 MI = MRI.getUniqueVRegDef(MO.getReg());
7465 // And it needs to be in the trace (otherwise, it won't have a depth).
7466 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7467 return false;
7468 // Must only used by the user we combine with.
7469 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7470 return false;
7471
7472 if (CheckZeroReg) {
7473 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7474 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7475 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7476 // The third input reg must be zero.
7477 if (MI->getOperand(3).getReg() != ZeroReg)
7478 return false;
7479 }
7480
7481 if (isCombineInstrSettingFlag(CombineOpc) &&
7482 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7483 return false;
7484
7485 return true;
7486}
7487
7488//
7489// Is \param MO defined by an integer multiply and can be combined?
7491 unsigned MulOpc, unsigned ZeroReg) {
7492 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7493}
7494
7495//
7496// Is \param MO defined by a floating-point multiply and can be combined?
7498 unsigned MulOpc) {
7499 return canCombine(MBB, MO, MulOpc);
7500}
7501
7502// TODO: There are many more machine instruction opcodes to match:
7503// 1. Other data types (integer, vectors)
7504// 2. Other math / logic operations (xor, or)
7505// 3. Other forms of the same operation (intrinsics and other variants)
7506bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7507 bool Invert) const {
7508 if (Invert)
7509 return false;
7510 switch (Inst.getOpcode()) {
7511 // == Floating-point types ==
7512 // -- Floating-point instructions --
7513 case AArch64::FADDHrr:
7514 case AArch64::FADDSrr:
7515 case AArch64::FADDDrr:
7516 case AArch64::FMULHrr:
7517 case AArch64::FMULSrr:
7518 case AArch64::FMULDrr:
7519 case AArch64::FMULX16:
7520 case AArch64::FMULX32:
7521 case AArch64::FMULX64:
7522 // -- Advanced SIMD instructions --
7523 case AArch64::FADDv4f16:
7524 case AArch64::FADDv8f16:
7525 case AArch64::FADDv2f32:
7526 case AArch64::FADDv4f32:
7527 case AArch64::FADDv2f64:
7528 case AArch64::FMULv4f16:
7529 case AArch64::FMULv8f16:
7530 case AArch64::FMULv2f32:
7531 case AArch64::FMULv4f32:
7532 case AArch64::FMULv2f64:
7533 case AArch64::FMULXv4f16:
7534 case AArch64::FMULXv8f16:
7535 case AArch64::FMULXv2f32:
7536 case AArch64::FMULXv4f32:
7537 case AArch64::FMULXv2f64:
7538 // -- SVE instructions --
7539 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7540 // in the SVE instruction set (though there are predicated ones).
7541 case AArch64::FADD_ZZZ_H:
7542 case AArch64::FADD_ZZZ_S:
7543 case AArch64::FADD_ZZZ_D:
7544 case AArch64::FMUL_ZZZ_H:
7545 case AArch64::FMUL_ZZZ_S:
7546 case AArch64::FMUL_ZZZ_D:
7549
7550 // == Integer types ==
7551 // -- Base instructions --
7552 // Opcodes MULWrr and MULXrr don't exist because
7553 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7554 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7555 // The machine-combiner does not support three-source-operands machine
7556 // instruction. So we cannot reassociate MULs.
7557 case AArch64::ADDWrr:
7558 case AArch64::ADDXrr:
7559 case AArch64::ANDWrr:
7560 case AArch64::ANDXrr:
7561 case AArch64::ORRWrr:
7562 case AArch64::ORRXrr:
7563 case AArch64::EORWrr:
7564 case AArch64::EORXrr:
7565 case AArch64::EONWrr:
7566 case AArch64::EONXrr:
7567 // -- Advanced SIMD instructions --
7568 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7569 // in the Advanced SIMD instruction set.
7570 case AArch64::ADDv8i8:
7571 case AArch64::ADDv16i8:
7572 case AArch64::ADDv4i16:
7573 case AArch64::ADDv8i16:
7574 case AArch64::ADDv2i32:
7575 case AArch64::ADDv4i32:
7576 case AArch64::ADDv1i64:
7577 case AArch64::ADDv2i64:
7578 case AArch64::MULv8i8:
7579 case AArch64::MULv16i8:
7580 case AArch64::MULv4i16:
7581 case AArch64::MULv8i16:
7582 case AArch64::MULv2i32:
7583 case AArch64::MULv4i32:
7584 case AArch64::ANDv8i8:
7585 case AArch64::ANDv16i8:
7586 case AArch64::ORRv8i8:
7587 case AArch64::ORRv16i8:
7588 case AArch64::EORv8i8:
7589 case AArch64::EORv16i8:
7590 // -- SVE instructions --
7591 case AArch64::ADD_ZZZ_B:
7592 case AArch64::ADD_ZZZ_H:
7593 case AArch64::ADD_ZZZ_S:
7594 case AArch64::ADD_ZZZ_D:
7595 case AArch64::MUL_ZZZ_B:
7596 case AArch64::MUL_ZZZ_H:
7597 case AArch64::MUL_ZZZ_S:
7598 case AArch64::MUL_ZZZ_D:
7599 case AArch64::AND_ZZZ:
7600 case AArch64::ORR_ZZZ:
7601 case AArch64::EOR_ZZZ:
7602 return true;
7603
7604 default:
7605 return false;
7606 }
7607}
7608
7609/// Find instructions that can be turned into madd.
7611 SmallVectorImpl<unsigned> &Patterns) {
7612 unsigned Opc = Root.getOpcode();
7613 MachineBasicBlock &MBB = *Root.getParent();
7614 bool Found = false;
7615
7617 return false;
7619 int Cmp_NZCV =
7620 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7621 // When NZCV is live bail out.
7622 if (Cmp_NZCV == -1)
7623 return false;
7624 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7625 // When opcode can't change bail out.
7626 // CHECKME: do we miss any cases for opcode conversion?
7627 if (NewOpc == Opc)
7628 return false;
7629 Opc = NewOpc;
7630 }
7631
7632 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7633 unsigned Pattern) {
7634 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7635 Patterns.push_back(Pattern);
7636 Found = true;
7637 }
7638 };
7639
7640 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7641 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7642 Patterns.push_back(Pattern);
7643 Found = true;
7644 }
7645 };
7646
7648
7649 switch (Opc) {
7650 default:
7651 break;
7652 case AArch64::ADDWrr:
7653 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7654 "ADDWrr does not have register operands");
7655 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7656 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7657 break;
7658 case AArch64::ADDXrr:
7659 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7660 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7661 break;
7662 case AArch64::SUBWrr:
7663 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7664 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7665 break;
7666 case AArch64::SUBXrr:
7667 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7668 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7669 break;
7670 case AArch64::ADDWri:
7671 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7672 break;
7673 case AArch64::ADDXri:
7674 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7675 break;
7676 case AArch64::SUBWri:
7677 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7678 break;
7679 case AArch64::SUBXri:
7680 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7681 break;
7682 case AArch64::ADDv8i8:
7683 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7684 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7685 break;
7686 case AArch64::ADDv16i8:
7687 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7688 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7689 break;
7690 case AArch64::ADDv4i16:
7691 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7692 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7693 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7694 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7695 break;
7696 case AArch64::ADDv8i16:
7697 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7698 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7699 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7700 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7701 break;
7702 case AArch64::ADDv2i32:
7703 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7704 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7705 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7706 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7707 break;
7708 case AArch64::ADDv4i32:
7709 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7710 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7711 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7712 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7713 break;
7714 case AArch64::SUBv8i8:
7715 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7716 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7717 break;
7718 case AArch64::SUBv16i8:
7719 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7720 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7721 break;
7722 case AArch64::SUBv4i16:
7723 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7724 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7725 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7726 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7727 break;
7728 case AArch64::SUBv8i16:
7729 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7730 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7731 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7732 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7733 break;
7734 case AArch64::SUBv2i32:
7735 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7736 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7737 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7738 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7739 break;
7740 case AArch64::SUBv4i32:
7741 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7742 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7743 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7744 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7745 break;
7746 }
7747 return Found;
7748}
7749
7750bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7751 switch (Opcode) {
7752 default:
7753 break;
7754 case AArch64::UABALB_ZZZ_D:
7755 case AArch64::UABALB_ZZZ_H:
7756 case AArch64::UABALB_ZZZ_S:
7757 case AArch64::UABALT_ZZZ_D:
7758 case AArch64::UABALT_ZZZ_H:
7759 case AArch64::UABALT_ZZZ_S:
7760 case AArch64::SABALB_ZZZ_D:
7761 case AArch64::SABALB_ZZZ_S:
7762 case AArch64::SABALB_ZZZ_H:
7763 case AArch64::SABALT_ZZZ_D:
7764 case AArch64::SABALT_ZZZ_S:
7765 case AArch64::SABALT_ZZZ_H:
7766 case AArch64::UABALv16i8_v8i16:
7767 case AArch64::UABALv2i32_v2i64:
7768 case AArch64::UABALv4i16_v4i32:
7769 case AArch64::UABALv4i32_v2i64:
7770 case AArch64::UABALv8i16_v4i32:
7771 case AArch64::UABALv8i8_v8i16:
7772 case AArch64::UABAv16i8:
7773 case AArch64::UABAv2i32:
7774 case AArch64::UABAv4i16:
7775 case AArch64::UABAv4i32:
7776 case AArch64::UABAv8i16:
7777 case AArch64::UABAv8i8:
7778 case AArch64::SABALv16i8_v8i16:
7779 case AArch64::SABALv2i32_v2i64:
7780 case AArch64::SABALv4i16_v4i32:
7781 case AArch64::SABALv4i32_v2i64:
7782 case AArch64::SABALv8i16_v4i32:
7783 case AArch64::SABALv8i8_v8i16:
7784 case AArch64::SABAv16i8:
7785 case AArch64::SABAv2i32:
7786 case AArch64::SABAv4i16:
7787 case AArch64::SABAv4i32:
7788 case AArch64::SABAv8i16:
7789 case AArch64::SABAv8i8:
7790 return true;
7791 }
7792
7793 return false;
7794}
7795
7796unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7797 unsigned AccumulationOpcode) const {
7798 switch (AccumulationOpcode) {
7799 default:
7800 llvm_unreachable("Unsupported accumulation Opcode!");
7801 case AArch64::UABALB_ZZZ_D:
7802 return AArch64::UABDLB_ZZZ_D;
7803 case AArch64::UABALB_ZZZ_H:
7804 return AArch64::UABDLB_ZZZ_H;
7805 case AArch64::UABALB_ZZZ_S:
7806 return AArch64::UABDLB_ZZZ_S;
7807 case AArch64::UABALT_ZZZ_D:
7808 return AArch64::UABDLT_ZZZ_D;
7809 case AArch64::UABALT_ZZZ_H:
7810 return AArch64::UABDLT_ZZZ_H;
7811 case AArch64::UABALT_ZZZ_S:
7812 return AArch64::UABDLT_ZZZ_S;
7813 case AArch64::UABALv16i8_v8i16:
7814 return AArch64::UABDLv16i8_v8i16;
7815 case AArch64::UABALv2i32_v2i64:
7816 return AArch64::UABDLv2i32_v2i64;
7817 case AArch64::UABALv4i16_v4i32:
7818 return AArch64::UABDLv4i16_v4i32;
7819 case AArch64::UABALv4i32_v2i64:
7820 return AArch64::UABDLv4i32_v2i64;
7821 case AArch64::UABALv8i16_v4i32:
7822 return AArch64::UABDLv8i16_v4i32;
7823 case AArch64::UABALv8i8_v8i16:
7824 return AArch64::UABDLv8i8_v8i16;
7825 case AArch64::UABAv16i8:
7826 return AArch64::UABDv16i8;
7827 case AArch64::UABAv2i32:
7828 return AArch64::UABDv2i32;
7829 case AArch64::UABAv4i16:
7830 return AArch64::UABDv4i16;
7831 case AArch64::UABAv4i32:
7832 return AArch64::UABDv4i32;
7833 case AArch64::UABAv8i16:
7834 return AArch64::UABDv8i16;
7835 case AArch64::UABAv8i8:
7836 return AArch64::UABDv8i8;
7837 case AArch64::SABALB_ZZZ_D:
7838 return AArch64::SABDLB_ZZZ_D;
7839 case AArch64::SABALB_ZZZ_S:
7840 return AArch64::SABDLB_ZZZ_S;
7841 case AArch64::SABALB_ZZZ_H:
7842 return AArch64::SABDLB_ZZZ_H;
7843 case AArch64::SABALT_ZZZ_D:
7844 return AArch64::SABDLT_ZZZ_D;
7845 case AArch64::SABALT_ZZZ_S:
7846 return AArch64::SABDLT_ZZZ_S;
7847 case AArch64::SABALT_ZZZ_H:
7848 return AArch64::SABDLT_ZZZ_H;
7849 case AArch64::SABALv16i8_v8i16:
7850 return AArch64::SABDLv16i8_v8i16;
7851 case AArch64::SABALv2i32_v2i64:
7852 return AArch64::SABDLv2i32_v2i64;
7853 case AArch64::SABALv4i16_v4i32:
7854 return AArch64::SABDLv4i16_v4i32;
7855 case AArch64::SABALv4i32_v2i64:
7856 return AArch64::SABDLv4i32_v2i64;
7857 case AArch64::SABALv8i16_v4i32:
7858 return AArch64::SABDLv8i16_v4i32;
7859 case AArch64::SABALv8i8_v8i16:
7860 return AArch64::SABDLv8i8_v8i16;
7861 case AArch64::SABAv16i8:
7862 return AArch64::SABDv16i8;
7863 case AArch64::SABAv2i32:
7864 return AArch64::SABAv2i32;
7865 case AArch64::SABAv4i16:
7866 return AArch64::SABDv4i16;
7867 case AArch64::SABAv4i32:
7868 return AArch64::SABDv4i32;
7869 case AArch64::SABAv8i16:
7870 return AArch64::SABDv8i16;
7871 case AArch64::SABAv8i8:
7872 return AArch64::SABDv8i8;
7873 }
7874}
7875
7876/// Floating-Point Support
7877
7878/// Find instructions that can be turned into madd.
7880 SmallVectorImpl<unsigned> &Patterns) {
7881
7882 if (!isCombineInstrCandidateFP(Root))
7883 return false;
7884
7885 MachineBasicBlock &MBB = *Root.getParent();
7886 bool Found = false;
7887
7888 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7889 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7890 Patterns.push_back(Pattern);
7891 return true;
7892 }
7893 return false;
7894 };
7895
7897
7898 switch (Root.getOpcode()) {
7899 default:
7900 assert(false && "Unsupported FP instruction in combiner\n");
7901 break;
7902 case AArch64::FADDHrr:
7903 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7904 "FADDHrr does not have register operands");
7905
7906 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7907 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7908 break;
7909 case AArch64::FADDSrr:
7910 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7911 "FADDSrr does not have register operands");
7912
7913 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7914 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7915
7916 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7917 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7918 break;
7919 case AArch64::FADDDrr:
7920 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7921 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7922
7923 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7924 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7925 break;
7926 case AArch64::FADDv4f16:
7927 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7928 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7929
7930 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7931 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7932 break;
7933 case AArch64::FADDv8f16:
7934 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7935 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7936
7937 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7938 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7939 break;
7940 case AArch64::FADDv2f32:
7941 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7942 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7943
7944 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7945 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7946 break;
7947 case AArch64::FADDv2f64:
7948 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7949 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7950
7951 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7952 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7953 break;
7954 case AArch64::FADDv4f32:
7955 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7956 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7957
7958 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7959 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7960 break;
7961 case AArch64::FSUBHrr:
7962 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7963 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7964 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7965 break;
7966 case AArch64::FSUBSrr:
7967 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7968
7969 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7970 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7971
7972 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7973 break;
7974 case AArch64::FSUBDrr:
7975 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7976
7977 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7978 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7979
7980 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7981 break;
7982 case AArch64::FSUBv4f16:
7983 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7984 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7985
7986 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7987 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7988 break;
7989 case AArch64::FSUBv8f16:
7990 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7991 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7992
7993 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7994 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7995 break;
7996 case AArch64::FSUBv2f32:
7997 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7998 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7999
8000 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
8001 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
8002 break;
8003 case AArch64::FSUBv2f64:
8004 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
8005 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
8006
8007 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
8008 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
8009 break;
8010 case AArch64::FSUBv4f32:
8011 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
8012 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
8013
8014 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
8015 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
8016 break;
8017 }
8018 return Found;
8019}
8020
8022 SmallVectorImpl<unsigned> &Patterns) {
8023 MachineBasicBlock &MBB = *Root.getParent();
8024 bool Found = false;
8025
8026 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
8027 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8028 MachineOperand &MO = Root.getOperand(Operand);
8029 MachineInstr *MI = nullptr;
8030 if (MO.isReg() && MO.getReg().isVirtual())
8031 MI = MRI.getUniqueVRegDef(MO.getReg());
8032 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
8033 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
8034 MI->getOperand(1).getReg().isVirtual())
8035 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
8036 if (MI && MI->getOpcode() == Opcode) {
8037 Patterns.push_back(Pattern);
8038 return true;
8039 }
8040 return false;
8041 };
8042
8044
8045 switch (Root.getOpcode()) {
8046 default:
8047 return false;
8048 case AArch64::FMULv2f32:
8049 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
8050 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
8051 break;
8052 case AArch64::FMULv2f64:
8053 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
8054 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
8055 break;
8056 case AArch64::FMULv4f16:
8057 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
8058 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
8059 break;
8060 case AArch64::FMULv4f32:
8061 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
8062 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
8063 break;
8064 case AArch64::FMULv8f16:
8065 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
8066 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
8067 break;
8068 }
8069
8070 return Found;
8071}
8072
8074 SmallVectorImpl<unsigned> &Patterns) {
8075 unsigned Opc = Root.getOpcode();
8076 MachineBasicBlock &MBB = *Root.getParent();
8077 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8078
8079 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
8080 MachineOperand &MO = Root.getOperand(1);
8082 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
8083 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
8087 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
8088 Patterns.push_back(Pattern);
8089 return true;
8090 }
8091 return false;
8092 };
8093
8094 switch (Opc) {
8095 default:
8096 break;
8097 case AArch64::FNEGDr:
8098 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
8099 case AArch64::FNEGSr:
8100 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
8101 }
8102
8103 return false;
8104}
8105
8106/// Return true when a code sequence can improve throughput. It
8107/// should be called only for instructions in loops.
8108/// \param Pattern - combiner pattern
8110 switch (Pattern) {
8111 default:
8112 break;
8218 return true;
8219 } // end switch (Pattern)
8220 return false;
8221}
8222
8223/// Find other MI combine patterns.
8225 SmallVectorImpl<unsigned> &Patterns) {
8226 // A - (B + C) ==> (A - B) - C or (A - C) - B
8227 unsigned Opc = Root.getOpcode();
8228 MachineBasicBlock &MBB = *Root.getParent();
8229
8230 switch (Opc) {
8231 case AArch64::SUBWrr:
8232 case AArch64::SUBSWrr:
8233 case AArch64::SUBXrr:
8234 case AArch64::SUBSXrr:
8235 // Found candidate root.
8236 break;
8237 default:
8238 return false;
8239 }
8240
8242 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8243 -1)
8244 return false;
8245
8246 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8247 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8248 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8249 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8252 return true;
8253 }
8254
8255 return false;
8256}
8257
8258/// Check if the given instruction forms a gather load pattern that can be
8259/// optimized for better Memory-Level Parallelism (MLP). This function
8260/// identifies chains of NEON lane load instructions that load data from
8261/// different memory addresses into individual lanes of a 128-bit vector
8262/// register, then attempts to split the pattern into parallel loads to break
8263/// the serial dependency between instructions.
8264///
8265/// Pattern Matched:
8266/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8267/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8268///
8269/// Transformed Into:
8270/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8271/// to combine the results, enabling better memory-level parallelism.
8272///
8273/// Supported Element Types:
8274/// - 32-bit elements (LD1i32, 4 lanes total)
8275/// - 16-bit elements (LD1i16, 8 lanes total)
8276/// - 8-bit elements (LD1i8, 16 lanes total)
8278 SmallVectorImpl<unsigned> &Patterns,
8279 unsigned LoadLaneOpCode, unsigned NumLanes) {
8280 const MachineFunction *MF = Root.getMF();
8281
8282 // Early exit if optimizing for size.
8283 if (MF->getFunction().hasMinSize())
8284 return false;
8285
8286 const MachineRegisterInfo &MRI = MF->getRegInfo();
8288
8289 // The root of the pattern must load into the last lane of the vector.
8290 if (Root.getOperand(2).getImm() != NumLanes - 1)
8291 return false;
8292
8293 // Check that we have load into all lanes except lane 0.
8294 // For each load we also want to check that:
8295 // 1. It has a single non-debug use (since we will be replacing the virtual
8296 // register)
8297 // 2. That the addressing mode only uses a single pointer operand
8298 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8299 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8300 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8302 while (!RemainingLanes.empty() && CurrInstr &&
8303 CurrInstr->getOpcode() == LoadLaneOpCode &&
8304 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8305 CurrInstr->getNumOperands() == 4) {
8306 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8307 LoadInstrs.push_back(CurrInstr);
8308 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8309 }
8310
8311 // Check that we have found a match for lanes N-1.. 1.
8312 if (!RemainingLanes.empty())
8313 return false;
8314
8315 // Match the SUBREG_TO_REG sequence.
8316 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8317 return false;
8318
8319 // Verify that the subreg to reg loads an integer into the first lane.
8320 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8321 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8322 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8323 return false;
8324
8325 // Verify that it also has a single non debug use.
8326 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8327 return false;
8328
8329 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8330
8331 // If there is any chance of aliasing, do not apply the pattern.
8332 // Walk backward through the MBB starting from Root.
8333 // Exit early if we've encountered all load instructions or hit the search
8334 // limit.
8335 auto MBBItr = Root.getIterator();
8336 unsigned RemainingSteps = GatherOptSearchLimit;
8337 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8338 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8339 const MachineBasicBlock *MBB = Root.getParent();
8340
8341 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8342 !RemainingLoadInstrs.empty();
8343 --MBBItr, --RemainingSteps) {
8344 const MachineInstr &CurrInstr = *MBBItr;
8345
8346 // Remove this instruction from remaining loads if it's one we're tracking.
8347 RemainingLoadInstrs.erase(&CurrInstr);
8348
8349 // Check for potential aliasing with any of the load instructions to
8350 // optimize.
8351 if (CurrInstr.isLoadFoldBarrier())
8352 return false;
8353 }
8354
8355 // If we hit the search limit without finding all load instructions,
8356 // don't match the pattern.
8357 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8358 return false;
8359
8360 switch (NumLanes) {
8361 case 4:
8363 break;
8364 case 8:
8366 break;
8367 case 16:
8369 break;
8370 default:
8371 llvm_unreachable("Got bad number of lanes for gather pattern.");
8372 }
8373
8374 return true;
8375}
8376
8377/// Search for patterns of LD instructions we can optimize.
8379 SmallVectorImpl<unsigned> &Patterns) {
8380
8381 // The pattern searches for loads into single lanes.
8382 switch (Root.getOpcode()) {
8383 case AArch64::LD1i32:
8384 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8385 case AArch64::LD1i16:
8386 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8387 case AArch64::LD1i8:
8388 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8389 default:
8390 return false;
8391 }
8392}
8393
8394/// Generate optimized instruction sequence for gather load patterns to improve
8395/// Memory-Level Parallelism (MLP). This function transforms a chain of
8396/// sequential NEON lane loads into parallel vector loads that can execute
8397/// concurrently.
8398static void
8402 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8403 unsigned Pattern, unsigned NumLanes) {
8404 MachineFunction &MF = *Root.getParent()->getParent();
8405 MachineRegisterInfo &MRI = MF.getRegInfo();
8407
8408 // Gather the initial load instructions to build the pattern.
8409 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8410 MachineInstr *CurrInstr = &Root;
8411 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8412 LoadToLaneInstrs.push_back(CurrInstr);
8413 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8414 }
8415
8416 // Sort the load instructions according to the lane.
8417 llvm::sort(LoadToLaneInstrs,
8418 [](const MachineInstr *A, const MachineInstr *B) {
8419 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8420 });
8421
8422 MachineInstr *SubregToReg = CurrInstr;
8423 LoadToLaneInstrs.push_back(
8424 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8425 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8426
8427 const TargetRegisterClass *FPR128RegClass =
8428 MRI.getRegClass(Root.getOperand(0).getReg());
8429
8430 // Helper lambda to create a LD1 instruction.
8431 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8432 Register SrcRegister, unsigned Lane,
8433 Register OffsetRegister,
8434 bool OffsetRegisterKillState) {
8435 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8436 MachineInstrBuilder LoadIndexIntoRegister =
8437 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8438 NewRegister)
8439 .addReg(SrcRegister)
8440 .addImm(Lane)
8441 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState))
8442 .setMemRefs(OriginalInstr->memoperands());
8443 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8444 InsInstrs.push_back(LoadIndexIntoRegister);
8445 return NewRegister;
8446 };
8447
8448 // Helper to create load instruction based on the NumLanes in the NEON
8449 // register we are rewriting.
8450 auto CreateLDRInstruction =
8451 [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
8453 unsigned Opcode;
8454 switch (NumLanes) {
8455 case 4:
8456 Opcode = AArch64::LDRSui;
8457 break;
8458 case 8:
8459 Opcode = AArch64::LDRHui;
8460 break;
8461 case 16:
8462 Opcode = AArch64::LDRBui;
8463 break;
8464 default:
8466 "Got unsupported number of lanes in machine-combiner gather pattern");
8467 }
8468 // Immediate offset load
8469 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8470 .addReg(OffsetReg)
8471 .addImm(0)
8472 .setMemRefs(MMOs);
8473 };
8474
8475 // Load the remaining lanes into register 0.
8476 auto LanesToLoadToReg0 =
8477 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8478 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8479 Register PrevReg = SubregToReg->getOperand(0).getReg();
8480 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8481 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8482 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8483 OffsetRegOperand.getReg(),
8484 OffsetRegOperand.isKill());
8485 DelInstrs.push_back(LoadInstr);
8486 }
8487 Register LastLoadReg0 = PrevReg;
8488
8489 // First load into register 1. Perform an integer load to zero out the upper
8490 // lanes in a single instruction.
8491 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8492 MachineInstr *OriginalSplitLoad =
8493 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8494 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8495 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8496
8497 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8498 OriginalSplitLoad->getOperand(3);
8499 MachineInstrBuilder MiddleIndexLoadInstr =
8500 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8501 OriginalSplitToLoadOffsetOperand.getReg(),
8502 OriginalSplitLoad->memoperands());
8503
8504 InstrIdxForVirtReg.insert(
8505 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8506 InsInstrs.push_back(MiddleIndexLoadInstr);
8507 DelInstrs.push_back(OriginalSplitLoad);
8508
8509 // Subreg To Reg instruction for register 1.
8510 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8511 unsigned SubregType;
8512 switch (NumLanes) {
8513 case 4:
8514 SubregType = AArch64::ssub;
8515 break;
8516 case 8:
8517 SubregType = AArch64::hsub;
8518 break;
8519 case 16:
8520 SubregType = AArch64::bsub;
8521 break;
8522 default:
8524 "Got invalid NumLanes for machine-combiner gather pattern");
8525 }
8526
8527 auto SubRegToRegInstr =
8528 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8529 DestRegForSubregToReg)
8530 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8531 .addImm(SubregType);
8532 InstrIdxForVirtReg.insert(
8533 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8534 InsInstrs.push_back(SubRegToRegInstr);
8535
8536 // Load remaining lanes into register 1.
8537 auto LanesToLoadToReg1 =
8538 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8539 LoadToLaneInstrsAscending.end());
8540 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8541 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8542 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8543 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8544 OffsetRegOperand.getReg(),
8545 OffsetRegOperand.isKill());
8546
8547 // Do not add the last reg to DelInstrs - it will be removed later.
8548 if (Index == NumLanes / 2 - 2) {
8549 break;
8550 }
8551 DelInstrs.push_back(LoadInstr);
8552 }
8553 Register LastLoadReg1 = PrevReg;
8554
8555 // Create the final zip instruction to combine the results.
8556 MachineInstrBuilder ZipInstr =
8557 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8558 Root.getOperand(0).getReg())
8559 .addReg(LastLoadReg0)
8560 .addReg(LastLoadReg1);
8561 InsInstrs.push_back(ZipInstr);
8562}
8563
8577
8578/// Return true when there is potentially a faster code sequence for an
8579/// instruction chain ending in \p Root. All potential patterns are listed in
8580/// the \p Pattern vector. Pattern should be sorted in priority order since the
8581/// pattern evaluator stops checking as soon as it finds a faster sequence.
8582
8583bool AArch64InstrInfo::getMachineCombinerPatterns(
8584 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8585 bool DoRegPressureReduce) const {
8586 // Integer patterns
8587 if (getMaddPatterns(Root, Patterns))
8588 return true;
8589 // Floating point patterns
8590 if (getFMULPatterns(Root, Patterns))
8591 return true;
8592 if (getFMAPatterns(Root, Patterns))
8593 return true;
8594 if (getFNEGPatterns(Root, Patterns))
8595 return true;
8596
8597 // Other patterns
8598 if (getMiscPatterns(Root, Patterns))
8599 return true;
8600
8601 // Load patterns
8602 if (getLoadPatterns(Root, Patterns))
8603 return true;
8604
8605 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8606 DoRegPressureReduce);
8607}
8608
8610/// genFusedMultiply - Generate fused multiply instructions.
8611/// This function supports both integer and floating point instructions.
8612/// A typical example:
8613/// F|MUL I=A,B,0
8614/// F|ADD R,I,C
8615/// ==> F|MADD R,A,B,C
8616/// \param MF Containing MachineFunction
8617/// \param MRI Register information
8618/// \param TII Target information
8619/// \param Root is the F|ADD instruction
8620/// \param [out] InsInstrs is a vector of machine instructions and will
8621/// contain the generated madd instruction
8622/// \param IdxMulOpd is index of operand in Root that is the result of
8623/// the F|MUL. In the example above IdxMulOpd is 1.
8624/// \param MaddOpc the opcode fo the f|madd instruction
8625/// \param RC Register class of operands
8626/// \param kind of fma instruction (addressing mode) to be generated
8627/// \param ReplacedAddend is the result register from the instruction
8628/// replacing the non-combined operand, if any.
8629static MachineInstr *
8631 const TargetInstrInfo *TII, MachineInstr &Root,
8632 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8633 unsigned MaddOpc, const TargetRegisterClass *RC,
8635 const Register *ReplacedAddend = nullptr) {
8636 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8637
8638 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8639 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8640 Register ResultReg = Root.getOperand(0).getReg();
8641 Register SrcReg0 = MUL->getOperand(1).getReg();
8642 bool Src0IsKill = MUL->getOperand(1).isKill();
8643 Register SrcReg1 = MUL->getOperand(2).getReg();
8644 bool Src1IsKill = MUL->getOperand(2).isKill();
8645
8646 Register SrcReg2;
8647 bool Src2IsKill;
8648 if (ReplacedAddend) {
8649 // If we just generated a new addend, we must be it's only use.
8650 SrcReg2 = *ReplacedAddend;
8651 Src2IsKill = true;
8652 } else {
8653 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8654 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8655 }
8656
8657 if (ResultReg.isVirtual())
8658 MRI.constrainRegClass(ResultReg, RC);
8659 if (SrcReg0.isVirtual())
8660 MRI.constrainRegClass(SrcReg0, RC);
8661 if (SrcReg1.isVirtual())
8662 MRI.constrainRegClass(SrcReg1, RC);
8663 if (SrcReg2.isVirtual())
8664 MRI.constrainRegClass(SrcReg2, RC);
8665
8667 if (kind == FMAInstKind::Default)
8668 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8669 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8670 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8671 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8672 else if (kind == FMAInstKind::Indexed)
8673 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8674 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8675 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8676 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8677 .addImm(MUL->getOperand(3).getImm());
8678 else if (kind == FMAInstKind::Accumulator)
8679 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8680 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8681 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8682 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8683 else
8684 assert(false && "Invalid FMA instruction kind \n");
8685 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8686 InsInstrs.push_back(MIB);
8687 return MUL;
8688}
8689
8690static MachineInstr *
8692 const TargetInstrInfo *TII, MachineInstr &Root,
8694 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8695
8696 unsigned Opc = 0;
8697 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8698 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8699 Opc = AArch64::FNMADDSrrr;
8700 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8701 Opc = AArch64::FNMADDDrrr;
8702 else
8703 return nullptr;
8704
8705 Register ResultReg = Root.getOperand(0).getReg();
8706 Register SrcReg0 = MAD->getOperand(1).getReg();
8707 Register SrcReg1 = MAD->getOperand(2).getReg();
8708 Register SrcReg2 = MAD->getOperand(3).getReg();
8709 bool Src0IsKill = MAD->getOperand(1).isKill();
8710 bool Src1IsKill = MAD->getOperand(2).isKill();
8711 bool Src2IsKill = MAD->getOperand(3).isKill();
8712 if (ResultReg.isVirtual())
8713 MRI.constrainRegClass(ResultReg, RC);
8714 if (SrcReg0.isVirtual())
8715 MRI.constrainRegClass(SrcReg0, RC);
8716 if (SrcReg1.isVirtual())
8717 MRI.constrainRegClass(SrcReg1, RC);
8718 if (SrcReg2.isVirtual())
8719 MRI.constrainRegClass(SrcReg2, RC);
8720
8722 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8723 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8724 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8725 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8726 InsInstrs.push_back(MIB);
8727
8728 return MAD;
8729}
8730
8731/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8732static MachineInstr *
8735 unsigned IdxDupOp, unsigned MulOpc,
8736 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8737 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8738 "Invalid index of FMUL operand");
8739
8740 MachineFunction &MF = *Root.getMF();
8742
8743 MachineInstr *Dup =
8744 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8745
8746 if (Dup->getOpcode() == TargetOpcode::COPY)
8747 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8748
8749 Register DupSrcReg = Dup->getOperand(1).getReg();
8750 MRI.clearKillFlags(DupSrcReg);
8751 MRI.constrainRegClass(DupSrcReg, RC);
8752
8753 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8754
8755 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8756 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8757
8758 Register ResultReg = Root.getOperand(0).getReg();
8759
8761 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8762 .add(MulOp)
8763 .addReg(DupSrcReg)
8764 .addImm(DupSrcLane);
8765
8766 InsInstrs.push_back(MIB);
8767 return &Root;
8768}
8769
8770/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8771/// instructions.
8772///
8773/// \see genFusedMultiply
8777 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8778 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8780}
8781
8782/// genNeg - Helper to generate an intermediate negation of the second operand
8783/// of Root
8785 const TargetInstrInfo *TII, MachineInstr &Root,
8787 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8788 unsigned MnegOpc, const TargetRegisterClass *RC) {
8789 Register NewVR = MRI.createVirtualRegister(RC);
8791 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8792 .add(Root.getOperand(2));
8793 InsInstrs.push_back(MIB);
8794
8795 assert(InstrIdxForVirtReg.empty());
8796 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8797
8798 return NewVR;
8799}
8800
8801/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8802/// instructions with an additional negation of the accumulator
8806 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8807 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8808 assert(IdxMulOpd == 1);
8809
8810 Register NewVR =
8811 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8812 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8813 FMAInstKind::Accumulator, &NewVR);
8814}
8815
8816/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8817/// instructions.
8818///
8819/// \see genFusedMultiply
8823 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8824 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8826}
8827
8828/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8829/// instructions with an additional negation of the accumulator
8833 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8834 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8835 assert(IdxMulOpd == 1);
8836
8837 Register NewVR =
8838 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8839
8840 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8841 FMAInstKind::Indexed, &NewVR);
8842}
8843
8844/// genMaddR - Generate madd instruction and combine mul and add using
8845/// an extra virtual register
8846/// Example - an ADD intermediate needs to be stored in a register:
8847/// MUL I=A,B,0
8848/// ADD R,I,Imm
8849/// ==> ORR V, ZR, Imm
8850/// ==> MADD R,A,B,V
8851/// \param MF Containing MachineFunction
8852/// \param MRI Register information
8853/// \param TII Target information
8854/// \param Root is the ADD instruction
8855/// \param [out] InsInstrs is a vector of machine instructions and will
8856/// contain the generated madd instruction
8857/// \param IdxMulOpd is index of operand in Root that is the result of
8858/// the MUL. In the example above IdxMulOpd is 1.
8859/// \param MaddOpc the opcode fo the madd instruction
8860/// \param VR is a virtual register that holds the value of an ADD operand
8861/// (V in the example above).
8862/// \param RC Register class of operands
8864 const TargetInstrInfo *TII, MachineInstr &Root,
8866 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8867 const TargetRegisterClass *RC) {
8868 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8869
8870 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8871 Register ResultReg = Root.getOperand(0).getReg();
8872 Register SrcReg0 = MUL->getOperand(1).getReg();
8873 bool Src0IsKill = MUL->getOperand(1).isKill();
8874 Register SrcReg1 = MUL->getOperand(2).getReg();
8875 bool Src1IsKill = MUL->getOperand(2).isKill();
8876
8877 if (ResultReg.isVirtual())
8878 MRI.constrainRegClass(ResultReg, RC);
8879 if (SrcReg0.isVirtual())
8880 MRI.constrainRegClass(SrcReg0, RC);
8881 if (SrcReg1.isVirtual())
8882 MRI.constrainRegClass(SrcReg1, RC);
8884 MRI.constrainRegClass(VR, RC);
8885
8887 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8888 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8889 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8890 .addReg(VR);
8891 // Insert the MADD
8892 InsInstrs.push_back(MIB);
8893 return MUL;
8894}
8895
8896/// Do the following transformation
8897/// A - (B + C) ==> (A - B) - C
8898/// A - (B + C) ==> (A - C) - B
8900 const TargetInstrInfo *TII, MachineInstr &Root,
8903 unsigned IdxOpd1,
8904 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8905 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8906 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8907 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8908
8909 Register ResultReg = Root.getOperand(0).getReg();
8910 Register RegA = Root.getOperand(1).getReg();
8911 bool RegAIsKill = Root.getOperand(1).isKill();
8912 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8913 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8914 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8915 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8916 Register NewVR =
8918
8919 unsigned Opcode = Root.getOpcode();
8920 if (Opcode == AArch64::SUBSWrr)
8921 Opcode = AArch64::SUBWrr;
8922 else if (Opcode == AArch64::SUBSXrr)
8923 Opcode = AArch64::SUBXrr;
8924 else
8925 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8926 "Unexpected instruction opcode.");
8927
8928 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8929 Flags &= ~MachineInstr::NoSWrap;
8930 Flags &= ~MachineInstr::NoUWrap;
8931
8932 MachineInstrBuilder MIB1 =
8933 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8934 .addReg(RegA, getKillRegState(RegAIsKill))
8935 .addReg(RegB, getKillRegState(RegBIsKill))
8936 .setMIFlags(Flags);
8937 MachineInstrBuilder MIB2 =
8938 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8939 .addReg(NewVR, getKillRegState(true))
8940 .addReg(RegC, getKillRegState(RegCIsKill))
8941 .setMIFlags(Flags);
8942
8943 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8944 InsInstrs.push_back(MIB1);
8945 InsInstrs.push_back(MIB2);
8946 DelInstrs.push_back(AddMI);
8947 DelInstrs.push_back(&Root);
8948}
8949
8950unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8951 unsigned int AccumulatorOpCode) const {
8952 switch (AccumulatorOpCode) {
8953 case AArch64::UABALB_ZZZ_D:
8954 case AArch64::SABALB_ZZZ_D:
8955 case AArch64::UABALT_ZZZ_D:
8956 case AArch64::SABALT_ZZZ_D:
8957 return AArch64::ADD_ZZZ_D;
8958 case AArch64::UABALB_ZZZ_H:
8959 case AArch64::SABALB_ZZZ_H:
8960 case AArch64::UABALT_ZZZ_H:
8961 case AArch64::SABALT_ZZZ_H:
8962 return AArch64::ADD_ZZZ_H;
8963 case AArch64::UABALB_ZZZ_S:
8964 case AArch64::SABALB_ZZZ_S:
8965 case AArch64::UABALT_ZZZ_S:
8966 case AArch64::SABALT_ZZZ_S:
8967 return AArch64::ADD_ZZZ_S;
8968 case AArch64::UABALv16i8_v8i16:
8969 case AArch64::SABALv8i8_v8i16:
8970 case AArch64::SABAv8i16:
8971 case AArch64::UABAv8i16:
8972 return AArch64::ADDv8i16;
8973 case AArch64::SABALv2i32_v2i64:
8974 case AArch64::UABALv2i32_v2i64:
8975 case AArch64::SABALv4i32_v2i64:
8976 return AArch64::ADDv2i64;
8977 case AArch64::UABALv4i16_v4i32:
8978 case AArch64::SABALv4i16_v4i32:
8979 case AArch64::SABALv8i16_v4i32:
8980 case AArch64::SABAv4i32:
8981 case AArch64::UABAv4i32:
8982 return AArch64::ADDv4i32;
8983 case AArch64::UABALv4i32_v2i64:
8984 return AArch64::ADDv2i64;
8985 case AArch64::UABALv8i16_v4i32:
8986 return AArch64::ADDv4i32;
8987 case AArch64::UABALv8i8_v8i16:
8988 case AArch64::SABALv16i8_v8i16:
8989 return AArch64::ADDv8i16;
8990 case AArch64::UABAv16i8:
8991 case AArch64::SABAv16i8:
8992 return AArch64::ADDv16i8;
8993 case AArch64::UABAv4i16:
8994 case AArch64::SABAv4i16:
8995 return AArch64::ADDv4i16;
8996 case AArch64::UABAv2i32:
8997 case AArch64::SABAv2i32:
8998 return AArch64::ADDv2i32;
8999 case AArch64::UABAv8i8:
9000 case AArch64::SABAv8i8:
9001 return AArch64::ADDv8i8;
9002 default:
9003 llvm_unreachable("Unknown accumulator opcode");
9004 }
9005}
9006
9007/// When getMachineCombinerPatterns() finds potential patterns,
9008/// this function generates the instructions that could replace the
9009/// original code sequence
9010void AArch64InstrInfo::genAlternativeCodeSequence(
9011 MachineInstr &Root, unsigned Pattern,
9014 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
9015 MachineBasicBlock &MBB = *Root.getParent();
9016 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9017 MachineFunction &MF = *MBB.getParent();
9018 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
9019
9020 MachineInstr *MUL = nullptr;
9021 const TargetRegisterClass *RC;
9022 unsigned Opc;
9023 switch (Pattern) {
9024 default:
9025 // Reassociate instructions.
9026 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
9027 DelInstrs, InstrIdxForVirtReg);
9028 return;
9030 // A - (B + C)
9031 // ==> (A - B) - C
9032 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
9033 InstrIdxForVirtReg);
9034 return;
9036 // A - (B + C)
9037 // ==> (A - C) - B
9038 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
9039 InstrIdxForVirtReg);
9040 return;
9043 // MUL I=A,B,0
9044 // ADD R,I,C
9045 // ==> MADD R,A,B,C
9046 // --- Create(MADD);
9048 Opc = AArch64::MADDWrrr;
9049 RC = &AArch64::GPR32RegClass;
9050 } else {
9051 Opc = AArch64::MADDXrrr;
9052 RC = &AArch64::GPR64RegClass;
9053 }
9054 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9055 break;
9058 // MUL I=A,B,0
9059 // ADD R,C,I
9060 // ==> MADD R,A,B,C
9061 // --- Create(MADD);
9063 Opc = AArch64::MADDWrrr;
9064 RC = &AArch64::GPR32RegClass;
9065 } else {
9066 Opc = AArch64::MADDXrrr;
9067 RC = &AArch64::GPR64RegClass;
9068 }
9069 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9070 break;
9075 // MUL I=A,B,0
9076 // ADD/SUB R,I,Imm
9077 // ==> MOV V, Imm/-Imm
9078 // ==> MADD R,A,B,V
9079 // --- Create(MADD);
9080 const TargetRegisterClass *RC;
9081 unsigned BitSize, MovImm;
9084 MovImm = AArch64::MOVi32imm;
9085 RC = &AArch64::GPR32spRegClass;
9086 BitSize = 32;
9087 Opc = AArch64::MADDWrrr;
9088 RC = &AArch64::GPR32RegClass;
9089 } else {
9090 MovImm = AArch64::MOVi64imm;
9091 RC = &AArch64::GPR64spRegClass;
9092 BitSize = 64;
9093 Opc = AArch64::MADDXrrr;
9094 RC = &AArch64::GPR64RegClass;
9095 }
9096 Register NewVR = MRI.createVirtualRegister(RC);
9097 uint64_t Imm = Root.getOperand(2).getImm();
9098
9099 if (Root.getOperand(3).isImm()) {
9100 unsigned Val = Root.getOperand(3).getImm();
9101 Imm = Imm << Val;
9102 }
9103 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
9105 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
9106 // Check that the immediate can be composed via a single instruction.
9108 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
9109 if (Insn.size() != 1)
9110 return;
9111 MachineInstrBuilder MIB1 =
9112 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
9113 .addImm(IsSub ? -Imm : Imm);
9114 InsInstrs.push_back(MIB1);
9115 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9116 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9117 break;
9118 }
9121 // MUL I=A,B,0
9122 // SUB R,I, C
9123 // ==> SUB V, 0, C
9124 // ==> MADD R,A,B,V // = -C + A*B
9125 // --- Create(MADD);
9126 const TargetRegisterClass *SubRC;
9127 unsigned SubOpc, ZeroReg;
9129 SubOpc = AArch64::SUBWrr;
9130 SubRC = &AArch64::GPR32spRegClass;
9131 ZeroReg = AArch64::WZR;
9132 Opc = AArch64::MADDWrrr;
9133 RC = &AArch64::GPR32RegClass;
9134 } else {
9135 SubOpc = AArch64::SUBXrr;
9136 SubRC = &AArch64::GPR64spRegClass;
9137 ZeroReg = AArch64::XZR;
9138 Opc = AArch64::MADDXrrr;
9139 RC = &AArch64::GPR64RegClass;
9140 }
9141 Register NewVR = MRI.createVirtualRegister(SubRC);
9142 // SUB NewVR, 0, C
9143 MachineInstrBuilder MIB1 =
9144 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
9145 .addReg(ZeroReg)
9146 .add(Root.getOperand(2));
9147 InsInstrs.push_back(MIB1);
9148 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9149 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
9150 break;
9151 }
9154 // MUL I=A,B,0
9155 // SUB R,C,I
9156 // ==> MSUB R,A,B,C (computes C - A*B)
9157 // --- Create(MSUB);
9159 Opc = AArch64::MSUBWrrr;
9160 RC = &AArch64::GPR32RegClass;
9161 } else {
9162 Opc = AArch64::MSUBXrrr;
9163 RC = &AArch64::GPR64RegClass;
9164 }
9165 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9166 break;
9168 Opc = AArch64::MLAv8i8;
9169 RC = &AArch64::FPR64RegClass;
9170 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9171 break;
9173 Opc = AArch64::MLAv8i8;
9174 RC = &AArch64::FPR64RegClass;
9175 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9176 break;
9178 Opc = AArch64::MLAv16i8;
9179 RC = &AArch64::FPR128RegClass;
9180 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9181 break;
9183 Opc = AArch64::MLAv16i8;
9184 RC = &AArch64::FPR128RegClass;
9185 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9186 break;
9188 Opc = AArch64::MLAv4i16;
9189 RC = &AArch64::FPR64RegClass;
9190 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9191 break;
9193 Opc = AArch64::MLAv4i16;
9194 RC = &AArch64::FPR64RegClass;
9195 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9196 break;
9198 Opc = AArch64::MLAv8i16;
9199 RC = &AArch64::FPR128RegClass;
9200 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9201 break;
9203 Opc = AArch64::MLAv8i16;
9204 RC = &AArch64::FPR128RegClass;
9205 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9206 break;
9208 Opc = AArch64::MLAv2i32;
9209 RC = &AArch64::FPR64RegClass;
9210 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9211 break;
9213 Opc = AArch64::MLAv2i32;
9214 RC = &AArch64::FPR64RegClass;
9215 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9216 break;
9218 Opc = AArch64::MLAv4i32;
9219 RC = &AArch64::FPR128RegClass;
9220 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9221 break;
9223 Opc = AArch64::MLAv4i32;
9224 RC = &AArch64::FPR128RegClass;
9225 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9226 break;
9227
9229 Opc = AArch64::MLAv8i8;
9230 RC = &AArch64::FPR64RegClass;
9231 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9232 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9233 RC);
9234 break;
9236 Opc = AArch64::MLSv8i8;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9239 break;
9241 Opc = AArch64::MLAv16i8;
9242 RC = &AArch64::FPR128RegClass;
9243 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9244 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9245 RC);
9246 break;
9248 Opc = AArch64::MLSv16i8;
9249 RC = &AArch64::FPR128RegClass;
9250 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9251 break;
9253 Opc = AArch64::MLAv4i16;
9254 RC = &AArch64::FPR64RegClass;
9255 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9256 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9257 RC);
9258 break;
9260 Opc = AArch64::MLSv4i16;
9261 RC = &AArch64::FPR64RegClass;
9262 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9263 break;
9265 Opc = AArch64::MLAv8i16;
9266 RC = &AArch64::FPR128RegClass;
9267 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9268 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9269 RC);
9270 break;
9272 Opc = AArch64::MLSv8i16;
9273 RC = &AArch64::FPR128RegClass;
9274 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9275 break;
9277 Opc = AArch64::MLAv2i32;
9278 RC = &AArch64::FPR64RegClass;
9279 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9280 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9281 RC);
9282 break;
9284 Opc = AArch64::MLSv2i32;
9285 RC = &AArch64::FPR64RegClass;
9286 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9287 break;
9289 Opc = AArch64::MLAv4i32;
9290 RC = &AArch64::FPR128RegClass;
9291 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9292 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9293 RC);
9294 break;
9296 Opc = AArch64::MLSv4i32;
9297 RC = &AArch64::FPR128RegClass;
9298 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9299 break;
9300
9302 Opc = AArch64::MLAv4i16_indexed;
9303 RC = &AArch64::FPR64RegClass;
9304 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9305 break;
9307 Opc = AArch64::MLAv4i16_indexed;
9308 RC = &AArch64::FPR64RegClass;
9309 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9310 break;
9312 Opc = AArch64::MLAv8i16_indexed;
9313 RC = &AArch64::FPR128RegClass;
9314 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9315 break;
9317 Opc = AArch64::MLAv8i16_indexed;
9318 RC = &AArch64::FPR128RegClass;
9319 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9320 break;
9322 Opc = AArch64::MLAv2i32_indexed;
9323 RC = &AArch64::FPR64RegClass;
9324 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9325 break;
9327 Opc = AArch64::MLAv2i32_indexed;
9328 RC = &AArch64::FPR64RegClass;
9329 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9330 break;
9332 Opc = AArch64::MLAv4i32_indexed;
9333 RC = &AArch64::FPR128RegClass;
9334 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9335 break;
9337 Opc = AArch64::MLAv4i32_indexed;
9338 RC = &AArch64::FPR128RegClass;
9339 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9340 break;
9341
9343 Opc = AArch64::MLAv4i16_indexed;
9344 RC = &AArch64::FPR64RegClass;
9345 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9346 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9347 RC);
9348 break;
9350 Opc = AArch64::MLSv4i16_indexed;
9351 RC = &AArch64::FPR64RegClass;
9352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9353 break;
9355 Opc = AArch64::MLAv8i16_indexed;
9356 RC = &AArch64::FPR128RegClass;
9357 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9358 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9359 RC);
9360 break;
9362 Opc = AArch64::MLSv8i16_indexed;
9363 RC = &AArch64::FPR128RegClass;
9364 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9365 break;
9367 Opc = AArch64::MLAv2i32_indexed;
9368 RC = &AArch64::FPR64RegClass;
9369 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9370 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9371 RC);
9372 break;
9374 Opc = AArch64::MLSv2i32_indexed;
9375 RC = &AArch64::FPR64RegClass;
9376 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9377 break;
9379 Opc = AArch64::MLAv4i32_indexed;
9380 RC = &AArch64::FPR128RegClass;
9381 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9382 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9383 RC);
9384 break;
9386 Opc = AArch64::MLSv4i32_indexed;
9387 RC = &AArch64::FPR128RegClass;
9388 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9389 break;
9390
9391 // Floating Point Support
9393 Opc = AArch64::FMADDHrrr;
9394 RC = &AArch64::FPR16RegClass;
9395 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9396 break;
9398 Opc = AArch64::FMADDSrrr;
9399 RC = &AArch64::FPR32RegClass;
9400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9401 break;
9403 Opc = AArch64::FMADDDrrr;
9404 RC = &AArch64::FPR64RegClass;
9405 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9406 break;
9407
9409 Opc = AArch64::FMADDHrrr;
9410 RC = &AArch64::FPR16RegClass;
9411 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9412 break;
9414 Opc = AArch64::FMADDSrrr;
9415 RC = &AArch64::FPR32RegClass;
9416 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9417 break;
9419 Opc = AArch64::FMADDDrrr;
9420 RC = &AArch64::FPR64RegClass;
9421 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9422 break;
9423
9425 Opc = AArch64::FMLAv1i32_indexed;
9426 RC = &AArch64::FPR32RegClass;
9427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9429 break;
9431 Opc = AArch64::FMLAv1i32_indexed;
9432 RC = &AArch64::FPR32RegClass;
9433 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9435 break;
9436
9438 Opc = AArch64::FMLAv1i64_indexed;
9439 RC = &AArch64::FPR64RegClass;
9440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9442 break;
9444 Opc = AArch64::FMLAv1i64_indexed;
9445 RC = &AArch64::FPR64RegClass;
9446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9448 break;
9449
9451 RC = &AArch64::FPR64RegClass;
9452 Opc = AArch64::FMLAv4i16_indexed;
9453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9455 break;
9457 RC = &AArch64::FPR64RegClass;
9458 Opc = AArch64::FMLAv4f16;
9459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9461 break;
9463 RC = &AArch64::FPR64RegClass;
9464 Opc = AArch64::FMLAv4i16_indexed;
9465 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9467 break;
9469 RC = &AArch64::FPR64RegClass;
9470 Opc = AArch64::FMLAv4f16;
9471 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9473 break;
9474
9477 RC = &AArch64::FPR64RegClass;
9479 Opc = AArch64::FMLAv2i32_indexed;
9480 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9482 } else {
9483 Opc = AArch64::FMLAv2f32;
9484 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9486 }
9487 break;
9490 RC = &AArch64::FPR64RegClass;
9492 Opc = AArch64::FMLAv2i32_indexed;
9493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9495 } else {
9496 Opc = AArch64::FMLAv2f32;
9497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9499 }
9500 break;
9501
9503 RC = &AArch64::FPR128RegClass;
9504 Opc = AArch64::FMLAv8i16_indexed;
9505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9507 break;
9509 RC = &AArch64::FPR128RegClass;
9510 Opc = AArch64::FMLAv8f16;
9511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9513 break;
9515 RC = &AArch64::FPR128RegClass;
9516 Opc = AArch64::FMLAv8i16_indexed;
9517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9519 break;
9521 RC = &AArch64::FPR128RegClass;
9522 Opc = AArch64::FMLAv8f16;
9523 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9525 break;
9526
9529 RC = &AArch64::FPR128RegClass;
9531 Opc = AArch64::FMLAv2i64_indexed;
9532 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9534 } else {
9535 Opc = AArch64::FMLAv2f64;
9536 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9538 }
9539 break;
9542 RC = &AArch64::FPR128RegClass;
9544 Opc = AArch64::FMLAv2i64_indexed;
9545 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9547 } else {
9548 Opc = AArch64::FMLAv2f64;
9549 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9551 }
9552 break;
9553
9556 RC = &AArch64::FPR128RegClass;
9558 Opc = AArch64::FMLAv4i32_indexed;
9559 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9561 } else {
9562 Opc = AArch64::FMLAv4f32;
9563 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9565 }
9566 break;
9567
9570 RC = &AArch64::FPR128RegClass;
9572 Opc = AArch64::FMLAv4i32_indexed;
9573 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9575 } else {
9576 Opc = AArch64::FMLAv4f32;
9577 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9579 }
9580 break;
9581
9583 Opc = AArch64::FNMSUBHrrr;
9584 RC = &AArch64::FPR16RegClass;
9585 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9586 break;
9588 Opc = AArch64::FNMSUBSrrr;
9589 RC = &AArch64::FPR32RegClass;
9590 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9591 break;
9593 Opc = AArch64::FNMSUBDrrr;
9594 RC = &AArch64::FPR64RegClass;
9595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9596 break;
9597
9599 Opc = AArch64::FNMADDHrrr;
9600 RC = &AArch64::FPR16RegClass;
9601 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9602 break;
9604 Opc = AArch64::FNMADDSrrr;
9605 RC = &AArch64::FPR32RegClass;
9606 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9607 break;
9609 Opc = AArch64::FNMADDDrrr;
9610 RC = &AArch64::FPR64RegClass;
9611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9612 break;
9613
9615 Opc = AArch64::FMSUBHrrr;
9616 RC = &AArch64::FPR16RegClass;
9617 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9618 break;
9620 Opc = AArch64::FMSUBSrrr;
9621 RC = &AArch64::FPR32RegClass;
9622 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9623 break;
9625 Opc = AArch64::FMSUBDrrr;
9626 RC = &AArch64::FPR64RegClass;
9627 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9628 break;
9629
9631 Opc = AArch64::FMLSv1i32_indexed;
9632 RC = &AArch64::FPR32RegClass;
9633 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9635 break;
9636
9638 Opc = AArch64::FMLSv1i64_indexed;
9639 RC = &AArch64::FPR64RegClass;
9640 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9642 break;
9643
9646 RC = &AArch64::FPR64RegClass;
9647 Register NewVR = MRI.createVirtualRegister(RC);
9648 MachineInstrBuilder MIB1 =
9649 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9650 .add(Root.getOperand(2));
9651 InsInstrs.push_back(MIB1);
9652 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9654 Opc = AArch64::FMLAv4f16;
9655 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9656 FMAInstKind::Accumulator, &NewVR);
9657 } else {
9658 Opc = AArch64::FMLAv4i16_indexed;
9659 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9660 FMAInstKind::Indexed, &NewVR);
9661 }
9662 break;
9663 }
9665 RC = &AArch64::FPR64RegClass;
9666 Opc = AArch64::FMLSv4f16;
9667 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9669 break;
9671 RC = &AArch64::FPR64RegClass;
9672 Opc = AArch64::FMLSv4i16_indexed;
9673 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9675 break;
9676
9679 RC = &AArch64::FPR64RegClass;
9681 Opc = AArch64::FMLSv2i32_indexed;
9682 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9684 } else {
9685 Opc = AArch64::FMLSv2f32;
9686 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9688 }
9689 break;
9690
9693 RC = &AArch64::FPR128RegClass;
9694 Register NewVR = MRI.createVirtualRegister(RC);
9695 MachineInstrBuilder MIB1 =
9696 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9697 .add(Root.getOperand(2));
9698 InsInstrs.push_back(MIB1);
9699 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9701 Opc = AArch64::FMLAv8f16;
9702 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9703 FMAInstKind::Accumulator, &NewVR);
9704 } else {
9705 Opc = AArch64::FMLAv8i16_indexed;
9706 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9707 FMAInstKind::Indexed, &NewVR);
9708 }
9709 break;
9710 }
9712 RC = &AArch64::FPR128RegClass;
9713 Opc = AArch64::FMLSv8f16;
9714 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9716 break;
9718 RC = &AArch64::FPR128RegClass;
9719 Opc = AArch64::FMLSv8i16_indexed;
9720 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9722 break;
9723
9726 RC = &AArch64::FPR128RegClass;
9728 Opc = AArch64::FMLSv2i64_indexed;
9729 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9731 } else {
9732 Opc = AArch64::FMLSv2f64;
9733 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9735 }
9736 break;
9737
9740 RC = &AArch64::FPR128RegClass;
9742 Opc = AArch64::FMLSv4i32_indexed;
9743 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9745 } else {
9746 Opc = AArch64::FMLSv4f32;
9747 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9749 }
9750 break;
9753 RC = &AArch64::FPR64RegClass;
9754 Register NewVR = MRI.createVirtualRegister(RC);
9755 MachineInstrBuilder MIB1 =
9756 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9757 .add(Root.getOperand(2));
9758 InsInstrs.push_back(MIB1);
9759 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9761 Opc = AArch64::FMLAv2i32_indexed;
9762 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9763 FMAInstKind::Indexed, &NewVR);
9764 } else {
9765 Opc = AArch64::FMLAv2f32;
9766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9767 FMAInstKind::Accumulator, &NewVR);
9768 }
9769 break;
9770 }
9773 RC = &AArch64::FPR128RegClass;
9774 Register NewVR = MRI.createVirtualRegister(RC);
9775 MachineInstrBuilder MIB1 =
9776 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9777 .add(Root.getOperand(2));
9778 InsInstrs.push_back(MIB1);
9779 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9781 Opc = AArch64::FMLAv4i32_indexed;
9782 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9783 FMAInstKind::Indexed, &NewVR);
9784 } else {
9785 Opc = AArch64::FMLAv4f32;
9786 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9787 FMAInstKind::Accumulator, &NewVR);
9788 }
9789 break;
9790 }
9793 RC = &AArch64::FPR128RegClass;
9794 Register NewVR = MRI.createVirtualRegister(RC);
9795 MachineInstrBuilder MIB1 =
9796 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9797 .add(Root.getOperand(2));
9798 InsInstrs.push_back(MIB1);
9799 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9801 Opc = AArch64::FMLAv2i64_indexed;
9802 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9803 FMAInstKind::Indexed, &NewVR);
9804 } else {
9805 Opc = AArch64::FMLAv2f64;
9806 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9807 FMAInstKind::Accumulator, &NewVR);
9808 }
9809 break;
9810 }
9813 unsigned IdxDupOp =
9815 : 2;
9816 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9817 &AArch64::FPR128RegClass, MRI);
9818 break;
9819 }
9822 unsigned IdxDupOp =
9824 : 2;
9825 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9826 &AArch64::FPR128RegClass, MRI);
9827 break;
9828 }
9831 unsigned IdxDupOp =
9833 : 2;
9834 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9835 &AArch64::FPR128_loRegClass, MRI);
9836 break;
9837 }
9840 unsigned IdxDupOp =
9842 : 2;
9843 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9844 &AArch64::FPR128RegClass, MRI);
9845 break;
9846 }
9849 unsigned IdxDupOp =
9851 : 2;
9852 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9853 &AArch64::FPR128_loRegClass, MRI);
9854 break;
9855 }
9857 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9858 break;
9859 }
9861 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9862 Pattern, 4);
9863 break;
9864 }
9866 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9867 Pattern, 8);
9868 break;
9869 }
9871 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9872 Pattern, 16);
9873 break;
9874 }
9875
9876 } // end switch (Pattern)
9877 // Record MUL and ADD/SUB for deletion
9878 if (MUL)
9879 DelInstrs.push_back(MUL);
9880 DelInstrs.push_back(&Root);
9881
9882 // Set the flags on the inserted instructions to be the merged flags of the
9883 // instructions that we have combined.
9884 uint32_t Flags = Root.getFlags();
9885 if (MUL)
9886 Flags = Root.mergeFlagsWith(*MUL);
9887 for (auto *MI : InsInstrs)
9888 MI->setFlags(Flags);
9889}
9890
9891/// Replace csincr-branch sequence by simple conditional branch
9892///
9893/// Examples:
9894/// 1. \code
9895/// csinc w9, wzr, wzr, <condition code>
9896/// tbnz w9, #0, 0x44
9897/// \endcode
9898/// to
9899/// \code
9900/// b.<inverted condition code>
9901/// \endcode
9902///
9903/// 2. \code
9904/// csinc w9, wzr, wzr, <condition code>
9905/// tbz w9, #0, 0x44
9906/// \endcode
9907/// to
9908/// \code
9909/// b.<condition code>
9910/// \endcode
9911///
9912/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9913/// compare's constant operand is power of 2.
9914///
9915/// Examples:
9916/// \code
9917/// and w8, w8, #0x400
9918/// cbnz w8, L1
9919/// \endcode
9920/// to
9921/// \code
9922/// tbnz w8, #10, L1
9923/// \endcode
9924///
9925/// \param MI Conditional Branch
9926/// \return True when the simple conditional branch is generated
9927///
9929 bool IsNegativeBranch = false;
9930 bool IsTestAndBranch = false;
9931 unsigned TargetBBInMI = 0;
9932 switch (MI.getOpcode()) {
9933 default:
9934 llvm_unreachable("Unknown branch instruction?");
9935 case AArch64::Bcc:
9936 case AArch64::CBWPri:
9937 case AArch64::CBXPri:
9938 case AArch64::CBBAssertExt:
9939 case AArch64::CBHAssertExt:
9940 case AArch64::CBWPrr:
9941 case AArch64::CBXPrr:
9942 return false;
9943 case AArch64::CBZW:
9944 case AArch64::CBZX:
9945 TargetBBInMI = 1;
9946 break;
9947 case AArch64::CBNZW:
9948 case AArch64::CBNZX:
9949 TargetBBInMI = 1;
9950 IsNegativeBranch = true;
9951 break;
9952 case AArch64::TBZW:
9953 case AArch64::TBZX:
9954 TargetBBInMI = 2;
9955 IsTestAndBranch = true;
9956 break;
9957 case AArch64::TBNZW:
9958 case AArch64::TBNZX:
9959 TargetBBInMI = 2;
9960 IsNegativeBranch = true;
9961 IsTestAndBranch = true;
9962 break;
9963 }
9964 // So we increment a zero register and test for bits other
9965 // than bit 0? Conservatively bail out in case the verifier
9966 // missed this case.
9967 if (IsTestAndBranch && MI.getOperand(1).getImm())
9968 return false;
9969
9970 // Find Definition.
9971 assert(MI.getParent() && "Incomplete machine instruction\n");
9972 MachineBasicBlock *MBB = MI.getParent();
9973 MachineFunction *MF = MBB->getParent();
9974 MachineRegisterInfo *MRI = &MF->getRegInfo();
9975 Register VReg = MI.getOperand(0).getReg();
9976 if (!VReg.isVirtual())
9977 return false;
9978
9979 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9980
9981 // Look through COPY instructions to find definition.
9982 while (DefMI->isCopy()) {
9983 Register CopyVReg = DefMI->getOperand(1).getReg();
9984 if (!MRI->hasOneNonDBGUse(CopyVReg))
9985 return false;
9986 if (!MRI->hasOneDef(CopyVReg))
9987 return false;
9988 DefMI = MRI->getVRegDef(CopyVReg);
9989 }
9990
9991 switch (DefMI->getOpcode()) {
9992 default:
9993 return false;
9994 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9995 case AArch64::ANDWri:
9996 case AArch64::ANDXri: {
9997 if (IsTestAndBranch)
9998 return false;
9999 if (DefMI->getParent() != MBB)
10000 return false;
10001 if (!MRI->hasOneNonDBGUse(VReg))
10002 return false;
10003
10004 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
10006 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
10007 if (!isPowerOf2_64(Mask))
10008 return false;
10009
10010 MachineOperand &MO = DefMI->getOperand(1);
10011 Register NewReg = MO.getReg();
10012 if (!NewReg.isVirtual())
10013 return false;
10014
10015 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
10016
10017 MachineBasicBlock &RefToMBB = *MBB;
10018 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
10019 DebugLoc DL = MI.getDebugLoc();
10020 unsigned Imm = Log2_64(Mask);
10021 unsigned Opc = (Imm < 32)
10022 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
10023 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
10024 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
10025 .addReg(NewReg)
10026 .addImm(Imm)
10027 .addMBB(TBB);
10028 // Register lives on to the CBZ now.
10029 MO.setIsKill(false);
10030
10031 // For immediate smaller than 32, we need to use the 32-bit
10032 // variant (W) in all cases. Indeed the 64-bit variant does not
10033 // allow to encode them.
10034 // Therefore, if the input register is 64-bit, we need to take the
10035 // 32-bit sub-part.
10036 if (!Is32Bit && Imm < 32)
10037 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
10038 MI.eraseFromParent();
10039 return true;
10040 }
10041 // Look for CSINC
10042 case AArch64::CSINCWr:
10043 case AArch64::CSINCXr: {
10044 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
10045 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
10046 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
10047 DefMI->getOperand(2).getReg() == AArch64::XZR))
10048 return false;
10049
10050 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
10051 true) != -1)
10052 return false;
10053
10054 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
10055 // Convert only when the condition code is not modified between
10056 // the CSINC and the branch. The CC may be used by other
10057 // instructions in between.
10059 return false;
10060 MachineBasicBlock &RefToMBB = *MBB;
10061 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
10062 DebugLoc DL = MI.getDebugLoc();
10063 if (IsNegativeBranch)
10065 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
10066 MI.eraseFromParent();
10067 return true;
10068 }
10069 }
10070}
10071
10072std::pair<unsigned, unsigned>
10073AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10074 const unsigned Mask = AArch64II::MO_FRAGMENT;
10075 return std::make_pair(TF & Mask, TF & ~Mask);
10076}
10077
10079AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10080 using namespace AArch64II;
10081
10082 static const std::pair<unsigned, const char *> TargetFlags[] = {
10083 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
10084 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
10085 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
10086 {MO_HI12, "aarch64-hi12"}};
10087 return ArrayRef(TargetFlags);
10088}
10089
10091AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
10092 using namespace AArch64II;
10093
10094 static const std::pair<unsigned, const char *> TargetFlags[] = {
10095 {MO_COFFSTUB, "aarch64-coffstub"},
10096 {MO_GOT, "aarch64-got"},
10097 {MO_NC, "aarch64-nc"},
10098 {MO_S, "aarch64-s"},
10099 {MO_TLS, "aarch64-tls"},
10100 {MO_DLLIMPORT, "aarch64-dllimport"},
10101 {MO_PREL, "aarch64-prel"},
10102 {MO_TAGGED, "aarch64-tagged"},
10103 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
10104 };
10105 return ArrayRef(TargetFlags);
10106}
10107
10109AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
10110 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
10111 {{MOSuppressPair, "aarch64-suppress-pair"},
10112 {MOStridedAccess, "aarch64-strided-access"}};
10113 return ArrayRef(TargetFlags);
10114}
10115
10116/// Constants defining how certain sequences should be outlined.
10117/// This encompasses how an outlined function should be called, and what kind of
10118/// frame should be emitted for that outlined function.
10119///
10120/// \p MachineOutlinerDefault implies that the function should be called with
10121/// a save and restore of LR to the stack.
10122///
10123/// That is,
10124///
10125/// I1 Save LR OUTLINED_FUNCTION:
10126/// I2 --> BL OUTLINED_FUNCTION I1
10127/// I3 Restore LR I2
10128/// I3
10129/// RET
10130///
10131/// * Call construction overhead: 3 (save + BL + restore)
10132/// * Frame construction overhead: 1 (ret)
10133/// * Requires stack fixups? Yes
10134///
10135/// \p MachineOutlinerTailCall implies that the function is being created from
10136/// a sequence of instructions ending in a return.
10137///
10138/// That is,
10139///
10140/// I1 OUTLINED_FUNCTION:
10141/// I2 --> B OUTLINED_FUNCTION I1
10142/// RET I2
10143/// RET
10144///
10145/// * Call construction overhead: 1 (B)
10146/// * Frame construction overhead: 0 (Return included in sequence)
10147/// * Requires stack fixups? No
10148///
10149/// \p MachineOutlinerNoLRSave implies that the function should be called using
10150/// a BL instruction, but doesn't require LR to be saved and restored. This
10151/// happens when LR is known to be dead.
10152///
10153/// That is,
10154///
10155/// I1 OUTLINED_FUNCTION:
10156/// I2 --> BL OUTLINED_FUNCTION I1
10157/// I3 I2
10158/// I3
10159/// RET
10160///
10161/// * Call construction overhead: 1 (BL)
10162/// * Frame construction overhead: 1 (RET)
10163/// * Requires stack fixups? No
10164///
10165/// \p MachineOutlinerThunk implies that the function is being created from
10166/// a sequence of instructions ending in a call. The outlined function is
10167/// called with a BL instruction, and the outlined function tail-calls the
10168/// original call destination.
10169///
10170/// That is,
10171///
10172/// I1 OUTLINED_FUNCTION:
10173/// I2 --> BL OUTLINED_FUNCTION I1
10174/// BL f I2
10175/// B f
10176/// * Call construction overhead: 1 (BL)
10177/// * Frame construction overhead: 0
10178/// * Requires stack fixups? No
10179///
10180/// \p MachineOutlinerRegSave implies that the function should be called with a
10181/// save and restore of LR to an available register. This allows us to avoid
10182/// stack fixups. Note that this outlining variant is compatible with the
10183/// NoLRSave case.
10184///
10185/// That is,
10186///
10187/// I1 Save LR OUTLINED_FUNCTION:
10188/// I2 --> BL OUTLINED_FUNCTION I1
10189/// I3 Restore LR I2
10190/// I3
10191/// RET
10192///
10193/// * Call construction overhead: 3 (save + BL + restore)
10194/// * Frame construction overhead: 1 (ret)
10195/// * Requires stack fixups? No
10197 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
10198 MachineOutlinerTailCall, /// Only emit a branch.
10199 MachineOutlinerNoLRSave, /// Emit a call and return.
10200 MachineOutlinerThunk, /// Emit a call and tail-call.
10201 MachineOutlinerRegSave /// Same as default, but save to a register.
10202};
10203
10209
10211AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10212 MachineFunction *MF = C.getMF();
10213 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10214 const AArch64RegisterInfo *ARI =
10215 static_cast<const AArch64RegisterInfo *>(&TRI);
10216 // Check if there is an available register across the sequence that we can
10217 // use.
10218 for (unsigned Reg : AArch64::GPR64RegClass) {
10219 if (!ARI->isReservedReg(*MF, Reg) &&
10220 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10221 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10222 Reg != AArch64::X17 && // Ditto for X17.
10223 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10224 C.isAvailableInsideSeq(Reg, TRI))
10225 return Reg;
10226 }
10227 return Register();
10228}
10229
10230static bool
10232 const outliner::Candidate &b) {
10233 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10234 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10235
10236 return MFIa->getSignReturnAddressCondition() ==
10238}
10239
10240static bool
10242 const outliner::Candidate &b) {
10243 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10244 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10245
10246 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10247}
10248
10250 const outliner::Candidate &b) {
10251 const AArch64Subtarget &SubtargetA =
10253 const AArch64Subtarget &SubtargetB =
10254 b.getMF()->getSubtarget<AArch64Subtarget>();
10255 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10256}
10257
10258std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10259AArch64InstrInfo::getOutliningCandidateInfo(
10260 const MachineModuleInfo &MMI,
10261 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10262 unsigned MinRepeats) const {
10263 unsigned SequenceSize = 0;
10264 for (auto &MI : RepeatedSequenceLocs[0])
10265 SequenceSize += getInstSizeInBytes(MI);
10266
10267 unsigned NumBytesToCreateFrame = 0;
10268
10269 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10270 // These instructions are fused together by the scheduler.
10271 // Any candidate where ADRP is the last instruction should be rejected
10272 // as that will lead to splitting ADRP pair.
10273 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10274 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10275 if (LastMI.getOpcode() == AArch64::ADRP &&
10276 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10277 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10278 return std::nullopt;
10279 }
10280
10281 // Similarly any candidate where the first instruction is ADD/LDR with a
10282 // page offset should be rejected to avoid ADRP splitting.
10283 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10284 FirstMI.getOpcode() == AArch64::LDRXui) &&
10285 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10286 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10287 return std::nullopt;
10288 }
10289
10290 // We only allow outlining for functions having exactly matching return
10291 // address signing attributes, i.e., all share the same value for the
10292 // attribute "sign-return-address" and all share the same type of key they
10293 // are signed with.
10294 // Additionally we require all functions to simultaneously either support
10295 // v8.3a features or not. Otherwise an outlined function could get signed
10296 // using dedicated v8.3 instructions and a call from a function that doesn't
10297 // support v8.3 instructions would therefore be invalid.
10298 if (std::adjacent_find(
10299 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10300 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10301 // Return true if a and b are non-equal w.r.t. return address
10302 // signing or support of v8.3a features
10303 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10304 outliningCandidatesSigningKeyConsensus(a, b) &&
10305 outliningCandidatesV8_3OpsConsensus(a, b)) {
10306 return false;
10307 }
10308 return true;
10309 }) != RepeatedSequenceLocs.end()) {
10310 return std::nullopt;
10311 }
10312
10313 // Since at this point all candidates agree on their return address signing
10314 // picking just one is fine. If the candidate functions potentially sign their
10315 // return addresses, the outlined function should do the same. Note that in
10316 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10317 // not certainly true that the outlined function will have to sign its return
10318 // address but this decision is made later, when the decision to outline
10319 // has already been made.
10320 // The same holds for the number of additional instructions we need: On
10321 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10322 // necessary. However, at this point we don't know if the outlined function
10323 // will have a RET instruction so we assume the worst.
10324 const TargetRegisterInfo &TRI = getRegisterInfo();
10325 // Performing a tail call may require extra checks when PAuth is enabled.
10326 // If PAuth is disabled, set it to zero for uniformity.
10327 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10328 const auto RASignCondition = RepeatedSequenceLocs[0]
10329 .getMF()
10330 ->getInfo<AArch64FunctionInfo>()
10331 ->getSignReturnAddressCondition();
10332 if (RASignCondition != SignReturnAddress::None) {
10333 // One PAC and one AUT instructions
10334 NumBytesToCreateFrame += 8;
10335
10336 // PAuth is enabled - set extra tail call cost, if any.
10337 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10338 *RepeatedSequenceLocs[0].getMF());
10339 NumBytesToCheckLRInTCEpilogue =
10341 // Checking the authenticated LR value may significantly impact
10342 // SequenceSize, so account for it for more precise results.
10343 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10344 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10345
10346 // We have to check if sp modifying instructions would get outlined.
10347 // If so we only allow outlining if sp is unchanged overall, so matching
10348 // sub and add instructions are okay to outline, all other sp modifications
10349 // are not
10350 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10351 int SPValue = 0;
10352 for (auto &MI : C) {
10353 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10354 switch (MI.getOpcode()) {
10355 case AArch64::ADDXri:
10356 case AArch64::ADDWri:
10357 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10358 assert(MI.getOperand(2).isImm() &&
10359 "Expected operand to be immediate");
10360 assert(MI.getOperand(1).isReg() &&
10361 "Expected operand to be a register");
10362 // Check if the add just increments sp. If so, we search for
10363 // matching sub instructions that decrement sp. If not, the
10364 // modification is illegal
10365 if (MI.getOperand(1).getReg() == AArch64::SP)
10366 SPValue += MI.getOperand(2).getImm();
10367 else
10368 return true;
10369 break;
10370 case AArch64::SUBXri:
10371 case AArch64::SUBWri:
10372 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10373 assert(MI.getOperand(2).isImm() &&
10374 "Expected operand to be immediate");
10375 assert(MI.getOperand(1).isReg() &&
10376 "Expected operand to be a register");
10377 // Check if the sub just decrements sp. If so, we search for
10378 // matching add instructions that increment sp. If not, the
10379 // modification is illegal
10380 if (MI.getOperand(1).getReg() == AArch64::SP)
10381 SPValue -= MI.getOperand(2).getImm();
10382 else
10383 return true;
10384 break;
10385 default:
10386 return true;
10387 }
10388 }
10389 }
10390 if (SPValue)
10391 return true;
10392 return false;
10393 };
10394 // Remove candidates with illegal stack modifying instructions
10395 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10396
10397 // If the sequence doesn't have enough candidates left, then we're done.
10398 if (RepeatedSequenceLocs.size() < MinRepeats)
10399 return std::nullopt;
10400 }
10401
10402 // Properties about candidate MBBs that hold for all of them.
10403 unsigned FlagsSetInAll = 0xF;
10404
10405 // Compute liveness information for each candidate, and set FlagsSetInAll.
10406 for (outliner::Candidate &C : RepeatedSequenceLocs)
10407 FlagsSetInAll &= C.Flags;
10408
10409 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10410
10411 // Helper lambda which sets call information for every candidate.
10412 auto SetCandidateCallInfo =
10413 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10414 for (outliner::Candidate &C : RepeatedSequenceLocs)
10415 C.setCallInfo(CallID, NumBytesForCall);
10416 };
10417
10418 unsigned FrameID = MachineOutlinerDefault;
10419 NumBytesToCreateFrame += 4;
10420
10421 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10422 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10423 });
10424
10425 // We check to see if CFI Instructions are present, and if they are
10426 // we find the number of CFI Instructions in the candidates.
10427 unsigned CFICount = 0;
10428 for (auto &I : RepeatedSequenceLocs[0]) {
10429 if (I.isCFIInstruction())
10430 CFICount++;
10431 }
10432
10433 // We compare the number of found CFI Instructions to the number of CFI
10434 // instructions in the parent function for each candidate. We must check this
10435 // since if we outline one of the CFI instructions in a function, we have to
10436 // outline them all for correctness. If we do not, the address offsets will be
10437 // incorrect between the two sections of the program.
10438 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10439 std::vector<MCCFIInstruction> CFIInstructions =
10440 C.getMF()->getFrameInstructions();
10441
10442 if (CFICount > 0 && CFICount != CFIInstructions.size())
10443 return std::nullopt;
10444 }
10445
10446 // Returns true if an instructions is safe to fix up, false otherwise.
10447 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10448 if (MI.isCall())
10449 return true;
10450
10451 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10452 !MI.readsRegister(AArch64::SP, &TRI))
10453 return true;
10454
10455 // Any modification of SP will break our code to save/restore LR.
10456 // FIXME: We could handle some instructions which add a constant
10457 // offset to SP, with a bit more work.
10458 if (MI.modifiesRegister(AArch64::SP, &TRI))
10459 return false;
10460
10461 // At this point, we have a stack instruction that we might need to
10462 // fix up. We'll handle it if it's a load or store.
10463 if (MI.mayLoadOrStore()) {
10464 const MachineOperand *Base; // Filled with the base operand of MI.
10465 int64_t Offset; // Filled with the offset of MI.
10466 bool OffsetIsScalable;
10467
10468 // Does it allow us to offset the base operand and is the base the
10469 // register SP?
10470 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10471 !Base->isReg() || Base->getReg() != AArch64::SP)
10472 return false;
10473
10474 // Fixe-up code below assumes bytes.
10475 if (OffsetIsScalable)
10476 return false;
10477
10478 // Find the minimum/maximum offset for this instruction and check
10479 // if fixing it up would be in range.
10480 int64_t MinOffset,
10481 MaxOffset; // Unscaled offsets for the instruction.
10482 // The scale to multiply the offsets by.
10483 TypeSize Scale(0U, false), DummyWidth(0U, false);
10484 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10485
10486 Offset += 16; // Update the offset to what it would be if we outlined.
10487 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10488 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10489 return false;
10490
10491 // It's in range, so we can outline it.
10492 return true;
10493 }
10494
10495 // FIXME: Add handling for instructions like "add x0, sp, #8".
10496
10497 // We can't fix it up, so don't outline it.
10498 return false;
10499 };
10500
10501 // True if it's possible to fix up each stack instruction in this sequence.
10502 // Important for frames/call variants that modify the stack.
10503 bool AllStackInstrsSafe =
10504 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10505
10506 // If the last instruction in any candidate is a terminator, then we should
10507 // tail call all of the candidates.
10508 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10509 FrameID = MachineOutlinerTailCall;
10510 NumBytesToCreateFrame = 0;
10511 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10512 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10513 }
10514
10515 else if (LastInstrOpcode == AArch64::BL ||
10516 ((LastInstrOpcode == AArch64::BLR ||
10517 LastInstrOpcode == AArch64::BLRNoIP) &&
10518 !HasBTI)) {
10519 // FIXME: Do we need to check if the code after this uses the value of LR?
10520 FrameID = MachineOutlinerThunk;
10521 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10522 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10523 }
10524
10525 else {
10526 // We need to decide how to emit calls + frames. We can always emit the same
10527 // frame if we don't need to save to the stack. If we have to save to the
10528 // stack, then we need a different frame.
10529 unsigned NumBytesNoStackCalls = 0;
10530 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10531
10532 // Check if we have to save LR.
10533 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10534 bool LRAvailable =
10536 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10537 : true;
10538 // If we have a noreturn caller, then we're going to be conservative and
10539 // say that we have to save LR. If we don't have a ret at the end of the
10540 // block, then we can't reason about liveness accurately.
10541 //
10542 // FIXME: We can probably do better than always disabling this in
10543 // noreturn functions by fixing up the liveness info.
10544 bool IsNoReturn =
10545 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10546
10547 // Is LR available? If so, we don't need a save.
10548 if (LRAvailable && !IsNoReturn) {
10549 NumBytesNoStackCalls += 4;
10550 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10551 CandidatesWithoutStackFixups.push_back(C);
10552 }
10553
10554 // Is an unused register available? If so, we won't modify the stack, so
10555 // we can outline with the same frame type as those that don't save LR.
10556 else if (findRegisterToSaveLRTo(C)) {
10557 NumBytesNoStackCalls += 12;
10558 C.setCallInfo(MachineOutlinerRegSave, 12);
10559 CandidatesWithoutStackFixups.push_back(C);
10560 }
10561
10562 // Is SP used in the sequence at all? If not, we don't have to modify
10563 // the stack, so we are guaranteed to get the same frame.
10564 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10565 NumBytesNoStackCalls += 12;
10566 C.setCallInfo(MachineOutlinerDefault, 12);
10567 CandidatesWithoutStackFixups.push_back(C);
10568 }
10569
10570 // If we outline this, we need to modify the stack. Pretend we don't
10571 // outline this by saving all of its bytes.
10572 else {
10573 NumBytesNoStackCalls += SequenceSize;
10574 }
10575 }
10576
10577 // If there are no places where we have to save LR, then note that we
10578 // don't have to update the stack. Otherwise, give every candidate the
10579 // default call type, as long as it's safe to do so.
10580 if (!AllStackInstrsSafe ||
10581 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10582 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10583 FrameID = MachineOutlinerNoLRSave;
10584 if (RepeatedSequenceLocs.size() < MinRepeats)
10585 return std::nullopt;
10586 } else {
10587 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10588
10589 // Bugzilla ID: 46767
10590 // TODO: Check if fixing up the stack more than once is safe so we can
10591 // outline these.
10592 //
10593 // An outline resulting in a caller that requires stack fixups at the
10594 // callsite to a callee that also requires stack fixups can happen when
10595 // there are no available registers at the candidate callsite for a
10596 // candidate that itself also has calls.
10597 //
10598 // In other words if function_containing_sequence in the following pseudo
10599 // assembly requires that we save LR at the point of the call, but there
10600 // are no available registers: in this case we save using SP and as a
10601 // result the SP offsets requires stack fixups by multiples of 16.
10602 //
10603 // function_containing_sequence:
10604 // ...
10605 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10606 // call OUTLINED_FUNCTION_N
10607 // restore LR from SP
10608 // ...
10609 //
10610 // OUTLINED_FUNCTION_N:
10611 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10612 // ...
10613 // bl foo
10614 // restore LR from SP
10615 // ret
10616 //
10617 // Because the code to handle more than one stack fixup does not
10618 // currently have the proper checks for legality, these cases will assert
10619 // in the AArch64 MachineOutliner. This is because the code to do this
10620 // needs more hardening, testing, better checks that generated code is
10621 // legal, etc and because it is only verified to handle a single pass of
10622 // stack fixup.
10623 //
10624 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10625 // these cases until they are known to be handled. Bugzilla 46767 is
10626 // referenced in comments at the assert site.
10627 //
10628 // To avoid asserting (or generating non-legal code on noassert builds)
10629 // we remove all candidates which would need more than one stack fixup by
10630 // pruning the cases where the candidate has calls while also having no
10631 // available LR and having no available general purpose registers to copy
10632 // LR to (ie one extra stack save/restore).
10633 //
10634 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10635 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10636 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10637 return (llvm::any_of(C, IsCall)) &&
10638 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10639 !findRegisterToSaveLRTo(C));
10640 });
10641 }
10642 }
10643
10644 // If we dropped all of the candidates, bail out here.
10645 if (RepeatedSequenceLocs.size() < MinRepeats)
10646 return std::nullopt;
10647 }
10648
10649 // Does every candidate's MBB contain a call? If so, then we might have a call
10650 // in the range.
10651 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10652 // Check if the range contains a call. These require a save + restore of the
10653 // link register.
10654 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10655 bool ModStackToSaveLR = false;
10656 if (any_of(drop_end(FirstCand),
10657 [](const MachineInstr &MI) { return MI.isCall(); }))
10658 ModStackToSaveLR = true;
10659
10660 // Handle the last instruction separately. If this is a tail call, then the
10661 // last instruction is a call. We don't want to save + restore in this case.
10662 // However, it could be possible that the last instruction is a call without
10663 // it being valid to tail call this sequence. We should consider this as
10664 // well.
10665 else if (FrameID != MachineOutlinerThunk &&
10666 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10667 ModStackToSaveLR = true;
10668
10669 if (ModStackToSaveLR) {
10670 // We can't fix up the stack. Bail out.
10671 if (!AllStackInstrsSafe)
10672 return std::nullopt;
10673
10674 // Save + restore LR.
10675 NumBytesToCreateFrame += 8;
10676 }
10677 }
10678
10679 // If we have CFI instructions, we can only outline if the outlined section
10680 // can be a tail call
10681 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10682 return std::nullopt;
10683
10684 return std::make_unique<outliner::OutlinedFunction>(
10685 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10686}
10687
10688void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10689 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10690 // If a bunch of candidates reach this point they must agree on their return
10691 // address signing. It is therefore enough to just consider the signing
10692 // behaviour of one of them
10693 const auto &CFn = Candidates.front().getMF()->getFunction();
10694
10695 if (CFn.hasFnAttribute("ptrauth-returns"))
10696 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10697 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10698 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10699 // Since all candidates belong to the same module, just copy the
10700 // function-level attributes of an arbitrary function.
10701 if (CFn.hasFnAttribute("sign-return-address"))
10702 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10703 if (CFn.hasFnAttribute("sign-return-address-key"))
10704 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10705
10706 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10707}
10708
10709bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10710 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10711 const Function &F = MF.getFunction();
10712
10713 // Can F be deduplicated by the linker? If it can, don't outline from it.
10714 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10715 return false;
10716
10717 // Don't outline from functions with section markings; the program could
10718 // expect that all the code is in the named section.
10719 // FIXME: Allow outlining from multiple functions with the same section
10720 // marking.
10721 if (F.hasSection())
10722 return false;
10723
10724 // Outlining from functions with redzones is unsafe since the outliner may
10725 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10726 // outline from it.
10727 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10728 if (!AFI || AFI->hasRedZone().value_or(true))
10729 return false;
10730
10731 // FIXME: Determine whether it is safe to outline from functions which contain
10732 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10733 // outlined together and ensure it is safe to outline with async unwind info,
10734 // required for saving & restoring VG around calls.
10735 if (AFI->hasStreamingModeChanges())
10736 return false;
10737
10738 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10740 return false;
10741
10742 // It's safe to outline from MF.
10743 return true;
10744}
10745
10747AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10748 unsigned &Flags) const {
10750 "Must track liveness!");
10752 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10753 Ranges;
10754 // According to the AArch64 Procedure Call Standard, the following are
10755 // undefined on entry/exit from a function call:
10756 //
10757 // * Registers x16, x17, (and thus w16, w17)
10758 // * Condition codes (and thus the NZCV register)
10759 //
10760 // If any of these registers are used inside or live across an outlined
10761 // function, then they may be modified later, either by the compiler or
10762 // some other tool (like the linker).
10763 //
10764 // To avoid outlining in these situations, partition each block into ranges
10765 // where these registers are dead. We will only outline from those ranges.
10766 LiveRegUnits LRU(getRegisterInfo());
10767 auto AreAllUnsafeRegsDead = [&LRU]() {
10768 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10769 LRU.available(AArch64::NZCV);
10770 };
10771
10772 // We need to know if LR is live across an outlining boundary later on in
10773 // order to decide how we'll create the outlined call, frame, etc.
10774 //
10775 // It's pretty expensive to check this for *every candidate* within a block.
10776 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10777 // to compute liveness from the end of the block for O(n) candidates within
10778 // the block.
10779 //
10780 // So, to improve the average case, let's keep track of liveness from the end
10781 // of the block to the beginning of *every outlinable range*. If we know that
10782 // LR is available in every range we could outline from, then we know that
10783 // we don't need to check liveness for any candidate within that range.
10784 bool LRAvailableEverywhere = true;
10785 // Compute liveness bottom-up.
10786 LRU.addLiveOuts(MBB);
10787 // Update flags that require info about the entire MBB.
10788 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10789 if (MI.isCall() && !MI.isTerminator())
10791 };
10792 // Range: [RangeBegin, RangeEnd)
10793 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10794 unsigned RangeLen;
10795 auto CreateNewRangeStartingAt =
10796 [&RangeBegin, &RangeEnd,
10797 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10798 RangeBegin = NewBegin;
10799 RangeEnd = std::next(RangeBegin);
10800 RangeLen = 0;
10801 };
10802 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10803 // At least one unsafe register is not dead. We do not want to outline at
10804 // this point. If it is long enough to outline from and does not cross a
10805 // bundle boundary, save the range [RangeBegin, RangeEnd).
10806 if (RangeLen <= 1)
10807 return;
10808 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10809 return;
10810 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10811 return;
10812 Ranges.emplace_back(RangeBegin, RangeEnd);
10813 };
10814 // Find the first point where all unsafe registers are dead.
10815 // FIND: <safe instr> <-- end of first potential range
10816 // SKIP: <unsafe def>
10817 // SKIP: ... everything between ...
10818 // SKIP: <unsafe use>
10819 auto FirstPossibleEndPt = MBB.instr_rbegin();
10820 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10821 if (!FirstPossibleEndPt->isDebugInstr())
10822 LRU.stepBackward(*FirstPossibleEndPt);
10823 // Update flags that impact how we outline across the entire block,
10824 // regardless of safety.
10825 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10826 if (AreAllUnsafeRegsDead())
10827 break;
10828 }
10829 // If we exhausted the entire block, we have no safe ranges to outline.
10830 if (FirstPossibleEndPt == MBB.instr_rend())
10831 return Ranges;
10832 // Current range.
10833 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10834 // StartPt points to the first place where all unsafe registers
10835 // are dead (if there is any such point). Begin partitioning the MBB into
10836 // ranges.
10837 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10838 if (!MI.isDebugInstr())
10839 LRU.stepBackward(MI);
10840 UpdateWholeMBBFlags(MI);
10841 if (!AreAllUnsafeRegsDead()) {
10842 SaveRangeIfNonEmpty();
10843 CreateNewRangeStartingAt(MI.getIterator());
10844 continue;
10845 }
10846 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10847 RangeBegin = MI.getIterator();
10848 ++RangeLen;
10849 }
10850 // Above loop misses the last (or only) range. If we are still safe, then
10851 // let's save the range.
10852 if (AreAllUnsafeRegsDead())
10853 SaveRangeIfNonEmpty();
10854 if (Ranges.empty())
10855 return Ranges;
10856 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10857 // the order.
10858 std::reverse(Ranges.begin(), Ranges.end());
10859 // If there is at least one outlinable range where LR is unavailable
10860 // somewhere, remember that.
10861 if (!LRAvailableEverywhere)
10863 return Ranges;
10864}
10865
10867AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10869 unsigned Flags) const {
10870 MachineInstr &MI = *MIT;
10871
10872 // Don't outline anything used for return address signing. The outlined
10873 // function will get signed later if needed
10874 switch (MI.getOpcode()) {
10875 case AArch64::PACM:
10876 case AArch64::PACIASP:
10877 case AArch64::PACIBSP:
10878 case AArch64::PACIASPPC:
10879 case AArch64::PACIBSPPC:
10880 case AArch64::AUTIASP:
10881 case AArch64::AUTIBSP:
10882 case AArch64::AUTIASPPCi:
10883 case AArch64::AUTIASPPCr:
10884 case AArch64::AUTIBSPPCi:
10885 case AArch64::AUTIBSPPCr:
10886 case AArch64::RETAA:
10887 case AArch64::RETAB:
10888 case AArch64::RETAASPPCi:
10889 case AArch64::RETAASPPCr:
10890 case AArch64::RETABSPPCi:
10891 case AArch64::RETABSPPCr:
10892 case AArch64::EMITBKEY:
10893 case AArch64::PAUTH_PROLOGUE:
10894 case AArch64::PAUTH_EPILOGUE:
10896 }
10897
10898 // We can only outline these if we will tail call the outlined function, or
10899 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10900 // in a tail call.
10901 //
10902 // FIXME: If the proper fixups for the offset are implemented, this should be
10903 // possible.
10904 if (MI.isCFIInstruction())
10906
10907 // Is this a terminator for a basic block?
10908 if (MI.isTerminator())
10909 // TargetInstrInfo::getOutliningType has already filtered out anything
10910 // that would break this, so we can allow it here.
10912
10913 // Make sure none of the operands are un-outlinable.
10914 for (const MachineOperand &MOP : MI.operands()) {
10915 // A check preventing CFI indices was here before, but only CFI
10916 // instructions should have those.
10917 assert(!MOP.isCFIIndex());
10918
10919 // If it uses LR or W30 explicitly, then don't touch it.
10920 if (MOP.isReg() && !MOP.isImplicit() &&
10921 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10923 }
10924
10925 // Special cases for instructions that can always be outlined, but will fail
10926 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10927 // be outlined because they don't require a *specific* value to be in LR.
10928 if (MI.getOpcode() == AArch64::ADRP)
10930
10931 // If MI is a call we might be able to outline it. We don't want to outline
10932 // any calls that rely on the position of items on the stack. When we outline
10933 // something containing a call, we have to emit a save and restore of LR in
10934 // the outlined function. Currently, this always happens by saving LR to the
10935 // stack. Thus, if we outline, say, half the parameters for a function call
10936 // plus the call, then we'll break the callee's expectations for the layout
10937 // of the stack.
10938 //
10939 // FIXME: Allow calls to functions which construct a stack frame, as long
10940 // as they don't access arguments on the stack.
10941 // FIXME: Figure out some way to analyze functions defined in other modules.
10942 // We should be able to compute the memory usage based on the IR calling
10943 // convention, even if we can't see the definition.
10944 if (MI.isCall()) {
10945 // Get the function associated with the call. Look at each operand and find
10946 // the one that represents the callee and get its name.
10947 const Function *Callee = nullptr;
10948 for (const MachineOperand &MOP : MI.operands()) {
10949 if (MOP.isGlobal()) {
10950 Callee = dyn_cast<Function>(MOP.getGlobal());
10951 break;
10952 }
10953 }
10954
10955 // Never outline calls to mcount. There isn't any rule that would require
10956 // this, but the Linux kernel's "ftrace" feature depends on it.
10957 if (Callee && Callee->getName() == "\01_mcount")
10959
10960 // If we don't know anything about the callee, assume it depends on the
10961 // stack layout of the caller. In that case, it's only legal to outline
10962 // as a tail-call. Explicitly list the call instructions we know about so we
10963 // don't get unexpected results with call pseudo-instructions.
10964 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10965 if (MI.getOpcode() == AArch64::BLR ||
10966 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10967 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10968
10969 if (!Callee)
10970 return UnknownCallOutlineType;
10971
10972 // We have a function we have information about. Check it if it's something
10973 // can safely outline.
10974 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10975
10976 // We don't know what's going on with the callee at all. Don't touch it.
10977 if (!CalleeMF)
10978 return UnknownCallOutlineType;
10979
10980 // Check if we know anything about the callee saves on the function. If we
10981 // don't, then don't touch it, since that implies that we haven't
10982 // computed anything about its stack frame yet.
10983 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10984 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10985 MFI.getNumObjects() > 0)
10986 return UnknownCallOutlineType;
10987
10988 // At this point, we can say that CalleeMF ought to not pass anything on the
10989 // stack. Therefore, we can outline it.
10991 }
10992
10993 // Don't touch the link register or W30.
10994 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10995 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10997
10998 // Don't outline BTI instructions, because that will prevent the outlining
10999 // site from being indirectly callable.
11000 if (hasBTISemantics(MI))
11002
11004}
11005
11006void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
11007 for (MachineInstr &MI : MBB) {
11008 const MachineOperand *Base;
11009 TypeSize Width(0, false);
11010 int64_t Offset;
11011 bool OffsetIsScalable;
11012
11013 // Is this a load or store with an immediate offset with SP as the base?
11014 if (!MI.mayLoadOrStore() ||
11015 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
11016 &RI) ||
11017 (Base->isReg() && Base->getReg() != AArch64::SP))
11018 continue;
11019
11020 // It is, so we have to fix it up.
11021 TypeSize Scale(0U, false);
11022 int64_t Dummy1, Dummy2;
11023
11024 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
11025 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
11026 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
11027 assert(Scale != 0 && "Unexpected opcode!");
11028 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
11029
11030 // We've pushed the return address to the stack, so add 16 to the offset.
11031 // This is safe, since we already checked if it would overflow when we
11032 // checked if this instruction was legal to outline.
11033 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
11034 StackOffsetOperand.setImm(NewImm);
11035 }
11036}
11037
11039 const AArch64InstrInfo *TII,
11040 bool ShouldSignReturnAddr) {
11041 if (!ShouldSignReturnAddr)
11042 return;
11043
11044 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
11046 TII->createPauthEpilogueInstr(MBB, DebugLoc());
11047}
11048
11049void AArch64InstrInfo::buildOutlinedFrame(
11051 const outliner::OutlinedFunction &OF) const {
11052
11053 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
11054
11055 if (OF.FrameConstructionID == MachineOutlinerTailCall)
11056 FI->setOutliningStyle("Tail Call");
11057 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
11058 // For thunk outlining, rewrite the last instruction from a call to a
11059 // tail-call.
11060 MachineInstr *Call = &*--MBB.instr_end();
11061 unsigned TailOpcode;
11062 if (Call->getOpcode() == AArch64::BL) {
11063 TailOpcode = AArch64::TCRETURNdi;
11064 } else {
11065 assert(Call->getOpcode() == AArch64::BLR ||
11066 Call->getOpcode() == AArch64::BLRNoIP);
11067 TailOpcode = AArch64::TCRETURNriALL;
11068 }
11069 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
11070 .add(Call->getOperand(0))
11071 .addImm(0);
11072 MBB.insert(MBB.end(), TC);
11074
11075 FI->setOutliningStyle("Thunk");
11076 }
11077
11078 bool IsLeafFunction = true;
11079
11080 // Is there a call in the outlined range?
11081 auto IsNonTailCall = [](const MachineInstr &MI) {
11082 return MI.isCall() && !MI.isReturn();
11083 };
11084
11085 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
11086 // Fix up the instructions in the range, since we're going to modify the
11087 // stack.
11088
11089 // Bugzilla ID: 46767
11090 // TODO: Check if fixing up twice is safe so we can outline these.
11091 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
11092 "Can only fix up stack references once");
11093 fixupPostOutline(MBB);
11094
11095 IsLeafFunction = false;
11096
11097 // LR has to be a live in so that we can save it.
11098 if (!MBB.isLiveIn(AArch64::LR))
11099 MBB.addLiveIn(AArch64::LR);
11100
11103
11104 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11105 OF.FrameConstructionID == MachineOutlinerThunk)
11106 Et = std::prev(MBB.end());
11107
11108 // Insert a save before the outlined region
11109 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11110 .addReg(AArch64::SP, RegState::Define)
11111 .addReg(AArch64::LR)
11112 .addReg(AArch64::SP)
11113 .addImm(-16);
11114 It = MBB.insert(It, STRXpre);
11115
11116 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
11117 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
11118
11119 // Add a CFI saying the stack was moved 16 B down.
11120 CFIBuilder.buildDefCFAOffset(16);
11121
11122 // Add a CFI saying that the LR that we want to find is now 16 B higher
11123 // than before.
11124 CFIBuilder.buildOffset(AArch64::LR, -16);
11125 }
11126
11127 // Insert a restore before the terminator for the function.
11128 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11129 .addReg(AArch64::SP, RegState::Define)
11130 .addReg(AArch64::LR, RegState::Define)
11131 .addReg(AArch64::SP)
11132 .addImm(16);
11133 Et = MBB.insert(Et, LDRXpost);
11134 }
11135
11136 auto RASignCondition = FI->getSignReturnAddressCondition();
11137 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
11138 RASignCondition, !IsLeafFunction);
11139
11140 // If this is a tail call outlined function, then there's already a return.
11141 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
11142 OF.FrameConstructionID == MachineOutlinerThunk) {
11143 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11144 return;
11145 }
11146
11147 // It's not a tail call, so we have to insert the return ourselves.
11148
11149 // LR has to be a live in so that we can return to it.
11150 if (!MBB.isLiveIn(AArch64::LR))
11151 MBB.addLiveIn(AArch64::LR);
11152
11153 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
11154 .addReg(AArch64::LR);
11155 MBB.insert(MBB.end(), ret);
11156
11157 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
11158
11159 FI->setOutliningStyle("Function");
11160
11161 // Did we have to modify the stack by saving the link register?
11162 if (OF.FrameConstructionID != MachineOutlinerDefault)
11163 return;
11164
11165 // We modified the stack.
11166 // Walk over the basic block and fix up all the stack accesses.
11167 fixupPostOutline(MBB);
11168}
11169
11170MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
11173
11174 // Are we tail calling?
11175 if (C.CallConstructionID == MachineOutlinerTailCall) {
11176 // If yes, then we can just branch to the label.
11177 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
11178 .addGlobalAddress(M.getNamedValue(MF.getName()))
11179 .addImm(0));
11180 return It;
11181 }
11182
11183 // Are we saving the link register?
11184 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
11185 C.CallConstructionID == MachineOutlinerThunk) {
11186 // No, so just insert the call.
11187 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11188 .addGlobalAddress(M.getNamedValue(MF.getName())));
11189 return It;
11190 }
11191
11192 // We want to return the spot where we inserted the call.
11194
11195 // Instructions for saving and restoring LR around the call instruction we're
11196 // going to insert.
11197 MachineInstr *Save;
11198 MachineInstr *Restore;
11199 // Can we save to a register?
11200 if (C.CallConstructionID == MachineOutlinerRegSave) {
11201 // FIXME: This logic should be sunk into a target-specific interface so that
11202 // we don't have to recompute the register.
11203 Register Reg = findRegisterToSaveLRTo(C);
11204 assert(Reg && "No callee-saved register available?");
11205
11206 // LR has to be a live in so that we can save it.
11207 if (!MBB.isLiveIn(AArch64::LR))
11208 MBB.addLiveIn(AArch64::LR);
11209
11210 // Save and restore LR from Reg.
11211 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11212 .addReg(AArch64::XZR)
11213 .addReg(AArch64::LR)
11214 .addImm(0);
11215 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11216 .addReg(AArch64::XZR)
11217 .addReg(Reg)
11218 .addImm(0);
11219 } else {
11220 // We have the default case. Save and restore from SP.
11221 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11222 .addReg(AArch64::SP, RegState::Define)
11223 .addReg(AArch64::LR)
11224 .addReg(AArch64::SP)
11225 .addImm(-16);
11226 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11227 .addReg(AArch64::SP, RegState::Define)
11228 .addReg(AArch64::LR, RegState::Define)
11229 .addReg(AArch64::SP)
11230 .addImm(16);
11231 }
11232
11233 It = MBB.insert(It, Save);
11234 It++;
11235
11236 // Insert the call.
11237 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11238 .addGlobalAddress(M.getNamedValue(MF.getName())));
11239 CallPt = It;
11240 It++;
11241
11242 It = MBB.insert(It, Restore);
11243 return CallPt;
11244}
11245
11246bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11247 MachineFunction &MF) const {
11248 return MF.getFunction().hasMinSize();
11249}
11250
11251void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11253 DebugLoc &DL,
11254 bool AllowSideEffects) const {
11255 const MachineFunction &MF = *MBB.getParent();
11256 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11257 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11258
11259 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11260 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11261 } else if (STI.isSVEorStreamingSVEAvailable()) {
11262 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11263 .addImm(0)
11264 .addImm(0);
11265 } else if (STI.isNeonAvailable()) {
11266 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11267 .addImm(0);
11268 } else {
11269 // This is a streaming-compatible function without SVE. We don't have full
11270 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11271 // So given `movi v..` would be illegal use `fmov d..` instead.
11272 assert(STI.hasNEON() && "Expected to have NEON.");
11273 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11274 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11275 }
11276}
11277
11278std::optional<DestSourcePair>
11280
11281 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11282 // and zero immediate operands used as an alias for mov instruction.
11283 if (((MI.getOpcode() == AArch64::ORRWrs &&
11284 MI.getOperand(1).getReg() == AArch64::WZR &&
11285 MI.getOperand(3).getImm() == 0x0) ||
11286 (MI.getOpcode() == AArch64::ORRWrr &&
11287 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11288 // Check that the w->w move is not a zero-extending w->x mov.
11289 (!MI.getOperand(0).getReg().isVirtual() ||
11290 MI.getOperand(0).getSubReg() == 0) &&
11291 (!MI.getOperand(0).getReg().isPhysical() ||
11292 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11293 /*TRI=*/nullptr) == -1))
11294 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11295
11296 if (MI.getOpcode() == AArch64::ORRXrs &&
11297 MI.getOperand(1).getReg() == AArch64::XZR &&
11298 MI.getOperand(3).getImm() == 0x0)
11299 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11300
11301 return std::nullopt;
11302}
11303
11304std::optional<DestSourcePair>
11306 if ((MI.getOpcode() == AArch64::ORRWrs &&
11307 MI.getOperand(1).getReg() == AArch64::WZR &&
11308 MI.getOperand(3).getImm() == 0x0) ||
11309 (MI.getOpcode() == AArch64::ORRWrr &&
11310 MI.getOperand(1).getReg() == AArch64::WZR))
11311 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11312 return std::nullopt;
11313}
11314
11315std::optional<RegImmPair>
11316AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11317 int Sign = 1;
11318 int64_t Offset = 0;
11319
11320 // TODO: Handle cases where Reg is a super- or sub-register of the
11321 // destination register.
11322 const MachineOperand &Op0 = MI.getOperand(0);
11323 if (!Op0.isReg() || Reg != Op0.getReg())
11324 return std::nullopt;
11325
11326 switch (MI.getOpcode()) {
11327 default:
11328 return std::nullopt;
11329 case AArch64::SUBWri:
11330 case AArch64::SUBXri:
11331 case AArch64::SUBSWri:
11332 case AArch64::SUBSXri:
11333 Sign *= -1;
11334 [[fallthrough]];
11335 case AArch64::ADDSWri:
11336 case AArch64::ADDSXri:
11337 case AArch64::ADDWri:
11338 case AArch64::ADDXri: {
11339 // TODO: Third operand can be global address (usually some string).
11340 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11341 !MI.getOperand(2).isImm())
11342 return std::nullopt;
11343 int Shift = MI.getOperand(3).getImm();
11344 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11345 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11346 }
11347 }
11348 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11349}
11350
11351/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11352/// the destination register then, if possible, describe the value in terms of
11353/// the source register.
11354static std::optional<ParamLoadedValue>
11356 const TargetInstrInfo *TII,
11357 const TargetRegisterInfo *TRI) {
11358 auto DestSrc = TII->isCopyLikeInstr(MI);
11359 if (!DestSrc)
11360 return std::nullopt;
11361
11362 Register DestReg = DestSrc->Destination->getReg();
11363 Register SrcReg = DestSrc->Source->getReg();
11364
11365 if (!DestReg.isValid() || !SrcReg.isValid())
11366 return std::nullopt;
11367
11368 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11369
11370 // If the described register is the destination, just return the source.
11371 if (DestReg == DescribedReg)
11372 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11373
11374 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11375 if (MI.getOpcode() == AArch64::ORRWrs &&
11376 TRI->isSuperRegister(DestReg, DescribedReg))
11377 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11378
11379 // We may need to describe the lower part of a ORRXrs move.
11380 if (MI.getOpcode() == AArch64::ORRXrs &&
11381 TRI->isSubRegister(DestReg, DescribedReg)) {
11382 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11383 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11384 }
11385
11386 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11387 "Unhandled ORR[XW]rs copy case");
11388
11389 return std::nullopt;
11390}
11391
11392bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11393 // Functions cannot be split to different sections on AArch64 if they have
11394 // a red zone. This is because relaxing a cross-section branch may require
11395 // incrementing the stack pointer to spill a register, which would overwrite
11396 // the red zone.
11397 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11398 return false;
11399
11401}
11402
11403bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11404 const MachineBasicBlock &MBB) const {
11405 // Asm Goto blocks can contain conditional branches to goto labels, which can
11406 // get moved out of range of the branch instruction.
11407 auto isAsmGoto = [](const MachineInstr &MI) {
11408 return MI.getOpcode() == AArch64::INLINEASM_BR;
11409 };
11410 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11411 return false;
11412
11413 // Because jump tables are label-relative instead of table-relative, they all
11414 // must be in the same section or relocation fixup handling will fail.
11415
11416 // Check if MBB is a jump table target
11417 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11418 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11419 return llvm::is_contained(JTE.MBBs, &MBB);
11420 };
11421 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11422 return false;
11423
11424 // Check if MBB contains a jump table lookup
11425 for (const MachineInstr &MI : MBB) {
11426 switch (MI.getOpcode()) {
11427 case TargetOpcode::G_BRJT:
11428 case AArch64::JumpTableDest32:
11429 case AArch64::JumpTableDest16:
11430 case AArch64::JumpTableDest8:
11431 return false;
11432 default:
11433 continue;
11434 }
11435 }
11436
11437 // MBB isn't a special case, so it's safe to be split to the cold section.
11438 return true;
11439}
11440
11441std::optional<ParamLoadedValue>
11442AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11443 Register Reg) const {
11444 const MachineFunction *MF = MI.getMF();
11445 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11446 switch (MI.getOpcode()) {
11447 case AArch64::MOVZWi:
11448 case AArch64::MOVZXi: {
11449 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11450 // 64-bit parameters, so we need to consider super-registers.
11451 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11452 return std::nullopt;
11453
11454 if (!MI.getOperand(1).isImm())
11455 return std::nullopt;
11456 int64_t Immediate = MI.getOperand(1).getImm();
11457 int Shift = MI.getOperand(2).getImm();
11458 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11459 nullptr);
11460 }
11461 case AArch64::ORRWrs:
11462 case AArch64::ORRXrs:
11463 return describeORRLoadedValue(MI, Reg, this, TRI);
11464 }
11465
11467}
11468
11469bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11470 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11471 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11472 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11473 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11474
11475 // Anyexts are nops.
11476 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11477 return true;
11478
11479 Register DefReg = ExtMI.getOperand(0).getReg();
11480 if (!MRI.hasOneNonDBGUse(DefReg))
11481 return false;
11482
11483 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11484 // addressing mode.
11485 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11486 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11487}
11488
11489uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11490 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11491}
11492
11493bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11494 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11495}
11496
11497bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11498 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11499}
11500
11501unsigned int
11502AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11503 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11504}
11505
11506bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11507 unsigned Scale) const {
11508 if (Offset && Scale)
11509 return false;
11510
11511 // Check Reg + Imm
11512 if (!Scale) {
11513 // 9-bit signed offset
11514 if (isInt<9>(Offset))
11515 return true;
11516
11517 // 12-bit unsigned offset
11518 unsigned Shift = Log2_64(NumBytes);
11519 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11520 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11521 (Offset >> Shift) << Shift == Offset)
11522 return true;
11523 return false;
11524 }
11525
11526 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11527 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11528}
11529
11531 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11532 return AArch64::BLRNoIP;
11533 else
11534 return AArch64::BLR;
11535}
11536
11538 DebugLoc DL) const {
11539 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11540 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11542
11543 MachineFunction &MF = *MBB.getParent();
11544 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
11545 auto &AFL = *static_cast<const AArch64FrameLowering *>(
11546 MF.getSubtarget().getFrameLowering());
11547 if (AFL.getArgumentStackToRestore(MF, MBB)) {
11548 Builder.addReg(AArch64::X17, RegState::ImplicitDefine);
11549 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11550 if (Subtarget.hasPAuthLR())
11551 Builder.addReg(AArch64::X15, RegState::ImplicitDefine);
11552 return;
11553 }
11554
11555 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11556 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11557}
11558
11560AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11561 Register TargetReg, bool FrameSetup) const {
11562 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11563
11564 MachineBasicBlock &MBB = *MBBI->getParent();
11565 MachineFunction &MF = *MBB.getParent();
11566 const AArch64InstrInfo *TII =
11567 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11568 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11569 DebugLoc DL = MBB.findDebugLoc(MBBI);
11570
11571 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11572 MachineBasicBlock *LoopTestMBB =
11573 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11574 MF.insert(MBBInsertPoint, LoopTestMBB);
11575 MachineBasicBlock *LoopBodyMBB =
11576 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11577 MF.insert(MBBInsertPoint, LoopBodyMBB);
11578 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11579 MF.insert(MBBInsertPoint, ExitMBB);
11580 MachineInstr::MIFlag Flags =
11582
11583 // LoopTest:
11584 // SUB SP, SP, #ProbeSize
11585 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11586 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11587
11588 // CMP SP, TargetReg
11589 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11590 AArch64::XZR)
11591 .addReg(AArch64::SP)
11592 .addReg(TargetReg)
11594 .setMIFlags(Flags);
11595
11596 // B.<Cond> LoopExit
11597 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11599 .addMBB(ExitMBB)
11600 .setMIFlags(Flags);
11601
11602 // LDR XZR, [SP]
11603 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11604 .addDef(AArch64::XZR)
11605 .addReg(AArch64::SP)
11606 .addImm(0)
11610 Align(8)))
11611 .setMIFlags(Flags);
11612
11613 // B loop
11614 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11615 .addMBB(LoopTestMBB)
11616 .setMIFlags(Flags);
11617
11618 // LoopExit:
11619 // MOV SP, TargetReg
11620 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11621 .addReg(TargetReg)
11622 .addImm(0)
11624 .setMIFlags(Flags);
11625
11626 // LDR XZR, [SP]
11627 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11628 .addReg(AArch64::XZR, RegState::Define)
11629 .addReg(AArch64::SP)
11630 .addImm(0)
11631 .setMIFlags(Flags);
11632
11633 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11635
11636 LoopTestMBB->addSuccessor(ExitMBB);
11637 LoopTestMBB->addSuccessor(LoopBodyMBB);
11638 LoopBodyMBB->addSuccessor(LoopTestMBB);
11639 MBB.addSuccessor(LoopTestMBB);
11640
11641 // Update liveins.
11642 if (MF.getRegInfo().reservedRegsFrozen())
11643 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11644
11645 return ExitMBB->begin();
11646}
11647
11648namespace {
11649class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11650 MachineFunction *MF;
11651 const TargetInstrInfo *TII;
11652 const TargetRegisterInfo *TRI;
11653 MachineRegisterInfo &MRI;
11654
11655 /// The block of the loop
11656 MachineBasicBlock *LoopBB;
11657 /// The conditional branch of the loop
11658 MachineInstr *CondBranch;
11659 /// The compare instruction for loop control
11660 MachineInstr *Comp;
11661 /// The number of the operand of the loop counter value in Comp
11662 unsigned CompCounterOprNum;
11663 /// The instruction that updates the loop counter value
11664 MachineInstr *Update;
11665 /// The number of the operand of the loop counter value in Update
11666 unsigned UpdateCounterOprNum;
11667 /// The initial value of the loop counter
11668 Register Init;
11669 /// True iff Update is a predecessor of Comp
11670 bool IsUpdatePriorComp;
11671
11672 /// The normalized condition used by createTripCountGreaterCondition()
11674
11675public:
11676 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11677 MachineInstr *Comp, unsigned CompCounterOprNum,
11678 MachineInstr *Update, unsigned UpdateCounterOprNum,
11679 Register Init, bool IsUpdatePriorComp,
11680 const SmallVectorImpl<MachineOperand> &Cond)
11681 : MF(Comp->getParent()->getParent()),
11682 TII(MF->getSubtarget().getInstrInfo()),
11683 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11684 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11685 CompCounterOprNum(CompCounterOprNum), Update(Update),
11686 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11687 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11688
11689 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11690 // Make the instructions for loop control be placed in stage 0.
11691 // The predecessors of Comp are considered by the caller.
11692 return MI == Comp;
11693 }
11694
11695 std::optional<bool> createTripCountGreaterCondition(
11696 int TC, MachineBasicBlock &MBB,
11697 SmallVectorImpl<MachineOperand> &CondParam) override {
11698 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11699 // Cond is normalized for such use.
11700 // The predecessors of the branch are assumed to have already been inserted.
11701 CondParam = Cond;
11702 return {};
11703 }
11704
11705 void createRemainingIterationsGreaterCondition(
11706 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11707 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11708
11709 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11710
11711 void adjustTripCount(int TripCountAdjust) override {}
11712
11713 bool isMVEExpanderSupported() override { return true; }
11714};
11715} // namespace
11716
11717/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11718/// is replaced by ReplaceReg. The output register is newly created.
11719/// The other operands are unchanged from MI.
11720static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11721 Register ReplaceReg, MachineBasicBlock &MBB,
11722 MachineBasicBlock::iterator InsertTo) {
11723 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11724 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11725 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11726 Register Result = 0;
11727 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11728 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11729 Result = MRI.createVirtualRegister(
11730 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11731 NewMI->getOperand(I).setReg(Result);
11732 } else if (I == ReplaceOprNum) {
11733 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11734 NewMI->getOperand(I).setReg(ReplaceReg);
11735 }
11736 }
11737 MBB.insert(InsertTo, NewMI);
11738 return Result;
11739}
11740
11741void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11744 // Create and accumulate conditions for next TC iterations.
11745 // Example:
11746 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11747 // # iteration of the kernel
11748 //
11749 // # insert the following instructions
11750 // cond = CSINCXr 0, 0, C, implicit $nzcv
11751 // counter = ADDXri counter, 1 # clone from this->Update
11752 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11753 // cond = CSINCXr cond, cond, C, implicit $nzcv
11754 // ... (repeat TC times)
11755 // SUBSXri cond, 0, implicit-def $nzcv
11756
11757 assert(CondBranch->getOpcode() == AArch64::Bcc);
11758 // CondCode to exit the loop
11760 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11761 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11763
11764 // Accumulate conditions to exit the loop
11765 Register AccCond = AArch64::XZR;
11766
11767 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11768 auto AccumulateCond = [&](Register CurCond,
11770 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11771 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11772 .addReg(NewCond, RegState::Define)
11773 .addReg(CurCond)
11774 .addReg(CurCond)
11776 return NewCond;
11777 };
11778
11779 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11780 // Update and Comp for I==0 are already exists in MBB
11781 // (MBB is an unrolled kernel)
11782 Register Counter;
11783 for (int I = 0; I <= TC; ++I) {
11784 Register NextCounter;
11785 if (I != 0)
11786 NextCounter =
11787 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11788
11789 AccCond = AccumulateCond(AccCond, CC);
11790
11791 if (I != TC) {
11792 if (I == 0) {
11793 if (Update != Comp && IsUpdatePriorComp) {
11794 Counter =
11795 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11796 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11797 MBB.end());
11798 } else {
11799 // can use already calculated value
11800 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11801 }
11802 } else if (Update != Comp) {
11803 NextCounter =
11804 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11805 }
11806 }
11807 Counter = NextCounter;
11808 }
11809 } else {
11810 Register Counter;
11811 if (LastStage0Insts.empty()) {
11812 // use initial counter value (testing if the trip count is sufficient to
11813 // be executed by pipelined code)
11814 Counter = Init;
11815 if (IsUpdatePriorComp)
11816 Counter =
11817 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11818 } else {
11819 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11820 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11821 }
11822
11823 for (int I = 0; I <= TC; ++I) {
11824 Register NextCounter;
11825 NextCounter =
11826 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11827 AccCond = AccumulateCond(AccCond, CC);
11828 if (I != TC && Update != Comp)
11829 NextCounter =
11830 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11831 Counter = NextCounter;
11832 }
11833 }
11834
11835 // If AccCond == 0, the remainder is greater than TC.
11836 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11837 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11838 .addReg(AccCond)
11839 .addImm(0)
11840 .addImm(0);
11841 Cond.clear();
11843}
11844
11845static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11846 Register &RegMBB, Register &RegOther) {
11847 assert(Phi.getNumOperands() == 5);
11848 if (Phi.getOperand(2).getMBB() == MBB) {
11849 RegMBB = Phi.getOperand(1).getReg();
11850 RegOther = Phi.getOperand(3).getReg();
11851 } else {
11852 assert(Phi.getOperand(4).getMBB() == MBB);
11853 RegMBB = Phi.getOperand(3).getReg();
11854 RegOther = Phi.getOperand(1).getReg();
11855 }
11856}
11857
11859 if (!Reg.isVirtual())
11860 return false;
11861 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11862 return MRI.getVRegDef(Reg)->getParent() != BB;
11863}
11864
11865/// If Reg is an induction variable, return true and set some parameters
11866static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11867 MachineInstr *&UpdateInst,
11868 unsigned &UpdateCounterOprNum, Register &InitReg,
11869 bool &IsUpdatePriorComp) {
11870 // Example:
11871 //
11872 // Preheader:
11873 // InitReg = ...
11874 // LoopBB:
11875 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11876 // Reg = COPY Reg0 ; COPY is ignored.
11877 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11878 // ; Reg is the value calculated in the previous
11879 // ; iteration, so IsUpdatePriorComp == false.
11880
11881 if (LoopBB->pred_size() != 2)
11882 return false;
11883 if (!Reg.isVirtual())
11884 return false;
11885 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11886 UpdateInst = nullptr;
11887 UpdateCounterOprNum = 0;
11888 InitReg = 0;
11889 IsUpdatePriorComp = true;
11890 Register CurReg = Reg;
11891 while (true) {
11892 MachineInstr *Def = MRI.getVRegDef(CurReg);
11893 if (Def->getParent() != LoopBB)
11894 return false;
11895 if (Def->isCopy()) {
11896 // Ignore copy instructions unless they contain subregisters
11897 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11898 return false;
11899 CurReg = Def->getOperand(1).getReg();
11900 } else if (Def->isPHI()) {
11901 if (InitReg != 0)
11902 return false;
11903 if (!UpdateInst)
11904 IsUpdatePriorComp = false;
11905 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11906 } else {
11907 if (UpdateInst)
11908 return false;
11909 switch (Def->getOpcode()) {
11910 case AArch64::ADDSXri:
11911 case AArch64::ADDSWri:
11912 case AArch64::SUBSXri:
11913 case AArch64::SUBSWri:
11914 case AArch64::ADDXri:
11915 case AArch64::ADDWri:
11916 case AArch64::SUBXri:
11917 case AArch64::SUBWri:
11918 UpdateInst = Def;
11919 UpdateCounterOprNum = 1;
11920 break;
11921 case AArch64::ADDSXrr:
11922 case AArch64::ADDSWrr:
11923 case AArch64::SUBSXrr:
11924 case AArch64::SUBSWrr:
11925 case AArch64::ADDXrr:
11926 case AArch64::ADDWrr:
11927 case AArch64::SUBXrr:
11928 case AArch64::SUBWrr:
11929 UpdateInst = Def;
11930 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11931 UpdateCounterOprNum = 1;
11932 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11933 UpdateCounterOprNum = 2;
11934 else
11935 return false;
11936 break;
11937 default:
11938 return false;
11939 }
11940 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11941 }
11942
11943 if (!CurReg.isVirtual())
11944 return false;
11945 if (Reg == CurReg)
11946 break;
11947 }
11948
11949 if (!UpdateInst)
11950 return false;
11951
11952 return true;
11953}
11954
11955std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11957 // Accept loops that meet the following conditions
11958 // * The conditional branch is BCC
11959 // * The compare instruction is ADDS/SUBS/WHILEXX
11960 // * One operand of the compare is an induction variable and the other is a
11961 // loop invariant value
11962 // * The induction variable is incremented/decremented by a single instruction
11963 // * Does not contain CALL or instructions which have unmodeled side effects
11964
11965 for (MachineInstr &MI : *LoopBB)
11966 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11967 // This instruction may use NZCV, which interferes with the instruction to
11968 // be inserted for loop control.
11969 return nullptr;
11970
11971 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11973 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11974 return nullptr;
11975
11976 // Infinite loops are not supported
11977 if (TBB == LoopBB && FBB == LoopBB)
11978 return nullptr;
11979
11980 // Must be conditional branch
11981 if (TBB != LoopBB && FBB == nullptr)
11982 return nullptr;
11983
11984 assert((TBB == LoopBB || FBB == LoopBB) &&
11985 "The Loop must be a single-basic-block loop");
11986
11987 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11989
11990 if (CondBranch->getOpcode() != AArch64::Bcc)
11991 return nullptr;
11992
11993 // Normalization for createTripCountGreaterCondition()
11994 if (TBB == LoopBB)
11996
11997 MachineInstr *Comp = nullptr;
11998 unsigned CompCounterOprNum = 0;
11999 for (MachineInstr &MI : reverse(*LoopBB)) {
12000 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
12001 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
12002 // operands is a loop invariant value
12003
12004 switch (MI.getOpcode()) {
12005 case AArch64::SUBSXri:
12006 case AArch64::SUBSWri:
12007 case AArch64::ADDSXri:
12008 case AArch64::ADDSWri:
12009 Comp = &MI;
12010 CompCounterOprNum = 1;
12011 break;
12012 case AArch64::ADDSWrr:
12013 case AArch64::ADDSXrr:
12014 case AArch64::SUBSWrr:
12015 case AArch64::SUBSXrr:
12016 Comp = &MI;
12017 break;
12018 default:
12019 if (isWhileOpcode(MI.getOpcode())) {
12020 Comp = &MI;
12021 break;
12022 }
12023 return nullptr;
12024 }
12025
12026 if (CompCounterOprNum == 0) {
12027 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
12028 CompCounterOprNum = 2;
12029 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
12030 CompCounterOprNum = 1;
12031 else
12032 return nullptr;
12033 }
12034 break;
12035 }
12036 }
12037 if (!Comp)
12038 return nullptr;
12039
12040 MachineInstr *Update = nullptr;
12041 Register Init;
12042 bool IsUpdatePriorComp;
12043 unsigned UpdateCounterOprNum;
12044 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
12045 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
12046 return nullptr;
12047
12048 return std::make_unique<AArch64PipelinerLoopInfo>(
12049 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
12050 Init, IsUpdatePriorComp, Cond);
12051}
12052
12053/// verifyInstruction - Perform target specific instruction verification.
12054bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
12055 StringRef &ErrInfo) const {
12056 // Verify that immediate offsets on load/store instructions are within range.
12057 // Stack objects with an FI operand are excluded as they can be fixed up
12058 // during PEI.
12059 TypeSize Scale(0U, false), Width(0U, false);
12060 int64_t MinOffset, MaxOffset;
12061 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
12062 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
12063 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
12064 int64_t Imm = MI.getOperand(ImmIdx).getImm();
12065 if (Imm < MinOffset || Imm > MaxOffset) {
12066 ErrInfo = "Unexpected immediate on load/store instruction";
12067 return false;
12068 }
12069 }
12070 }
12071
12072 const MCInstrDesc &MCID = MI.getDesc();
12073 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
12074 const MachineOperand &MO = MI.getOperand(Op);
12075 switch (MCID.operands()[Op].OperandType) {
12077 if (!MO.isImm() || MO.getImm() != 0) {
12078 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
12079 return false;
12080 }
12081 break;
12083 if (!MO.isImm() ||
12085 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
12086 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
12087 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
12088 return false;
12089 }
12090 break;
12091 default:
12092 break;
12093 }
12094 }
12095 return true;
12096}
12097
12098#define GET_INSTRINFO_HELPERS
12099#define GET_INSTRMAP_INFO
12100#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:124
bool empty() const
Definition DenseMap.h:173
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:66
bool usesWindowsCFI() const
Definition MCAsmInfo.h:674
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:615
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:657
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:630
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:727
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVAddr(unsigned Opcode, unsigned TargetFlags, bool IsTargetMachO, SmallVectorImpl< AddrInsnModel > &Insn)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool isLFIPrePostMemAccess(unsigned Opcode)
Returns true if Opcode is a pre- or post-indexed memory access that the LFI rewriter expands with a b...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.