LLVM 23.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// Return the maximum number of bytes of code the specified instruction may be
111/// after LFI rewriting. If the instruction is not rewritten, std::nullopt is
112/// returned (use default sizing).
113///
114/// NOTE: the size estimates here must be kept in sync with the rewrites in
115/// AArch64MCLFIRewriter.cpp. Sizes may be overestimates of the rewritten
116/// instruction sequences.
117static std::optional<unsigned> getLFIInstSizeInBytes(const MachineInstr &MI) {
118 switch (MI.getOpcode()) {
119 case AArch64::SVC:
120 // SVC expands to 4 instructions.
121 return 16;
122 case AArch64::BR:
123 case AArch64::BLR:
124 // Indirect branches/calls expand to 2 instructions (guard + br/blr).
125 return 8;
126 case AArch64::RET:
127 // RET through LR is not rewritten, but RET through another register
128 // expands to 2 instructions (guard + ret).
129 if (MI.getOperand(0).getReg() != AArch64::LR)
130 return 8;
131 return 4;
132 default:
133 break;
134 }
135
136 // Instructions that explicitly modify LR expand to 2 instructions.
137 for (const MachineOperand &MO : MI.explicit_operands())
138 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::LR)
139 return 8;
140
141 // Default case: instructions that don't cause expansion.
142 // - TP accesses in LFI are a single load/store, so no expansion.
143 // - All remaining instructions are not rewritten.
144 return std::nullopt;
145}
146
147/// GetInstSize - Return the number of bytes of code the specified
148/// instruction may be. This returns the maximum number of bytes.
150 const MachineBasicBlock &MBB = *MI.getParent();
151 const MachineFunction *MF = MBB.getParent();
152 const Function &F = MF->getFunction();
153 const MCAsmInfo &MAI = MF->getTarget().getMCAsmInfo();
154
155 {
156 auto Op = MI.getOpcode();
157 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
158 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
159 }
160
161 // Meta-instructions emit no code.
162 if (MI.isMetaInstruction())
163 return 0;
164
165 // FIXME: We currently only handle pseudoinstructions that don't get expanded
166 // before the assembly printer.
167 unsigned NumBytes = 0;
168 const MCInstrDesc &Desc = MI.getDesc();
169
170 // LFI rewriter expansions that supersede normal sizing.
171 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
172 if (STI.isLFI())
173 if (auto Size = getLFIInstSizeInBytes(MI))
174 return *Size;
175
176 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
177 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
178
179 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
180 if (!MFI->shouldSignReturnAddress(*MF))
181 return NumBytes;
182
183 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
184 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
185 return NumBytes;
186 }
187
188 // Size should be preferably set in
189 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
190 // Specific cases handle instructions of variable sizes
191 switch (Desc.getOpcode()) {
192 default:
193 if (Desc.getSize())
194 return Desc.getSize();
195
196 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
197 // with fixed constant size but not specified in .td file) is a normal
198 // 4-byte insn.
199 NumBytes = 4;
200 break;
201 case TargetOpcode::STACKMAP:
202 // The upper bound for a stackmap intrinsic is the full length of its shadow
203 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
204 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
205 break;
206 case TargetOpcode::PATCHPOINT:
207 // The size of the patchpoint intrinsic is the number of bytes requested
208 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
209 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
210 break;
211 case TargetOpcode::STATEPOINT:
212 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
213 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
214 // No patch bytes means a normal call inst is emitted
215 if (NumBytes == 0)
216 NumBytes = 4;
217 break;
218 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
219 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
220 // instructions are expanded to the specified number of NOPs. Otherwise,
221 // they are expanded to 36-byte XRay sleds.
222 NumBytes =
223 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
224 break;
225 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
226 case TargetOpcode::PATCHABLE_TAIL_CALL:
227 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
228 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
229 NumBytes = 36;
230 break;
231 case TargetOpcode::PATCHABLE_EVENT_CALL:
232 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
233 NumBytes = 24;
234 break;
235
236 case AArch64::SPACE:
237 NumBytes = MI.getOperand(1).getImm();
238 break;
239 case TargetOpcode::BUNDLE:
240 NumBytes = getInstBundleSize(MI);
241 break;
242 }
243
244 return NumBytes;
245}
246
249 // Block ends with fall-through condbranch.
250 switch (LastInst->getOpcode()) {
251 default:
252 llvm_unreachable("Unknown branch instruction?");
253 case AArch64::Bcc:
254 Target = LastInst->getOperand(1).getMBB();
255 Cond.push_back(LastInst->getOperand(0));
256 break;
257 case AArch64::CBZW:
258 case AArch64::CBZX:
259 case AArch64::CBNZW:
260 case AArch64::CBNZX:
261 Target = LastInst->getOperand(1).getMBB();
262 Cond.push_back(MachineOperand::CreateImm(-1));
263 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
264 Cond.push_back(LastInst->getOperand(0));
265 break;
266 case AArch64::TBZW:
267 case AArch64::TBZX:
268 case AArch64::TBNZW:
269 case AArch64::TBNZX:
270 Target = LastInst->getOperand(2).getMBB();
271 Cond.push_back(MachineOperand::CreateImm(-1));
272 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
273 Cond.push_back(LastInst->getOperand(0));
274 Cond.push_back(LastInst->getOperand(1));
275 break;
276 case AArch64::CBWPri:
277 case AArch64::CBXPri:
278 case AArch64::CBWPrr:
279 case AArch64::CBXPrr:
280 Target = LastInst->getOperand(3).getMBB();
281 Cond.push_back(MachineOperand::CreateImm(-1));
282 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
283 Cond.push_back(LastInst->getOperand(0));
284 Cond.push_back(LastInst->getOperand(1));
285 Cond.push_back(LastInst->getOperand(2));
286 break;
287 case AArch64::CBBAssertExt:
288 case AArch64::CBHAssertExt:
289 Target = LastInst->getOperand(3).getMBB();
290 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
291 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
292 Cond.push_back(LastInst->getOperand(0)); // Cond
293 Cond.push_back(LastInst->getOperand(1)); // Op0
294 Cond.push_back(LastInst->getOperand(2)); // Op1
295 Cond.push_back(LastInst->getOperand(4)); // Ext0
296 Cond.push_back(LastInst->getOperand(5)); // Ext1
297 break;
298 }
299}
300
301static unsigned getBranchDisplacementBits(unsigned Opc) {
302 switch (Opc) {
303 default:
304 llvm_unreachable("unexpected opcode!");
305 case AArch64::B:
306 return BDisplacementBits;
307 case AArch64::TBNZW:
308 case AArch64::TBZW:
309 case AArch64::TBNZX:
310 case AArch64::TBZX:
311 return TBZDisplacementBits;
312 case AArch64::CBNZW:
313 case AArch64::CBZW:
314 case AArch64::CBNZX:
315 case AArch64::CBZX:
316 return CBZDisplacementBits;
317 case AArch64::Bcc:
318 return BCCDisplacementBits;
319 case AArch64::CBWPri:
320 case AArch64::CBXPri:
321 case AArch64::CBBAssertExt:
322 case AArch64::CBHAssertExt:
323 case AArch64::CBWPrr:
324 case AArch64::CBXPrr:
325 return CBDisplacementBits;
326 }
327}
328
330 int64_t BrOffset) const {
331 unsigned Bits = getBranchDisplacementBits(BranchOp);
332 assert(Bits >= 3 && "max branch displacement must be enough to jump"
333 "over conditional branch expansion");
334 return isIntN(Bits, BrOffset / 4);
335}
336
339 switch (MI.getOpcode()) {
340 default:
341 llvm_unreachable("unexpected opcode!");
342 case AArch64::B:
343 return MI.getOperand(0).getMBB();
344 case AArch64::TBZW:
345 case AArch64::TBNZW:
346 case AArch64::TBZX:
347 case AArch64::TBNZX:
348 return MI.getOperand(2).getMBB();
349 case AArch64::CBZW:
350 case AArch64::CBNZW:
351 case AArch64::CBZX:
352 case AArch64::CBNZX:
353 case AArch64::Bcc:
354 return MI.getOperand(1).getMBB();
355 case AArch64::CBWPri:
356 case AArch64::CBXPri:
357 case AArch64::CBBAssertExt:
358 case AArch64::CBHAssertExt:
359 case AArch64::CBWPrr:
360 case AArch64::CBXPrr:
361 return MI.getOperand(3).getMBB();
362 }
363}
364
366 MachineBasicBlock &NewDestBB,
367 MachineBasicBlock &RestoreBB,
368 const DebugLoc &DL,
369 int64_t BrOffset,
370 RegScavenger *RS) const {
371 assert(RS && "RegScavenger required for long branching");
372 assert(MBB.empty() &&
373 "new block should be inserted for expanding unconditional branch");
374 assert(MBB.pred_size() == 1);
375 assert(RestoreBB.empty() &&
376 "restore block should be inserted for restoring clobbered registers");
377
378 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
379 // Offsets outside of the signed 33-bit range are not supported for ADRP +
380 // ADD.
381 if (!isInt<33>(BrOffset))
383 "Branch offsets outside of the signed 33-bit range not supported");
384
385 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
386 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
387 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
388 .addReg(Reg)
389 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
390 .addImm(0);
391 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
392 };
393
394 RS->enterBasicBlockEnd(MBB);
395 // If X16 is unused, we can rely on the linker to insert a range extension
396 // thunk if NewDestBB is out of range of a single B instruction.
397 constexpr Register Reg = AArch64::X16;
398 if (!RS->isRegUsed(Reg)) {
399 insertUnconditionalBranch(MBB, &NewDestBB, DL);
400 RS->setRegUsed(Reg);
401 return;
402 }
403
404 // In a cold block without BTI, insert the indirect branch if a register is
405 // free. Skip this if BTI is enabled to avoid inserting a BTI at the target,
406 // prioritizing a dynamic cost in cold code over a static cost in hot code.
407 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
408 bool HasBTI = AFI && AFI->branchTargetEnforcement();
409 if (MBB.getSectionID() == MBBSectionID::ColdSectionID && !HasBTI) {
410 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
411 if (Scavenged != AArch64::NoRegister) {
412 buildIndirectBranch(Scavenged, NewDestBB);
413 RS->setRegUsed(Scavenged);
414 return;
415 }
416 }
417
418 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
419 // with red zones.
420 if (!AFI || AFI->hasRedZone().value_or(true))
422 "Unable to insert indirect branch inside function that has red zone");
423
424 // Otherwise, spill X16 and defer range extension to the linker.
425 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
426 .addReg(AArch64::SP, RegState::Define)
427 .addReg(Reg)
428 .addReg(AArch64::SP)
429 .addImm(-16);
430
431 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
432
433 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
434 .addReg(AArch64::SP, RegState::Define)
436 .addReg(AArch64::SP)
437 .addImm(16);
438}
439
440// Branch analysis.
443 MachineBasicBlock *&FBB,
445 bool AllowModify) const {
446 // If the block has no terminators, it just falls into the block after it.
447 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
448 if (I == MBB.end())
449 return false;
450
451 // Skip over SpeculationBarrierEndBB terminators
452 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
453 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
454 --I;
455 }
456
457 if (!isUnpredicatedTerminator(*I))
458 return false;
459
460 // Get the last instruction in the block.
461 MachineInstr *LastInst = &*I;
462
463 // If there is only one terminator instruction, process it.
464 unsigned LastOpc = LastInst->getOpcode();
465 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
466 if (isUncondBranchOpcode(LastOpc)) {
467 TBB = LastInst->getOperand(0).getMBB();
468 return false;
469 }
470 if (isCondBranchOpcode(LastOpc)) {
471 // Block ends with fall-through condbranch.
472 parseCondBranch(LastInst, TBB, Cond);
473 return false;
474 }
475 return true; // Can't handle indirect branch.
476 }
477
478 // Get the instruction before it if it is a terminator.
479 MachineInstr *SecondLastInst = &*I;
480 unsigned SecondLastOpc = SecondLastInst->getOpcode();
481
482 // If AllowModify is true and the block ends with two or more unconditional
483 // branches, delete all but the first unconditional branch.
484 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
485 while (isUncondBranchOpcode(SecondLastOpc)) {
486 LastInst->eraseFromParent();
487 LastInst = SecondLastInst;
488 LastOpc = LastInst->getOpcode();
489 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
490 // Return now the only terminator is an unconditional branch.
491 TBB = LastInst->getOperand(0).getMBB();
492 return false;
493 }
494 SecondLastInst = &*I;
495 SecondLastOpc = SecondLastInst->getOpcode();
496 }
497 }
498
499 // If we're allowed to modify and the block ends in a unconditional branch
500 // which could simply fallthrough, remove the branch. (Note: This case only
501 // matters when we can't understand the whole sequence, otherwise it's also
502 // handled by BranchFolding.cpp.)
503 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
504 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
505 LastInst->eraseFromParent();
506 LastInst = SecondLastInst;
507 LastOpc = LastInst->getOpcode();
508 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
509 assert(!isUncondBranchOpcode(LastOpc) &&
510 "unreachable unconditional branches removed above");
511
512 if (isCondBranchOpcode(LastOpc)) {
513 // Block ends with fall-through condbranch.
514 parseCondBranch(LastInst, TBB, Cond);
515 return false;
516 }
517 return true; // Can't handle indirect branch.
518 }
519 SecondLastInst = &*I;
520 SecondLastOpc = SecondLastInst->getOpcode();
521 }
522
523 // If there are three terminators, we don't know what sort of block this is.
524 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
525 return true;
526
527 // If the block ends with a B and a Bcc, handle it.
528 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
529 parseCondBranch(SecondLastInst, TBB, Cond);
530 FBB = LastInst->getOperand(0).getMBB();
531 return false;
532 }
533
534 // If the block ends with two unconditional branches, handle it. The second
535 // one is not executed, so remove it.
536 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
537 TBB = SecondLastInst->getOperand(0).getMBB();
538 I = LastInst;
539 if (AllowModify)
540 I->eraseFromParent();
541 return false;
542 }
543
544 // ...likewise if it ends with an indirect branch followed by an unconditional
545 // branch.
546 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
547 I = LastInst;
548 if (AllowModify)
549 I->eraseFromParent();
550 return true;
551 }
552
553 // Otherwise, can't handle this.
554 return true;
555}
556
558 MachineBranchPredicate &MBP,
559 bool AllowModify) const {
560 // Use analyzeBranch to validate the branch pattern.
561 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
563 if (analyzeBranch(MBB, TBB, FBB, Cond, AllowModify))
564 return true;
565
566 // analyzeBranch returns success with empty Cond for unconditional branches.
567 if (Cond.empty())
568 return true;
569
570 MBP.TrueDest = TBB;
571 assert(MBP.TrueDest && "expected!");
572 MBP.FalseDest = FBB ? FBB : MBB.getNextNode();
573
574 MBP.ConditionDef = nullptr;
575 MBP.SingleUseCondition = false;
576
577 // Find the conditional branch. After analyzeBranch succeeds with non-empty
578 // Cond, there's exactly one conditional branch - either last (fallthrough)
579 // or second-to-last (followed by unconditional B).
580 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
581 if (I == MBB.end())
582 return true;
583
584 if (isUncondBranchOpcode(I->getOpcode())) {
585 if (I == MBB.begin())
586 return true;
587 --I;
588 }
589
590 MachineInstr *CondBranch = &*I;
591 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
592
593 switch (CondBranch->getOpcode()) {
594 default:
595 return true;
596
597 case AArch64::Bcc:
598 // Bcc takes the NZCV flag as the operand to branch on, walk up the
599 // instruction stream to find the last instruction to define NZCV.
601 if (MI.modifiesRegister(AArch64::NZCV, /*TRI=*/nullptr)) {
602 MBP.ConditionDef = &MI;
603 break;
604 }
605 }
606 return false;
607
608 case AArch64::CBZW:
609 case AArch64::CBZX:
610 case AArch64::CBNZW:
611 case AArch64::CBNZX: {
612 MBP.LHS = CondBranch->getOperand(0);
613 MBP.RHS = MachineOperand::CreateImm(0);
614 unsigned Opc = CondBranch->getOpcode();
615 MBP.Predicate = (Opc == AArch64::CBNZX || Opc == AArch64::CBNZW)
616 ? MachineBranchPredicate::PRED_NE
617 : MachineBranchPredicate::PRED_EQ;
618 Register CondReg = MBP.LHS.getReg();
619 if (CondReg.isVirtual())
620 MBP.ConditionDef = MRI.getVRegDef(CondReg);
621 return false;
622 }
623
624 case AArch64::TBZW:
625 case AArch64::TBZX:
626 case AArch64::TBNZW:
627 case AArch64::TBNZX: {
628 Register CondReg = CondBranch->getOperand(0).getReg();
629 if (CondReg.isVirtual())
630 MBP.ConditionDef = MRI.getVRegDef(CondReg);
631 return false;
632 }
633 }
634}
635
638 if (Cond[0].getImm() != -1) {
639 // Regular Bcc
640 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
642 } else {
643 // Folded compare-and-branch
644 switch (Cond[1].getImm()) {
645 default:
646 llvm_unreachable("Unknown conditional branch!");
647 case AArch64::CBZW:
648 Cond[1].setImm(AArch64::CBNZW);
649 break;
650 case AArch64::CBNZW:
651 Cond[1].setImm(AArch64::CBZW);
652 break;
653 case AArch64::CBZX:
654 Cond[1].setImm(AArch64::CBNZX);
655 break;
656 case AArch64::CBNZX:
657 Cond[1].setImm(AArch64::CBZX);
658 break;
659 case AArch64::TBZW:
660 Cond[1].setImm(AArch64::TBNZW);
661 break;
662 case AArch64::TBNZW:
663 Cond[1].setImm(AArch64::TBZW);
664 break;
665 case AArch64::TBZX:
666 Cond[1].setImm(AArch64::TBNZX);
667 break;
668 case AArch64::TBNZX:
669 Cond[1].setImm(AArch64::TBZX);
670 break;
671
672 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
673 case AArch64::CBWPri:
674 case AArch64::CBXPri:
675 case AArch64::CBBAssertExt:
676 case AArch64::CBHAssertExt:
677 case AArch64::CBWPrr:
678 case AArch64::CBXPrr: {
679 // Pseudos using standard 4bit Arm condition codes
681 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
683 }
684 }
685 }
686
687 return false;
688}
689
691 int *BytesRemoved) const {
692 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
693 if (I == MBB.end())
694 return 0;
695
696 if (!isUncondBranchOpcode(I->getOpcode()) &&
697 !isCondBranchOpcode(I->getOpcode()))
698 return 0;
699
700 // Remove the branch.
701 I->eraseFromParent();
702
703 I = MBB.end();
704
705 if (I == MBB.begin()) {
706 if (BytesRemoved)
707 *BytesRemoved = 4;
708 return 1;
709 }
710 --I;
711 if (!isCondBranchOpcode(I->getOpcode())) {
712 if (BytesRemoved)
713 *BytesRemoved = 4;
714 return 1;
715 }
716
717 // Remove the branch.
718 I->eraseFromParent();
719 if (BytesRemoved)
720 *BytesRemoved = 8;
721
722 return 2;
723}
724
725void AArch64InstrInfo::instantiateCondBranch(
728 if (Cond[0].getImm() != -1) {
729 // Regular Bcc
730 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
731 } else {
732 // Folded compare-and-branch
733 // Note that we use addOperand instead of addReg to keep the flags.
734
735 // cbz, cbnz
736 const MachineInstrBuilder MIB =
737 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
738
739 // tbz/tbnz
740 if (Cond.size() > 3)
741 MIB.add(Cond[3]);
742
743 // cb
744 if (Cond.size() > 4)
745 MIB.add(Cond[4]);
746
747 MIB.addMBB(TBB);
748
749 // cb[b,h]
750 if (Cond.size() > 5) {
751 MIB.addImm(Cond[5].getImm());
752 MIB.addImm(Cond[6].getImm());
753 }
754 }
755}
756
759 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
760 // Shouldn't be a fall through.
761 assert(TBB && "insertBranch must not be told to insert a fallthrough");
762
763 if (!FBB) {
764 if (Cond.empty()) // Unconditional branch?
765 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
766 else
767 instantiateCondBranch(MBB, DL, TBB, Cond);
768
769 if (BytesAdded)
770 *BytesAdded = 4;
771
772 return 1;
773 }
774
775 // Two-way conditional branch.
776 instantiateCondBranch(MBB, DL, TBB, Cond);
777 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
778
779 if (BytesAdded)
780 *BytesAdded = 8;
781
782 return 2;
783}
784
786 const TargetInstrInfo &TII) {
787 for (MachineInstr &MI : MBB->terminators()) {
788 unsigned Opc = MI.getOpcode();
789 switch (Opc) {
790 case AArch64::CBZW:
791 case AArch64::CBZX:
792 case AArch64::TBZW:
793 case AArch64::TBZX:
794 // CBZ/TBZ with WZR/XZR -> unconditional B
795 if (MI.getOperand(0).getReg() == AArch64::WZR ||
796 MI.getOperand(0).getReg() == AArch64::XZR) {
797 DEBUG_WITH_TYPE("optimizeTerminators",
798 dbgs() << "Removing always taken branch: " << MI);
799 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
800 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
801 for (auto *S : Succs)
802 if (S != Target)
803 MBB->removeSuccessor(S);
804 DebugLoc DL = MI.getDebugLoc();
805 while (MBB->rbegin() != &MI)
806 MBB->rbegin()->eraseFromParent();
807 MI.eraseFromParent();
808 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
809 return true;
810 }
811 break;
812 case AArch64::CBNZW:
813 case AArch64::CBNZX:
814 case AArch64::TBNZW:
815 case AArch64::TBNZX:
816 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
817 if (MI.getOperand(0).getReg() == AArch64::WZR ||
818 MI.getOperand(0).getReg() == AArch64::XZR) {
819 DEBUG_WITH_TYPE("optimizeTerminators",
820 dbgs() << "Removing never taken branch: " << MI);
821 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
822 MI.getParent()->removeSuccessor(Target);
823 MI.eraseFromParent();
824 return true;
825 }
826 break;
827 }
828 }
829 return false;
830}
831
832// Find the original register that VReg is copied from.
833static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
834 while (Register::isVirtualRegister(VReg)) {
835 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
836 if (!DefMI->isFullCopy())
837 return VReg;
838 VReg = DefMI->getOperand(1).getReg();
839 }
840 return VReg;
841}
842
843// Determine if VReg is defined by an instruction that can be folded into a
844// csel instruction. If so, return the folded opcode, and the replacement
845// register.
846static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
847 unsigned *NewReg = nullptr) {
848 VReg = removeCopies(MRI, VReg);
850 return 0;
851
852 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
853 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
854 unsigned Opc = 0;
855 unsigned SrcReg = 0;
856 switch (DefMI->getOpcode()) {
857 case AArch64::SUBREG_TO_REG:
858 // Check for the following way to define an 64-bit immediate:
859 // %0:gpr32 = MOVi32imm 1
860 // %1:gpr64 = SUBREG_TO_REG %0:gpr32, %subreg.sub_32
861 if (!DefMI->getOperand(1).isReg())
862 return 0;
863 if (!DefMI->getOperand(2).isImm() ||
864 DefMI->getOperand(2).getImm() != AArch64::sub_32)
865 return 0;
866 DefMI = MRI.getVRegDef(DefMI->getOperand(1).getReg());
867 if (DefMI->getOpcode() != AArch64::MOVi32imm)
868 return 0;
869 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
870 return 0;
871 assert(Is64Bit);
872 SrcReg = AArch64::XZR;
873 Opc = AArch64::CSINCXr;
874 break;
875
876 case AArch64::MOVi32imm:
877 case AArch64::MOVi64imm:
878 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
879 return 0;
880 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
881 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
882 break;
883
884 case AArch64::ADDSXri:
885 case AArch64::ADDSWri:
886 // if NZCV is used, do not fold.
887 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
888 true) == -1)
889 return 0;
890 // fall-through to ADDXri and ADDWri.
891 [[fallthrough]];
892 case AArch64::ADDXri:
893 case AArch64::ADDWri:
894 // add x, 1 -> csinc.
895 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
896 DefMI->getOperand(3).getImm() != 0)
897 return 0;
898 SrcReg = DefMI->getOperand(1).getReg();
899 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
900 break;
901
902 case AArch64::ORNXrr:
903 case AArch64::ORNWrr: {
904 // not x -> csinv, represented as orn dst, xzr, src.
905 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
906 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
907 return 0;
908 SrcReg = DefMI->getOperand(2).getReg();
909 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
910 break;
911 }
912
913 case AArch64::SUBSXrr:
914 case AArch64::SUBSWrr:
915 // if NZCV is used, do not fold.
916 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
917 true) == -1)
918 return 0;
919 // fall-through to SUBXrr and SUBWrr.
920 [[fallthrough]];
921 case AArch64::SUBXrr:
922 case AArch64::SUBWrr: {
923 // neg x -> csneg, represented as sub dst, xzr, src.
924 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
925 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
926 return 0;
927 SrcReg = DefMI->getOperand(2).getReg();
928 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
929 break;
930 }
931 default:
932 return 0;
933 }
934 assert(Opc && SrcReg && "Missing parameters");
935
936 if (NewReg)
937 *NewReg = SrcReg;
938 return Opc;
939}
940
943 Register DstReg, Register TrueReg,
944 Register FalseReg, int &CondCycles,
945 int &TrueCycles,
946 int &FalseCycles) const {
947 // Check register classes.
948 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
949 const TargetRegisterClass *RC =
950 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
951 if (!RC)
952 return false;
953
954 // Also need to check the dest regclass, in case we're trying to optimize
955 // something like:
956 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
957 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
958 return false;
959
960 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
961 unsigned ExtraCondLat = Cond.size() != 1;
962
963 // GPRs are handled by csel.
964 // FIXME: Fold in x+1, -x, and ~x when applicable.
965 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
966 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
967 // Single-cycle csel, csinc, csinv, and csneg.
968 CondCycles = 1 + ExtraCondLat;
969 TrueCycles = FalseCycles = 1;
970 if (canFoldIntoCSel(MRI, TrueReg))
971 TrueCycles = 0;
972 else if (canFoldIntoCSel(MRI, FalseReg))
973 FalseCycles = 0;
974 return true;
975 }
976
977 // Scalar floating point is handled by fcsel.
978 // FIXME: Form fabs, fmin, and fmax when applicable.
979 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
980 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
981 CondCycles = 5 + ExtraCondLat;
982 TrueCycles = FalseCycles = 2;
983 return true;
984 }
985
986 // Can't do vectors.
987 return false;
988}
989
992 const DebugLoc &DL, Register DstReg,
994 Register TrueReg, Register FalseReg) const {
995 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
996
997 // Parse the condition code, see parseCondBranch() above.
999 switch (Cond.size()) {
1000 default:
1001 llvm_unreachable("Unknown condition opcode in Cond");
1002 case 1: // b.cc
1003 CC = AArch64CC::CondCode(Cond[0].getImm());
1004 break;
1005 case 3: { // cbz/cbnz
1006 // We must insert a compare against 0.
1007 bool Is64Bit;
1008 switch (Cond[1].getImm()) {
1009 default:
1010 llvm_unreachable("Unknown branch opcode in Cond");
1011 case AArch64::CBZW:
1012 Is64Bit = false;
1013 CC = AArch64CC::EQ;
1014 break;
1015 case AArch64::CBZX:
1016 Is64Bit = true;
1017 CC = AArch64CC::EQ;
1018 break;
1019 case AArch64::CBNZW:
1020 Is64Bit = false;
1021 CC = AArch64CC::NE;
1022 break;
1023 case AArch64::CBNZX:
1024 Is64Bit = true;
1025 CC = AArch64CC::NE;
1026 break;
1027 }
1028 Register SrcReg = Cond[2].getReg();
1029 if (Is64Bit) {
1030 // cmp reg, #0 is actually subs xzr, reg, #0.
1031 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
1032 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
1033 .addReg(SrcReg)
1034 .addImm(0)
1035 .addImm(0);
1036 } else {
1037 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
1038 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
1039 .addReg(SrcReg)
1040 .addImm(0)
1041 .addImm(0);
1042 }
1043 break;
1044 }
1045 case 4: { // tbz/tbnz
1046 // We must insert a tst instruction.
1047 switch (Cond[1].getImm()) {
1048 default:
1049 llvm_unreachable("Unknown branch opcode in Cond");
1050 case AArch64::TBZW:
1051 case AArch64::TBZX:
1052 CC = AArch64CC::EQ;
1053 break;
1054 case AArch64::TBNZW:
1055 case AArch64::TBNZX:
1056 CC = AArch64CC::NE;
1057 break;
1058 }
1059 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1060 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1061 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1062 .addReg(Cond[2].getReg())
1063 .addImm(
1065 else
1066 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1067 .addReg(Cond[2].getReg())
1068 .addImm(
1070 break;
1071 }
1072 case 5: { // cb
1073 // We must insert a cmp, that is a subs
1074 // 0 1 2 3 4
1075 // Cond is { -1, Opcode, CC, Op0, Op1 }
1076
1077 unsigned SubsOpc, SubsDestReg;
1078 bool IsImm = false;
1079 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1080 switch (Cond[1].getImm()) {
1081 default:
1082 llvm_unreachable("Unknown branch opcode in Cond");
1083 case AArch64::CBWPri:
1084 SubsOpc = AArch64::SUBSWri;
1085 SubsDestReg = AArch64::WZR;
1086 IsImm = true;
1087 break;
1088 case AArch64::CBXPri:
1089 SubsOpc = AArch64::SUBSXri;
1090 SubsDestReg = AArch64::XZR;
1091 IsImm = true;
1092 break;
1093 case AArch64::CBWPrr:
1094 SubsOpc = AArch64::SUBSWrr;
1095 SubsDestReg = AArch64::WZR;
1096 IsImm = false;
1097 break;
1098 case AArch64::CBXPrr:
1099 SubsOpc = AArch64::SUBSXrr;
1100 SubsDestReg = AArch64::XZR;
1101 IsImm = false;
1102 break;
1103 }
1104
1105 if (IsImm)
1106 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1107 .addReg(Cond[3].getReg())
1108 .addImm(Cond[4].getImm())
1109 .addImm(0);
1110 else
1111 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1112 .addReg(Cond[3].getReg())
1113 .addReg(Cond[4].getReg());
1114 } break;
1115 case 7: { // cb[b,h]
1116 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1117 // that have been folded. For the first operand we codegen an explicit
1118 // extension, for the second operand we fold the extension into cmp.
1119 // 0 1 2 3 4 5 6
1120 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1121
1122 // We need a new register for the now explicitly extended register
1123 Register Reg = Cond[4].getReg();
1125 unsigned ExtOpc;
1126 unsigned ExtBits;
1127 AArch64_AM::ShiftExtendType ExtendType =
1129 switch (ExtendType) {
1130 default:
1131 llvm_unreachable("Unknown shift-extend for CB instruction");
1132 case AArch64_AM::SXTB:
1133 assert(
1134 Cond[1].getImm() == AArch64::CBBAssertExt &&
1135 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1136 ExtOpc = AArch64::SBFMWri;
1137 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1138 break;
1139 case AArch64_AM::SXTH:
1140 assert(
1141 Cond[1].getImm() == AArch64::CBHAssertExt &&
1142 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1143 ExtOpc = AArch64::SBFMWri;
1144 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1145 break;
1146 case AArch64_AM::UXTB:
1147 assert(
1148 Cond[1].getImm() == AArch64::CBBAssertExt &&
1149 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1150 ExtOpc = AArch64::ANDWri;
1151 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1152 break;
1153 case AArch64_AM::UXTH:
1154 assert(
1155 Cond[1].getImm() == AArch64::CBHAssertExt &&
1156 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1157 ExtOpc = AArch64::ANDWri;
1158 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1159 break;
1160 }
1161
1162 // Build the explicit extension of the first operand
1163 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1165 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1166 if (ExtOpc != AArch64::ANDWri)
1167 MBBI.addImm(0);
1168 MBBI.addImm(ExtBits);
1169 }
1170
1171 // Now, subs with an extended second operand
1173 AArch64_AM::ShiftExtendType ExtendType =
1175 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1176 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1177 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1178 .addReg(Cond[3].getReg())
1179 .addReg(Reg)
1180 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1181 } // If no extension is needed, just a regular subs
1182 else {
1183 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1184 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1185 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1186 .addReg(Cond[3].getReg())
1187 .addReg(Reg);
1188 }
1189
1190 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1191 } break;
1192 }
1193
1194 unsigned Opc = 0;
1195 const TargetRegisterClass *RC = nullptr;
1196 bool TryFold = false;
1197 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1198 RC = &AArch64::GPR64RegClass;
1199 Opc = AArch64::CSELXr;
1200 TryFold = true;
1201 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1202 RC = &AArch64::GPR32RegClass;
1203 Opc = AArch64::CSELWr;
1204 TryFold = true;
1205 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1206 RC = &AArch64::FPR64RegClass;
1207 Opc = AArch64::FCSELDrrr;
1208 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1209 RC = &AArch64::FPR32RegClass;
1210 Opc = AArch64::FCSELSrrr;
1211 }
1212 assert(RC && "Unsupported regclass");
1213
1214 // Try folding simple instructions into the csel.
1215 if (TryFold) {
1216 unsigned NewReg = 0;
1217 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1218 if (FoldedOpc) {
1219 // The folded opcodes csinc, csinc and csneg apply the operation to
1220 // FalseReg, so we need to invert the condition.
1222 TrueReg = FalseReg;
1223 } else
1224 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1225
1226 // Fold the operation. Leave any dead instructions for DCE to clean up.
1227 if (FoldedOpc) {
1228 FalseReg = NewReg;
1229 Opc = FoldedOpc;
1230 // Extend the live range of NewReg.
1231 MRI.clearKillFlags(NewReg);
1232 }
1233 }
1234
1235 // Pull all virtual register into the appropriate class.
1236 MRI.constrainRegClass(TrueReg, RC);
1237 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1238 assert(
1239 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1240 FalseReg == AArch64::XZR) &&
1241 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1242 if (FalseReg.isVirtual())
1243 MRI.constrainRegClass(FalseReg, RC);
1244
1245 // Insert the csel.
1246 BuildMI(MBB, I, DL, get(Opc), DstReg)
1247 .addReg(TrueReg)
1248 .addReg(FalseReg)
1249 .addImm(CC);
1250}
1251
1252// Return true if Imm can be loaded into a register by a "cheap" sequence of
1253// instructions. For now, "cheap" means at most two instructions.
1254static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1255 if (BitSize == 32)
1256 return true;
1257
1258 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1259 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1261 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1262
1263 return Is.size() <= 2;
1264}
1265
1266// Check if a COPY instruction is cheap.
1267static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1268 assert(MI.isCopy() && "Expected COPY instruction");
1269 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1270
1271 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1272 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1273 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1274 if (Reg.isVirtual())
1275 return MRI.getRegClass(Reg);
1276 if (Reg.isPhysical())
1277 return RI.getMinimalPhysRegClass(Reg);
1278 return nullptr;
1279 };
1280 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1281 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1282 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1283 return false;
1284
1285 return MI.isAsCheapAsAMove();
1286}
1287
1288// FIXME: this implementation should be micro-architecture dependent, so a
1289// micro-architecture target hook should be introduced here in future.
1291 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1292 if (isExynosCheapAsMove(MI))
1293 return true;
1294 return MI.isAsCheapAsAMove();
1295 }
1296
1297 switch (MI.getOpcode()) {
1298 default:
1299 return MI.isAsCheapAsAMove();
1300
1301 case TargetOpcode::COPY:
1302 return isCheapCopy(MI, RI);
1303
1304 case AArch64::ADDWrs:
1305 case AArch64::ADDXrs:
1306 case AArch64::SUBWrs:
1307 case AArch64::SUBXrs:
1308 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1309
1310 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1311 // ORRXri, it is as cheap as MOV.
1312 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1313 case AArch64::MOVi32imm:
1314 return isCheapImmediate(MI, 32);
1315 case AArch64::MOVi64imm:
1316 return isCheapImmediate(MI, 64);
1317 }
1318}
1319
1320bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1321 switch (MI.getOpcode()) {
1322 default:
1323 return false;
1324
1325 case AArch64::ADDWrs:
1326 case AArch64::ADDXrs:
1327 case AArch64::ADDSWrs:
1328 case AArch64::ADDSXrs: {
1329 unsigned Imm = MI.getOperand(3).getImm();
1330 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1331 if (ShiftVal == 0)
1332 return true;
1333 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1334 }
1335
1336 case AArch64::ADDWrx:
1337 case AArch64::ADDXrx:
1338 case AArch64::ADDXrx64:
1339 case AArch64::ADDSWrx:
1340 case AArch64::ADDSXrx:
1341 case AArch64::ADDSXrx64: {
1342 unsigned Imm = MI.getOperand(3).getImm();
1343 switch (AArch64_AM::getArithExtendType(Imm)) {
1344 default:
1345 return false;
1346 case AArch64_AM::UXTB:
1347 case AArch64_AM::UXTH:
1348 case AArch64_AM::UXTW:
1349 case AArch64_AM::UXTX:
1350 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1351 }
1352 }
1353
1354 case AArch64::SUBWrs:
1355 case AArch64::SUBSWrs: {
1356 unsigned Imm = MI.getOperand(3).getImm();
1357 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1358 return ShiftVal == 0 ||
1359 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1360 }
1361
1362 case AArch64::SUBXrs:
1363 case AArch64::SUBSXrs: {
1364 unsigned Imm = MI.getOperand(3).getImm();
1365 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1366 return ShiftVal == 0 ||
1367 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1368 }
1369
1370 case AArch64::SUBWrx:
1371 case AArch64::SUBXrx:
1372 case AArch64::SUBXrx64:
1373 case AArch64::SUBSWrx:
1374 case AArch64::SUBSXrx:
1375 case AArch64::SUBSXrx64: {
1376 unsigned Imm = MI.getOperand(3).getImm();
1377 switch (AArch64_AM::getArithExtendType(Imm)) {
1378 default:
1379 return false;
1380 case AArch64_AM::UXTB:
1381 case AArch64_AM::UXTH:
1382 case AArch64_AM::UXTW:
1383 case AArch64_AM::UXTX:
1384 return AArch64_AM::getArithShiftValue(Imm) == 0;
1385 }
1386 }
1387
1388 case AArch64::LDRBBroW:
1389 case AArch64::LDRBBroX:
1390 case AArch64::LDRBroW:
1391 case AArch64::LDRBroX:
1392 case AArch64::LDRDroW:
1393 case AArch64::LDRDroX:
1394 case AArch64::LDRHHroW:
1395 case AArch64::LDRHHroX:
1396 case AArch64::LDRHroW:
1397 case AArch64::LDRHroX:
1398 case AArch64::LDRQroW:
1399 case AArch64::LDRQroX:
1400 case AArch64::LDRSBWroW:
1401 case AArch64::LDRSBWroX:
1402 case AArch64::LDRSBXroW:
1403 case AArch64::LDRSBXroX:
1404 case AArch64::LDRSHWroW:
1405 case AArch64::LDRSHWroX:
1406 case AArch64::LDRSHXroW:
1407 case AArch64::LDRSHXroX:
1408 case AArch64::LDRSWroW:
1409 case AArch64::LDRSWroX:
1410 case AArch64::LDRSroW:
1411 case AArch64::LDRSroX:
1412 case AArch64::LDRWroW:
1413 case AArch64::LDRWroX:
1414 case AArch64::LDRXroW:
1415 case AArch64::LDRXroX:
1416 case AArch64::PRFMroW:
1417 case AArch64::PRFMroX:
1418 case AArch64::STRBBroW:
1419 case AArch64::STRBBroX:
1420 case AArch64::STRBroW:
1421 case AArch64::STRBroX:
1422 case AArch64::STRDroW:
1423 case AArch64::STRDroX:
1424 case AArch64::STRHHroW:
1425 case AArch64::STRHHroX:
1426 case AArch64::STRHroW:
1427 case AArch64::STRHroX:
1428 case AArch64::STRQroW:
1429 case AArch64::STRQroX:
1430 case AArch64::STRSroW:
1431 case AArch64::STRSroX:
1432 case AArch64::STRWroW:
1433 case AArch64::STRWroX:
1434 case AArch64::STRXroW:
1435 case AArch64::STRXroX: {
1436 unsigned IsSigned = MI.getOperand(3).getImm();
1437 return !IsSigned;
1438 }
1439 }
1440}
1441
1442bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1443 unsigned Opc = MI.getOpcode();
1444 switch (Opc) {
1445 default:
1446 return false;
1447 case AArch64::SEH_StackAlloc:
1448 case AArch64::SEH_SaveFPLR:
1449 case AArch64::SEH_SaveFPLR_X:
1450 case AArch64::SEH_SaveReg:
1451 case AArch64::SEH_SaveReg_X:
1452 case AArch64::SEH_SaveRegP:
1453 case AArch64::SEH_SaveRegP_X:
1454 case AArch64::SEH_SaveFReg:
1455 case AArch64::SEH_SaveFReg_X:
1456 case AArch64::SEH_SaveFRegP:
1457 case AArch64::SEH_SaveFRegP_X:
1458 case AArch64::SEH_SetFP:
1459 case AArch64::SEH_AddFP:
1460 case AArch64::SEH_Nop:
1461 case AArch64::SEH_PrologEnd:
1462 case AArch64::SEH_EpilogStart:
1463 case AArch64::SEH_EpilogEnd:
1464 case AArch64::SEH_PACSignLR:
1465 case AArch64::SEH_SaveAnyRegI:
1466 case AArch64::SEH_SaveAnyRegIP:
1467 case AArch64::SEH_SaveAnyRegQP:
1468 case AArch64::SEH_SaveAnyRegQPX:
1469 case AArch64::SEH_AllocZ:
1470 case AArch64::SEH_SaveZReg:
1471 case AArch64::SEH_SavePReg:
1472 return true;
1473 }
1474}
1475
1477 Register &SrcReg, Register &DstReg,
1478 unsigned &SubIdx) const {
1479 switch (MI.getOpcode()) {
1480 default:
1481 return false;
1482 case AArch64::SBFMXri: // aka sxtw
1483 case AArch64::UBFMXri: // aka uxtw
1484 // Check for the 32 -> 64 bit extension case, these instructions can do
1485 // much more.
1486 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1487 return false;
1488 // This is a signed or unsigned 32 -> 64 bit extension.
1489 SrcReg = MI.getOperand(1).getReg();
1490 DstReg = MI.getOperand(0).getReg();
1491 SubIdx = AArch64::sub_32;
1492 return true;
1493 }
1494}
1495
1497 const MachineInstr &MIa, const MachineInstr &MIb) const {
1499 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1500 int64_t OffsetA = 0, OffsetB = 0;
1501 TypeSize WidthA(0, false), WidthB(0, false);
1502 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1503
1504 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1505 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1506
1509 return false;
1510
1511 // Retrieve the base, offset from the base and width. Width
1512 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1513 // base are identical, and the offset of a lower memory access +
1514 // the width doesn't overlap the offset of a higher memory access,
1515 // then the memory accesses are different.
1516 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1517 // are assumed to have the same scale (vscale).
1518 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1519 WidthA, TRI) &&
1520 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1521 WidthB, TRI)) {
1522 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1523 OffsetAIsScalable == OffsetBIsScalable) {
1524 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1525 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1526 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1527 if (LowWidth.isScalable() == OffsetAIsScalable &&
1528 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1529 return true;
1530 }
1531 }
1532 return false;
1533}
1534
1536 const MachineBasicBlock *MBB,
1537 const MachineFunction &MF) const {
1539 return true;
1540
1541 // Do not move an instruction that can be recognized as a branch target.
1542 if (hasBTISemantics(MI))
1543 return true;
1544
1545 switch (MI.getOpcode()) {
1546 case AArch64::HINT:
1547 // CSDB hints are scheduling barriers.
1548 if (MI.getOperand(0).getImm() == 0x14)
1549 return true;
1550 break;
1551 case AArch64::DSB:
1552 case AArch64::ISB:
1553 // DSB and ISB also are scheduling barriers.
1554 return true;
1555 case AArch64::MSRpstatesvcrImm1:
1556 // SMSTART and SMSTOP are also scheduling barriers.
1557 return true;
1558 default:;
1559 }
1560 if (isSEHInstruction(MI))
1561 return true;
1562 auto Next = std::next(MI.getIterator());
1563 return Next != MBB->end() && Next->isCFIInstruction();
1564}
1565
1566/// analyzeCompare - For a comparison instruction, return the source registers
1567/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1568/// Return true if the comparison instruction can be analyzed.
1570 Register &SrcReg2, int64_t &CmpMask,
1571 int64_t &CmpValue) const {
1572 // The first operand can be a frame index where we'd normally expect a
1573 // register.
1574 // FIXME: Pass subregisters out of analyzeCompare
1575 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1576 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1577 return false;
1578
1579 switch (MI.getOpcode()) {
1580 default:
1581 break;
1582 case AArch64::PTEST_PP:
1583 case AArch64::PTEST_PP_ANY:
1584 case AArch64::PTEST_PP_FIRST:
1585 SrcReg = MI.getOperand(0).getReg();
1586 SrcReg2 = MI.getOperand(1).getReg();
1587 if (MI.getOperand(2).getSubReg())
1588 return false;
1589
1590 // Not sure about the mask and value for now...
1591 CmpMask = ~0;
1592 CmpValue = 0;
1593 return true;
1594 case AArch64::SUBSWrr:
1595 case AArch64::SUBSWrs:
1596 case AArch64::SUBSWrx:
1597 case AArch64::SUBSXrr:
1598 case AArch64::SUBSXrs:
1599 case AArch64::SUBSXrx:
1600 case AArch64::ADDSWrr:
1601 case AArch64::ADDSWrs:
1602 case AArch64::ADDSWrx:
1603 case AArch64::ADDSXrr:
1604 case AArch64::ADDSXrs:
1605 case AArch64::ADDSXrx:
1606 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1607 SrcReg = MI.getOperand(1).getReg();
1608 SrcReg2 = MI.getOperand(2).getReg();
1609
1610 // FIXME: Pass subregisters out of analyzeCompare
1611 if (MI.getOperand(2).getSubReg())
1612 return false;
1613
1614 CmpMask = ~0;
1615 CmpValue = 0;
1616 return true;
1617 case AArch64::SUBSWri:
1618 case AArch64::ADDSWri:
1619 case AArch64::SUBSXri:
1620 case AArch64::ADDSXri:
1621 SrcReg = MI.getOperand(1).getReg();
1622 SrcReg2 = 0;
1623 CmpMask = ~0;
1624 CmpValue = MI.getOperand(2).getImm();
1625 return true;
1626 case AArch64::ANDSWri:
1627 case AArch64::ANDSXri:
1628 // ANDS does not use the same encoding scheme as the others xxxS
1629 // instructions.
1630 SrcReg = MI.getOperand(1).getReg();
1631 SrcReg2 = 0;
1632 CmpMask = ~0;
1634 MI.getOperand(2).getImm(),
1635 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1636 return true;
1637 }
1638
1639 return false;
1640}
1641
1643 MachineBasicBlock *MBB = Instr.getParent();
1644 assert(MBB && "Can't get MachineBasicBlock here");
1645 MachineFunction *MF = MBB->getParent();
1646 assert(MF && "Can't get MachineFunction here");
1649 MachineRegisterInfo *MRI = &MF->getRegInfo();
1650
1651 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1652 ++OpIdx) {
1653 MachineOperand &MO = Instr.getOperand(OpIdx);
1654 const TargetRegisterClass *OpRegCstraints =
1655 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1656
1657 // If there's no constraint, there's nothing to do.
1658 if (!OpRegCstraints)
1659 continue;
1660 // If the operand is a frame index, there's nothing to do here.
1661 // A frame index operand will resolve correctly during PEI.
1662 if (MO.isFI())
1663 continue;
1664
1665 assert(MO.isReg() &&
1666 "Operand has register constraints without being a register!");
1667
1668 Register Reg = MO.getReg();
1669 if (Reg.isPhysical()) {
1670 if (!OpRegCstraints->contains(Reg))
1671 return false;
1672 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1673 !MRI->constrainRegClass(Reg, OpRegCstraints))
1674 return false;
1675 }
1676
1677 return true;
1678}
1679
1680/// Return the opcode that does not set flags when possible - otherwise
1681/// return the original opcode. The caller is responsible to do the actual
1682/// substitution and legality checking.
1684 // Don't convert all compare instructions, because for some the zero register
1685 // encoding becomes the sp register.
1686 bool MIDefinesZeroReg = false;
1687 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1688 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1689 MIDefinesZeroReg = true;
1690
1691 switch (MI.getOpcode()) {
1692 default:
1693 return MI.getOpcode();
1694 case AArch64::ADDSWrr:
1695 return AArch64::ADDWrr;
1696 case AArch64::ADDSWri:
1697 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1698 case AArch64::ADDSWrs:
1699 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1700 case AArch64::ADDSWrx:
1701 return AArch64::ADDWrx;
1702 case AArch64::ADDSXrr:
1703 return AArch64::ADDXrr;
1704 case AArch64::ADDSXri:
1705 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1706 case AArch64::ADDSXrs:
1707 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1708 case AArch64::ADDSXrx:
1709 return AArch64::ADDXrx;
1710 case AArch64::SUBSWrr:
1711 return AArch64::SUBWrr;
1712 case AArch64::SUBSWri:
1713 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1714 case AArch64::SUBSWrs:
1715 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1716 case AArch64::SUBSWrx:
1717 return AArch64::SUBWrx;
1718 case AArch64::SUBSXrr:
1719 return AArch64::SUBXrr;
1720 case AArch64::SUBSXri:
1721 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1722 case AArch64::SUBSXrs:
1723 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1724 case AArch64::SUBSXrx:
1725 return AArch64::SUBXrx;
1726 }
1727}
1728
1729enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1730
1731/// True when condition flags are accessed (either by writing or reading)
1732/// on the instruction trace starting at From and ending at To.
1733///
1734/// Note: If From and To are from different blocks it's assumed CC are accessed
1735/// on the path.
1738 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1739 // Early exit if To is at the beginning of the BB.
1740 if (To == To->getParent()->begin())
1741 return true;
1742
1743 // Check whether the instructions are in the same basic block
1744 // If not, assume the condition flags might get modified somewhere.
1745 if (To->getParent() != From->getParent())
1746 return true;
1747
1748 // From must be above To.
1749 assert(std::any_of(
1750 ++To.getReverse(), To->getParent()->rend(),
1751 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1752
1753 // We iterate backward starting at \p To until we hit \p From.
1754 for (const MachineInstr &Instr :
1756 if (((AccessToCheck & AK_Write) &&
1757 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1758 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1759 return true;
1760 }
1761 return false;
1762}
1763
1764std::optional<unsigned>
1765AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1766 MachineInstr *Pred,
1767 const MachineRegisterInfo *MRI) const {
1768 unsigned MaskOpcode = Mask->getOpcode();
1769 unsigned PredOpcode = Pred->getOpcode();
1770 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1771 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1772
1773 if (PredIsWhileLike) {
1774 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1775 // instruction and the condition is "any" since WHILcc does an implicit
1776 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1777 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1778 return PredOpcode;
1779
1780 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1781 // redundant since WHILE performs an implicit PTEST with an all active
1782 // mask.
1783 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1784 getElementSizeForOpcode(MaskOpcode) ==
1785 getElementSizeForOpcode(PredOpcode))
1786 return PredOpcode;
1787
1788 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1789 // WHILEcc performs an implicit PTEST with an all active mask, setting
1790 // the N flag as the PTEST_FIRST would.
1791 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1792 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1793 return PredOpcode;
1794
1795 return {};
1796 }
1797
1798 if (PredIsPTestLike) {
1799 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1800 // instruction that sets the flags as PTEST would and the condition is
1801 // "any" since PG is always a subset of the governing predicate of the
1802 // ptest-like instruction.
1803 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1804 return PredOpcode;
1805
1806 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1807
1808 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1809 // to look through a copy and try again. This is because some instructions
1810 // take a predicate whose register class is a subset of its result class.
1811 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1812 PTestLikeMask->getOperand(1).getReg().isVirtual())
1813 PTestLikeMask =
1814 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1815
1816 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1817 // the element size matches and either the PTEST_LIKE instruction uses
1818 // the same all active mask or the condition is "any".
1819 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1820 getElementSizeForOpcode(MaskOpcode) ==
1821 getElementSizeForOpcode(PredOpcode)) {
1822 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1823 return PredOpcode;
1824 }
1825
1826 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1827 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1828 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1829 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1830 // performed by the compare could consider fewer lanes for these element
1831 // sizes.
1832 //
1833 // For example, consider
1834 //
1835 // ptrue p0.b ; P0=1111-1111-1111-1111
1836 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1837 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1838 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1839 // ; ^ last active
1840 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1841 // ; ^ last active
1842 //
1843 // where the compare generates a canonical all active 32-bit predicate
1844 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1845 // active flag, whereas the PTEST instruction with the same mask doesn't.
1846 // For PTEST_ANY this doesn't apply as the flags in this case would be
1847 // identical regardless of element size.
1848 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1849 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1850 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1851 return PredOpcode;
1852
1853 return {};
1854 }
1855
1856 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1857 // opcode so the PTEST becomes redundant.
1858 switch (PredOpcode) {
1859 case AArch64::AND_PPzPP:
1860 case AArch64::BIC_PPzPP:
1861 case AArch64::EOR_PPzPP:
1862 case AArch64::NAND_PPzPP:
1863 case AArch64::NOR_PPzPP:
1864 case AArch64::ORN_PPzPP:
1865 case AArch64::ORR_PPzPP:
1866 case AArch64::BRKA_PPzP:
1867 case AArch64::BRKPA_PPzPP:
1868 case AArch64::BRKB_PPzP:
1869 case AArch64::BRKPB_PPzPP:
1870 case AArch64::RDFFR_PPz: {
1871 // Check to see if our mask is the same. If not the resulting flag bits
1872 // may be different and we can't remove the ptest.
1873 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1874 if (Mask != PredMask)
1875 return {};
1876 break;
1877 }
1878 case AArch64::BRKN_PPzP: {
1879 // BRKN uses an all active implicit mask to set flags unlike the other
1880 // flag-setting instructions.
1881 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1882 if ((MaskOpcode != AArch64::PTRUE_B) ||
1883 (Mask->getOperand(1).getImm() != 31))
1884 return {};
1885 break;
1886 }
1887 case AArch64::PTRUE_B:
1888 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1889 break;
1890 default:
1891 // Bail out if we don't recognize the input
1892 return {};
1893 }
1894
1895 return convertToFlagSettingOpc(PredOpcode);
1896}
1897
1898/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1899/// operation which could set the flags in an identical manner
1900bool AArch64InstrInfo::optimizePTestInstr(
1901 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1902 const MachineRegisterInfo *MRI) const {
1903 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1904 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1905
1906 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1907 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1908 // before the branch to extract each subregister.
1909 auto Op = Pred->getOperand(1);
1910 if (Op.isReg() && Op.getReg().isVirtual() &&
1911 Op.getSubReg() == AArch64::psub0)
1912 Pred = MRI->getUniqueVRegDef(Op.getReg());
1913 }
1914
1915 unsigned PredOpcode = Pred->getOpcode();
1916 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1917 if (!NewOp)
1918 return false;
1919
1920 const TargetRegisterInfo *TRI = &getRegisterInfo();
1921
1922 // If another instruction between Pred and PTest accesses flags, don't remove
1923 // the ptest or update the earlier instruction to modify them.
1924 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1925 return false;
1926
1927 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1928 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1929 // operand to be replaced with an equivalent instruction that also sets the
1930 // flags.
1931 PTest->eraseFromParent();
1932 if (*NewOp != PredOpcode) {
1933 Pred->setDesc(get(*NewOp));
1934 bool succeeded = UpdateOperandRegClass(*Pred);
1935 (void)succeeded;
1936 assert(succeeded && "Operands have incompatible register classes!");
1937 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1938 }
1939
1940 // Ensure that the flags def is live.
1941 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1942 unsigned i = 0, e = Pred->getNumOperands();
1943 for (; i != e; ++i) {
1944 MachineOperand &MO = Pred->getOperand(i);
1945 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1946 MO.setIsDead(false);
1947 break;
1948 }
1949 }
1950 }
1951 return true;
1952}
1953
1954/// Try to optimize a compare instruction. A compare instruction is an
1955/// instruction which produces AArch64::NZCV. It can be truly compare
1956/// instruction
1957/// when there are no uses of its destination register.
1958///
1959/// The following steps are tried in order:
1960/// 1. Convert CmpInstr into an unconditional version.
1961/// 2. Remove CmpInstr if above there is an instruction producing a needed
1962/// condition code or an instruction which can be converted into such an
1963/// instruction.
1964/// Only comparison with zero is supported.
1966 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1967 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1968 assert(CmpInstr.getParent());
1969 assert(MRI);
1970
1971 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1972 int DeadNZCVIdx =
1973 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1974 if (DeadNZCVIdx != -1) {
1975 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1976 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1977 CmpInstr.eraseFromParent();
1978 return true;
1979 }
1980 unsigned Opc = CmpInstr.getOpcode();
1981 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1982 if (NewOpc == Opc)
1983 return false;
1984 const MCInstrDesc &MCID = get(NewOpc);
1985 CmpInstr.setDesc(MCID);
1986 CmpInstr.removeOperand(DeadNZCVIdx);
1987 bool succeeded = UpdateOperandRegClass(CmpInstr);
1988 (void)succeeded;
1989 assert(succeeded && "Some operands reg class are incompatible!");
1990 return true;
1991 }
1992
1993 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1994 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1995 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1996 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1997
1998 if (SrcReg2 != 0)
1999 return false;
2000
2001 // CmpInstr is a Compare instruction if destination register is not used.
2002 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
2003 return false;
2004
2005 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
2006 return true;
2007 return (CmpValue == 0 || CmpValue == 1) &&
2008 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
2009}
2010
2011/// Get opcode of S version of Instr.
2012/// If Instr is S version its opcode is returned.
2013/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
2014/// or we are not interested in it.
2015static unsigned sForm(MachineInstr &Instr) {
2016 switch (Instr.getOpcode()) {
2017 default:
2018 return AArch64::INSTRUCTION_LIST_END;
2019
2020 case AArch64::ADDSWrr:
2021 case AArch64::ADDSWri:
2022 case AArch64::ADDSXrr:
2023 case AArch64::ADDSXri:
2024 case AArch64::ADDSWrx:
2025 case AArch64::ADDSXrx:
2026 case AArch64::SUBSWrr:
2027 case AArch64::SUBSWri:
2028 case AArch64::SUBSWrx:
2029 case AArch64::SUBSXrr:
2030 case AArch64::SUBSXri:
2031 case AArch64::SUBSXrx:
2032 case AArch64::ANDSWri:
2033 case AArch64::ANDSWrr:
2034 case AArch64::ANDSWrs:
2035 case AArch64::ANDSXri:
2036 case AArch64::ANDSXrr:
2037 case AArch64::ANDSXrs:
2038 case AArch64::BICSWrr:
2039 case AArch64::BICSXrr:
2040 case AArch64::BICSWrs:
2041 case AArch64::BICSXrs:
2042 return Instr.getOpcode();
2043
2044 case AArch64::ADDWrr:
2045 return AArch64::ADDSWrr;
2046 case AArch64::ADDWri:
2047 return AArch64::ADDSWri;
2048 case AArch64::ADDXrr:
2049 return AArch64::ADDSXrr;
2050 case AArch64::ADDXri:
2051 return AArch64::ADDSXri;
2052 case AArch64::ADDWrx:
2053 return AArch64::ADDSWrx;
2054 case AArch64::ADDXrx:
2055 return AArch64::ADDSXrx;
2056 case AArch64::ADCWr:
2057 return AArch64::ADCSWr;
2058 case AArch64::ADCXr:
2059 return AArch64::ADCSXr;
2060 case AArch64::SUBWrr:
2061 return AArch64::SUBSWrr;
2062 case AArch64::SUBWri:
2063 return AArch64::SUBSWri;
2064 case AArch64::SUBXrr:
2065 return AArch64::SUBSXrr;
2066 case AArch64::SUBXri:
2067 return AArch64::SUBSXri;
2068 case AArch64::SUBWrx:
2069 return AArch64::SUBSWrx;
2070 case AArch64::SUBXrx:
2071 return AArch64::SUBSXrx;
2072 case AArch64::SBCWr:
2073 return AArch64::SBCSWr;
2074 case AArch64::SBCXr:
2075 return AArch64::SBCSXr;
2076 case AArch64::ANDWri:
2077 return AArch64::ANDSWri;
2078 case AArch64::ANDXri:
2079 return AArch64::ANDSXri;
2080 case AArch64::ANDWrr:
2081 return AArch64::ANDSWrr;
2082 case AArch64::ANDWrs:
2083 return AArch64::ANDSWrs;
2084 case AArch64::ANDXrr:
2085 return AArch64::ANDSXrr;
2086 case AArch64::ANDXrs:
2087 return AArch64::ANDSXrs;
2088 case AArch64::BICWrr:
2089 return AArch64::BICSWrr;
2090 case AArch64::BICXrr:
2091 return AArch64::BICSXrr;
2092 case AArch64::BICWrs:
2093 return AArch64::BICSWrs;
2094 case AArch64::BICXrs:
2095 return AArch64::BICSXrs;
2096 }
2097}
2098
2099/// Check if AArch64::NZCV should be alive in successors of MBB.
2101 for (auto *BB : MBB->successors())
2102 if (BB->isLiveIn(AArch64::NZCV))
2103 return true;
2104 return false;
2105}
2106
2107/// \returns The condition code operand index for \p Instr if it is a branch
2108/// or select and -1 otherwise.
2109int AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(
2110 const MachineInstr &Instr) {
2111 switch (Instr.getOpcode()) {
2112 default:
2113 return -1;
2114
2115 case AArch64::Bcc: {
2116 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2117 assert(Idx >= 2);
2118 return Idx - 2;
2119 }
2120
2121 case AArch64::CSINVWr:
2122 case AArch64::CSINVXr:
2123 case AArch64::CSINCWr:
2124 case AArch64::CSINCXr:
2125 case AArch64::CSELWr:
2126 case AArch64::CSELXr:
2127 case AArch64::CSNEGWr:
2128 case AArch64::CSNEGXr:
2129 case AArch64::FCSELSrrr:
2130 case AArch64::FCSELDrrr: {
2131 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2132 assert(Idx >= 1);
2133 return Idx - 1;
2134 }
2135 }
2136}
2137
2138/// Find a condition code used by the instruction.
2139/// Returns AArch64CC::Invalid if either the instruction does not use condition
2140/// codes or we don't optimize CmpInstr in the presence of such instructions.
2142 int CCIdx =
2143 AArch64InstrInfo::findCondCodeUseOperandIdxForBranchOrSelect(Instr);
2144 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2145 Instr.getOperand(CCIdx).getImm())
2147}
2148
2151 UsedNZCV UsedFlags;
2152 switch (CC) {
2153 default:
2154 break;
2155
2156 case AArch64CC::EQ: // Z set
2157 case AArch64CC::NE: // Z clear
2158 UsedFlags.Z = true;
2159 break;
2160
2161 case AArch64CC::HI: // Z clear and C set
2162 case AArch64CC::LS: // Z set or C clear
2163 UsedFlags.Z = true;
2164 [[fallthrough]];
2165 case AArch64CC::HS: // C set
2166 case AArch64CC::LO: // C clear
2167 UsedFlags.C = true;
2168 break;
2169
2170 case AArch64CC::MI: // N set
2171 case AArch64CC::PL: // N clear
2172 UsedFlags.N = true;
2173 break;
2174
2175 case AArch64CC::VS: // V set
2176 case AArch64CC::VC: // V clear
2177 UsedFlags.V = true;
2178 break;
2179
2180 case AArch64CC::GT: // Z clear, N and V the same
2181 case AArch64CC::LE: // Z set, N and V differ
2182 UsedFlags.Z = true;
2183 [[fallthrough]];
2184 case AArch64CC::GE: // N and V the same
2185 case AArch64CC::LT: // N and V differ
2186 UsedFlags.N = true;
2187 UsedFlags.V = true;
2188 break;
2189 }
2190 return UsedFlags;
2191}
2192
2193/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2194/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2195/// \returns std::nullopt otherwise.
2196///
2197/// Collect instructions using that flags in \p CCUseInstrs if provided.
2198std::optional<UsedNZCV>
2200 const TargetRegisterInfo &TRI,
2201 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2202 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2203 if (MI.getParent() != CmpParent)
2204 return std::nullopt;
2205
2206 if (areCFlagsAliveInSuccessors(CmpParent))
2207 return std::nullopt;
2208
2209 UsedNZCV NZCVUsedAfterCmp;
2211 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2212 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2214 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2215 return std::nullopt;
2216 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2217 if (CCUseInstrs)
2218 CCUseInstrs->push_back(&Instr);
2219 }
2220 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2221 break;
2222 }
2223 return NZCVUsedAfterCmp;
2224}
2225
2226static bool isADDSRegImm(unsigned Opcode) {
2227 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2228}
2229
2230static bool isSUBSRegImm(unsigned Opcode) {
2231 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2232}
2233
2235 unsigned Opc = sForm(MI);
2236 switch (Opc) {
2237 case AArch64::ANDSWri:
2238 case AArch64::ANDSWrr:
2239 case AArch64::ANDSWrs:
2240 case AArch64::ANDSXri:
2241 case AArch64::ANDSXrr:
2242 case AArch64::ANDSXrs:
2243 case AArch64::BICSWrr:
2244 case AArch64::BICSXrr:
2245 case AArch64::BICSWrs:
2246 case AArch64::BICSXrs:
2247 return true;
2248 default:
2249 return false;
2250 }
2251}
2252
2253/// Check if CmpInstr can be substituted by MI.
2254///
2255/// CmpInstr can be substituted:
2256/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2257/// - and, MI and CmpInstr are from the same MachineBB
2258/// - and, condition flags are not alive in successors of the CmpInstr parent
2259/// - and, if MI opcode is the S form there must be no defs of flags between
2260/// MI and CmpInstr
2261/// or if MI opcode is not the S form there must be neither defs of flags
2262/// nor uses of flags between MI and CmpInstr.
2263/// - and, if C/V flags are not used after CmpInstr
2264/// or if N flag is used but MI produces poison value if signed overflow
2265/// occurs.
2267 const TargetRegisterInfo &TRI) {
2268 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2269 // that may or may not set flags.
2270 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2271
2272 const unsigned CmpOpcode = CmpInstr.getOpcode();
2273 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2274 return false;
2275
2276 assert((CmpInstr.getOperand(2).isImm() &&
2277 CmpInstr.getOperand(2).getImm() == 0) &&
2278 "Caller guarantees that CmpInstr compares with constant 0");
2279
2280 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2281 if (!NZVCUsed || NZVCUsed->C)
2282 return false;
2283
2284 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2285 // '%vreg = add ...' or '%vreg = sub ...'.
2286 // Condition flag V is used to indicate signed overflow.
2287 // 1) MI and CmpInstr set N and V to the same value.
2288 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2289 // signed overflow occurs, so CmpInstr could still be simplified away.
2290 // Note that Ands and Bics instructions always clear the V flag.
2291 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2292 return false;
2293
2294 AccessKind AccessToCheck = AK_Write;
2295 if (sForm(MI) != MI.getOpcode())
2296 AccessToCheck = AK_All;
2297 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2298}
2299
2300/// Substitute an instruction comparing to zero with another instruction
2301/// which produces needed condition flags.
2302///
2303/// Return true on success.
2304bool AArch64InstrInfo::substituteCmpToZero(
2305 MachineInstr &CmpInstr, unsigned SrcReg,
2306 const MachineRegisterInfo &MRI) const {
2307 // Get the unique definition of SrcReg.
2308 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2309 if (!MI)
2310 return false;
2311
2312 const TargetRegisterInfo &TRI = getRegisterInfo();
2313
2314 unsigned NewOpc = sForm(*MI);
2315 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2316 return false;
2317
2318 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2319 return false;
2320
2321 // Update the instruction to set NZCV.
2322 MI->setDesc(get(NewOpc));
2323 CmpInstr.eraseFromParent();
2325 (void)succeeded;
2326 assert(succeeded && "Some operands reg class are incompatible!");
2327 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2328 return true;
2329}
2330
2331/// \returns True if \p CmpInstr can be removed.
2332///
2333/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2334/// codes used in \p CCUseInstrs must be inverted.
2336 int CmpValue, const TargetRegisterInfo &TRI,
2338 bool &IsInvertCC) {
2339 assert((CmpValue == 0 || CmpValue == 1) &&
2340 "Only comparisons to 0 or 1 considered for removal!");
2341
2342 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2343 unsigned MIOpc = MI.getOpcode();
2344 if (MIOpc == AArch64::CSINCWr) {
2345 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2346 MI.getOperand(2).getReg() != AArch64::WZR)
2347 return false;
2348 } else if (MIOpc == AArch64::CSINCXr) {
2349 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2350 MI.getOperand(2).getReg() != AArch64::XZR)
2351 return false;
2352 } else {
2353 return false;
2354 }
2356 if (MICC == AArch64CC::Invalid)
2357 return false;
2358
2359 // NZCV needs to be defined
2360 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2361 return false;
2362
2363 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2364 const unsigned CmpOpcode = CmpInstr.getOpcode();
2365 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2366 if (CmpValue && !IsSubsRegImm)
2367 return false;
2368 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2369 return false;
2370
2371 // MI conditions allowed: eq, ne, mi, pl
2372 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2373 if (MIUsedNZCV.C || MIUsedNZCV.V)
2374 return false;
2375
2376 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2377 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2378 // Condition flags are not used in CmpInstr basic block successors and only
2379 // Z or N flags allowed to be used after CmpInstr within its basic block
2380 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2381 return false;
2382 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2383 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2384 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2385 return false;
2386 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2387 if (MIUsedNZCV.N && !CmpValue)
2388 return false;
2389
2390 // There must be no defs of flags between MI and CmpInstr
2391 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2392 return false;
2393
2394 // Condition code is inverted in the following cases:
2395 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2396 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2397 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2398 (!CmpValue && MICC == AArch64CC::NE);
2399 return true;
2400}
2401
2402/// Remove comparison in csinc-cmp sequence
2403///
2404/// Examples:
2405/// 1. \code
2406/// csinc w9, wzr, wzr, ne
2407/// cmp w9, #0
2408/// b.eq
2409/// \endcode
2410/// to
2411/// \code
2412/// csinc w9, wzr, wzr, ne
2413/// b.ne
2414/// \endcode
2415///
2416/// 2. \code
2417/// csinc x2, xzr, xzr, mi
2418/// cmp x2, #1
2419/// b.pl
2420/// \endcode
2421/// to
2422/// \code
2423/// csinc x2, xzr, xzr, mi
2424/// b.pl
2425/// \endcode
2426///
2427/// \param CmpInstr comparison instruction
2428/// \return True when comparison removed
2429bool AArch64InstrInfo::removeCmpToZeroOrOne(
2430 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2431 const MachineRegisterInfo &MRI) const {
2432 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2433 if (!MI)
2434 return false;
2435 const TargetRegisterInfo &TRI = getRegisterInfo();
2436 SmallVector<MachineInstr *, 4> CCUseInstrs;
2437 bool IsInvertCC = false;
2438 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2439 IsInvertCC))
2440 return false;
2441 // Make transformation
2442 CmpInstr.eraseFromParent();
2443 if (IsInvertCC) {
2444 // Invert condition codes in CmpInstr CC users
2445 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2446 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2447 assert(Idx >= 0 && "Unexpected instruction using CC.");
2448 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2450 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2451 CCOperand.setImm(CCUse);
2452 }
2453 }
2454 return true;
2455}
2456
2457bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2458 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2459 MI.getOpcode() != AArch64::CATCHRET)
2460 return false;
2461
2462 MachineBasicBlock &MBB = *MI.getParent();
2463 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2464 auto TRI = Subtarget.getRegisterInfo();
2465 DebugLoc DL = MI.getDebugLoc();
2466
2467 if (MI.getOpcode() == AArch64::CATCHRET) {
2468 // Skip to the first instruction before the epilog.
2469 const TargetInstrInfo *TII =
2471 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2473 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2474 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2475 FirstEpilogSEH != MBB.begin())
2476 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2477 if (FirstEpilogSEH != MBB.begin())
2478 FirstEpilogSEH = std::next(FirstEpilogSEH);
2479 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2480 .addReg(AArch64::X0, RegState::Define)
2481 .addMBB(TargetMBB);
2482 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2483 .addReg(AArch64::X0, RegState::Define)
2484 .addReg(AArch64::X0)
2485 .addMBB(TargetMBB)
2486 .addImm(0);
2487 TargetMBB->setMachineBlockAddressTaken();
2488 return true;
2489 }
2490
2491 Register Reg = MI.getOperand(0).getReg();
2493 if (M.getStackProtectorGuard() == "sysreg") {
2494 const AArch64SysReg::SysReg *SrcReg =
2495 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2496 if (!SrcReg)
2497 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2498
2499 // mrs xN, sysreg
2500 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2502 .addImm(SrcReg->Encoding);
2503 int Offset = M.getStackProtectorGuardOffset();
2504 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2505 // ldr xN, [xN, #offset]
2506 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2507 .addDef(Reg)
2509 .addImm(Offset / 8);
2510 } else if (Offset >= -256 && Offset <= 255) {
2511 // ldur xN, [xN, #offset]
2512 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2513 .addDef(Reg)
2515 .addImm(Offset);
2516 } else if (Offset >= -4095 && Offset <= 4095) {
2517 if (Offset > 0) {
2518 // add xN, xN, #offset
2519 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2520 .addDef(Reg)
2522 .addImm(Offset)
2523 .addImm(0);
2524 } else {
2525 // sub xN, xN, #offset
2526 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2527 .addDef(Reg)
2529 .addImm(-Offset)
2530 .addImm(0);
2531 }
2532 // ldr xN, [xN]
2533 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2534 .addDef(Reg)
2536 .addImm(0);
2537 } else {
2538 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2539 // than 23760.
2540 // It might be nice to use AArch64::MOVi32imm here, which would get
2541 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2542 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2543 // AArch64FrameLowering might help us find such a scratch register
2544 // though. If we failed to find a scratch register, we could emit a
2545 // stream of add instructions to build up the immediate. Or, we could try
2546 // to insert a AArch64::MOVi32imm before register allocation so that we
2547 // didn't need to scavenge for a scratch register.
2548 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2549 }
2550 MBB.erase(MI);
2551 return true;
2552 }
2553
2554 const GlobalValue *GV =
2555 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2556 const TargetMachine &TM = MBB.getParent()->getTarget();
2557 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2558 const unsigned char MO_NC = AArch64II::MO_NC;
2559
2560 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2561 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2562 .addGlobalAddress(GV, 0, OpFlags);
2563 if (Subtarget.isTargetILP32()) {
2564 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2565 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2566 .addDef(Reg32, RegState::Dead)
2568 .addImm(0)
2569 .addMemOperand(*MI.memoperands_begin())
2571 } else {
2572 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2574 .addImm(0)
2575 .addMemOperand(*MI.memoperands_begin());
2576 }
2577 } else if (TM.getCodeModel() == CodeModel::Large) {
2578 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2579 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2580 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2581 .addImm(0);
2582 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2584 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2585 .addImm(16);
2586 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2588 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2589 .addImm(32);
2590 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2593 .addImm(48);
2594 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2596 .addImm(0)
2597 .addMemOperand(*MI.memoperands_begin());
2598 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2599 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2600 .addGlobalAddress(GV, 0, OpFlags);
2601 } else {
2602 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2603 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2604 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2605 if (Subtarget.isTargetILP32()) {
2606 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2607 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2608 .addDef(Reg32, RegState::Dead)
2610 .addGlobalAddress(GV, 0, LoFlags)
2611 .addMemOperand(*MI.memoperands_begin())
2613 } else {
2614 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2616 .addGlobalAddress(GV, 0, LoFlags)
2617 .addMemOperand(*MI.memoperands_begin());
2618 }
2619 }
2620
2621 MBB.erase(MI);
2622
2623 return true;
2624}
2625
2626// Return true if this instruction simply sets its single destination register
2627// to zero. This is equivalent to a register rename of the zero-register.
2629 switch (MI.getOpcode()) {
2630 default:
2631 break;
2632 case AArch64::MOVZWi:
2633 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2634 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2635 assert(MI.getDesc().getNumOperands() == 3 &&
2636 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2637 return true;
2638 }
2639 break;
2640 case AArch64::ANDWri: // and Rd, Rzr, #imm
2641 return MI.getOperand(1).getReg() == AArch64::WZR;
2642 case AArch64::ANDXri:
2643 return MI.getOperand(1).getReg() == AArch64::XZR;
2644 case TargetOpcode::COPY:
2645 return MI.getOperand(1).getReg() == AArch64::WZR;
2646 }
2647 return false;
2648}
2649
2650// Return true if this instruction simply renames a general register without
2651// modifying bits.
2653 switch (MI.getOpcode()) {
2654 default:
2655 break;
2656 case TargetOpcode::COPY: {
2657 // GPR32 copies will by lowered to ORRXrs
2658 Register DstReg = MI.getOperand(0).getReg();
2659 return (AArch64::GPR32RegClass.contains(DstReg) ||
2660 AArch64::GPR64RegClass.contains(DstReg));
2661 }
2662 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2663 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2664 assert(MI.getDesc().getNumOperands() == 4 &&
2665 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2666 return true;
2667 }
2668 break;
2669 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2670 if (MI.getOperand(2).getImm() == 0) {
2671 assert(MI.getDesc().getNumOperands() == 4 &&
2672 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2673 return true;
2674 }
2675 break;
2676 }
2677 return false;
2678}
2679
2680// Return true if this instruction simply renames a general register without
2681// modifying bits.
2683 switch (MI.getOpcode()) {
2684 default:
2685 break;
2686 case TargetOpcode::COPY: {
2687 Register DstReg = MI.getOperand(0).getReg();
2688 return AArch64::FPR128RegClass.contains(DstReg);
2689 }
2690 case AArch64::ORRv16i8:
2691 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2692 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2693 "invalid ORRv16i8 operands");
2694 return true;
2695 }
2696 break;
2697 }
2698 return false;
2699}
2700
2701static bool isFrameLoadOpcode(int Opcode) {
2702 switch (Opcode) {
2703 default:
2704 return false;
2705 case AArch64::LDRWui:
2706 case AArch64::LDRXui:
2707 case AArch64::LDRBui:
2708 case AArch64::LDRHui:
2709 case AArch64::LDRSui:
2710 case AArch64::LDRDui:
2711 case AArch64::LDRQui:
2712 case AArch64::LDR_PXI:
2713 return true;
2714 }
2715}
2716
2718 int &FrameIndex) const {
2719 if (!isFrameLoadOpcode(MI.getOpcode()))
2720 return Register();
2721
2722 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2723 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2724 FrameIndex = MI.getOperand(1).getIndex();
2725 return MI.getOperand(0).getReg();
2726 }
2727 return Register();
2728}
2729
2730static bool isFrameStoreOpcode(int Opcode) {
2731 switch (Opcode) {
2732 default:
2733 return false;
2734 case AArch64::STRWui:
2735 case AArch64::STRXui:
2736 case AArch64::STRBui:
2737 case AArch64::STRHui:
2738 case AArch64::STRSui:
2739 case AArch64::STRDui:
2740 case AArch64::STRQui:
2741 case AArch64::STR_PXI:
2742 return true;
2743 }
2744}
2745
2747 int &FrameIndex) const {
2748 if (!isFrameStoreOpcode(MI.getOpcode()))
2749 return Register();
2750
2751 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2752 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2753 FrameIndex = MI.getOperand(1).getIndex();
2754 return MI.getOperand(0).getReg();
2755 }
2756 return Register();
2757}
2758
2760 int &FrameIndex) const {
2761 if (!isFrameStoreOpcode(MI.getOpcode()))
2762 return Register();
2763
2764 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2765 return Reg;
2766
2768 if (hasStoreToStackSlot(MI, Accesses)) {
2769 if (Accesses.size() > 1)
2770 return Register();
2771
2772 FrameIndex =
2773 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2774 ->getFrameIndex();
2775 return MI.getOperand(0).getReg();
2776 }
2777 return Register();
2778}
2779
2781 int &FrameIndex) const {
2782 if (!isFrameLoadOpcode(MI.getOpcode()))
2783 return Register();
2784
2785 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2786 return Reg;
2787
2789 if (hasLoadFromStackSlot(MI, Accesses)) {
2790 if (Accesses.size() > 1)
2791 return Register();
2792
2793 FrameIndex =
2794 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2795 ->getFrameIndex();
2796 return MI.getOperand(0).getReg();
2797 }
2798 return Register();
2799}
2800
2801/// Check all MachineMemOperands for a hint to suppress pairing.
2803 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2804 return MMO->getFlags() & MOSuppressPair;
2805 });
2806}
2807
2808/// Set a flag on the first MachineMemOperand to suppress pairing.
2810 if (MI.memoperands_empty())
2811 return;
2812 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2813}
2814
2815/// Check all MachineMemOperands for a hint that the load/store is strided.
2817 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2818 return MMO->getFlags() & MOStridedAccess;
2819 });
2820}
2821
2823 switch (Opc) {
2824 default:
2825 return false;
2826 case AArch64::STURSi:
2827 case AArch64::STRSpre:
2828 case AArch64::STURDi:
2829 case AArch64::STRDpre:
2830 case AArch64::STURQi:
2831 case AArch64::STRQpre:
2832 case AArch64::STURBBi:
2833 case AArch64::STURHHi:
2834 case AArch64::STURWi:
2835 case AArch64::STRWpre:
2836 case AArch64::STURXi:
2837 case AArch64::STRXpre:
2838 case AArch64::LDURSi:
2839 case AArch64::LDRSpre:
2840 case AArch64::LDURDi:
2841 case AArch64::LDRDpre:
2842 case AArch64::LDURQi:
2843 case AArch64::LDRQpre:
2844 case AArch64::LDURWi:
2845 case AArch64::LDRWpre:
2846 case AArch64::LDURXi:
2847 case AArch64::LDRXpre:
2848 case AArch64::LDRSWpre:
2849 case AArch64::LDURSWi:
2850 case AArch64::LDURHHi:
2851 case AArch64::LDURBBi:
2852 case AArch64::LDURSBWi:
2853 case AArch64::LDURSHWi:
2854 return true;
2855 }
2856}
2857
2858std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2859 switch (Opc) {
2860 default: return {};
2861 case AArch64::PRFMui: return AArch64::PRFUMi;
2862 case AArch64::LDRXui: return AArch64::LDURXi;
2863 case AArch64::LDRWui: return AArch64::LDURWi;
2864 case AArch64::LDRBui: return AArch64::LDURBi;
2865 case AArch64::LDRHui: return AArch64::LDURHi;
2866 case AArch64::LDRSui: return AArch64::LDURSi;
2867 case AArch64::LDRDui: return AArch64::LDURDi;
2868 case AArch64::LDRQui: return AArch64::LDURQi;
2869 case AArch64::LDRBBui: return AArch64::LDURBBi;
2870 case AArch64::LDRHHui: return AArch64::LDURHHi;
2871 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2872 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2873 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2874 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2875 case AArch64::LDRSWui: return AArch64::LDURSWi;
2876 case AArch64::STRXui: return AArch64::STURXi;
2877 case AArch64::STRWui: return AArch64::STURWi;
2878 case AArch64::STRBui: return AArch64::STURBi;
2879 case AArch64::STRHui: return AArch64::STURHi;
2880 case AArch64::STRSui: return AArch64::STURSi;
2881 case AArch64::STRDui: return AArch64::STURDi;
2882 case AArch64::STRQui: return AArch64::STURQi;
2883 case AArch64::STRBBui: return AArch64::STURBBi;
2884 case AArch64::STRHHui: return AArch64::STURHHi;
2885 }
2886}
2887
2889 switch (Opc) {
2890 default:
2891 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2892 case AArch64::ADDG:
2893 case AArch64::LDAPURBi:
2894 case AArch64::LDAPURHi:
2895 case AArch64::LDAPURi:
2896 case AArch64::LDAPURSBWi:
2897 case AArch64::LDAPURSBXi:
2898 case AArch64::LDAPURSHWi:
2899 case AArch64::LDAPURSHXi:
2900 case AArch64::LDAPURSWi:
2901 case AArch64::LDAPURXi:
2902 case AArch64::LDR_PPXI:
2903 case AArch64::LDR_PXI:
2904 case AArch64::LDR_ZXI:
2905 case AArch64::LDR_ZZXI:
2906 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2907 case AArch64::LDR_ZZZXI:
2908 case AArch64::LDR_ZZZZXI:
2909 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2910 case AArch64::LDRBBui:
2911 case AArch64::LDRBui:
2912 case AArch64::LDRDui:
2913 case AArch64::LDRHHui:
2914 case AArch64::LDRHui:
2915 case AArch64::LDRQui:
2916 case AArch64::LDRSBWui:
2917 case AArch64::LDRSBXui:
2918 case AArch64::LDRSHWui:
2919 case AArch64::LDRSHXui:
2920 case AArch64::LDRSui:
2921 case AArch64::LDRSWui:
2922 case AArch64::LDRWui:
2923 case AArch64::LDRXui:
2924 case AArch64::LDURBBi:
2925 case AArch64::LDURBi:
2926 case AArch64::LDURDi:
2927 case AArch64::LDURHHi:
2928 case AArch64::LDURHi:
2929 case AArch64::LDURQi:
2930 case AArch64::LDURSBWi:
2931 case AArch64::LDURSBXi:
2932 case AArch64::LDURSHWi:
2933 case AArch64::LDURSHXi:
2934 case AArch64::LDURSi:
2935 case AArch64::LDURSWi:
2936 case AArch64::LDURWi:
2937 case AArch64::LDURXi:
2938 case AArch64::PRFMui:
2939 case AArch64::PRFUMi:
2940 case AArch64::ST2Gi:
2941 case AArch64::STGi:
2942 case AArch64::STLURBi:
2943 case AArch64::STLURHi:
2944 case AArch64::STLURWi:
2945 case AArch64::STLURXi:
2946 case AArch64::StoreSwiftAsyncContext:
2947 case AArch64::STR_PPXI:
2948 case AArch64::STR_PXI:
2949 case AArch64::STR_ZXI:
2950 case AArch64::STR_ZZXI:
2951 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2952 case AArch64::STR_ZZZXI:
2953 case AArch64::STR_ZZZZXI:
2954 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2955 case AArch64::STRBBui:
2956 case AArch64::STRBui:
2957 case AArch64::STRDui:
2958 case AArch64::STRHHui:
2959 case AArch64::STRHui:
2960 case AArch64::STRQui:
2961 case AArch64::STRSui:
2962 case AArch64::STRWui:
2963 case AArch64::STRXui:
2964 case AArch64::STURBBi:
2965 case AArch64::STURBi:
2966 case AArch64::STURDi:
2967 case AArch64::STURHHi:
2968 case AArch64::STURHi:
2969 case AArch64::STURQi:
2970 case AArch64::STURSi:
2971 case AArch64::STURWi:
2972 case AArch64::STURXi:
2973 case AArch64::STZ2Gi:
2974 case AArch64::STZGi:
2975 case AArch64::TAGPstack:
2976 return 2;
2977 case AArch64::LD1B_D_IMM:
2978 case AArch64::LD1B_H_IMM:
2979 case AArch64::LD1B_IMM:
2980 case AArch64::LD1B_S_IMM:
2981 case AArch64::LD1D_IMM:
2982 case AArch64::LD1H_D_IMM:
2983 case AArch64::LD1H_IMM:
2984 case AArch64::LD1H_S_IMM:
2985 case AArch64::LD1RB_D_IMM:
2986 case AArch64::LD1RB_H_IMM:
2987 case AArch64::LD1RB_IMM:
2988 case AArch64::LD1RB_S_IMM:
2989 case AArch64::LD1RD_IMM:
2990 case AArch64::LD1RH_D_IMM:
2991 case AArch64::LD1RH_IMM:
2992 case AArch64::LD1RH_S_IMM:
2993 case AArch64::LD1RSB_D_IMM:
2994 case AArch64::LD1RSB_H_IMM:
2995 case AArch64::LD1RSB_S_IMM:
2996 case AArch64::LD1RSH_D_IMM:
2997 case AArch64::LD1RSH_S_IMM:
2998 case AArch64::LD1RSW_IMM:
2999 case AArch64::LD1RW_D_IMM:
3000 case AArch64::LD1RW_IMM:
3001 case AArch64::LD1SB_D_IMM:
3002 case AArch64::LD1SB_H_IMM:
3003 case AArch64::LD1SB_S_IMM:
3004 case AArch64::LD1SH_D_IMM:
3005 case AArch64::LD1SH_S_IMM:
3006 case AArch64::LD1SW_D_IMM:
3007 case AArch64::LD1W_D_IMM:
3008 case AArch64::LD1W_IMM:
3009 case AArch64::LD2B_IMM:
3010 case AArch64::LD2D_IMM:
3011 case AArch64::LD2H_IMM:
3012 case AArch64::LD2W_IMM:
3013 case AArch64::LD3B_IMM:
3014 case AArch64::LD3D_IMM:
3015 case AArch64::LD3H_IMM:
3016 case AArch64::LD3W_IMM:
3017 case AArch64::LD4B_IMM:
3018 case AArch64::LD4D_IMM:
3019 case AArch64::LD4H_IMM:
3020 case AArch64::LD4W_IMM:
3021 case AArch64::LDG:
3022 case AArch64::LDNF1B_D_IMM:
3023 case AArch64::LDNF1B_H_IMM:
3024 case AArch64::LDNF1B_IMM:
3025 case AArch64::LDNF1B_S_IMM:
3026 case AArch64::LDNF1D_IMM:
3027 case AArch64::LDNF1H_D_IMM:
3028 case AArch64::LDNF1H_IMM:
3029 case AArch64::LDNF1H_S_IMM:
3030 case AArch64::LDNF1SB_D_IMM:
3031 case AArch64::LDNF1SB_H_IMM:
3032 case AArch64::LDNF1SB_S_IMM:
3033 case AArch64::LDNF1SH_D_IMM:
3034 case AArch64::LDNF1SH_S_IMM:
3035 case AArch64::LDNF1SW_D_IMM:
3036 case AArch64::LDNF1W_D_IMM:
3037 case AArch64::LDNF1W_IMM:
3038 case AArch64::LDNPDi:
3039 case AArch64::LDNPQi:
3040 case AArch64::LDNPSi:
3041 case AArch64::LDNPWi:
3042 case AArch64::LDNPXi:
3043 case AArch64::LDNT1B_ZRI:
3044 case AArch64::LDNT1D_ZRI:
3045 case AArch64::LDNT1H_ZRI:
3046 case AArch64::LDNT1W_ZRI:
3047 case AArch64::LDPDi:
3048 case AArch64::LDPQi:
3049 case AArch64::LDPSi:
3050 case AArch64::LDPWi:
3051 case AArch64::LDPXi:
3052 case AArch64::LDRBBpost:
3053 case AArch64::LDRBBpre:
3054 case AArch64::LDRBpost:
3055 case AArch64::LDRBpre:
3056 case AArch64::LDRDpost:
3057 case AArch64::LDRDpre:
3058 case AArch64::LDRHHpost:
3059 case AArch64::LDRHHpre:
3060 case AArch64::LDRHpost:
3061 case AArch64::LDRHpre:
3062 case AArch64::LDRQpost:
3063 case AArch64::LDRQpre:
3064 case AArch64::LDRSpost:
3065 case AArch64::LDRSpre:
3066 case AArch64::LDRWpost:
3067 case AArch64::LDRWpre:
3068 case AArch64::LDRXpost:
3069 case AArch64::LDRXpre:
3070 case AArch64::ST1B_D_IMM:
3071 case AArch64::ST1B_H_IMM:
3072 case AArch64::ST1B_IMM:
3073 case AArch64::ST1B_S_IMM:
3074 case AArch64::ST1D_IMM:
3075 case AArch64::ST1H_D_IMM:
3076 case AArch64::ST1H_IMM:
3077 case AArch64::ST1H_S_IMM:
3078 case AArch64::ST1W_D_IMM:
3079 case AArch64::ST1W_IMM:
3080 case AArch64::ST2B_IMM:
3081 case AArch64::ST2D_IMM:
3082 case AArch64::ST2H_IMM:
3083 case AArch64::ST2W_IMM:
3084 case AArch64::ST3B_IMM:
3085 case AArch64::ST3D_IMM:
3086 case AArch64::ST3H_IMM:
3087 case AArch64::ST3W_IMM:
3088 case AArch64::ST4B_IMM:
3089 case AArch64::ST4D_IMM:
3090 case AArch64::ST4H_IMM:
3091 case AArch64::ST4W_IMM:
3092 case AArch64::STGPi:
3093 case AArch64::STGPreIndex:
3094 case AArch64::STZGPreIndex:
3095 case AArch64::ST2GPreIndex:
3096 case AArch64::STZ2GPreIndex:
3097 case AArch64::STGPostIndex:
3098 case AArch64::STZGPostIndex:
3099 case AArch64::ST2GPostIndex:
3100 case AArch64::STZ2GPostIndex:
3101 case AArch64::STNPDi:
3102 case AArch64::STNPQi:
3103 case AArch64::STNPSi:
3104 case AArch64::STNPWi:
3105 case AArch64::STNPXi:
3106 case AArch64::STNT1B_ZRI:
3107 case AArch64::STNT1D_ZRI:
3108 case AArch64::STNT1H_ZRI:
3109 case AArch64::STNT1W_ZRI:
3110 case AArch64::STPDi:
3111 case AArch64::STPQi:
3112 case AArch64::STPSi:
3113 case AArch64::STPWi:
3114 case AArch64::STPXi:
3115 case AArch64::STRBBpost:
3116 case AArch64::STRBBpre:
3117 case AArch64::STRBpost:
3118 case AArch64::STRBpre:
3119 case AArch64::STRDpost:
3120 case AArch64::STRDpre:
3121 case AArch64::STRHHpost:
3122 case AArch64::STRHHpre:
3123 case AArch64::STRHpost:
3124 case AArch64::STRHpre:
3125 case AArch64::STRQpost:
3126 case AArch64::STRQpre:
3127 case AArch64::STRSpost:
3128 case AArch64::STRSpre:
3129 case AArch64::STRWpost:
3130 case AArch64::STRWpre:
3131 case AArch64::STRXpost:
3132 case AArch64::STRXpre:
3133 return 3;
3134 case AArch64::LDPDpost:
3135 case AArch64::LDPDpre:
3136 case AArch64::LDPQpost:
3137 case AArch64::LDPQpre:
3138 case AArch64::LDPSpost:
3139 case AArch64::LDPSpre:
3140 case AArch64::LDPWpost:
3141 case AArch64::LDPWpre:
3142 case AArch64::LDPXpost:
3143 case AArch64::LDPXpre:
3144 case AArch64::STGPpre:
3145 case AArch64::STGPpost:
3146 case AArch64::STPDpost:
3147 case AArch64::STPDpre:
3148 case AArch64::STPQpost:
3149 case AArch64::STPQpre:
3150 case AArch64::STPSpost:
3151 case AArch64::STPSpre:
3152 case AArch64::STPWpost:
3153 case AArch64::STPWpre:
3154 case AArch64::STPXpost:
3155 case AArch64::STPXpre:
3156 return 4;
3157 }
3158}
3159
3161 switch (MI.getOpcode()) {
3162 default:
3163 return false;
3164 // Scaled instructions.
3165 case AArch64::STRSui:
3166 case AArch64::STRDui:
3167 case AArch64::STRQui:
3168 case AArch64::STRXui:
3169 case AArch64::STRWui:
3170 case AArch64::LDRSui:
3171 case AArch64::LDRDui:
3172 case AArch64::LDRQui:
3173 case AArch64::LDRXui:
3174 case AArch64::LDRWui:
3175 case AArch64::LDRSWui:
3176 // Unscaled instructions.
3177 case AArch64::STURSi:
3178 case AArch64::STRSpre:
3179 case AArch64::STURDi:
3180 case AArch64::STRDpre:
3181 case AArch64::STURQi:
3182 case AArch64::STRQpre:
3183 case AArch64::STURWi:
3184 case AArch64::STRWpre:
3185 case AArch64::STURXi:
3186 case AArch64::STRXpre:
3187 case AArch64::LDURSi:
3188 case AArch64::LDRSpre:
3189 case AArch64::LDURDi:
3190 case AArch64::LDRDpre:
3191 case AArch64::LDURQi:
3192 case AArch64::LDRQpre:
3193 case AArch64::LDURWi:
3194 case AArch64::LDRWpre:
3195 case AArch64::LDURXi:
3196 case AArch64::LDRXpre:
3197 case AArch64::LDURSWi:
3198 case AArch64::LDRSWpre:
3199 // SVE instructions.
3200 case AArch64::LDR_ZXI:
3201 case AArch64::STR_ZXI:
3202 return true;
3203 }
3204}
3205
3207 switch (MI.getOpcode()) {
3208 default:
3209 assert((!MI.isCall() || !MI.isReturn()) &&
3210 "Unexpected instruction - was a new tail call opcode introduced?");
3211 return false;
3212 case AArch64::TCRETURNdi:
3213 case AArch64::TCRETURNri:
3214 case AArch64::TCRETURNrix16x17:
3215 case AArch64::TCRETURNrix17:
3216 case AArch64::TCRETURNrinotx16:
3217 case AArch64::TCRETURNriALL:
3218 case AArch64::AUTH_TCRETURN:
3219 case AArch64::AUTH_TCRETURN_BTI:
3220 return true;
3221 }
3222}
3223
3225 switch (Opc) {
3226 default:
3227 llvm_unreachable("Opcode has no flag setting equivalent!");
3228 // 32-bit cases:
3229 case AArch64::ADDWri:
3230 return AArch64::ADDSWri;
3231 case AArch64::ADDWrr:
3232 return AArch64::ADDSWrr;
3233 case AArch64::ADDWrs:
3234 return AArch64::ADDSWrs;
3235 case AArch64::ADDWrx:
3236 return AArch64::ADDSWrx;
3237 case AArch64::ANDWri:
3238 return AArch64::ANDSWri;
3239 case AArch64::ANDWrr:
3240 return AArch64::ANDSWrr;
3241 case AArch64::ANDWrs:
3242 return AArch64::ANDSWrs;
3243 case AArch64::BICWrr:
3244 return AArch64::BICSWrr;
3245 case AArch64::BICWrs:
3246 return AArch64::BICSWrs;
3247 case AArch64::SUBWri:
3248 return AArch64::SUBSWri;
3249 case AArch64::SUBWrr:
3250 return AArch64::SUBSWrr;
3251 case AArch64::SUBWrs:
3252 return AArch64::SUBSWrs;
3253 case AArch64::SUBWrx:
3254 return AArch64::SUBSWrx;
3255 // 64-bit cases:
3256 case AArch64::ADDXri:
3257 return AArch64::ADDSXri;
3258 case AArch64::ADDXrr:
3259 return AArch64::ADDSXrr;
3260 case AArch64::ADDXrs:
3261 return AArch64::ADDSXrs;
3262 case AArch64::ADDXrx:
3263 return AArch64::ADDSXrx;
3264 case AArch64::ANDXri:
3265 return AArch64::ANDSXri;
3266 case AArch64::ANDXrr:
3267 return AArch64::ANDSXrr;
3268 case AArch64::ANDXrs:
3269 return AArch64::ANDSXrs;
3270 case AArch64::BICXrr:
3271 return AArch64::BICSXrr;
3272 case AArch64::BICXrs:
3273 return AArch64::BICSXrs;
3274 case AArch64::SUBXri:
3275 return AArch64::SUBSXri;
3276 case AArch64::SUBXrr:
3277 return AArch64::SUBSXrr;
3278 case AArch64::SUBXrs:
3279 return AArch64::SUBSXrs;
3280 case AArch64::SUBXrx:
3281 return AArch64::SUBSXrx;
3282 // SVE instructions:
3283 case AArch64::AND_PPzPP:
3284 return AArch64::ANDS_PPzPP;
3285 case AArch64::BIC_PPzPP:
3286 return AArch64::BICS_PPzPP;
3287 case AArch64::EOR_PPzPP:
3288 return AArch64::EORS_PPzPP;
3289 case AArch64::NAND_PPzPP:
3290 return AArch64::NANDS_PPzPP;
3291 case AArch64::NOR_PPzPP:
3292 return AArch64::NORS_PPzPP;
3293 case AArch64::ORN_PPzPP:
3294 return AArch64::ORNS_PPzPP;
3295 case AArch64::ORR_PPzPP:
3296 return AArch64::ORRS_PPzPP;
3297 case AArch64::BRKA_PPzP:
3298 return AArch64::BRKAS_PPzP;
3299 case AArch64::BRKPA_PPzPP:
3300 return AArch64::BRKPAS_PPzPP;
3301 case AArch64::BRKB_PPzP:
3302 return AArch64::BRKBS_PPzP;
3303 case AArch64::BRKPB_PPzPP:
3304 return AArch64::BRKPBS_PPzPP;
3305 case AArch64::BRKN_PPzP:
3306 return AArch64::BRKNS_PPzP;
3307 case AArch64::RDFFR_PPz:
3308 return AArch64::RDFFRS_PPz;
3309 case AArch64::PTRUE_B:
3310 return AArch64::PTRUES_B;
3311 }
3312}
3313
3314// Is this a candidate for ld/st merging or pairing? For example, we don't
3315// touch volatiles or load/stores that have a hint to avoid pair formation.
3317
3318 bool IsPreLdSt = isPreLdSt(MI);
3319
3320 // If this is a volatile load/store, don't mess with it.
3321 if (MI.hasOrderedMemoryRef())
3322 return false;
3323
3324 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3325 // For Pre-inc LD/ST, the operand is shifted by one.
3326 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3327 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3328 "Expected a reg or frame index operand.");
3329
3330 // For Pre-indexed addressing quadword instructions, the third operand is the
3331 // immediate value.
3332 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3333
3334 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3335 return false;
3336
3337 // Can't merge/pair if the instruction modifies the base register.
3338 // e.g., ldr x0, [x0]
3339 // This case will never occur with an FI base.
3340 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3341 // STR<S,D,Q,W,X>pre, it can be merged.
3342 // For example:
3343 // ldr q0, [x11, #32]!
3344 // ldr q1, [x11, #16]
3345 // to
3346 // ldp q0, q1, [x11, #32]!
3347 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3348 Register BaseReg = MI.getOperand(1).getReg();
3350 if (MI.modifiesRegister(BaseReg, TRI))
3351 return false;
3352 }
3353
3354 // Pairing SVE fills/spills is only valid for little-endian targets that
3355 // implement VLS 128.
3356 switch (MI.getOpcode()) {
3357 default:
3358 break;
3359 case AArch64::LDR_ZXI:
3360 case AArch64::STR_ZXI:
3361 if (!Subtarget.isLittleEndian() ||
3362 Subtarget.getSVEVectorSizeInBits() != 128)
3363 return false;
3364 }
3365
3366 // Check if this load/store has a hint to avoid pair formation.
3367 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3369 return false;
3370
3371 // Do not pair any callee-save store/reload instructions in the
3372 // prologue/epilogue if the CFI information encoded the operations as separate
3373 // instructions, as that will cause the size of the actual prologue to mismatch
3374 // with the prologue size recorded in the Windows CFI.
3375 const MCAsmInfo &MAI = MI.getMF()->getTarget().getMCAsmInfo();
3376 bool NeedsWinCFI =
3377 MAI.usesWindowsCFI() && MI.getMF()->getFunction().needsUnwindTableEntry();
3378 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3380 return false;
3381
3382 // On some CPUs quad load/store pairs are slower than two single load/stores.
3383 if (Subtarget.isPaired128Slow()) {
3384 switch (MI.getOpcode()) {
3385 default:
3386 break;
3387 case AArch64::LDURQi:
3388 case AArch64::STURQi:
3389 case AArch64::LDRQui:
3390 case AArch64::STRQui:
3391 return false;
3392 }
3393 }
3394
3395 return true;
3396}
3397
3400 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3401 const TargetRegisterInfo *TRI) const {
3402 if (!LdSt.mayLoadOrStore())
3403 return false;
3404
3405 const MachineOperand *BaseOp;
3406 TypeSize WidthN(0, false);
3407 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3408 WidthN, TRI))
3409 return false;
3410 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3411 // vector.
3412 Width = LocationSize::precise(WidthN);
3413 BaseOps.push_back(BaseOp);
3414 return true;
3415}
3416
3417std::optional<ExtAddrMode>
3419 const TargetRegisterInfo *TRI) const {
3420 const MachineOperand *Base; // Filled with the base operand of MI.
3421 int64_t Offset; // Filled with the offset of MI.
3422 bool OffsetIsScalable;
3423 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3424 return std::nullopt;
3425
3426 if (!Base->isReg())
3427 return std::nullopt;
3428 ExtAddrMode AM;
3429 AM.BaseReg = Base->getReg();
3430 AM.Displacement = Offset;
3431 AM.ScaledReg = 0;
3432 AM.Scale = 0;
3433 return AM;
3434}
3435
3437 Register Reg,
3438 const MachineInstr &AddrI,
3439 ExtAddrMode &AM) const {
3440 // Filter out instructions into which we cannot fold.
3441 unsigned NumBytes;
3442 int64_t OffsetScale = 1;
3443 switch (MemI.getOpcode()) {
3444 default:
3445 return false;
3446
3447 case AArch64::LDURQi:
3448 case AArch64::STURQi:
3449 NumBytes = 16;
3450 break;
3451
3452 case AArch64::LDURDi:
3453 case AArch64::STURDi:
3454 case AArch64::LDURXi:
3455 case AArch64::STURXi:
3456 NumBytes = 8;
3457 break;
3458
3459 case AArch64::LDURWi:
3460 case AArch64::LDURSWi:
3461 case AArch64::STURWi:
3462 NumBytes = 4;
3463 break;
3464
3465 case AArch64::LDURHi:
3466 case AArch64::STURHi:
3467 case AArch64::LDURHHi:
3468 case AArch64::STURHHi:
3469 case AArch64::LDURSHXi:
3470 case AArch64::LDURSHWi:
3471 NumBytes = 2;
3472 break;
3473
3474 case AArch64::LDRBroX:
3475 case AArch64::LDRBBroX:
3476 case AArch64::LDRSBXroX:
3477 case AArch64::LDRSBWroX:
3478 case AArch64::STRBroX:
3479 case AArch64::STRBBroX:
3480 case AArch64::LDURBi:
3481 case AArch64::LDURBBi:
3482 case AArch64::LDURSBXi:
3483 case AArch64::LDURSBWi:
3484 case AArch64::STURBi:
3485 case AArch64::STURBBi:
3486 case AArch64::LDRBui:
3487 case AArch64::LDRBBui:
3488 case AArch64::LDRSBXui:
3489 case AArch64::LDRSBWui:
3490 case AArch64::STRBui:
3491 case AArch64::STRBBui:
3492 NumBytes = 1;
3493 break;
3494
3495 case AArch64::LDRQroX:
3496 case AArch64::STRQroX:
3497 case AArch64::LDRQui:
3498 case AArch64::STRQui:
3499 NumBytes = 16;
3500 OffsetScale = 16;
3501 break;
3502
3503 case AArch64::LDRDroX:
3504 case AArch64::STRDroX:
3505 case AArch64::LDRXroX:
3506 case AArch64::STRXroX:
3507 case AArch64::LDRDui:
3508 case AArch64::STRDui:
3509 case AArch64::LDRXui:
3510 case AArch64::STRXui:
3511 NumBytes = 8;
3512 OffsetScale = 8;
3513 break;
3514
3515 case AArch64::LDRWroX:
3516 case AArch64::LDRSWroX:
3517 case AArch64::STRWroX:
3518 case AArch64::LDRWui:
3519 case AArch64::LDRSWui:
3520 case AArch64::STRWui:
3521 NumBytes = 4;
3522 OffsetScale = 4;
3523 break;
3524
3525 case AArch64::LDRHroX:
3526 case AArch64::STRHroX:
3527 case AArch64::LDRHHroX:
3528 case AArch64::STRHHroX:
3529 case AArch64::LDRSHXroX:
3530 case AArch64::LDRSHWroX:
3531 case AArch64::LDRHui:
3532 case AArch64::STRHui:
3533 case AArch64::LDRHHui:
3534 case AArch64::STRHHui:
3535 case AArch64::LDRSHXui:
3536 case AArch64::LDRSHWui:
3537 NumBytes = 2;
3538 OffsetScale = 2;
3539 break;
3540 }
3541
3542 // Check the fold operand is not the loaded/stored value.
3543 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3544 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3545 return false;
3546
3547 // Handle memory instructions with a [Reg, Reg] addressing mode.
3548 if (MemI.getOperand(2).isReg()) {
3549 // Bail if the addressing mode already includes extension of the offset
3550 // register.
3551 if (MemI.getOperand(3).getImm())
3552 return false;
3553
3554 // Check if we actually have a scaled offset.
3555 if (MemI.getOperand(4).getImm() == 0)
3556 OffsetScale = 1;
3557
3558 // If the address instructions is folded into the base register, then the
3559 // addressing mode must not have a scale. Then we can swap the base and the
3560 // scaled registers.
3561 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3562 return false;
3563
3564 switch (AddrI.getOpcode()) {
3565 default:
3566 return false;
3567
3568 case AArch64::SBFMXri:
3569 // sxtw Xa, Wm
3570 // ldr Xd, [Xn, Xa, lsl #N]
3571 // ->
3572 // ldr Xd, [Xn, Wm, sxtw #N]
3573 if (AddrI.getOperand(2).getImm() != 0 ||
3574 AddrI.getOperand(3).getImm() != 31)
3575 return false;
3576
3577 AM.BaseReg = MemI.getOperand(1).getReg();
3578 if (AM.BaseReg == Reg)
3579 AM.BaseReg = MemI.getOperand(2).getReg();
3580 AM.ScaledReg = AddrI.getOperand(1).getReg();
3581 AM.Scale = OffsetScale;
3582 AM.Displacement = 0;
3584 return true;
3585
3586 case TargetOpcode::SUBREG_TO_REG: {
3587 // mov Wa, Wm
3588 // ldr Xd, [Xn, Xa, lsl #N]
3589 // ->
3590 // ldr Xd, [Xn, Wm, uxtw #N]
3591
3592 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3593 if (AddrI.getOperand(2).getImm() != AArch64::sub_32)
3594 return false;
3595
3596 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3597 Register OffsetReg = AddrI.getOperand(1).getReg();
3598 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3599 return false;
3600
3601 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3602 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3603 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3604 DefMI.getOperand(3).getImm() != 0)
3605 return false;
3606
3607 AM.BaseReg = MemI.getOperand(1).getReg();
3608 if (AM.BaseReg == Reg)
3609 AM.BaseReg = MemI.getOperand(2).getReg();
3610 AM.ScaledReg = DefMI.getOperand(2).getReg();
3611 AM.Scale = OffsetScale;
3612 AM.Displacement = 0;
3614 return true;
3615 }
3616 }
3617 }
3618
3619 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3620
3621 // Check we are not breaking a potential conversion to an LDP.
3622 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3623 int64_t NewOffset) -> bool {
3624 int64_t MinOffset, MaxOffset;
3625 switch (NumBytes) {
3626 default:
3627 return true;
3628 case 4:
3629 MinOffset = -256;
3630 MaxOffset = 252;
3631 break;
3632 case 8:
3633 MinOffset = -512;
3634 MaxOffset = 504;
3635 break;
3636 case 16:
3637 MinOffset = -1024;
3638 MaxOffset = 1008;
3639 break;
3640 }
3641 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3642 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3643 };
3644 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3645 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3646 int64_t NewOffset = OldOffset + Disp;
3647 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3648 return false;
3649 // If the old offset would fit into an LDP, but the new offset wouldn't,
3650 // bail out.
3651 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3652 return false;
3653 AM.BaseReg = AddrI.getOperand(1).getReg();
3654 AM.ScaledReg = 0;
3655 AM.Scale = 0;
3656 AM.Displacement = NewOffset;
3658 return true;
3659 };
3660
3661 auto canFoldAddRegIntoAddrMode =
3662 [&](int64_t Scale,
3664 if (MemI.getOperand(2).getImm() != 0)
3665 return false;
3666 if ((unsigned)Scale != Scale)
3667 return false;
3668 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3669 return false;
3670 AM.BaseReg = AddrI.getOperand(1).getReg();
3671 AM.ScaledReg = AddrI.getOperand(2).getReg();
3672 AM.Scale = Scale;
3673 AM.Displacement = 0;
3674 AM.Form = Form;
3675 return true;
3676 };
3677
3678 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3679 unsigned Opcode = MemI.getOpcode();
3680 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3681 Subtarget.isSTRQroSlow();
3682 };
3683
3684 int64_t Disp = 0;
3685 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3686 switch (AddrI.getOpcode()) {
3687 default:
3688 return false;
3689
3690 case AArch64::ADDXri:
3691 // add Xa, Xn, #N
3692 // ldr Xd, [Xa, #M]
3693 // ->
3694 // ldr Xd, [Xn, #N'+M]
3695 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3696 return canFoldAddSubImmIntoAddrMode(Disp);
3697
3698 case AArch64::SUBXri:
3699 // sub Xa, Xn, #N
3700 // ldr Xd, [Xa, #M]
3701 // ->
3702 // ldr Xd, [Xn, #N'+M]
3703 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3704 return canFoldAddSubImmIntoAddrMode(-Disp);
3705
3706 case AArch64::ADDXrs: {
3707 // add Xa, Xn, Xm, lsl #N
3708 // ldr Xd, [Xa]
3709 // ->
3710 // ldr Xd, [Xn, Xm, lsl #N]
3711
3712 // Don't fold the add if the result would be slower, unless optimising for
3713 // size.
3714 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3716 return false;
3717 Shift = AArch64_AM::getShiftValue(Shift);
3718 if (!OptSize) {
3719 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3720 return false;
3721 if (avoidSlowSTRQ(MemI))
3722 return false;
3723 }
3724 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3725 }
3726
3727 case AArch64::ADDXrr:
3728 // add Xa, Xn, Xm
3729 // ldr Xd, [Xa]
3730 // ->
3731 // ldr Xd, [Xn, Xm, lsl #0]
3732
3733 // Don't fold the add if the result would be slower, unless optimising for
3734 // size.
3735 if (!OptSize && avoidSlowSTRQ(MemI))
3736 return false;
3737 return canFoldAddRegIntoAddrMode(1);
3738
3739 case AArch64::ADDXrx:
3740 // add Xa, Xn, Wm, {s,u}xtw #N
3741 // ldr Xd, [Xa]
3742 // ->
3743 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3744
3745 // Don't fold the add if the result would be slower, unless optimising for
3746 // size.
3747 if (!OptSize && avoidSlowSTRQ(MemI))
3748 return false;
3749
3750 // Can fold only sign-/zero-extend of a word.
3751 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3753 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3754 return false;
3755
3756 return canFoldAddRegIntoAddrMode(
3757 1ULL << AArch64_AM::getArithShiftValue(Imm),
3760 }
3761}
3762
3763// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3764// return the opcode of an instruction performing the same operation, but using
3765// the [Reg, Reg] addressing mode.
3766static unsigned regOffsetOpcode(unsigned Opcode) {
3767 switch (Opcode) {
3768 default:
3769 llvm_unreachable("Address folding not implemented for instruction");
3770
3771 case AArch64::LDURQi:
3772 case AArch64::LDRQui:
3773 return AArch64::LDRQroX;
3774 case AArch64::STURQi:
3775 case AArch64::STRQui:
3776 return AArch64::STRQroX;
3777 case AArch64::LDURDi:
3778 case AArch64::LDRDui:
3779 return AArch64::LDRDroX;
3780 case AArch64::STURDi:
3781 case AArch64::STRDui:
3782 return AArch64::STRDroX;
3783 case AArch64::LDURXi:
3784 case AArch64::LDRXui:
3785 return AArch64::LDRXroX;
3786 case AArch64::STURXi:
3787 case AArch64::STRXui:
3788 return AArch64::STRXroX;
3789 case AArch64::LDURWi:
3790 case AArch64::LDRWui:
3791 return AArch64::LDRWroX;
3792 case AArch64::LDURSWi:
3793 case AArch64::LDRSWui:
3794 return AArch64::LDRSWroX;
3795 case AArch64::STURWi:
3796 case AArch64::STRWui:
3797 return AArch64::STRWroX;
3798 case AArch64::LDURHi:
3799 case AArch64::LDRHui:
3800 return AArch64::LDRHroX;
3801 case AArch64::STURHi:
3802 case AArch64::STRHui:
3803 return AArch64::STRHroX;
3804 case AArch64::LDURHHi:
3805 case AArch64::LDRHHui:
3806 return AArch64::LDRHHroX;
3807 case AArch64::STURHHi:
3808 case AArch64::STRHHui:
3809 return AArch64::STRHHroX;
3810 case AArch64::LDURSHXi:
3811 case AArch64::LDRSHXui:
3812 return AArch64::LDRSHXroX;
3813 case AArch64::LDURSHWi:
3814 case AArch64::LDRSHWui:
3815 return AArch64::LDRSHWroX;
3816 case AArch64::LDURBi:
3817 case AArch64::LDRBui:
3818 return AArch64::LDRBroX;
3819 case AArch64::LDURBBi:
3820 case AArch64::LDRBBui:
3821 return AArch64::LDRBBroX;
3822 case AArch64::LDURSBXi:
3823 case AArch64::LDRSBXui:
3824 return AArch64::LDRSBXroX;
3825 case AArch64::LDURSBWi:
3826 case AArch64::LDRSBWui:
3827 return AArch64::LDRSBWroX;
3828 case AArch64::STURBi:
3829 case AArch64::STRBui:
3830 return AArch64::STRBroX;
3831 case AArch64::STURBBi:
3832 case AArch64::STRBBui:
3833 return AArch64::STRBBroX;
3834 }
3835}
3836
3837// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3838// the opcode of an instruction performing the same operation, but using the
3839// [Reg, #Imm] addressing mode with scaled offset.
3840unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3841 switch (Opcode) {
3842 default:
3843 llvm_unreachable("Address folding not implemented for instruction");
3844
3845 case AArch64::LDURQi:
3846 Scale = 16;
3847 return AArch64::LDRQui;
3848 case AArch64::STURQi:
3849 Scale = 16;
3850 return AArch64::STRQui;
3851 case AArch64::LDURDi:
3852 Scale = 8;
3853 return AArch64::LDRDui;
3854 case AArch64::STURDi:
3855 Scale = 8;
3856 return AArch64::STRDui;
3857 case AArch64::LDURXi:
3858 Scale = 8;
3859 return AArch64::LDRXui;
3860 case AArch64::STURXi:
3861 Scale = 8;
3862 return AArch64::STRXui;
3863 case AArch64::LDURWi:
3864 Scale = 4;
3865 return AArch64::LDRWui;
3866 case AArch64::LDURSWi:
3867 Scale = 4;
3868 return AArch64::LDRSWui;
3869 case AArch64::STURWi:
3870 Scale = 4;
3871 return AArch64::STRWui;
3872 case AArch64::LDURHi:
3873 Scale = 2;
3874 return AArch64::LDRHui;
3875 case AArch64::STURHi:
3876 Scale = 2;
3877 return AArch64::STRHui;
3878 case AArch64::LDURHHi:
3879 Scale = 2;
3880 return AArch64::LDRHHui;
3881 case AArch64::STURHHi:
3882 Scale = 2;
3883 return AArch64::STRHHui;
3884 case AArch64::LDURSHXi:
3885 Scale = 2;
3886 return AArch64::LDRSHXui;
3887 case AArch64::LDURSHWi:
3888 Scale = 2;
3889 return AArch64::LDRSHWui;
3890 case AArch64::LDURBi:
3891 Scale = 1;
3892 return AArch64::LDRBui;
3893 case AArch64::LDURBBi:
3894 Scale = 1;
3895 return AArch64::LDRBBui;
3896 case AArch64::LDURSBXi:
3897 Scale = 1;
3898 return AArch64::LDRSBXui;
3899 case AArch64::LDURSBWi:
3900 Scale = 1;
3901 return AArch64::LDRSBWui;
3902 case AArch64::STURBi:
3903 Scale = 1;
3904 return AArch64::STRBui;
3905 case AArch64::STURBBi:
3906 Scale = 1;
3907 return AArch64::STRBBui;
3908 case AArch64::LDRQui:
3909 case AArch64::STRQui:
3910 Scale = 16;
3911 return Opcode;
3912 case AArch64::LDRDui:
3913 case AArch64::STRDui:
3914 case AArch64::LDRXui:
3915 case AArch64::STRXui:
3916 Scale = 8;
3917 return Opcode;
3918 case AArch64::LDRWui:
3919 case AArch64::LDRSWui:
3920 case AArch64::STRWui:
3921 Scale = 4;
3922 return Opcode;
3923 case AArch64::LDRHui:
3924 case AArch64::STRHui:
3925 case AArch64::LDRHHui:
3926 case AArch64::STRHHui:
3927 case AArch64::LDRSHXui:
3928 case AArch64::LDRSHWui:
3929 Scale = 2;
3930 return Opcode;
3931 case AArch64::LDRBui:
3932 case AArch64::LDRBBui:
3933 case AArch64::LDRSBXui:
3934 case AArch64::LDRSBWui:
3935 case AArch64::STRBui:
3936 case AArch64::STRBBui:
3937 Scale = 1;
3938 return Opcode;
3939 }
3940}
3941
3942// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3943// the opcode of an instruction performing the same operation, but using the
3944// [Reg, #Imm] addressing mode with unscaled offset.
3945unsigned unscaledOffsetOpcode(unsigned Opcode) {
3946 switch (Opcode) {
3947 default:
3948 llvm_unreachable("Address folding not implemented for instruction");
3949
3950 case AArch64::LDURQi:
3951 case AArch64::STURQi:
3952 case AArch64::LDURDi:
3953 case AArch64::STURDi:
3954 case AArch64::LDURXi:
3955 case AArch64::STURXi:
3956 case AArch64::LDURWi:
3957 case AArch64::LDURSWi:
3958 case AArch64::STURWi:
3959 case AArch64::LDURHi:
3960 case AArch64::STURHi:
3961 case AArch64::LDURHHi:
3962 case AArch64::STURHHi:
3963 case AArch64::LDURSHXi:
3964 case AArch64::LDURSHWi:
3965 case AArch64::LDURBi:
3966 case AArch64::STURBi:
3967 case AArch64::LDURBBi:
3968 case AArch64::STURBBi:
3969 case AArch64::LDURSBWi:
3970 case AArch64::LDURSBXi:
3971 return Opcode;
3972 case AArch64::LDRQui:
3973 return AArch64::LDURQi;
3974 case AArch64::STRQui:
3975 return AArch64::STURQi;
3976 case AArch64::LDRDui:
3977 return AArch64::LDURDi;
3978 case AArch64::STRDui:
3979 return AArch64::STURDi;
3980 case AArch64::LDRXui:
3981 return AArch64::LDURXi;
3982 case AArch64::STRXui:
3983 return AArch64::STURXi;
3984 case AArch64::LDRWui:
3985 return AArch64::LDURWi;
3986 case AArch64::LDRSWui:
3987 return AArch64::LDURSWi;
3988 case AArch64::STRWui:
3989 return AArch64::STURWi;
3990 case AArch64::LDRHui:
3991 return AArch64::LDURHi;
3992 case AArch64::STRHui:
3993 return AArch64::STURHi;
3994 case AArch64::LDRHHui:
3995 return AArch64::LDURHHi;
3996 case AArch64::STRHHui:
3997 return AArch64::STURHHi;
3998 case AArch64::LDRSHXui:
3999 return AArch64::LDURSHXi;
4000 case AArch64::LDRSHWui:
4001 return AArch64::LDURSHWi;
4002 case AArch64::LDRBBui:
4003 return AArch64::LDURBBi;
4004 case AArch64::LDRBui:
4005 return AArch64::LDURBi;
4006 case AArch64::STRBBui:
4007 return AArch64::STURBBi;
4008 case AArch64::STRBui:
4009 return AArch64::STURBi;
4010 case AArch64::LDRSBWui:
4011 return AArch64::LDURSBWi;
4012 case AArch64::LDRSBXui:
4013 return AArch64::LDURSBXi;
4014 }
4015}
4016
4017// Given the opcode of a memory load/store instruction, return the opcode of an
4018// instruction performing the same operation, but using
4019// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
4020// offset register.
4021static unsigned offsetExtendOpcode(unsigned Opcode) {
4022 switch (Opcode) {
4023 default:
4024 llvm_unreachable("Address folding not implemented for instruction");
4025
4026 case AArch64::LDRQroX:
4027 case AArch64::LDURQi:
4028 case AArch64::LDRQui:
4029 return AArch64::LDRQroW;
4030 case AArch64::STRQroX:
4031 case AArch64::STURQi:
4032 case AArch64::STRQui:
4033 return AArch64::STRQroW;
4034 case AArch64::LDRDroX:
4035 case AArch64::LDURDi:
4036 case AArch64::LDRDui:
4037 return AArch64::LDRDroW;
4038 case AArch64::STRDroX:
4039 case AArch64::STURDi:
4040 case AArch64::STRDui:
4041 return AArch64::STRDroW;
4042 case AArch64::LDRXroX:
4043 case AArch64::LDURXi:
4044 case AArch64::LDRXui:
4045 return AArch64::LDRXroW;
4046 case AArch64::STRXroX:
4047 case AArch64::STURXi:
4048 case AArch64::STRXui:
4049 return AArch64::STRXroW;
4050 case AArch64::LDRWroX:
4051 case AArch64::LDURWi:
4052 case AArch64::LDRWui:
4053 return AArch64::LDRWroW;
4054 case AArch64::LDRSWroX:
4055 case AArch64::LDURSWi:
4056 case AArch64::LDRSWui:
4057 return AArch64::LDRSWroW;
4058 case AArch64::STRWroX:
4059 case AArch64::STURWi:
4060 case AArch64::STRWui:
4061 return AArch64::STRWroW;
4062 case AArch64::LDRHroX:
4063 case AArch64::LDURHi:
4064 case AArch64::LDRHui:
4065 return AArch64::LDRHroW;
4066 case AArch64::STRHroX:
4067 case AArch64::STURHi:
4068 case AArch64::STRHui:
4069 return AArch64::STRHroW;
4070 case AArch64::LDRHHroX:
4071 case AArch64::LDURHHi:
4072 case AArch64::LDRHHui:
4073 return AArch64::LDRHHroW;
4074 case AArch64::STRHHroX:
4075 case AArch64::STURHHi:
4076 case AArch64::STRHHui:
4077 return AArch64::STRHHroW;
4078 case AArch64::LDRSHXroX:
4079 case AArch64::LDURSHXi:
4080 case AArch64::LDRSHXui:
4081 return AArch64::LDRSHXroW;
4082 case AArch64::LDRSHWroX:
4083 case AArch64::LDURSHWi:
4084 case AArch64::LDRSHWui:
4085 return AArch64::LDRSHWroW;
4086 case AArch64::LDRBroX:
4087 case AArch64::LDURBi:
4088 case AArch64::LDRBui:
4089 return AArch64::LDRBroW;
4090 case AArch64::LDRBBroX:
4091 case AArch64::LDURBBi:
4092 case AArch64::LDRBBui:
4093 return AArch64::LDRBBroW;
4094 case AArch64::LDRSBXroX:
4095 case AArch64::LDURSBXi:
4096 case AArch64::LDRSBXui:
4097 return AArch64::LDRSBXroW;
4098 case AArch64::LDRSBWroX:
4099 case AArch64::LDURSBWi:
4100 case AArch64::LDRSBWui:
4101 return AArch64::LDRSBWroW;
4102 case AArch64::STRBroX:
4103 case AArch64::STURBi:
4104 case AArch64::STRBui:
4105 return AArch64::STRBroW;
4106 case AArch64::STRBBroX:
4107 case AArch64::STURBBi:
4108 case AArch64::STRBBui:
4109 return AArch64::STRBBroW;
4110 }
4111}
4112
4114 const ExtAddrMode &AM) const {
4115
4116 const DebugLoc &DL = MemI.getDebugLoc();
4117 MachineBasicBlock &MBB = *MemI.getParent();
4118 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
4119
4121 if (AM.ScaledReg) {
4122 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4123 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4124 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4125 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4126 .addReg(MemI.getOperand(0).getReg(),
4127 getDefRegState(MemI.mayLoad()))
4128 .addReg(AM.BaseReg)
4129 .addReg(AM.ScaledReg)
4130 .addImm(0)
4131 .addImm(AM.Scale > 1)
4132 .setMemRefs(MemI.memoperands())
4133 .setMIFlags(MemI.getFlags());
4134 return B.getInstr();
4135 }
4136
4137 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4138 "Addressing mode not supported for folding");
4139
4140 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4141 unsigned Scale = 1;
4142 unsigned Opcode = MemI.getOpcode();
4143 if (isInt<9>(AM.Displacement))
4144 Opcode = unscaledOffsetOpcode(Opcode);
4145 else
4146 Opcode = scaledOffsetOpcode(Opcode, Scale);
4147
4148 auto B =
4149 BuildMI(MBB, MemI, DL, get(Opcode))
4150 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4151 .addReg(AM.BaseReg)
4152 .addImm(AM.Displacement / Scale)
4153 .setMemRefs(MemI.memoperands())
4154 .setMIFlags(MemI.getFlags());
4155 return B.getInstr();
4156 }
4157
4160 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4161 assert(AM.ScaledReg && !AM.Displacement &&
4162 "Address offset can be a register or an immediate, but not both");
4163 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4164 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4165 // Make sure the offset register is in the correct register class.
4166 Register OffsetReg = AM.ScaledReg;
4167 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4168 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4169 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4170 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4171 .addReg(AM.ScaledReg, {}, AArch64::sub_32);
4172 }
4173 auto B =
4174 BuildMI(MBB, MemI, DL, get(Opcode))
4175 .addReg(MemI.getOperand(0).getReg(), getDefRegState(MemI.mayLoad()))
4176 .addReg(AM.BaseReg)
4177 .addReg(OffsetReg)
4179 .addImm(AM.Scale != 1)
4180 .setMemRefs(MemI.memoperands())
4181 .setMIFlags(MemI.getFlags());
4182
4183 return B.getInstr();
4184 }
4185
4187 "Function must not be called with an addressing mode it can't handle");
4188}
4189
4190/// Return true if the opcode is a post-index ld/st instruction, which really
4191/// loads from base+0.
4192static bool isPostIndexLdStOpcode(unsigned Opcode) {
4193 switch (Opcode) {
4194 default:
4195 return false;
4196 case AArch64::LD1Fourv16b_POST:
4197 case AArch64::LD1Fourv1d_POST:
4198 case AArch64::LD1Fourv2d_POST:
4199 case AArch64::LD1Fourv2s_POST:
4200 case AArch64::LD1Fourv4h_POST:
4201 case AArch64::LD1Fourv4s_POST:
4202 case AArch64::LD1Fourv8b_POST:
4203 case AArch64::LD1Fourv8h_POST:
4204 case AArch64::LD1Onev16b_POST:
4205 case AArch64::LD1Onev1d_POST:
4206 case AArch64::LD1Onev2d_POST:
4207 case AArch64::LD1Onev2s_POST:
4208 case AArch64::LD1Onev4h_POST:
4209 case AArch64::LD1Onev4s_POST:
4210 case AArch64::LD1Onev8b_POST:
4211 case AArch64::LD1Onev8h_POST:
4212 case AArch64::LD1Rv16b_POST:
4213 case AArch64::LD1Rv1d_POST:
4214 case AArch64::LD1Rv2d_POST:
4215 case AArch64::LD1Rv2s_POST:
4216 case AArch64::LD1Rv4h_POST:
4217 case AArch64::LD1Rv4s_POST:
4218 case AArch64::LD1Rv8b_POST:
4219 case AArch64::LD1Rv8h_POST:
4220 case AArch64::LD1Threev16b_POST:
4221 case AArch64::LD1Threev1d_POST:
4222 case AArch64::LD1Threev2d_POST:
4223 case AArch64::LD1Threev2s_POST:
4224 case AArch64::LD1Threev4h_POST:
4225 case AArch64::LD1Threev4s_POST:
4226 case AArch64::LD1Threev8b_POST:
4227 case AArch64::LD1Threev8h_POST:
4228 case AArch64::LD1Twov16b_POST:
4229 case AArch64::LD1Twov1d_POST:
4230 case AArch64::LD1Twov2d_POST:
4231 case AArch64::LD1Twov2s_POST:
4232 case AArch64::LD1Twov4h_POST:
4233 case AArch64::LD1Twov4s_POST:
4234 case AArch64::LD1Twov8b_POST:
4235 case AArch64::LD1Twov8h_POST:
4236 case AArch64::LD1i16_POST:
4237 case AArch64::LD1i32_POST:
4238 case AArch64::LD1i64_POST:
4239 case AArch64::LD1i8_POST:
4240 case AArch64::LD2Rv16b_POST:
4241 case AArch64::LD2Rv1d_POST:
4242 case AArch64::LD2Rv2d_POST:
4243 case AArch64::LD2Rv2s_POST:
4244 case AArch64::LD2Rv4h_POST:
4245 case AArch64::LD2Rv4s_POST:
4246 case AArch64::LD2Rv8b_POST:
4247 case AArch64::LD2Rv8h_POST:
4248 case AArch64::LD2Twov16b_POST:
4249 case AArch64::LD2Twov2d_POST:
4250 case AArch64::LD2Twov2s_POST:
4251 case AArch64::LD2Twov4h_POST:
4252 case AArch64::LD2Twov4s_POST:
4253 case AArch64::LD2Twov8b_POST:
4254 case AArch64::LD2Twov8h_POST:
4255 case AArch64::LD2i16_POST:
4256 case AArch64::LD2i32_POST:
4257 case AArch64::LD2i64_POST:
4258 case AArch64::LD2i8_POST:
4259 case AArch64::LD3Rv16b_POST:
4260 case AArch64::LD3Rv1d_POST:
4261 case AArch64::LD3Rv2d_POST:
4262 case AArch64::LD3Rv2s_POST:
4263 case AArch64::LD3Rv4h_POST:
4264 case AArch64::LD3Rv4s_POST:
4265 case AArch64::LD3Rv8b_POST:
4266 case AArch64::LD3Rv8h_POST:
4267 case AArch64::LD3Threev16b_POST:
4268 case AArch64::LD3Threev2d_POST:
4269 case AArch64::LD3Threev2s_POST:
4270 case AArch64::LD3Threev4h_POST:
4271 case AArch64::LD3Threev4s_POST:
4272 case AArch64::LD3Threev8b_POST:
4273 case AArch64::LD3Threev8h_POST:
4274 case AArch64::LD3i16_POST:
4275 case AArch64::LD3i32_POST:
4276 case AArch64::LD3i64_POST:
4277 case AArch64::LD3i8_POST:
4278 case AArch64::LD4Fourv16b_POST:
4279 case AArch64::LD4Fourv2d_POST:
4280 case AArch64::LD4Fourv2s_POST:
4281 case AArch64::LD4Fourv4h_POST:
4282 case AArch64::LD4Fourv4s_POST:
4283 case AArch64::LD4Fourv8b_POST:
4284 case AArch64::LD4Fourv8h_POST:
4285 case AArch64::LD4Rv16b_POST:
4286 case AArch64::LD4Rv1d_POST:
4287 case AArch64::LD4Rv2d_POST:
4288 case AArch64::LD4Rv2s_POST:
4289 case AArch64::LD4Rv4h_POST:
4290 case AArch64::LD4Rv4s_POST:
4291 case AArch64::LD4Rv8b_POST:
4292 case AArch64::LD4Rv8h_POST:
4293 case AArch64::LD4i16_POST:
4294 case AArch64::LD4i32_POST:
4295 case AArch64::LD4i64_POST:
4296 case AArch64::LD4i8_POST:
4297 case AArch64::LDAPRWpost:
4298 case AArch64::LDAPRXpost:
4299 case AArch64::LDIAPPWpost:
4300 case AArch64::LDIAPPXpost:
4301 case AArch64::LDPDpost:
4302 case AArch64::LDPQpost:
4303 case AArch64::LDPSWpost:
4304 case AArch64::LDPSpost:
4305 case AArch64::LDPWpost:
4306 case AArch64::LDPXpost:
4307 case AArch64::LDRBBpost:
4308 case AArch64::LDRBpost:
4309 case AArch64::LDRDpost:
4310 case AArch64::LDRHHpost:
4311 case AArch64::LDRHpost:
4312 case AArch64::LDRQpost:
4313 case AArch64::LDRSBWpost:
4314 case AArch64::LDRSBXpost:
4315 case AArch64::LDRSHWpost:
4316 case AArch64::LDRSHXpost:
4317 case AArch64::LDRSWpost:
4318 case AArch64::LDRSpost:
4319 case AArch64::LDRWpost:
4320 case AArch64::LDRXpost:
4321 case AArch64::ST1Fourv16b_POST:
4322 case AArch64::ST1Fourv1d_POST:
4323 case AArch64::ST1Fourv2d_POST:
4324 case AArch64::ST1Fourv2s_POST:
4325 case AArch64::ST1Fourv4h_POST:
4326 case AArch64::ST1Fourv4s_POST:
4327 case AArch64::ST1Fourv8b_POST:
4328 case AArch64::ST1Fourv8h_POST:
4329 case AArch64::ST1Onev16b_POST:
4330 case AArch64::ST1Onev1d_POST:
4331 case AArch64::ST1Onev2d_POST:
4332 case AArch64::ST1Onev2s_POST:
4333 case AArch64::ST1Onev4h_POST:
4334 case AArch64::ST1Onev4s_POST:
4335 case AArch64::ST1Onev8b_POST:
4336 case AArch64::ST1Onev8h_POST:
4337 case AArch64::ST1Threev16b_POST:
4338 case AArch64::ST1Threev1d_POST:
4339 case AArch64::ST1Threev2d_POST:
4340 case AArch64::ST1Threev2s_POST:
4341 case AArch64::ST1Threev4h_POST:
4342 case AArch64::ST1Threev4s_POST:
4343 case AArch64::ST1Threev8b_POST:
4344 case AArch64::ST1Threev8h_POST:
4345 case AArch64::ST1Twov16b_POST:
4346 case AArch64::ST1Twov1d_POST:
4347 case AArch64::ST1Twov2d_POST:
4348 case AArch64::ST1Twov2s_POST:
4349 case AArch64::ST1Twov4h_POST:
4350 case AArch64::ST1Twov4s_POST:
4351 case AArch64::ST1Twov8b_POST:
4352 case AArch64::ST1Twov8h_POST:
4353 case AArch64::ST1i16_POST:
4354 case AArch64::ST1i32_POST:
4355 case AArch64::ST1i64_POST:
4356 case AArch64::ST1i8_POST:
4357 case AArch64::ST2GPostIndex:
4358 case AArch64::ST2Twov16b_POST:
4359 case AArch64::ST2Twov2d_POST:
4360 case AArch64::ST2Twov2s_POST:
4361 case AArch64::ST2Twov4h_POST:
4362 case AArch64::ST2Twov4s_POST:
4363 case AArch64::ST2Twov8b_POST:
4364 case AArch64::ST2Twov8h_POST:
4365 case AArch64::ST2i16_POST:
4366 case AArch64::ST2i32_POST:
4367 case AArch64::ST2i64_POST:
4368 case AArch64::ST2i8_POST:
4369 case AArch64::ST3Threev16b_POST:
4370 case AArch64::ST3Threev2d_POST:
4371 case AArch64::ST3Threev2s_POST:
4372 case AArch64::ST3Threev4h_POST:
4373 case AArch64::ST3Threev4s_POST:
4374 case AArch64::ST3Threev8b_POST:
4375 case AArch64::ST3Threev8h_POST:
4376 case AArch64::ST3i16_POST:
4377 case AArch64::ST3i32_POST:
4378 case AArch64::ST3i64_POST:
4379 case AArch64::ST3i8_POST:
4380 case AArch64::ST4Fourv16b_POST:
4381 case AArch64::ST4Fourv2d_POST:
4382 case AArch64::ST4Fourv2s_POST:
4383 case AArch64::ST4Fourv4h_POST:
4384 case AArch64::ST4Fourv4s_POST:
4385 case AArch64::ST4Fourv8b_POST:
4386 case AArch64::ST4Fourv8h_POST:
4387 case AArch64::ST4i16_POST:
4388 case AArch64::ST4i32_POST:
4389 case AArch64::ST4i64_POST:
4390 case AArch64::ST4i8_POST:
4391 case AArch64::STGPostIndex:
4392 case AArch64::STGPpost:
4393 case AArch64::STPDpost:
4394 case AArch64::STPQpost:
4395 case AArch64::STPSpost:
4396 case AArch64::STPWpost:
4397 case AArch64::STPXpost:
4398 case AArch64::STRBBpost:
4399 case AArch64::STRBpost:
4400 case AArch64::STRDpost:
4401 case AArch64::STRHHpost:
4402 case AArch64::STRHpost:
4403 case AArch64::STRQpost:
4404 case AArch64::STRSpost:
4405 case AArch64::STRWpost:
4406 case AArch64::STRXpost:
4407 case AArch64::STZ2GPostIndex:
4408 case AArch64::STZGPostIndex:
4409 return true;
4410 }
4411}
4412
4414 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4415 bool &OffsetIsScalable, TypeSize &Width,
4416 const TargetRegisterInfo *TRI) const {
4417 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4418 // Handle only loads/stores with base register followed by immediate offset.
4419 if (LdSt.getNumExplicitOperands() == 3) {
4420 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4421 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4422 !LdSt.getOperand(2).isImm())
4423 return false;
4424 } else if (LdSt.getNumExplicitOperands() == 4) {
4425 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4426 if (!LdSt.getOperand(1).isReg() ||
4427 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4428 !LdSt.getOperand(3).isImm())
4429 return false;
4430 } else
4431 return false;
4432
4433 // Get the scaling factor for the instruction and set the width for the
4434 // instruction.
4435 TypeSize Scale(0U, false);
4436 int64_t Dummy1, Dummy2;
4437
4438 // If this returns false, then it's an instruction we don't want to handle.
4439 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4440 return false;
4441
4442 // Compute the offset. Offset is calculated as the immediate operand
4443 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4444 // set to 1. Postindex are a special case which have an offset of 0.
4445 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4446 BaseOp = &LdSt.getOperand(2);
4447 Offset = 0;
4448 } else if (LdSt.getNumExplicitOperands() == 3) {
4449 BaseOp = &LdSt.getOperand(1);
4450 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4451 } else {
4452 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4453 BaseOp = &LdSt.getOperand(2);
4454 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4455 }
4456 OffsetIsScalable = Scale.isScalable();
4457
4458 return BaseOp->isReg() || BaseOp->isFI();
4459}
4460
4463 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4464 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4465 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4466 return OfsOp;
4467}
4468
4469bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4470 TypeSize &Width, int64_t &MinOffset,
4471 int64_t &MaxOffset) {
4472 switch (Opcode) {
4473 // Not a memory operation or something we want to handle.
4474 default:
4475 Scale = TypeSize::getFixed(0);
4476 Width = TypeSize::getFixed(0);
4477 MinOffset = MaxOffset = 0;
4478 return false;
4479 // LDR / STR
4480 case AArch64::LDRQui:
4481 case AArch64::STRQui:
4482 Scale = TypeSize::getFixed(16);
4483 Width = TypeSize::getFixed(16);
4484 MinOffset = 0;
4485 MaxOffset = 4095;
4486 break;
4487 case AArch64::LDRXui:
4488 case AArch64::LDRDui:
4489 case AArch64::STRXui:
4490 case AArch64::STRDui:
4491 case AArch64::PRFMui:
4492 Scale = TypeSize::getFixed(8);
4493 Width = TypeSize::getFixed(8);
4494 MinOffset = 0;
4495 MaxOffset = 4095;
4496 break;
4497 case AArch64::LDRWui:
4498 case AArch64::LDRSui:
4499 case AArch64::LDRSWui:
4500 case AArch64::STRWui:
4501 case AArch64::STRSui:
4502 Scale = TypeSize::getFixed(4);
4503 Width = TypeSize::getFixed(4);
4504 MinOffset = 0;
4505 MaxOffset = 4095;
4506 break;
4507 case AArch64::LDRHui:
4508 case AArch64::LDRHHui:
4509 case AArch64::LDRSHWui:
4510 case AArch64::LDRSHXui:
4511 case AArch64::STRHui:
4512 case AArch64::STRHHui:
4513 Scale = TypeSize::getFixed(2);
4514 Width = TypeSize::getFixed(2);
4515 MinOffset = 0;
4516 MaxOffset = 4095;
4517 break;
4518 case AArch64::LDRBui:
4519 case AArch64::LDRBBui:
4520 case AArch64::LDRSBWui:
4521 case AArch64::LDRSBXui:
4522 case AArch64::STRBui:
4523 case AArch64::STRBBui:
4524 Scale = TypeSize::getFixed(1);
4525 Width = TypeSize::getFixed(1);
4526 MinOffset = 0;
4527 MaxOffset = 4095;
4528 break;
4529 // post/pre inc
4530 case AArch64::STRQpre:
4531 case AArch64::LDRQpost:
4532 Scale = TypeSize::getFixed(1);
4533 Width = TypeSize::getFixed(16);
4534 MinOffset = -256;
4535 MaxOffset = 255;
4536 break;
4537 case AArch64::LDRDpost:
4538 case AArch64::LDRDpre:
4539 case AArch64::LDRXpost:
4540 case AArch64::LDRXpre:
4541 case AArch64::STRDpost:
4542 case AArch64::STRDpre:
4543 case AArch64::STRXpost:
4544 case AArch64::STRXpre:
4545 Scale = TypeSize::getFixed(1);
4546 Width = TypeSize::getFixed(8);
4547 MinOffset = -256;
4548 MaxOffset = 255;
4549 break;
4550 case AArch64::STRWpost:
4551 case AArch64::STRWpre:
4552 case AArch64::LDRWpost:
4553 case AArch64::LDRWpre:
4554 case AArch64::STRSpost:
4555 case AArch64::STRSpre:
4556 case AArch64::LDRSpost:
4557 case AArch64::LDRSpre:
4558 Scale = TypeSize::getFixed(1);
4559 Width = TypeSize::getFixed(4);
4560 MinOffset = -256;
4561 MaxOffset = 255;
4562 break;
4563 case AArch64::LDRHpost:
4564 case AArch64::LDRHpre:
4565 case AArch64::STRHpost:
4566 case AArch64::STRHpre:
4567 case AArch64::LDRHHpost:
4568 case AArch64::LDRHHpre:
4569 case AArch64::STRHHpost:
4570 case AArch64::STRHHpre:
4571 Scale = TypeSize::getFixed(1);
4572 Width = TypeSize::getFixed(2);
4573 MinOffset = -256;
4574 MaxOffset = 255;
4575 break;
4576 case AArch64::LDRBpost:
4577 case AArch64::LDRBpre:
4578 case AArch64::STRBpost:
4579 case AArch64::STRBpre:
4580 case AArch64::LDRBBpost:
4581 case AArch64::LDRBBpre:
4582 case AArch64::STRBBpost:
4583 case AArch64::STRBBpre:
4584 Scale = TypeSize::getFixed(1);
4585 Width = TypeSize::getFixed(1);
4586 MinOffset = -256;
4587 MaxOffset = 255;
4588 break;
4589 // Unscaled
4590 case AArch64::LDURQi:
4591 case AArch64::STURQi:
4592 Scale = TypeSize::getFixed(1);
4593 Width = TypeSize::getFixed(16);
4594 MinOffset = -256;
4595 MaxOffset = 255;
4596 break;
4597 case AArch64::LDURXi:
4598 case AArch64::LDURDi:
4599 case AArch64::LDAPURXi:
4600 case AArch64::STURXi:
4601 case AArch64::STURDi:
4602 case AArch64::STLURXi:
4603 case AArch64::PRFUMi:
4604 Scale = TypeSize::getFixed(1);
4605 Width = TypeSize::getFixed(8);
4606 MinOffset = -256;
4607 MaxOffset = 255;
4608 break;
4609 case AArch64::LDURWi:
4610 case AArch64::LDURSi:
4611 case AArch64::LDURSWi:
4612 case AArch64::LDAPURi:
4613 case AArch64::LDAPURSWi:
4614 case AArch64::STURWi:
4615 case AArch64::STURSi:
4616 case AArch64::STLURWi:
4617 Scale = TypeSize::getFixed(1);
4618 Width = TypeSize::getFixed(4);
4619 MinOffset = -256;
4620 MaxOffset = 255;
4621 break;
4622 case AArch64::LDURHi:
4623 case AArch64::LDURHHi:
4624 case AArch64::LDURSHXi:
4625 case AArch64::LDURSHWi:
4626 case AArch64::LDAPURHi:
4627 case AArch64::LDAPURSHWi:
4628 case AArch64::LDAPURSHXi:
4629 case AArch64::STURHi:
4630 case AArch64::STURHHi:
4631 case AArch64::STLURHi:
4632 Scale = TypeSize::getFixed(1);
4633 Width = TypeSize::getFixed(2);
4634 MinOffset = -256;
4635 MaxOffset = 255;
4636 break;
4637 case AArch64::LDURBi:
4638 case AArch64::LDURBBi:
4639 case AArch64::LDURSBXi:
4640 case AArch64::LDURSBWi:
4641 case AArch64::LDAPURBi:
4642 case AArch64::LDAPURSBWi:
4643 case AArch64::LDAPURSBXi:
4644 case AArch64::STURBi:
4645 case AArch64::STURBBi:
4646 case AArch64::STLURBi:
4647 Scale = TypeSize::getFixed(1);
4648 Width = TypeSize::getFixed(1);
4649 MinOffset = -256;
4650 MaxOffset = 255;
4651 break;
4652 // LDP / STP (including pre/post inc)
4653 case AArch64::LDPQi:
4654 case AArch64::LDNPQi:
4655 case AArch64::STPQi:
4656 case AArch64::STNPQi:
4657 case AArch64::LDPQpost:
4658 case AArch64::LDPQpre:
4659 case AArch64::STPQpost:
4660 case AArch64::STPQpre:
4661 Scale = TypeSize::getFixed(16);
4662 Width = TypeSize::getFixed(16 * 2);
4663 MinOffset = -64;
4664 MaxOffset = 63;
4665 break;
4666 case AArch64::LDPXi:
4667 case AArch64::LDPDi:
4668 case AArch64::LDNPXi:
4669 case AArch64::LDNPDi:
4670 case AArch64::STPXi:
4671 case AArch64::STPDi:
4672 case AArch64::STNPXi:
4673 case AArch64::STNPDi:
4674 case AArch64::LDPDpost:
4675 case AArch64::LDPDpre:
4676 case AArch64::LDPXpost:
4677 case AArch64::LDPXpre:
4678 case AArch64::STPDpost:
4679 case AArch64::STPDpre:
4680 case AArch64::STPXpost:
4681 case AArch64::STPXpre:
4682 Scale = TypeSize::getFixed(8);
4683 Width = TypeSize::getFixed(8 * 2);
4684 MinOffset = -64;
4685 MaxOffset = 63;
4686 break;
4687 case AArch64::LDPWi:
4688 case AArch64::LDPSi:
4689 case AArch64::LDNPWi:
4690 case AArch64::LDNPSi:
4691 case AArch64::STPWi:
4692 case AArch64::STPSi:
4693 case AArch64::STNPWi:
4694 case AArch64::STNPSi:
4695 case AArch64::LDPSpost:
4696 case AArch64::LDPSpre:
4697 case AArch64::LDPWpost:
4698 case AArch64::LDPWpre:
4699 case AArch64::STPSpost:
4700 case AArch64::STPSpre:
4701 case AArch64::STPWpost:
4702 case AArch64::STPWpre:
4703 Scale = TypeSize::getFixed(4);
4704 Width = TypeSize::getFixed(4 * 2);
4705 MinOffset = -64;
4706 MaxOffset = 63;
4707 break;
4708 case AArch64::StoreSwiftAsyncContext:
4709 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4710 Scale = TypeSize::getFixed(1);
4711 Width = TypeSize::getFixed(8);
4712 MinOffset = 0;
4713 MaxOffset = 4095;
4714 break;
4715 case AArch64::ADDG:
4716 Scale = TypeSize::getFixed(16);
4717 Width = TypeSize::getFixed(0);
4718 MinOffset = 0;
4719 MaxOffset = 63;
4720 break;
4721 case AArch64::TAGPstack:
4722 Scale = TypeSize::getFixed(16);
4723 Width = TypeSize::getFixed(0);
4724 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4725 // of 63 (not 64!).
4726 MinOffset = -63;
4727 MaxOffset = 63;
4728 break;
4729 case AArch64::LDG:
4730 case AArch64::STGi:
4731 case AArch64::STGPreIndex:
4732 case AArch64::STGPostIndex:
4733 case AArch64::STZGi:
4734 case AArch64::STZGPreIndex:
4735 case AArch64::STZGPostIndex:
4736 Scale = TypeSize::getFixed(16);
4737 Width = TypeSize::getFixed(16);
4738 MinOffset = -256;
4739 MaxOffset = 255;
4740 break;
4741 // SVE
4742 case AArch64::STR_ZZZZXI:
4743 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4744 case AArch64::LDR_ZZZZXI:
4745 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4746 Scale = TypeSize::getScalable(16);
4747 Width = TypeSize::getScalable(16 * 4);
4748 MinOffset = -256;
4749 MaxOffset = 252;
4750 break;
4751 case AArch64::STR_ZZZXI:
4752 case AArch64::LDR_ZZZXI:
4753 Scale = TypeSize::getScalable(16);
4754 Width = TypeSize::getScalable(16 * 3);
4755 MinOffset = -256;
4756 MaxOffset = 253;
4757 break;
4758 case AArch64::STR_ZZXI:
4759 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4760 case AArch64::LDR_ZZXI:
4761 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4762 Scale = TypeSize::getScalable(16);
4763 Width = TypeSize::getScalable(16 * 2);
4764 MinOffset = -256;
4765 MaxOffset = 254;
4766 break;
4767 case AArch64::LDR_PXI:
4768 case AArch64::STR_PXI:
4769 Scale = TypeSize::getScalable(2);
4770 Width = TypeSize::getScalable(2);
4771 MinOffset = -256;
4772 MaxOffset = 255;
4773 break;
4774 case AArch64::LDR_PPXI:
4775 case AArch64::STR_PPXI:
4776 Scale = TypeSize::getScalable(2);
4777 Width = TypeSize::getScalable(2 * 2);
4778 MinOffset = -256;
4779 MaxOffset = 254;
4780 break;
4781 case AArch64::LDR_ZXI:
4782 case AArch64::STR_ZXI:
4783 Scale = TypeSize::getScalable(16);
4784 Width = TypeSize::getScalable(16);
4785 MinOffset = -256;
4786 MaxOffset = 255;
4787 break;
4788 case AArch64::LD1B_IMM:
4789 case AArch64::LD1H_IMM:
4790 case AArch64::LD1W_IMM:
4791 case AArch64::LD1D_IMM:
4792 case AArch64::LDNT1B_ZRI:
4793 case AArch64::LDNT1H_ZRI:
4794 case AArch64::LDNT1W_ZRI:
4795 case AArch64::LDNT1D_ZRI:
4796 case AArch64::ST1B_IMM:
4797 case AArch64::ST1H_IMM:
4798 case AArch64::ST1W_IMM:
4799 case AArch64::ST1D_IMM:
4800 case AArch64::STNT1B_ZRI:
4801 case AArch64::STNT1H_ZRI:
4802 case AArch64::STNT1W_ZRI:
4803 case AArch64::STNT1D_ZRI:
4804 case AArch64::LDNF1B_IMM:
4805 case AArch64::LDNF1H_IMM:
4806 case AArch64::LDNF1W_IMM:
4807 case AArch64::LDNF1D_IMM:
4808 // A full vectors worth of data
4809 // Width = mbytes * elements
4810 Scale = TypeSize::getScalable(16);
4811 Width = TypeSize::getScalable(16);
4812 MinOffset = -8;
4813 MaxOffset = 7;
4814 break;
4815 case AArch64::LD2B_IMM:
4816 case AArch64::LD2H_IMM:
4817 case AArch64::LD2W_IMM:
4818 case AArch64::LD2D_IMM:
4819 case AArch64::ST2B_IMM:
4820 case AArch64::ST2H_IMM:
4821 case AArch64::ST2W_IMM:
4822 case AArch64::ST2D_IMM:
4823 Scale = TypeSize::getScalable(32);
4824 Width = TypeSize::getScalable(16 * 2);
4825 MinOffset = -8;
4826 MaxOffset = 7;
4827 break;
4828 case AArch64::LD3B_IMM:
4829 case AArch64::LD3H_IMM:
4830 case AArch64::LD3W_IMM:
4831 case AArch64::LD3D_IMM:
4832 case AArch64::ST3B_IMM:
4833 case AArch64::ST3H_IMM:
4834 case AArch64::ST3W_IMM:
4835 case AArch64::ST3D_IMM:
4836 Scale = TypeSize::getScalable(48);
4837 Width = TypeSize::getScalable(16 * 3);
4838 MinOffset = -8;
4839 MaxOffset = 7;
4840 break;
4841 case AArch64::LD4B_IMM:
4842 case AArch64::LD4H_IMM:
4843 case AArch64::LD4W_IMM:
4844 case AArch64::LD4D_IMM:
4845 case AArch64::ST4B_IMM:
4846 case AArch64::ST4H_IMM:
4847 case AArch64::ST4W_IMM:
4848 case AArch64::ST4D_IMM:
4849 Scale = TypeSize::getScalable(64);
4850 Width = TypeSize::getScalable(16 * 4);
4851 MinOffset = -8;
4852 MaxOffset = 7;
4853 break;
4854 case AArch64::LD1B_H_IMM:
4855 case AArch64::LD1SB_H_IMM:
4856 case AArch64::LD1H_S_IMM:
4857 case AArch64::LD1SH_S_IMM:
4858 case AArch64::LD1W_D_IMM:
4859 case AArch64::LD1SW_D_IMM:
4860 case AArch64::ST1B_H_IMM:
4861 case AArch64::ST1H_S_IMM:
4862 case AArch64::ST1W_D_IMM:
4863 case AArch64::LDNF1B_H_IMM:
4864 case AArch64::LDNF1SB_H_IMM:
4865 case AArch64::LDNF1H_S_IMM:
4866 case AArch64::LDNF1SH_S_IMM:
4867 case AArch64::LDNF1W_D_IMM:
4868 case AArch64::LDNF1SW_D_IMM:
4869 // A half vector worth of data
4870 // Width = mbytes * elements
4871 Scale = TypeSize::getScalable(8);
4872 Width = TypeSize::getScalable(8);
4873 MinOffset = -8;
4874 MaxOffset = 7;
4875 break;
4876 case AArch64::LD1B_S_IMM:
4877 case AArch64::LD1SB_S_IMM:
4878 case AArch64::LD1H_D_IMM:
4879 case AArch64::LD1SH_D_IMM:
4880 case AArch64::ST1B_S_IMM:
4881 case AArch64::ST1H_D_IMM:
4882 case AArch64::LDNF1B_S_IMM:
4883 case AArch64::LDNF1SB_S_IMM:
4884 case AArch64::LDNF1H_D_IMM:
4885 case AArch64::LDNF1SH_D_IMM:
4886 // A quarter vector worth of data
4887 // Width = mbytes * elements
4888 Scale = TypeSize::getScalable(4);
4889 Width = TypeSize::getScalable(4);
4890 MinOffset = -8;
4891 MaxOffset = 7;
4892 break;
4893 case AArch64::LD1B_D_IMM:
4894 case AArch64::LD1SB_D_IMM:
4895 case AArch64::ST1B_D_IMM:
4896 case AArch64::LDNF1B_D_IMM:
4897 case AArch64::LDNF1SB_D_IMM:
4898 // A eighth vector worth of data
4899 // Width = mbytes * elements
4900 Scale = TypeSize::getScalable(2);
4901 Width = TypeSize::getScalable(2);
4902 MinOffset = -8;
4903 MaxOffset = 7;
4904 break;
4905 case AArch64::ST2Gi:
4906 case AArch64::ST2GPreIndex:
4907 case AArch64::ST2GPostIndex:
4908 case AArch64::STZ2Gi:
4909 case AArch64::STZ2GPreIndex:
4910 case AArch64::STZ2GPostIndex:
4911 Scale = TypeSize::getFixed(16);
4912 Width = TypeSize::getFixed(32);
4913 MinOffset = -256;
4914 MaxOffset = 255;
4915 break;
4916 case AArch64::STGPi:
4917 case AArch64::STGPpost:
4918 case AArch64::STGPpre:
4919 Scale = TypeSize::getFixed(16);
4920 Width = TypeSize::getFixed(16);
4921 MinOffset = -64;
4922 MaxOffset = 63;
4923 break;
4924 case AArch64::LD1RB_IMM:
4925 case AArch64::LD1RB_H_IMM:
4926 case AArch64::LD1RB_S_IMM:
4927 case AArch64::LD1RB_D_IMM:
4928 case AArch64::LD1RSB_H_IMM:
4929 case AArch64::LD1RSB_S_IMM:
4930 case AArch64::LD1RSB_D_IMM:
4931 Scale = TypeSize::getFixed(1);
4932 Width = TypeSize::getFixed(1);
4933 MinOffset = 0;
4934 MaxOffset = 63;
4935 break;
4936 case AArch64::LD1RH_IMM:
4937 case AArch64::LD1RH_S_IMM:
4938 case AArch64::LD1RH_D_IMM:
4939 case AArch64::LD1RSH_S_IMM:
4940 case AArch64::LD1RSH_D_IMM:
4941 Scale = TypeSize::getFixed(2);
4942 Width = TypeSize::getFixed(2);
4943 MinOffset = 0;
4944 MaxOffset = 63;
4945 break;
4946 case AArch64::LD1RW_IMM:
4947 case AArch64::LD1RW_D_IMM:
4948 case AArch64::LD1RSW_IMM:
4949 Scale = TypeSize::getFixed(4);
4950 Width = TypeSize::getFixed(4);
4951 MinOffset = 0;
4952 MaxOffset = 63;
4953 break;
4954 case AArch64::LD1RD_IMM:
4955 Scale = TypeSize::getFixed(8);
4956 Width = TypeSize::getFixed(8);
4957 MinOffset = 0;
4958 MaxOffset = 63;
4959 break;
4960 }
4961
4962 return true;
4963}
4964
4965// Scaling factor for unscaled load or store.
4967 switch (Opc) {
4968 default:
4969 llvm_unreachable("Opcode has unknown scale!");
4970 case AArch64::LDRBui:
4971 case AArch64::LDRBBui:
4972 case AArch64::LDURBBi:
4973 case AArch64::LDRSBWui:
4974 case AArch64::LDURSBWi:
4975 case AArch64::STRBui:
4976 case AArch64::STRBBui:
4977 case AArch64::STURBBi:
4978 return 1;
4979 case AArch64::LDRHui:
4980 case AArch64::LDRHHui:
4981 case AArch64::LDURHHi:
4982 case AArch64::LDRSHWui:
4983 case AArch64::LDURSHWi:
4984 case AArch64::STRHui:
4985 case AArch64::STRHHui:
4986 case AArch64::STURHHi:
4987 return 2;
4988 case AArch64::LDRSui:
4989 case AArch64::LDURSi:
4990 case AArch64::LDRSpre:
4991 case AArch64::LDRSWui:
4992 case AArch64::LDURSWi:
4993 case AArch64::LDRSWpre:
4994 case AArch64::LDRWpre:
4995 case AArch64::LDRWui:
4996 case AArch64::LDURWi:
4997 case AArch64::STRSui:
4998 case AArch64::STURSi:
4999 case AArch64::STRSpre:
5000 case AArch64::STRWui:
5001 case AArch64::STURWi:
5002 case AArch64::STRWpre:
5003 case AArch64::LDPSi:
5004 case AArch64::LDPSWi:
5005 case AArch64::LDPWi:
5006 case AArch64::STPSi:
5007 case AArch64::STPWi:
5008 return 4;
5009 case AArch64::LDRDui:
5010 case AArch64::LDURDi:
5011 case AArch64::LDRDpre:
5012 case AArch64::LDRXui:
5013 case AArch64::LDURXi:
5014 case AArch64::LDRXpre:
5015 case AArch64::STRDui:
5016 case AArch64::STURDi:
5017 case AArch64::STRDpre:
5018 case AArch64::STRXui:
5019 case AArch64::STURXi:
5020 case AArch64::STRXpre:
5021 case AArch64::LDPDi:
5022 case AArch64::LDPXi:
5023 case AArch64::STPDi:
5024 case AArch64::STPXi:
5025 return 8;
5026 case AArch64::LDRQui:
5027 case AArch64::LDURQi:
5028 case AArch64::STRQui:
5029 case AArch64::STURQi:
5030 case AArch64::STRQpre:
5031 case AArch64::LDPQi:
5032 case AArch64::LDRQpre:
5033 case AArch64::STPQi:
5034 case AArch64::STGi:
5035 case AArch64::STZGi:
5036 case AArch64::ST2Gi:
5037 case AArch64::STZ2Gi:
5038 case AArch64::STGPi:
5039 return 16;
5040 }
5041}
5042
5044 switch (MI.getOpcode()) {
5045 default:
5046 return false;
5047 case AArch64::LDRWpre:
5048 case AArch64::LDRXpre:
5049 case AArch64::LDRSWpre:
5050 case AArch64::LDRSpre:
5051 case AArch64::LDRDpre:
5052 case AArch64::LDRQpre:
5053 return true;
5054 }
5055}
5056
5058 switch (MI.getOpcode()) {
5059 default:
5060 return false;
5061 case AArch64::STRWpre:
5062 case AArch64::STRXpre:
5063 case AArch64::STRSpre:
5064 case AArch64::STRDpre:
5065 case AArch64::STRQpre:
5066 return true;
5067 }
5068}
5069
5071 return isPreLd(MI) || isPreSt(MI);
5072}
5073
5075 switch (MI.getOpcode()) {
5076 default:
5077 return false;
5078 case AArch64::LDURBBi:
5079 case AArch64::LDURHHi:
5080 case AArch64::LDURWi:
5081 case AArch64::LDRBBui:
5082 case AArch64::LDRHHui:
5083 case AArch64::LDRWui:
5084 case AArch64::LDRBBroX:
5085 case AArch64::LDRHHroX:
5086 case AArch64::LDRWroX:
5087 case AArch64::LDRBBroW:
5088 case AArch64::LDRHHroW:
5089 case AArch64::LDRWroW:
5090 return true;
5091 }
5092}
5093
5095 switch (MI.getOpcode()) {
5096 default:
5097 return false;
5098 case AArch64::LDURSBWi:
5099 case AArch64::LDURSHWi:
5100 case AArch64::LDURSBXi:
5101 case AArch64::LDURSHXi:
5102 case AArch64::LDURSWi:
5103 case AArch64::LDRSBWui:
5104 case AArch64::LDRSHWui:
5105 case AArch64::LDRSBXui:
5106 case AArch64::LDRSHXui:
5107 case AArch64::LDRSWui:
5108 case AArch64::LDRSBWroX:
5109 case AArch64::LDRSHWroX:
5110 case AArch64::LDRSBXroX:
5111 case AArch64::LDRSHXroX:
5112 case AArch64::LDRSWroX:
5113 case AArch64::LDRSBWroW:
5114 case AArch64::LDRSHWroW:
5115 case AArch64::LDRSBXroW:
5116 case AArch64::LDRSHXroW:
5117 case AArch64::LDRSWroW:
5118 return true;
5119 }
5120}
5121
5123 switch (MI.getOpcode()) {
5124 default:
5125 return false;
5126 case AArch64::LDPSi:
5127 case AArch64::LDPSWi:
5128 case AArch64::LDPDi:
5129 case AArch64::LDPQi:
5130 case AArch64::LDPWi:
5131 case AArch64::LDPXi:
5132 case AArch64::STPSi:
5133 case AArch64::STPDi:
5134 case AArch64::STPQi:
5135 case AArch64::STPWi:
5136 case AArch64::STPXi:
5137 case AArch64::STGPi:
5138 return true;
5139 }
5140}
5141
5143 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5144 unsigned Idx =
5146 : 1;
5147 return MI.getOperand(Idx);
5148}
5149
5150const MachineOperand &
5152 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5153 unsigned Idx =
5155 : 2;
5156 return MI.getOperand(Idx);
5157}
5158
5159const MachineOperand &
5161 switch (MI.getOpcode()) {
5162 default:
5163 llvm_unreachable("Unexpected opcode");
5164 case AArch64::LDRBroX:
5165 case AArch64::LDRBBroX:
5166 case AArch64::LDRSBXroX:
5167 case AArch64::LDRSBWroX:
5168 case AArch64::LDRHroX:
5169 case AArch64::LDRHHroX:
5170 case AArch64::LDRSHXroX:
5171 case AArch64::LDRSHWroX:
5172 case AArch64::LDRWroX:
5173 case AArch64::LDRSroX:
5174 case AArch64::LDRSWroX:
5175 case AArch64::LDRDroX:
5176 case AArch64::LDRXroX:
5177 case AArch64::LDRQroX:
5178 return MI.getOperand(4);
5179 }
5180}
5181
5183 Register Reg) {
5184 if (MI.getParent() == nullptr)
5185 return nullptr;
5186 const MachineFunction *MF = MI.getParent()->getParent();
5187 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5188}
5189
5191 auto IsHFPR = [&](const MachineOperand &Op) {
5192 if (!Op.isReg())
5193 return false;
5194 auto Reg = Op.getReg();
5195 if (Reg.isPhysical())
5196 return AArch64::FPR16RegClass.contains(Reg);
5197 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5198 return TRC == &AArch64::FPR16RegClass ||
5199 TRC == &AArch64::FPR16_loRegClass;
5200 };
5201 return llvm::any_of(MI.operands(), IsHFPR);
5202}
5203
5205 auto IsQFPR = [&](const MachineOperand &Op) {
5206 if (!Op.isReg())
5207 return false;
5208 auto Reg = Op.getReg();
5209 if (Reg.isPhysical())
5210 return AArch64::FPR128RegClass.contains(Reg);
5211 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5212 return TRC == &AArch64::FPR128RegClass ||
5213 TRC == &AArch64::FPR128_loRegClass;
5214 };
5215 return llvm::any_of(MI.operands(), IsQFPR);
5216}
5217
5219 switch (MI.getOpcode()) {
5220 case AArch64::BRK:
5221 case AArch64::HLT:
5222 case AArch64::PACIASP:
5223 case AArch64::PACIBSP:
5224 // Implicit BTI behavior.
5225 return true;
5226 case AArch64::PAUTH_PROLOGUE:
5227 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5228 return true;
5229 case AArch64::HINT: {
5230 unsigned Imm = MI.getOperand(0).getImm();
5231 // Explicit BTI instruction.
5232 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5233 return true;
5234 // PACI(A|B)SP instructions.
5235 if (Imm == 25 || Imm == 27)
5236 return true;
5237 return false;
5238 }
5239 default:
5240 return false;
5241 }
5242}
5243
5245 if (Reg == 0)
5246 return false;
5247 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5248 return AArch64::FPR128RegClass.contains(Reg) ||
5249 AArch64::FPR64RegClass.contains(Reg) ||
5250 AArch64::FPR32RegClass.contains(Reg) ||
5251 AArch64::FPR16RegClass.contains(Reg) ||
5252 AArch64::FPR8RegClass.contains(Reg);
5253}
5254
5256 auto IsFPR = [&](const MachineOperand &Op) {
5257 if (!Op.isReg())
5258 return false;
5259 auto Reg = Op.getReg();
5260 if (Reg.isPhysical())
5261 return isFpOrNEON(Reg);
5262
5263 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5264 return TRC == &AArch64::FPR128RegClass ||
5265 TRC == &AArch64::FPR128_loRegClass ||
5266 TRC == &AArch64::FPR64RegClass ||
5267 TRC == &AArch64::FPR64_loRegClass ||
5268 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5269 TRC == &AArch64::FPR8RegClass;
5270 };
5271 return llvm::any_of(MI.operands(), IsFPR);
5272}
5273
5274// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5275// scaled.
5276static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5278
5279 // If the byte-offset isn't a multiple of the stride, we can't scale this
5280 // offset.
5281 if (Offset % Scale != 0)
5282 return false;
5283
5284 // Convert the byte-offset used by unscaled into an "element" offset used
5285 // by the scaled pair load/store instructions.
5286 Offset /= Scale;
5287 return true;
5288}
5289
5290static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5291 if (FirstOpc == SecondOpc)
5292 return true;
5293 // We can also pair sign-ext and zero-ext instructions.
5294 switch (FirstOpc) {
5295 default:
5296 return false;
5297 case AArch64::STRSui:
5298 case AArch64::STURSi:
5299 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5300 case AArch64::STRDui:
5301 case AArch64::STURDi:
5302 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5303 case AArch64::STRQui:
5304 case AArch64::STURQi:
5305 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5306 case AArch64::STRWui:
5307 case AArch64::STURWi:
5308 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5309 case AArch64::STRXui:
5310 case AArch64::STURXi:
5311 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5312 case AArch64::LDRSui:
5313 case AArch64::LDURSi:
5314 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5315 case AArch64::LDRDui:
5316 case AArch64::LDURDi:
5317 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5318 case AArch64::LDRQui:
5319 case AArch64::LDURQi:
5320 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5321 case AArch64::LDRWui:
5322 case AArch64::LDURWi:
5323 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5324 case AArch64::LDRSWui:
5325 case AArch64::LDURSWi:
5326 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5327 case AArch64::LDRXui:
5328 case AArch64::LDURXi:
5329 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5330 }
5331 // These instructions can't be paired based on their opcodes.
5332 return false;
5333}
5334
5335static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5336 int64_t Offset1, unsigned Opcode1, int FI2,
5337 int64_t Offset2, unsigned Opcode2) {
5338 // Accesses through fixed stack object frame indices may access a different
5339 // fixed stack slot. Check that the object offsets + offsets match.
5340 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5341 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5342 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5343 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5344 // Convert to scaled object offsets.
5345 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5346 if (ObjectOffset1 % Scale1 != 0)
5347 return false;
5348 ObjectOffset1 /= Scale1;
5349 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5350 if (ObjectOffset2 % Scale2 != 0)
5351 return false;
5352 ObjectOffset2 /= Scale2;
5353 ObjectOffset1 += Offset1;
5354 ObjectOffset2 += Offset2;
5355 return ObjectOffset1 + 1 == ObjectOffset2;
5356 }
5357
5358 return FI1 == FI2;
5359}
5360
5361/// Detect opportunities for ldp/stp formation.
5362///
5363/// Only called for LdSt for which getMemOperandWithOffset returns true.
5365 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5366 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5367 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5368 unsigned NumBytes) const {
5369 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5370 const MachineOperand &BaseOp1 = *BaseOps1.front();
5371 const MachineOperand &BaseOp2 = *BaseOps2.front();
5372 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5373 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5374 if (BaseOp1.getType() != BaseOp2.getType())
5375 return false;
5376
5377 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5378 "Only base registers and frame indices are supported.");
5379
5380 // Check for both base regs and base FI.
5381 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5382 return false;
5383
5384 // Only cluster up to a single pair.
5385 if (ClusterSize > 2)
5386 return false;
5387
5388 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5389 return false;
5390
5391 // Can we pair these instructions based on their opcodes?
5392 unsigned FirstOpc = FirstLdSt.getOpcode();
5393 unsigned SecondOpc = SecondLdSt.getOpcode();
5394 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5395 return false;
5396
5397 // Can't merge volatiles or load/stores that have a hint to avoid pair
5398 // formation, for example.
5399 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5400 !isCandidateToMergeOrPair(SecondLdSt))
5401 return false;
5402
5403 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5404 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5405 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5406 return false;
5407
5408 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5409 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5410 return false;
5411
5412 // Pairwise instructions have a 7-bit signed offset field.
5413 if (Offset1 > 63 || Offset1 < -64)
5414 return false;
5415
5416 // The caller should already have ordered First/SecondLdSt by offset.
5417 // Note: except for non-equal frame index bases
5418 if (BaseOp1.isFI()) {
5419 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5420 "Caller should have ordered offsets.");
5421
5422 const MachineFrameInfo &MFI =
5423 FirstLdSt.getParent()->getParent()->getFrameInfo();
5424 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5425 BaseOp2.getIndex(), Offset2, SecondOpc);
5426 }
5427
5428 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5429
5430 return Offset1 + 1 == Offset2;
5431}
5432
5434 MCRegister Reg, unsigned SubIdx,
5435 RegState State,
5436 const TargetRegisterInfo *TRI) {
5437 if (!SubIdx)
5438 return MIB.addReg(Reg, State);
5439
5440 if (Reg.isPhysical())
5441 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5442 return MIB.addReg(Reg, State, SubIdx);
5443}
5444
5445static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5446 unsigned NumRegs) {
5447 // We really want the positive remainder mod 32 here, that happens to be
5448 // easily obtainable with a mask.
5449 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5450}
5451
5454 const DebugLoc &DL, MCRegister DestReg,
5455 MCRegister SrcReg, bool KillSrc,
5456 unsigned Opcode,
5457 ArrayRef<unsigned> Indices) const {
5458 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5460 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5461 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5462 unsigned NumRegs = Indices.size();
5463
5464 int SubReg = 0, End = NumRegs, Incr = 1;
5465 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5466 SubReg = NumRegs - 1;
5467 End = -1;
5468 Incr = -1;
5469 }
5470
5471 for (; SubReg != End; SubReg += Incr) {
5472 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5473 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5474 AddSubReg(MIB, SrcReg, Indices[SubReg], {}, TRI);
5475 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5476 }
5477}
5478
5481 const DebugLoc &DL, MCRegister DestReg,
5482 MCRegister SrcReg, bool KillSrc,
5483 unsigned Opcode, unsigned ZeroReg,
5484 llvm::ArrayRef<unsigned> Indices) const {
5486 unsigned NumRegs = Indices.size();
5487
5488#ifndef NDEBUG
5489 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5490 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5491 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5492 "GPR reg sequences should not be able to overlap");
5493#endif
5494
5495 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5496 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5497 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5498 MIB.addReg(ZeroReg);
5499 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5500 MIB.addImm(0);
5501 }
5502}
5503
5504/// Returns true if the instruction at I is in a streaming call site region,
5505/// within a single basic block.
5506/// A "call site streaming region" starts after smstart and ends at smstop
5507/// around a call to a streaming function. This walks backward from I.
5510 MachineFunction &MF = *MBB.getParent();
5512 if (!AFI->hasStreamingModeChanges())
5513 return false;
5514 // Walk backwards to find smstart/smstop
5515 for (MachineInstr &MI : reverse(make_range(MBB.begin(), I))) {
5516 unsigned Opc = MI.getOpcode();
5517 if (Opc == AArch64::MSRpstatesvcrImm1 || Opc == AArch64::MSRpstatePseudo) {
5518 // Check if this is SM change (not ZA)
5519 int64_t PState = MI.getOperand(0).getImm();
5520 if (PState == AArch64SVCR::SVCRSM || PState == AArch64SVCR::SVCRSMZA) {
5521 // Operand 1 is 1 for start, 0 for stop
5522 return MI.getOperand(1).getImm() == 1;
5523 }
5524 }
5525 }
5526 return false;
5527}
5528
5529/// Returns true if in a streaming call site region without SME-FA64.
5530static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget,
5533 return !Subtarget.hasSMEFA64() && isInStreamingCallSiteRegion(MBB, I);
5534}
5535
5538 const DebugLoc &DL, Register DestReg,
5539 Register SrcReg, bool KillSrc,
5540 bool RenamableDest,
5541 bool RenamableSrc) const {
5542 ++NumCopyInstrs;
5543 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5544 AArch64::GPR32spRegClass.contains(SrcReg)) {
5545 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5546 // If either operand is WSP, expand to ADD #0.
5547 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5548 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5549 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5550 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5551 &AArch64::GPR64spRegClass);
5552 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5553 &AArch64::GPR64spRegClass);
5554 // This instruction is reading and writing X registers. This may upset
5555 // the register scavenger and machine verifier, so we need to indicate
5556 // that we are reading an undefined value from SrcRegX, but a proper
5557 // value from SrcReg.
5558 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5559 .addReg(SrcRegX, RegState::Undef)
5560 .addImm(0)
5562 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5563 ++NumZCRegMoveInstrsGPR;
5564 } else {
5565 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5566 .addReg(SrcReg, getKillRegState(KillSrc))
5567 .addImm(0)
5569 if (Subtarget.hasZeroCycleRegMoveGPR32())
5570 ++NumZCRegMoveInstrsGPR;
5571 }
5572 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5573 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5574 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5575 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5576 &AArch64::GPR64spRegClass);
5577 assert(DestRegX.isValid() && "Destination super-reg not valid");
5578 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5579 &AArch64::GPR64spRegClass);
5580 assert(SrcRegX.isValid() && "Source super-reg not valid");
5581 // This instruction is reading and writing X registers. This may upset
5582 // the register scavenger and machine verifier, so we need to indicate
5583 // that we are reading an undefined value from SrcRegX, but a proper
5584 // value from SrcReg.
5585 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5586 .addReg(AArch64::XZR)
5587 .addReg(SrcRegX, RegState::Undef)
5588 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5589 ++NumZCRegMoveInstrsGPR;
5590 } else {
5591 // Otherwise, expand to ORR WZR.
5592 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5593 .addReg(AArch64::WZR)
5594 .addReg(SrcReg, getKillRegState(KillSrc));
5595 if (Subtarget.hasZeroCycleRegMoveGPR32())
5596 ++NumZCRegMoveInstrsGPR;
5597 }
5598 return;
5599 }
5600
5601 // GPR32 zeroing
5602 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5603 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5604 !Subtarget.hasZeroCycleZeroingGPR32()) {
5605 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5606 &AArch64::GPR64spRegClass);
5607 assert(DestRegX.isValid() && "Destination super-reg not valid");
5608 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5609 .addImm(0)
5611 ++NumZCZeroingInstrsGPR;
5612 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5613 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5614 .addImm(0)
5616 ++NumZCZeroingInstrsGPR;
5617 } else {
5618 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5619 .addReg(AArch64::WZR)
5620 .addReg(AArch64::WZR);
5621 }
5622 return;
5623 }
5624
5625 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5626 AArch64::GPR64spRegClass.contains(SrcReg)) {
5627 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5628 // If either operand is SP, expand to ADD #0.
5629 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5630 .addReg(SrcReg, getKillRegState(KillSrc))
5631 .addImm(0)
5633 if (Subtarget.hasZeroCycleRegMoveGPR64())
5634 ++NumZCRegMoveInstrsGPR;
5635 } else {
5636 // Otherwise, expand to ORR XZR.
5637 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5638 .addReg(AArch64::XZR)
5639 .addReg(SrcReg, getKillRegState(KillSrc));
5640 if (Subtarget.hasZeroCycleRegMoveGPR64())
5641 ++NumZCRegMoveInstrsGPR;
5642 }
5643 return;
5644 }
5645
5646 // GPR64 zeroing
5647 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5648 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5649 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5650 .addImm(0)
5652 ++NumZCZeroingInstrsGPR;
5653 } else {
5654 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5655 .addReg(AArch64::XZR)
5656 .addReg(AArch64::XZR);
5657 }
5658 return;
5659 }
5660
5661 // Copy a Predicate register by ORRing with itself.
5662 if (AArch64::PPRRegClass.contains(DestReg) &&
5663 AArch64::PPRRegClass.contains(SrcReg)) {
5664 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5665 "Unexpected SVE register.");
5666 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5667 .addReg(SrcReg) // Pg
5668 .addReg(SrcReg)
5669 .addReg(SrcReg, getKillRegState(KillSrc));
5670 return;
5671 }
5672
5673 // Copy a predicate-as-counter register by ORRing with itself as if it
5674 // were a regular predicate (mask) register.
5675 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5676 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5677 if (DestIsPNR || SrcIsPNR) {
5678 auto ToPPR = [](MCRegister R) -> MCRegister {
5679 return (R - AArch64::PN0) + AArch64::P0;
5680 };
5681 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5682 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5683
5684 if (PPRSrcReg != PPRDestReg) {
5685 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5686 .addReg(PPRSrcReg) // Pg
5687 .addReg(PPRSrcReg)
5688 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5689 if (DestIsPNR)
5690 NewMI.addDef(DestReg, RegState::Implicit);
5691 }
5692 return;
5693 }
5694
5695 // Copy a Z register by ORRing with itself.
5696 if (AArch64::ZPRRegClass.contains(DestReg) &&
5697 AArch64::ZPRRegClass.contains(SrcReg)) {
5698 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5699 "Unexpected SVE register.");
5700 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5701 .addReg(SrcReg)
5702 .addReg(SrcReg, getKillRegState(KillSrc));
5703 return;
5704 }
5705
5706 // Copy a Z register pair by copying the individual sub-registers.
5707 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5708 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5709 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5710 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5711 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5712 "Unexpected SVE register.");
5713 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5714 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5715 Indices);
5716 return;
5717 }
5718
5719 // Copy a Z register triple by copying the individual sub-registers.
5720 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5721 AArch64::ZPR3RegClass.contains(SrcReg)) {
5722 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5723 "Unexpected SVE register.");
5724 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5725 AArch64::zsub2};
5726 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5727 Indices);
5728 return;
5729 }
5730
5731 // Copy a Z register quad by copying the individual sub-registers.
5732 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5733 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5734 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5735 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5736 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5737 "Unexpected SVE register.");
5738 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5739 AArch64::zsub2, AArch64::zsub3};
5740 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5741 Indices);
5742 return;
5743 }
5744
5745 // Copy a DDDD register quad by copying the individual sub-registers.
5746 if (AArch64::DDDDRegClass.contains(DestReg) &&
5747 AArch64::DDDDRegClass.contains(SrcReg)) {
5748 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5749 AArch64::dsub2, AArch64::dsub3};
5750 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5751 Indices);
5752 return;
5753 }
5754
5755 // Copy a DDD register triple by copying the individual sub-registers.
5756 if (AArch64::DDDRegClass.contains(DestReg) &&
5757 AArch64::DDDRegClass.contains(SrcReg)) {
5758 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5759 AArch64::dsub2};
5760 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5761 Indices);
5762 return;
5763 }
5764
5765 // Copy a DD register pair by copying the individual sub-registers.
5766 if (AArch64::DDRegClass.contains(DestReg) &&
5767 AArch64::DDRegClass.contains(SrcReg)) {
5768 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5769 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5770 Indices);
5771 return;
5772 }
5773
5774 // Copy a QQQQ register quad by copying the individual sub-registers.
5775 if (AArch64::QQQQRegClass.contains(DestReg) &&
5776 AArch64::QQQQRegClass.contains(SrcReg)) {
5777 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5778 AArch64::qsub2, AArch64::qsub3};
5779 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5780 Indices);
5781 return;
5782 }
5783
5784 // Copy a QQQ register triple by copying the individual sub-registers.
5785 if (AArch64::QQQRegClass.contains(DestReg) &&
5786 AArch64::QQQRegClass.contains(SrcReg)) {
5787 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5788 AArch64::qsub2};
5789 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5790 Indices);
5791 return;
5792 }
5793
5794 // Copy a QQ register pair by copying the individual sub-registers.
5795 if (AArch64::QQRegClass.contains(DestReg) &&
5796 AArch64::QQRegClass.contains(SrcReg)) {
5797 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5798 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5799 Indices);
5800 return;
5801 }
5802
5803 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5804 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5805 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5806 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5807 AArch64::XZR, Indices);
5808 return;
5809 }
5810
5811 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5812 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5813 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5814 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5815 AArch64::WZR, Indices);
5816 return;
5817 }
5818
5819 if (AArch64::FPR128RegClass.contains(DestReg) &&
5820 AArch64::FPR128RegClass.contains(SrcReg)) {
5821 // In streaming regions, NEON is illegal but streaming-SVE is available.
5822 // Use SVE for copies if we're in a streaming region and SME is available.
5823 // With +sme-fa64, NEON is legal in streaming mode so we can use it.
5824 if ((Subtarget.isSVEorStreamingSVEAvailable() &&
5825 !Subtarget.isNeonAvailable()) ||
5826 mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5827 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5828 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5829 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5830 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5831 } else if (Subtarget.isNeonAvailable()) {
5832 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5833 .addReg(SrcReg)
5834 .addReg(SrcReg, getKillRegState(KillSrc));
5835 if (Subtarget.hasZeroCycleRegMoveFPR128())
5836 ++NumZCRegMoveInstrsFPR;
5837 } else {
5838 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5839 .addReg(AArch64::SP, RegState::Define)
5840 .addReg(SrcReg, getKillRegState(KillSrc))
5841 .addReg(AArch64::SP)
5842 .addImm(-16);
5843 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5844 .addReg(AArch64::SP, RegState::Define)
5845 .addReg(DestReg, RegState::Define)
5846 .addReg(AArch64::SP)
5847 .addImm(16);
5848 }
5849 return;
5850 }
5851
5852 if (AArch64::FPR64RegClass.contains(DestReg) &&
5853 AArch64::FPR64RegClass.contains(SrcReg)) {
5854 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5855 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5856 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5857 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5858 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5859 &AArch64::FPR128RegClass);
5860 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5861 &AArch64::FPR128RegClass);
5862 // This instruction is reading and writing Q registers. This may upset
5863 // the register scavenger and machine verifier, so we need to indicate
5864 // that we are reading an undefined value from SrcRegQ, but a proper
5865 // value from SrcReg.
5866 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5867 .addReg(SrcRegQ, RegState::Undef)
5868 .addReg(SrcRegQ, RegState::Undef)
5869 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5870 ++NumZCRegMoveInstrsFPR;
5871 } else {
5872 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5873 .addReg(SrcReg, getKillRegState(KillSrc));
5874 if (Subtarget.hasZeroCycleRegMoveFPR64())
5875 ++NumZCRegMoveInstrsFPR;
5876 }
5877 return;
5878 }
5879
5880 if (AArch64::FPR32RegClass.contains(DestReg) &&
5881 AArch64::FPR32RegClass.contains(SrcReg)) {
5882 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5883 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5884 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5885 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5886 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5887 &AArch64::FPR128RegClass);
5888 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5889 &AArch64::FPR128RegClass);
5890 // This instruction is reading and writing Q registers. This may upset
5891 // the register scavenger and machine verifier, so we need to indicate
5892 // that we are reading an undefined value from SrcRegQ, but a proper
5893 // value from SrcReg.
5894 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5895 .addReg(SrcRegQ, RegState::Undef)
5896 .addReg(SrcRegQ, RegState::Undef)
5897 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5898 ++NumZCRegMoveInstrsFPR;
5899 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5900 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5901 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5902 &AArch64::FPR64RegClass);
5903 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5904 &AArch64::FPR64RegClass);
5905 // This instruction is reading and writing D registers. This may upset
5906 // the register scavenger and machine verifier, so we need to indicate
5907 // that we are reading an undefined value from SrcRegD, but a proper
5908 // value from SrcReg.
5909 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5910 .addReg(SrcRegD, RegState::Undef)
5911 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5912 ++NumZCRegMoveInstrsFPR;
5913 } else {
5914 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5915 .addReg(SrcReg, getKillRegState(KillSrc));
5916 if (Subtarget.hasZeroCycleRegMoveFPR32())
5917 ++NumZCRegMoveInstrsFPR;
5918 }
5919 return;
5920 }
5921
5922 if (AArch64::FPR16RegClass.contains(DestReg) &&
5923 AArch64::FPR16RegClass.contains(SrcReg)) {
5924 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5925 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5926 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5927 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5928 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5929 &AArch64::FPR128RegClass);
5930 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5931 &AArch64::FPR128RegClass);
5932 // This instruction is reading and writing Q registers. This may upset
5933 // the register scavenger and machine verifier, so we need to indicate
5934 // that we are reading an undefined value from SrcRegQ, but a proper
5935 // value from SrcReg.
5936 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5937 .addReg(SrcRegQ, RegState::Undef)
5938 .addReg(SrcRegQ, RegState::Undef)
5939 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5940 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5941 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5942 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5943 &AArch64::FPR64RegClass);
5944 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5945 &AArch64::FPR64RegClass);
5946 // This instruction is reading and writing D registers. This may upset
5947 // the register scavenger and machine verifier, so we need to indicate
5948 // that we are reading an undefined value from SrcRegD, but a proper
5949 // value from SrcReg.
5950 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5951 .addReg(SrcRegD, RegState::Undef)
5952 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5953 } else {
5954 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5955 &AArch64::FPR32RegClass);
5956 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5957 &AArch64::FPR32RegClass);
5958 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5959 .addReg(SrcReg, getKillRegState(KillSrc));
5960 }
5961 return;
5962 }
5963
5964 if (AArch64::FPR8RegClass.contains(DestReg) &&
5965 AArch64::FPR8RegClass.contains(SrcReg)) {
5966 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5967 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5968 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable() &&
5969 !mustAvoidNeonAtMBBI(Subtarget, MBB, I)) {
5970 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5971 &AArch64::FPR128RegClass);
5972 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5973 &AArch64::FPR128RegClass);
5974 // This instruction is reading and writing Q registers. This may upset
5975 // the register scavenger and machine verifier, so we need to indicate
5976 // that we are reading an undefined value from SrcRegQ, but a proper
5977 // value from SrcReg.
5978 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5979 .addReg(SrcRegQ, RegState::Undef)
5980 .addReg(SrcRegQ, RegState::Undef)
5981 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5982 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5983 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5984 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5985 &AArch64::FPR64RegClass);
5986 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5987 &AArch64::FPR64RegClass);
5988 // This instruction is reading and writing D registers. This may upset
5989 // the register scavenger and machine verifier, so we need to indicate
5990 // that we are reading an undefined value from SrcRegD, but a proper
5991 // value from SrcReg.
5992 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5993 .addReg(SrcRegD, RegState::Undef)
5994 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5995 } else {
5996 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5997 &AArch64::FPR32RegClass);
5998 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5999 &AArch64::FPR32RegClass);
6000 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
6001 .addReg(SrcReg, getKillRegState(KillSrc));
6002 }
6003 return;
6004 }
6005
6006 // Copies between GPR64 and FPR64.
6007 if (AArch64::FPR64RegClass.contains(DestReg) &&
6008 AArch64::GPR64RegClass.contains(SrcReg)) {
6009 if (AArch64::XZR == SrcReg) {
6010 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
6011 } else {
6012 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
6013 .addReg(SrcReg, getKillRegState(KillSrc));
6014 }
6015 return;
6016 }
6017 if (AArch64::GPR64RegClass.contains(DestReg) &&
6018 AArch64::FPR64RegClass.contains(SrcReg)) {
6019 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
6020 .addReg(SrcReg, getKillRegState(KillSrc));
6021 return;
6022 }
6023 // Copies between GPR32 and FPR32.
6024 if (AArch64::FPR32RegClass.contains(DestReg) &&
6025 AArch64::GPR32RegClass.contains(SrcReg)) {
6026 if (AArch64::WZR == SrcReg) {
6027 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
6028 } else {
6029 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
6030 .addReg(SrcReg, getKillRegState(KillSrc));
6031 }
6032 return;
6033 }
6034 if (AArch64::GPR32RegClass.contains(DestReg) &&
6035 AArch64::FPR32RegClass.contains(SrcReg)) {
6036 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
6037 .addReg(SrcReg, getKillRegState(KillSrc));
6038 return;
6039 }
6040
6041 if (DestReg == AArch64::NZCV) {
6042 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
6043 BuildMI(MBB, I, DL, get(AArch64::MSR))
6044 .addImm(AArch64SysReg::NZCV)
6045 .addReg(SrcReg, getKillRegState(KillSrc))
6046 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
6047 return;
6048 }
6049
6050 if (SrcReg == AArch64::NZCV) {
6051 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
6052 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
6053 .addImm(AArch64SysReg::NZCV)
6054 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
6055 return;
6056 }
6057
6058#ifndef NDEBUG
6059 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
6060 << "\n";
6061#endif
6062 llvm_unreachable("unimplemented reg-to-reg copy");
6063}
6064
6067 MachineBasicBlock::iterator InsertBefore,
6068 const MCInstrDesc &MCID,
6069 Register SrcReg, bool IsKill,
6070 unsigned SubIdx0, unsigned SubIdx1, int FI,
6071 MachineMemOperand *MMO) {
6072 Register SrcReg0 = SrcReg;
6073 Register SrcReg1 = SrcReg;
6074 if (SrcReg.isPhysical()) {
6075 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
6076 SubIdx0 = 0;
6077 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
6078 SubIdx1 = 0;
6079 }
6080 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6081 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
6082 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
6083 .addFrameIndex(FI)
6084 .addImm(0)
6085 .addMemOperand(MMO);
6086}
6087
6090 Register SrcReg, bool isKill, int FI,
6091 const TargetRegisterClass *RC,
6092 Register VReg,
6093 MachineInstr::MIFlag Flags) const {
6094 MachineFunction &MF = *MBB.getParent();
6095 MachineFrameInfo &MFI = MF.getFrameInfo();
6096
6098 MachineMemOperand *MMO =
6100 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6101 unsigned Opc = 0;
6102 bool Offset = true;
6104 unsigned StackID = TargetStackID::Default;
6105 switch (RI.getSpillSize(*RC)) {
6106 case 1:
6107 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6108 Opc = AArch64::STRBui;
6109 break;
6110 case 2: {
6111 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6112 Opc = AArch64::STRHui;
6113 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
6114 AArch64::PPRRegClass.hasSubClassEq(RC)) {
6115 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6116 "Unexpected register store without SVE store instructions");
6117 Opc = AArch64::STR_PXI;
6119 }
6120 break;
6121 }
6122 case 4:
6123 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6124 Opc = AArch64::STRWui;
6125 if (SrcReg.isVirtual())
6126 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
6127 else
6128 assert(SrcReg != AArch64::WSP);
6129 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6130 Opc = AArch64::STRSui;
6131 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6132 Opc = AArch64::STR_PPXI;
6134 }
6135 break;
6136 case 8:
6137 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6138 Opc = AArch64::STRXui;
6139 if (SrcReg.isVirtual())
6140 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6141 else
6142 assert(SrcReg != AArch64::SP);
6143 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6144 Opc = AArch64::STRDui;
6145 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6147 get(AArch64::STPWi), SrcReg, isKill,
6148 AArch64::sube32, AArch64::subo32, FI, MMO);
6149 return;
6150 }
6151 break;
6152 case 16:
6153 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6154 Opc = AArch64::STRQui;
6155 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6156 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6157 Opc = AArch64::ST1Twov1d;
6158 Offset = false;
6159 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6161 get(AArch64::STPXi), SrcReg, isKill,
6162 AArch64::sube64, AArch64::subo64, FI, MMO);
6163 return;
6164 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6165 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6166 "Unexpected register store without SVE store instructions");
6167 Opc = AArch64::STR_ZXI;
6169 }
6170 break;
6171 case 24:
6172 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6173 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6174 Opc = AArch64::ST1Threev1d;
6175 Offset = false;
6176 }
6177 break;
6178 case 32:
6179 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6180 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6181 Opc = AArch64::ST1Fourv1d;
6182 Offset = false;
6183 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6184 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6185 Opc = AArch64::ST1Twov2d;
6186 Offset = false;
6187 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6188 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6189 "Unexpected register store without SVE store instructions");
6190 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6192 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6193 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6194 "Unexpected register store without SVE store instructions");
6195 Opc = AArch64::STR_ZZXI;
6197 }
6198 break;
6199 case 48:
6200 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6201 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6202 Opc = AArch64::ST1Threev2d;
6203 Offset = false;
6204 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6205 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6206 "Unexpected register store without SVE store instructions");
6207 Opc = AArch64::STR_ZZZXI;
6209 }
6210 break;
6211 case 64:
6212 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6213 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6214 Opc = AArch64::ST1Fourv2d;
6215 Offset = false;
6216 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6217 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6218 "Unexpected register store without SVE store instructions");
6219 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6221 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6222 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6223 "Unexpected register store without SVE store instructions");
6224 Opc = AArch64::STR_ZZZZXI;
6226 }
6227 break;
6228 }
6229 assert(Opc && "Unknown register class");
6230 MFI.setStackID(FI, StackID);
6231
6233 .addReg(SrcReg, getKillRegState(isKill))
6234 .addFrameIndex(FI);
6235
6236 if (Offset)
6237 MI.addImm(0);
6238 if (PNRReg.isValid())
6239 MI.addDef(PNRReg, RegState::Implicit);
6240 MI.addMemOperand(MMO);
6241}
6242
6245 MachineBasicBlock::iterator InsertBefore,
6246 const MCInstrDesc &MCID,
6247 Register DestReg, unsigned SubIdx0,
6248 unsigned SubIdx1, int FI,
6249 MachineMemOperand *MMO) {
6250 Register DestReg0 = DestReg;
6251 Register DestReg1 = DestReg;
6252 bool IsUndef = true;
6253 if (DestReg.isPhysical()) {
6254 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6255 SubIdx0 = 0;
6256 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6257 SubIdx1 = 0;
6258 IsUndef = false;
6259 }
6260 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6261 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6262 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6263 .addFrameIndex(FI)
6264 .addImm(0)
6265 .addMemOperand(MMO);
6266}
6267
6270 Register DestReg, int FI,
6271 const TargetRegisterClass *RC,
6272 Register VReg, unsigned SubReg,
6273 MachineInstr::MIFlag Flags) const {
6274 MachineFunction &MF = *MBB.getParent();
6275 MachineFrameInfo &MFI = MF.getFrameInfo();
6277 MachineMemOperand *MMO =
6279 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6280
6281 unsigned Opc = 0;
6282 bool Offset = true;
6283 unsigned StackID = TargetStackID::Default;
6285 switch (TRI.getSpillSize(*RC)) {
6286 case 1:
6287 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6288 Opc = AArch64::LDRBui;
6289 break;
6290 case 2: {
6291 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6292 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6293 Opc = AArch64::LDRHui;
6294 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6295 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6296 "Unexpected register load without SVE load instructions");
6297 if (IsPNR)
6298 PNRReg = DestReg;
6299 Opc = AArch64::LDR_PXI;
6301 }
6302 break;
6303 }
6304 case 4:
6305 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6306 Opc = AArch64::LDRWui;
6307 if (DestReg.isVirtual())
6308 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6309 else
6310 assert(DestReg != AArch64::WSP);
6311 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6312 Opc = AArch64::LDRSui;
6313 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6314 Opc = AArch64::LDR_PPXI;
6316 }
6317 break;
6318 case 8:
6319 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6320 Opc = AArch64::LDRXui;
6321 if (DestReg.isVirtual())
6322 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6323 else
6324 assert(DestReg != AArch64::SP);
6325 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6326 Opc = AArch64::LDRDui;
6327 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6329 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6330 AArch64::subo32, FI, MMO);
6331 return;
6332 }
6333 break;
6334 case 16:
6335 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6336 Opc = AArch64::LDRQui;
6337 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6338 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6339 Opc = AArch64::LD1Twov1d;
6340 Offset = false;
6341 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6343 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6344 AArch64::subo64, FI, MMO);
6345 return;
6346 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6347 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6348 "Unexpected register load without SVE load instructions");
6349 Opc = AArch64::LDR_ZXI;
6351 }
6352 break;
6353 case 24:
6354 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6355 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6356 Opc = AArch64::LD1Threev1d;
6357 Offset = false;
6358 }
6359 break;
6360 case 32:
6361 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6362 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6363 Opc = AArch64::LD1Fourv1d;
6364 Offset = false;
6365 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6366 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6367 Opc = AArch64::LD1Twov2d;
6368 Offset = false;
6369 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6370 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6371 "Unexpected register load without SVE load instructions");
6372 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6374 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6375 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6376 "Unexpected register load without SVE load instructions");
6377 Opc = AArch64::LDR_ZZXI;
6379 }
6380 break;
6381 case 48:
6382 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6383 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6384 Opc = AArch64::LD1Threev2d;
6385 Offset = false;
6386 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6387 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6388 "Unexpected register load without SVE load instructions");
6389 Opc = AArch64::LDR_ZZZXI;
6391 }
6392 break;
6393 case 64:
6394 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6395 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6396 Opc = AArch64::LD1Fourv2d;
6397 Offset = false;
6398 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6399 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6400 "Unexpected register load without SVE load instructions");
6401 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6403 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6404 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6405 "Unexpected register load without SVE load instructions");
6406 Opc = AArch64::LDR_ZZZZXI;
6408 }
6409 break;
6410 }
6411
6412 assert(Opc && "Unknown register class");
6413 MFI.setStackID(FI, StackID);
6414
6416 .addReg(DestReg, getDefRegState(true))
6417 .addFrameIndex(FI);
6418 if (Offset)
6419 MI.addImm(0);
6420 if (PNRReg.isValid() && !PNRReg.isVirtual())
6421 MI.addDef(PNRReg, RegState::Implicit);
6422 MI.addMemOperand(MMO);
6423}
6424
6426 const MachineInstr &UseMI,
6427 const TargetRegisterInfo *TRI) {
6428 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6429 UseMI.getIterator()),
6430 [TRI](const MachineInstr &I) {
6431 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6432 I.readsRegister(AArch64::NZCV, TRI);
6433 });
6434}
6435
6436void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6437 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6438 // The smallest scalable element supported by scaled SVE addressing
6439 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6440 // byte offset must always be a multiple of 2.
6441 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6442
6443 // VGSized offsets are divided by '2', because the VG register is the
6444 // the number of 64bit granules as opposed to 128bit vector chunks,
6445 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6446 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6447 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6448 ByteSized = Offset.getFixed();
6449 VGSized = Offset.getScalable() / 2;
6450}
6451
6452/// Returns the offset in parts to which this frame offset can be
6453/// decomposed for the purpose of describing a frame offset.
6454/// For non-scalable offsets this is simply its byte size.
6455void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6456 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6457 int64_t &NumDataVectors) {
6458 // The smallest scalable element supported by scaled SVE addressing
6459 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6460 // byte offset must always be a multiple of 2.
6461 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6462
6463 NumBytes = Offset.getFixed();
6464 NumDataVectors = 0;
6465 NumPredicateVectors = Offset.getScalable() / 2;
6466 // This method is used to get the offsets to adjust the frame offset.
6467 // If the function requires ADDPL to be used and needs more than two ADDPL
6468 // instructions, part of the offset is folded into NumDataVectors so that it
6469 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6470 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6471 NumPredicateVectors > 62) {
6472 NumDataVectors = NumPredicateVectors / 8;
6473 NumPredicateVectors -= NumDataVectors * 8;
6474 }
6475}
6476
6477// Convenience function to create a DWARF expression for: Constant `Operation`.
6478// This helper emits compact sequences for common cases. For example, for`-15
6479// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6482 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6483 // -Constant (1 to 31)
6484 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6485 Operation = dwarf::DW_OP_minus;
6486 } else if (Constant >= 0 && Constant <= 31) {
6487 // Literal value 0 to 31
6488 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6489 } else {
6490 // Signed constant
6491 Expr.push_back(dwarf::DW_OP_consts);
6493 }
6494 return Expr.push_back(Operation);
6495}
6496
6497// Convenience function to create a DWARF expression for a register.
6498static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6499 Expr.push_back((char)dwarf::DW_OP_bregx);
6501 Expr.push_back(0);
6502}
6503
6504// Convenience function to create a DWARF expression for loading a register from
6505// a CFA offset.
6507 int64_t OffsetFromDefCFA) {
6508 // This assumes the top of the DWARF stack contains the CFA.
6509 Expr.push_back(dwarf::DW_OP_dup);
6510 // Add the offset to the register.
6511 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6512 // Dereference the address (loads a 64 bit value)..
6513 Expr.push_back(dwarf::DW_OP_deref);
6514}
6515
6516// Convenience function to create a comment for
6517// (+/-) NumBytes (* RegScale)?
6518static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6519 StringRef RegScale = {}) {
6520 if (NumBytes) {
6521 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6522 if (!RegScale.empty())
6523 Comment << ' ' << RegScale;
6524 }
6525}
6526
6527// Creates an MCCFIInstruction:
6528// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6530 unsigned Reg,
6531 const StackOffset &Offset) {
6532 int64_t NumBytes, NumVGScaledBytes;
6533 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6534 NumVGScaledBytes);
6535 std::string CommentBuffer;
6536 llvm::raw_string_ostream Comment(CommentBuffer);
6537
6538 if (Reg == AArch64::SP)
6539 Comment << "sp";
6540 else if (Reg == AArch64::FP)
6541 Comment << "fp";
6542 else
6543 Comment << printReg(Reg, &TRI);
6544
6545 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6546 SmallString<64> Expr;
6547 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6548 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6549 // Reg + NumBytes
6550 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6551 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6552 appendOffsetComment(NumBytes, Comment);
6553 if (NumVGScaledBytes) {
6554 // + VG * NumVGScaledBytes
6555 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6556 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6557 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6558 Expr.push_back(dwarf::DW_OP_plus);
6559 }
6560
6561 // Wrap this into DW_CFA_def_cfa.
6562 SmallString<64> DefCfaExpr;
6563 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6564 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6565 DefCfaExpr.append(Expr.str());
6566 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6567 Comment.str());
6568}
6569
6571 unsigned FrameReg, unsigned Reg,
6572 const StackOffset &Offset,
6573 bool LastAdjustmentWasScalable) {
6574 if (Offset.getScalable())
6575 return createDefCFAExpression(TRI, Reg, Offset);
6576
6577 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6578 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6579
6580 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6581 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6582}
6583
6586 const StackOffset &OffsetFromDefCFA,
6587 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6588 int64_t NumBytes, NumVGScaledBytes;
6589 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6590 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6591
6592 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6593
6594 // Non-scalable offsets can use DW_CFA_offset directly.
6595 if (!NumVGScaledBytes)
6596 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6597
6598 std::string CommentBuffer;
6599 llvm::raw_string_ostream Comment(CommentBuffer);
6600 Comment << printReg(Reg, &TRI) << " @ cfa";
6601
6602 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6603 assert(NumVGScaledBytes && "Expected scalable offset");
6604 SmallString<64> OffsetExpr;
6605 // + VG * NumVGScaledBytes
6606 StringRef VGRegScale;
6607 if (IncomingVGOffsetFromDefCFA) {
6608 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6609 VGRegScale = "* IncomingVG";
6610 } else {
6611 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6612 VGRegScale = "* VG";
6613 }
6614 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6615 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6616 OffsetExpr.push_back(dwarf::DW_OP_plus);
6617 if (NumBytes) {
6618 // + NumBytes
6619 appendOffsetComment(NumBytes, Comment);
6620 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6621 }
6622
6623 // Wrap this into DW_CFA_expression
6624 SmallString<64> CfaExpr;
6625 CfaExpr.push_back(dwarf::DW_CFA_expression);
6626 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6627 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6628 CfaExpr.append(OffsetExpr.str());
6629
6630 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6631 Comment.str());
6632}
6633
6634// Helper function to emit a frame offset adjustment from a given
6635// pointer (SrcReg), stored into DestReg. This function is explicit
6636// in that it requires the opcode.
6639 const DebugLoc &DL, unsigned DestReg,
6640 unsigned SrcReg, int64_t Offset, unsigned Opc,
6641 const TargetInstrInfo *TII,
6642 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6643 bool *HasWinCFI, bool EmitCFAOffset,
6644 StackOffset CFAOffset, unsigned FrameReg) {
6645 int Sign = 1;
6646 unsigned MaxEncoding, ShiftSize;
6647 switch (Opc) {
6648 case AArch64::ADDXri:
6649 case AArch64::ADDSXri:
6650 case AArch64::SUBXri:
6651 case AArch64::SUBSXri:
6652 MaxEncoding = 0xfff;
6653 ShiftSize = 12;
6654 break;
6655 case AArch64::ADDVL_XXI:
6656 case AArch64::ADDPL_XXI:
6657 case AArch64::ADDSVL_XXI:
6658 case AArch64::ADDSPL_XXI:
6659 MaxEncoding = 31;
6660 ShiftSize = 0;
6661 if (Offset < 0) {
6662 MaxEncoding = 32;
6663 Sign = -1;
6664 Offset = -Offset;
6665 }
6666 break;
6667 default:
6668 llvm_unreachable("Unsupported opcode");
6669 }
6670
6671 // `Offset` can be in bytes or in "scalable bytes".
6672 int VScale = 1;
6673 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6674 VScale = 16;
6675 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6676 VScale = 2;
6677
6678 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6679 // scratch register. If DestReg is a virtual register, use it as the
6680 // scratch register; otherwise, create a new virtual register (to be
6681 // replaced by the scavenger at the end of PEI). That case can be optimized
6682 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6683 // register can be loaded with offset%8 and the add/sub can use an extending
6684 // instruction with LSL#3.
6685 // Currently the function handles any offsets but generates a poor sequence
6686 // of code.
6687 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6688
6689 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6690 Register TmpReg = DestReg;
6691 if (TmpReg == AArch64::XZR)
6692 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6693 &AArch64::GPR64RegClass);
6694 do {
6695 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6696 unsigned LocalShiftSize = 0;
6697 if (ThisVal > MaxEncoding) {
6698 ThisVal = ThisVal >> ShiftSize;
6699 LocalShiftSize = ShiftSize;
6700 }
6701 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6702 "Encoding cannot handle value that big");
6703
6704 Offset -= ThisVal << LocalShiftSize;
6705 if (Offset == 0)
6706 TmpReg = DestReg;
6707 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6708 .addReg(SrcReg)
6709 .addImm(Sign * (int)ThisVal);
6710 if (ShiftSize)
6711 MBI = MBI.addImm(
6713 MBI = MBI.setMIFlag(Flag);
6714
6715 auto Change =
6716 VScale == 1
6717 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6718 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6719 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6720 CFAOffset += Change;
6721 else
6722 CFAOffset -= Change;
6723 if (EmitCFAOffset && DestReg == TmpReg) {
6724 MachineFunction &MF = *MBB.getParent();
6725 const TargetSubtargetInfo &STI = MF.getSubtarget();
6726 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6727
6728 unsigned CFIIndex = MF.addFrameInst(
6729 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6730 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6731 .addCFIIndex(CFIIndex)
6732 .setMIFlags(Flag);
6733 }
6734
6735 if (NeedsWinCFI) {
6736 int Imm = (int)(ThisVal << LocalShiftSize);
6737 if (VScale != 1 && DestReg == AArch64::SP) {
6738 if (HasWinCFI)
6739 *HasWinCFI = true;
6740 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6741 .addImm(ThisVal)
6742 .setMIFlag(Flag);
6743 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6744 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6745 assert(VScale == 1 && "Expected non-scalable operation");
6746 if (HasWinCFI)
6747 *HasWinCFI = true;
6748 if (Imm == 0)
6749 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6750 else
6751 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6752 .addImm(Imm)
6753 .setMIFlag(Flag);
6754 assert(Offset == 0 && "Expected remaining offset to be zero to "
6755 "emit a single SEH directive");
6756 } else if (DestReg == AArch64::SP) {
6757 assert(VScale == 1 && "Expected non-scalable operation");
6758 if (HasWinCFI)
6759 *HasWinCFI = true;
6760 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6761 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6762 .addImm(Imm)
6763 .setMIFlag(Flag);
6764 }
6765 }
6766
6767 SrcReg = TmpReg;
6768 } while (Offset);
6769}
6770
6773 unsigned DestReg, unsigned SrcReg,
6775 MachineInstr::MIFlag Flag, bool SetNZCV,
6776 bool NeedsWinCFI, bool *HasWinCFI,
6777 bool EmitCFAOffset, StackOffset CFAOffset,
6778 unsigned FrameReg) {
6779 // If a function is marked as arm_locally_streaming, then the runtime value of
6780 // vscale in the prologue/epilogue is different the runtime value of vscale
6781 // in the function's body. To avoid having to consider multiple vscales,
6782 // we can use `addsvl` to allocate any scalable stack-slots, which under
6783 // most circumstances will be only locals, not callee-save slots.
6784 const Function &F = MBB.getParent()->getFunction();
6785 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6786
6787 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6788 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6789 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6790
6791 // Insert ADDSXri for scalable offset at the end.
6792 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6793 if (NeedsFinalDefNZCV)
6794 SetNZCV = false;
6795
6796 // First emit non-scalable frame offsets, or a simple 'mov'.
6797 if (Bytes || (!Offset && SrcReg != DestReg)) {
6798 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6799 "SP increment/decrement not 8-byte aligned");
6800 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6801 if (Bytes < 0) {
6802 Bytes = -Bytes;
6803 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6804 }
6805 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6806 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6807 FrameReg);
6808 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6809 ? StackOffset::getFixed(-Bytes)
6810 : StackOffset::getFixed(Bytes);
6811 SrcReg = DestReg;
6812 FrameReg = DestReg;
6813 }
6814
6815 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6816 "WinCFI can't allocate fractions of an SVE data vector");
6817
6818 if (NumDataVectors) {
6819 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6820 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6821 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6822 FrameReg);
6823 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6824 SrcReg = DestReg;
6825 }
6826
6827 if (NumPredicateVectors) {
6828 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6829 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6830 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6831 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6832 FrameReg);
6833 }
6834
6835 if (NeedsFinalDefNZCV)
6836 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6837 .addReg(DestReg)
6838 .addImm(0)
6839 .addImm(0);
6840}
6841
6844 int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS,
6845 VirtRegMap *VRM) const {
6847 // This is a bit of a hack. Consider this instruction:
6848 //
6849 // %0 = COPY %sp; GPR64all:%0
6850 //
6851 // We explicitly chose GPR64all for the virtual register so such a copy might
6852 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6853 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6854 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6855 //
6856 // To prevent that, we are going to constrain the %0 register class here.
6857 if (MI.isFullCopy()) {
6858 Register DstReg = MI.getOperand(0).getReg();
6859 Register SrcReg = MI.getOperand(1).getReg();
6860 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6861 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6862 return nullptr;
6863 }
6864 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6865 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6866 return nullptr;
6867 }
6868 // Nothing can folded with copy from/to NZCV.
6869 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6870 return nullptr;
6871 }
6872
6873 // Handle the case where a copy is being spilled or filled but the source
6874 // and destination register class don't match. For example:
6875 //
6876 // %0 = COPY %xzr; GPR64common:%0
6877 //
6878 // In this case we can still safely fold away the COPY and generate the
6879 // following spill code:
6880 //
6881 // STRXui %xzr, %stack.0
6882 //
6883 // This also eliminates spilled cross register class COPYs (e.g. between x and
6884 // d regs) of the same size. For example:
6885 //
6886 // %0 = COPY %1; GPR64:%0, FPR64:%1
6887 //
6888 // will be filled as
6889 //
6890 // LDRDui %0, fi<#0>
6891 //
6892 // instead of
6893 //
6894 // LDRXui %Temp, fi<#0>
6895 // %0 = FMOV %Temp
6896 //
6897 if (MI.isCopy() && Ops.size() == 1 &&
6898 // Make sure we're only folding the explicit COPY defs/uses.
6899 (Ops[0] == 0 || Ops[0] == 1)) {
6900 bool IsSpill = Ops[0] == 0;
6901 bool IsFill = !IsSpill;
6903 const MachineRegisterInfo &MRI = MF.getRegInfo();
6904 MachineBasicBlock &MBB = *MI.getParent();
6905 const MachineOperand &DstMO = MI.getOperand(0);
6906 const MachineOperand &SrcMO = MI.getOperand(1);
6907 Register DstReg = DstMO.getReg();
6908 Register SrcReg = SrcMO.getReg();
6909 // This is slightly expensive to compute for physical regs since
6910 // getMinimalPhysRegClass is slow.
6911 auto getRegClass = [&](unsigned Reg) {
6912 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6913 : TRI.getMinimalPhysRegClass(Reg);
6914 };
6915
6916 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6917 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6918 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6919 "Mismatched register size in non subreg COPY");
6920 if (IsSpill)
6921 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6922 getRegClass(SrcReg), Register());
6923 else
6924 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6925 getRegClass(DstReg), Register());
6926 return &*--InsertPt;
6927 }
6928
6929 // Handle cases like spilling def of:
6930 //
6931 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6932 //
6933 // where the physical register source can be widened and stored to the full
6934 // virtual reg destination stack slot, in this case producing:
6935 //
6936 // STRXui %xzr, %stack.0
6937 //
6938 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6939 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6940 assert(SrcMO.getSubReg() == 0 &&
6941 "Unexpected subreg on physical register");
6942 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6943 FrameIndex, &AArch64::GPR64RegClass, Register());
6944 return &*--InsertPt;
6945 }
6946
6947 // Handle cases like filling use of:
6948 //
6949 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6950 //
6951 // where we can load the full virtual reg source stack slot, into the subreg
6952 // destination, in this case producing:
6953 //
6954 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6955 //
6956 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6957 const TargetRegisterClass *FillRC = nullptr;
6958 switch (DstMO.getSubReg()) {
6959 default:
6960 break;
6961 case AArch64::sub_32:
6962 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6963 FillRC = &AArch64::GPR32RegClass;
6964 break;
6965 case AArch64::ssub:
6966 FillRC = &AArch64::FPR32RegClass;
6967 break;
6968 case AArch64::dsub:
6969 FillRC = &AArch64::FPR64RegClass;
6970 break;
6971 }
6972
6973 if (FillRC) {
6974 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6975 TRI.getRegSizeInBits(*FillRC) &&
6976 "Mismatched regclass size on folded subreg COPY");
6977 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6978 Register());
6979 MachineInstr &LoadMI = *--InsertPt;
6980 MachineOperand &LoadDst = LoadMI.getOperand(0);
6981 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6982 LoadDst.setSubReg(DstMO.getSubReg());
6983 LoadDst.setIsUndef();
6984 return &LoadMI;
6985 }
6986 }
6987 }
6988
6989 // Cannot fold.
6990 return nullptr;
6991}
6992
6994 StackOffset &SOffset,
6995 bool *OutUseUnscaledOp,
6996 unsigned *OutUnscaledOp,
6997 int64_t *EmittableOffset) {
6998 // Set output values in case of early exit.
6999 if (EmittableOffset)
7000 *EmittableOffset = 0;
7001 if (OutUseUnscaledOp)
7002 *OutUseUnscaledOp = false;
7003 if (OutUnscaledOp)
7004 *OutUnscaledOp = 0;
7005
7006 // Exit early for structured vector spills/fills as they can't take an
7007 // immediate offset.
7008 switch (MI.getOpcode()) {
7009 default:
7010 break;
7011 case AArch64::LD1Rv1d:
7012 case AArch64::LD1Rv2s:
7013 case AArch64::LD1Rv2d:
7014 case AArch64::LD1Rv4h:
7015 case AArch64::LD1Rv4s:
7016 case AArch64::LD1Rv8b:
7017 case AArch64::LD1Rv8h:
7018 case AArch64::LD1Rv16b:
7019 case AArch64::LD1Twov2d:
7020 case AArch64::LD1Threev2d:
7021 case AArch64::LD1Fourv2d:
7022 case AArch64::LD1Twov1d:
7023 case AArch64::LD1Threev1d:
7024 case AArch64::LD1Fourv1d:
7025 case AArch64::ST1Twov2d:
7026 case AArch64::ST1Threev2d:
7027 case AArch64::ST1Fourv2d:
7028 case AArch64::ST1Twov1d:
7029 case AArch64::ST1Threev1d:
7030 case AArch64::ST1Fourv1d:
7031 case AArch64::ST1i8:
7032 case AArch64::ST1i16:
7033 case AArch64::ST1i32:
7034 case AArch64::ST1i64:
7035 case AArch64::IRG:
7036 case AArch64::IRGstack:
7037 case AArch64::STGloop:
7038 case AArch64::STZGloop:
7040 }
7041
7042 // Get the min/max offset and the scale.
7043 TypeSize ScaleValue(0U, false), Width(0U, false);
7044 int64_t MinOff, MaxOff;
7045 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
7046 MaxOff))
7047 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7048
7049 // Construct the complete offset.
7050 bool IsMulVL = ScaleValue.isScalable();
7051 unsigned Scale = ScaleValue.getKnownMinValue();
7052 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
7053
7054 const MachineOperand &ImmOpnd =
7055 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
7056 Offset += ImmOpnd.getImm() * Scale;
7057
7058 // If the offset doesn't match the scale, we rewrite the instruction to
7059 // use the unscaled instruction instead. Likewise, if we have a negative
7060 // offset and there is an unscaled op to use.
7061 std::optional<unsigned> UnscaledOp =
7063 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
7064 if (useUnscaledOp &&
7065 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
7066 MaxOff))
7067 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
7068
7069 Scale = ScaleValue.getKnownMinValue();
7070 assert(IsMulVL == ScaleValue.isScalable() &&
7071 "Unscaled opcode has different value for scalable");
7072
7073 int64_t Remainder = Offset % Scale;
7074 assert(!(Remainder && useUnscaledOp) &&
7075 "Cannot have remainder when using unscaled op");
7076
7077 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
7078 int64_t NewOffset = Offset / Scale;
7079 if (MinOff <= NewOffset && NewOffset <= MaxOff)
7080 Offset = Remainder;
7081 else {
7082 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
7083 Offset = Offset - (NewOffset * Scale);
7084 }
7085
7086 if (EmittableOffset)
7087 *EmittableOffset = NewOffset;
7088 if (OutUseUnscaledOp)
7089 *OutUseUnscaledOp = useUnscaledOp;
7090 if (OutUnscaledOp && UnscaledOp)
7091 *OutUnscaledOp = *UnscaledOp;
7092
7093 if (IsMulVL)
7094 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
7095 else
7096 SOffset = StackOffset::get(Offset, SOffset.getScalable());
7098 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
7099}
7100
7102 unsigned FrameReg, StackOffset &Offset,
7103 const AArch64InstrInfo *TII) {
7104 unsigned Opcode = MI.getOpcode();
7105 unsigned ImmIdx = FrameRegIdx + 1;
7106
7107 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
7108 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
7109 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
7110 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
7111 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
7112 MI.eraseFromParent();
7113 Offset = StackOffset();
7114 return true;
7115 }
7116
7117 int64_t NewOffset;
7118 unsigned UnscaledOp;
7119 bool UseUnscaledOp;
7120 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
7121 &UnscaledOp, &NewOffset);
7124 // Replace the FrameIndex with FrameReg.
7125 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
7126 if (UseUnscaledOp)
7127 MI.setDesc(TII->get(UnscaledOp));
7128
7129 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
7130 return !Offset;
7131 }
7132
7133 return false;
7134}
7135
7141
7142MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
7143
7144// AArch64 supports MachineCombiner.
7145bool AArch64InstrInfo::useMachineCombiner() const { return true; }
7146
7147// True when Opc sets flag
7148static bool isCombineInstrSettingFlag(unsigned Opc) {
7149 switch (Opc) {
7150 case AArch64::ADDSWrr:
7151 case AArch64::ADDSWri:
7152 case AArch64::ADDSXrr:
7153 case AArch64::ADDSXri:
7154 case AArch64::SUBSWrr:
7155 case AArch64::SUBSXrr:
7156 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7157 case AArch64::SUBSWri:
7158 case AArch64::SUBSXri:
7159 return true;
7160 default:
7161 break;
7162 }
7163 return false;
7164}
7165
7166// 32b Opcodes that can be combined with a MUL
7167static bool isCombineInstrCandidate32(unsigned Opc) {
7168 switch (Opc) {
7169 case AArch64::ADDWrr:
7170 case AArch64::ADDWri:
7171 case AArch64::SUBWrr:
7172 case AArch64::ADDSWrr:
7173 case AArch64::ADDSWri:
7174 case AArch64::SUBSWrr:
7175 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7176 case AArch64::SUBWri:
7177 case AArch64::SUBSWri:
7178 return true;
7179 default:
7180 break;
7181 }
7182 return false;
7183}
7184
7185// 64b Opcodes that can be combined with a MUL
7186static bool isCombineInstrCandidate64(unsigned Opc) {
7187 switch (Opc) {
7188 case AArch64::ADDXrr:
7189 case AArch64::ADDXri:
7190 case AArch64::SUBXrr:
7191 case AArch64::ADDSXrr:
7192 case AArch64::ADDSXri:
7193 case AArch64::SUBSXrr:
7194 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7195 case AArch64::SUBXri:
7196 case AArch64::SUBSXri:
7197 case AArch64::ADDv8i8:
7198 case AArch64::ADDv16i8:
7199 case AArch64::ADDv4i16:
7200 case AArch64::ADDv8i16:
7201 case AArch64::ADDv2i32:
7202 case AArch64::ADDv4i32:
7203 case AArch64::SUBv8i8:
7204 case AArch64::SUBv16i8:
7205 case AArch64::SUBv4i16:
7206 case AArch64::SUBv8i16:
7207 case AArch64::SUBv2i32:
7208 case AArch64::SUBv4i32:
7209 return true;
7210 default:
7211 break;
7212 }
7213 return false;
7214}
7215
7216// FP Opcodes that can be combined with a FMUL.
7217static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7218 switch (Inst.getOpcode()) {
7219 default:
7220 break;
7221 case AArch64::FADDHrr:
7222 case AArch64::FADDSrr:
7223 case AArch64::FADDDrr:
7224 case AArch64::FADDv4f16:
7225 case AArch64::FADDv8f16:
7226 case AArch64::FADDv2f32:
7227 case AArch64::FADDv2f64:
7228 case AArch64::FADDv4f32:
7229 case AArch64::FSUBHrr:
7230 case AArch64::FSUBSrr:
7231 case AArch64::FSUBDrr:
7232 case AArch64::FSUBv4f16:
7233 case AArch64::FSUBv8f16:
7234 case AArch64::FSUBv2f32:
7235 case AArch64::FSUBv2f64:
7236 case AArch64::FSUBv4f32:
7238 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7239 // the target options or if FADD/FSUB has the contract fast-math flag.
7240 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7242 }
7243 return false;
7244}
7245
7246// Opcodes that can be combined with a MUL
7250
7251//
7252// Utility routine that checks if \param MO is defined by an
7253// \param CombineOpc instruction in the basic block \param MBB
7255 unsigned CombineOpc, unsigned ZeroReg = 0,
7256 bool CheckZeroReg = false) {
7257 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7258 MachineInstr *MI = nullptr;
7259
7260 if (MO.isReg() && MO.getReg().isVirtual())
7261 MI = MRI.getUniqueVRegDef(MO.getReg());
7262 // And it needs to be in the trace (otherwise, it won't have a depth).
7263 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7264 return false;
7265 // Must only used by the user we combine with.
7266 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7267 return false;
7268
7269 if (CheckZeroReg) {
7270 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7271 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7272 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7273 // The third input reg must be zero.
7274 if (MI->getOperand(3).getReg() != ZeroReg)
7275 return false;
7276 }
7277
7278 if (isCombineInstrSettingFlag(CombineOpc) &&
7279 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7280 return false;
7281
7282 return true;
7283}
7284
7285//
7286// Is \param MO defined by an integer multiply and can be combined?
7288 unsigned MulOpc, unsigned ZeroReg) {
7289 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7290}
7291
7292//
7293// Is \param MO defined by a floating-point multiply and can be combined?
7295 unsigned MulOpc) {
7296 return canCombine(MBB, MO, MulOpc);
7297}
7298
7299// TODO: There are many more machine instruction opcodes to match:
7300// 1. Other data types (integer, vectors)
7301// 2. Other math / logic operations (xor, or)
7302// 3. Other forms of the same operation (intrinsics and other variants)
7303bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7304 bool Invert) const {
7305 if (Invert)
7306 return false;
7307 switch (Inst.getOpcode()) {
7308 // == Floating-point types ==
7309 // -- Floating-point instructions --
7310 case AArch64::FADDHrr:
7311 case AArch64::FADDSrr:
7312 case AArch64::FADDDrr:
7313 case AArch64::FMULHrr:
7314 case AArch64::FMULSrr:
7315 case AArch64::FMULDrr:
7316 case AArch64::FMULX16:
7317 case AArch64::FMULX32:
7318 case AArch64::FMULX64:
7319 // -- Advanced SIMD instructions --
7320 case AArch64::FADDv4f16:
7321 case AArch64::FADDv8f16:
7322 case AArch64::FADDv2f32:
7323 case AArch64::FADDv4f32:
7324 case AArch64::FADDv2f64:
7325 case AArch64::FMULv4f16:
7326 case AArch64::FMULv8f16:
7327 case AArch64::FMULv2f32:
7328 case AArch64::FMULv4f32:
7329 case AArch64::FMULv2f64:
7330 case AArch64::FMULXv4f16:
7331 case AArch64::FMULXv8f16:
7332 case AArch64::FMULXv2f32:
7333 case AArch64::FMULXv4f32:
7334 case AArch64::FMULXv2f64:
7335 // -- SVE instructions --
7336 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7337 // in the SVE instruction set (though there are predicated ones).
7338 case AArch64::FADD_ZZZ_H:
7339 case AArch64::FADD_ZZZ_S:
7340 case AArch64::FADD_ZZZ_D:
7341 case AArch64::FMUL_ZZZ_H:
7342 case AArch64::FMUL_ZZZ_S:
7343 case AArch64::FMUL_ZZZ_D:
7346
7347 // == Integer types ==
7348 // -- Base instructions --
7349 // Opcodes MULWrr and MULXrr don't exist because
7350 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7351 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7352 // The machine-combiner does not support three-source-operands machine
7353 // instruction. So we cannot reassociate MULs.
7354 case AArch64::ADDWrr:
7355 case AArch64::ADDXrr:
7356 case AArch64::ANDWrr:
7357 case AArch64::ANDXrr:
7358 case AArch64::ORRWrr:
7359 case AArch64::ORRXrr:
7360 case AArch64::EORWrr:
7361 case AArch64::EORXrr:
7362 case AArch64::EONWrr:
7363 case AArch64::EONXrr:
7364 // -- Advanced SIMD instructions --
7365 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7366 // in the Advanced SIMD instruction set.
7367 case AArch64::ADDv8i8:
7368 case AArch64::ADDv16i8:
7369 case AArch64::ADDv4i16:
7370 case AArch64::ADDv8i16:
7371 case AArch64::ADDv2i32:
7372 case AArch64::ADDv4i32:
7373 case AArch64::ADDv1i64:
7374 case AArch64::ADDv2i64:
7375 case AArch64::MULv8i8:
7376 case AArch64::MULv16i8:
7377 case AArch64::MULv4i16:
7378 case AArch64::MULv8i16:
7379 case AArch64::MULv2i32:
7380 case AArch64::MULv4i32:
7381 case AArch64::ANDv8i8:
7382 case AArch64::ANDv16i8:
7383 case AArch64::ORRv8i8:
7384 case AArch64::ORRv16i8:
7385 case AArch64::EORv8i8:
7386 case AArch64::EORv16i8:
7387 // -- SVE instructions --
7388 case AArch64::ADD_ZZZ_B:
7389 case AArch64::ADD_ZZZ_H:
7390 case AArch64::ADD_ZZZ_S:
7391 case AArch64::ADD_ZZZ_D:
7392 case AArch64::MUL_ZZZ_B:
7393 case AArch64::MUL_ZZZ_H:
7394 case AArch64::MUL_ZZZ_S:
7395 case AArch64::MUL_ZZZ_D:
7396 case AArch64::AND_ZZZ:
7397 case AArch64::ORR_ZZZ:
7398 case AArch64::EOR_ZZZ:
7399 return true;
7400
7401 default:
7402 return false;
7403 }
7404}
7405
7406/// Find instructions that can be turned into madd.
7408 SmallVectorImpl<unsigned> &Patterns) {
7409 unsigned Opc = Root.getOpcode();
7410 MachineBasicBlock &MBB = *Root.getParent();
7411 bool Found = false;
7412
7414 return false;
7416 int Cmp_NZCV =
7417 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7418 // When NZCV is live bail out.
7419 if (Cmp_NZCV == -1)
7420 return false;
7421 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7422 // When opcode can't change bail out.
7423 // CHECKME: do we miss any cases for opcode conversion?
7424 if (NewOpc == Opc)
7425 return false;
7426 Opc = NewOpc;
7427 }
7428
7429 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7430 unsigned Pattern) {
7431 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7432 Patterns.push_back(Pattern);
7433 Found = true;
7434 }
7435 };
7436
7437 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7438 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7439 Patterns.push_back(Pattern);
7440 Found = true;
7441 }
7442 };
7443
7445
7446 switch (Opc) {
7447 default:
7448 break;
7449 case AArch64::ADDWrr:
7450 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7451 "ADDWrr does not have register operands");
7452 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7453 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7454 break;
7455 case AArch64::ADDXrr:
7456 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7457 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7458 break;
7459 case AArch64::SUBWrr:
7460 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7461 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7462 break;
7463 case AArch64::SUBXrr:
7464 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7465 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7466 break;
7467 case AArch64::ADDWri:
7468 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7469 break;
7470 case AArch64::ADDXri:
7471 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7472 break;
7473 case AArch64::SUBWri:
7474 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7475 break;
7476 case AArch64::SUBXri:
7477 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7478 break;
7479 case AArch64::ADDv8i8:
7480 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7481 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7482 break;
7483 case AArch64::ADDv16i8:
7484 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7485 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7486 break;
7487 case AArch64::ADDv4i16:
7488 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7489 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7490 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7491 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7492 break;
7493 case AArch64::ADDv8i16:
7494 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7495 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7496 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7497 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7498 break;
7499 case AArch64::ADDv2i32:
7500 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7501 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7502 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7503 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7504 break;
7505 case AArch64::ADDv4i32:
7506 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7507 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7508 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7509 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7510 break;
7511 case AArch64::SUBv8i8:
7512 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7513 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7514 break;
7515 case AArch64::SUBv16i8:
7516 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7517 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7518 break;
7519 case AArch64::SUBv4i16:
7520 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7521 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7522 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7523 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7524 break;
7525 case AArch64::SUBv8i16:
7526 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7527 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7528 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7529 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7530 break;
7531 case AArch64::SUBv2i32:
7532 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7533 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7534 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7535 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7536 break;
7537 case AArch64::SUBv4i32:
7538 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7539 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7540 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7541 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7542 break;
7543 }
7544 return Found;
7545}
7546
7547bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7548 switch (Opcode) {
7549 default:
7550 break;
7551 case AArch64::UABALB_ZZZ_D:
7552 case AArch64::UABALB_ZZZ_H:
7553 case AArch64::UABALB_ZZZ_S:
7554 case AArch64::UABALT_ZZZ_D:
7555 case AArch64::UABALT_ZZZ_H:
7556 case AArch64::UABALT_ZZZ_S:
7557 case AArch64::SABALB_ZZZ_D:
7558 case AArch64::SABALB_ZZZ_S:
7559 case AArch64::SABALB_ZZZ_H:
7560 case AArch64::SABALT_ZZZ_D:
7561 case AArch64::SABALT_ZZZ_S:
7562 case AArch64::SABALT_ZZZ_H:
7563 case AArch64::UABALv16i8_v8i16:
7564 case AArch64::UABALv2i32_v2i64:
7565 case AArch64::UABALv4i16_v4i32:
7566 case AArch64::UABALv4i32_v2i64:
7567 case AArch64::UABALv8i16_v4i32:
7568 case AArch64::UABALv8i8_v8i16:
7569 case AArch64::UABAv16i8:
7570 case AArch64::UABAv2i32:
7571 case AArch64::UABAv4i16:
7572 case AArch64::UABAv4i32:
7573 case AArch64::UABAv8i16:
7574 case AArch64::UABAv8i8:
7575 case AArch64::SABALv16i8_v8i16:
7576 case AArch64::SABALv2i32_v2i64:
7577 case AArch64::SABALv4i16_v4i32:
7578 case AArch64::SABALv4i32_v2i64:
7579 case AArch64::SABALv8i16_v4i32:
7580 case AArch64::SABALv8i8_v8i16:
7581 case AArch64::SABAv16i8:
7582 case AArch64::SABAv2i32:
7583 case AArch64::SABAv4i16:
7584 case AArch64::SABAv4i32:
7585 case AArch64::SABAv8i16:
7586 case AArch64::SABAv8i8:
7587 return true;
7588 }
7589
7590 return false;
7591}
7592
7593unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7594 unsigned AccumulationOpcode) const {
7595 switch (AccumulationOpcode) {
7596 default:
7597 llvm_unreachable("Unsupported accumulation Opcode!");
7598 case AArch64::UABALB_ZZZ_D:
7599 return AArch64::UABDLB_ZZZ_D;
7600 case AArch64::UABALB_ZZZ_H:
7601 return AArch64::UABDLB_ZZZ_H;
7602 case AArch64::UABALB_ZZZ_S:
7603 return AArch64::UABDLB_ZZZ_S;
7604 case AArch64::UABALT_ZZZ_D:
7605 return AArch64::UABDLT_ZZZ_D;
7606 case AArch64::UABALT_ZZZ_H:
7607 return AArch64::UABDLT_ZZZ_H;
7608 case AArch64::UABALT_ZZZ_S:
7609 return AArch64::UABDLT_ZZZ_S;
7610 case AArch64::UABALv16i8_v8i16:
7611 return AArch64::UABDLv16i8_v8i16;
7612 case AArch64::UABALv2i32_v2i64:
7613 return AArch64::UABDLv2i32_v2i64;
7614 case AArch64::UABALv4i16_v4i32:
7615 return AArch64::UABDLv4i16_v4i32;
7616 case AArch64::UABALv4i32_v2i64:
7617 return AArch64::UABDLv4i32_v2i64;
7618 case AArch64::UABALv8i16_v4i32:
7619 return AArch64::UABDLv8i16_v4i32;
7620 case AArch64::UABALv8i8_v8i16:
7621 return AArch64::UABDLv8i8_v8i16;
7622 case AArch64::UABAv16i8:
7623 return AArch64::UABDv16i8;
7624 case AArch64::UABAv2i32:
7625 return AArch64::UABDv2i32;
7626 case AArch64::UABAv4i16:
7627 return AArch64::UABDv4i16;
7628 case AArch64::UABAv4i32:
7629 return AArch64::UABDv4i32;
7630 case AArch64::UABAv8i16:
7631 return AArch64::UABDv8i16;
7632 case AArch64::UABAv8i8:
7633 return AArch64::UABDv8i8;
7634 case AArch64::SABALB_ZZZ_D:
7635 return AArch64::SABDLB_ZZZ_D;
7636 case AArch64::SABALB_ZZZ_S:
7637 return AArch64::SABDLB_ZZZ_S;
7638 case AArch64::SABALB_ZZZ_H:
7639 return AArch64::SABDLB_ZZZ_H;
7640 case AArch64::SABALT_ZZZ_D:
7641 return AArch64::SABDLT_ZZZ_D;
7642 case AArch64::SABALT_ZZZ_S:
7643 return AArch64::SABDLT_ZZZ_S;
7644 case AArch64::SABALT_ZZZ_H:
7645 return AArch64::SABDLT_ZZZ_H;
7646 case AArch64::SABALv16i8_v8i16:
7647 return AArch64::SABDLv16i8_v8i16;
7648 case AArch64::SABALv2i32_v2i64:
7649 return AArch64::SABDLv2i32_v2i64;
7650 case AArch64::SABALv4i16_v4i32:
7651 return AArch64::SABDLv4i16_v4i32;
7652 case AArch64::SABALv4i32_v2i64:
7653 return AArch64::SABDLv4i32_v2i64;
7654 case AArch64::SABALv8i16_v4i32:
7655 return AArch64::SABDLv8i16_v4i32;
7656 case AArch64::SABALv8i8_v8i16:
7657 return AArch64::SABDLv8i8_v8i16;
7658 case AArch64::SABAv16i8:
7659 return AArch64::SABDv16i8;
7660 case AArch64::SABAv2i32:
7661 return AArch64::SABAv2i32;
7662 case AArch64::SABAv4i16:
7663 return AArch64::SABDv4i16;
7664 case AArch64::SABAv4i32:
7665 return AArch64::SABDv4i32;
7666 case AArch64::SABAv8i16:
7667 return AArch64::SABDv8i16;
7668 case AArch64::SABAv8i8:
7669 return AArch64::SABDv8i8;
7670 }
7671}
7672
7673/// Floating-Point Support
7674
7675/// Find instructions that can be turned into madd.
7677 SmallVectorImpl<unsigned> &Patterns) {
7678
7679 if (!isCombineInstrCandidateFP(Root))
7680 return false;
7681
7682 MachineBasicBlock &MBB = *Root.getParent();
7683 bool Found = false;
7684
7685 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7686 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7687 Patterns.push_back(Pattern);
7688 return true;
7689 }
7690 return false;
7691 };
7692
7694
7695 switch (Root.getOpcode()) {
7696 default:
7697 assert(false && "Unsupported FP instruction in combiner\n");
7698 break;
7699 case AArch64::FADDHrr:
7700 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7701 "FADDHrr does not have register operands");
7702
7703 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7704 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7705 break;
7706 case AArch64::FADDSrr:
7707 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7708 "FADDSrr does not have register operands");
7709
7710 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7711 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7712
7713 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7714 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7715 break;
7716 case AArch64::FADDDrr:
7717 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7718 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7719
7720 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7721 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7722 break;
7723 case AArch64::FADDv4f16:
7724 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7725 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7726
7727 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7728 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7729 break;
7730 case AArch64::FADDv8f16:
7731 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7732 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7733
7734 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7735 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7736 break;
7737 case AArch64::FADDv2f32:
7738 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7739 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7740
7741 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7742 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7743 break;
7744 case AArch64::FADDv2f64:
7745 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7746 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7747
7748 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7749 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7750 break;
7751 case AArch64::FADDv4f32:
7752 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7753 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7754
7755 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7756 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7757 break;
7758 case AArch64::FSUBHrr:
7759 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7760 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7761 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7762 break;
7763 case AArch64::FSUBSrr:
7764 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7765
7766 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7767 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7768
7769 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7770 break;
7771 case AArch64::FSUBDrr:
7772 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7773
7774 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7775 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7776
7777 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7778 break;
7779 case AArch64::FSUBv4f16:
7780 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7781 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7782
7783 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7784 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7785 break;
7786 case AArch64::FSUBv8f16:
7787 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7788 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7789
7790 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7791 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7792 break;
7793 case AArch64::FSUBv2f32:
7794 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7795 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7796
7797 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7798 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7799 break;
7800 case AArch64::FSUBv2f64:
7801 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7802 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7803
7804 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7805 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7806 break;
7807 case AArch64::FSUBv4f32:
7808 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7809 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7810
7811 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7812 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7813 break;
7814 }
7815 return Found;
7816}
7817
7819 SmallVectorImpl<unsigned> &Patterns) {
7820 MachineBasicBlock &MBB = *Root.getParent();
7821 bool Found = false;
7822
7823 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7824 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7825 MachineOperand &MO = Root.getOperand(Operand);
7826 MachineInstr *MI = nullptr;
7827 if (MO.isReg() && MO.getReg().isVirtual())
7828 MI = MRI.getUniqueVRegDef(MO.getReg());
7829 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7830 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7831 MI->getOperand(1).getReg().isVirtual())
7832 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7833 if (MI && MI->getOpcode() == Opcode) {
7834 Patterns.push_back(Pattern);
7835 return true;
7836 }
7837 return false;
7838 };
7839
7841
7842 switch (Root.getOpcode()) {
7843 default:
7844 return false;
7845 case AArch64::FMULv2f32:
7846 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7847 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7848 break;
7849 case AArch64::FMULv2f64:
7850 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7851 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7852 break;
7853 case AArch64::FMULv4f16:
7854 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7855 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7856 break;
7857 case AArch64::FMULv4f32:
7858 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7859 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7860 break;
7861 case AArch64::FMULv8f16:
7862 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7863 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7864 break;
7865 }
7866
7867 return Found;
7868}
7869
7871 SmallVectorImpl<unsigned> &Patterns) {
7872 unsigned Opc = Root.getOpcode();
7873 MachineBasicBlock &MBB = *Root.getParent();
7874 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7875
7876 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7877 MachineOperand &MO = Root.getOperand(1);
7879 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7880 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7884 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7885 Patterns.push_back(Pattern);
7886 return true;
7887 }
7888 return false;
7889 };
7890
7891 switch (Opc) {
7892 default:
7893 break;
7894 case AArch64::FNEGDr:
7895 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7896 case AArch64::FNEGSr:
7897 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7898 }
7899
7900 return false;
7901}
7902
7903/// Return true when a code sequence can improve throughput. It
7904/// should be called only for instructions in loops.
7905/// \param Pattern - combiner pattern
7907 switch (Pattern) {
7908 default:
7909 break;
8015 return true;
8016 } // end switch (Pattern)
8017 return false;
8018}
8019
8020/// Find other MI combine patterns.
8022 SmallVectorImpl<unsigned> &Patterns) {
8023 // A - (B + C) ==> (A - B) - C or (A - C) - B
8024 unsigned Opc = Root.getOpcode();
8025 MachineBasicBlock &MBB = *Root.getParent();
8026
8027 switch (Opc) {
8028 case AArch64::SUBWrr:
8029 case AArch64::SUBSWrr:
8030 case AArch64::SUBXrr:
8031 case AArch64::SUBSXrr:
8032 // Found candidate root.
8033 break;
8034 default:
8035 return false;
8036 }
8037
8039 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
8040 -1)
8041 return false;
8042
8043 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
8044 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
8045 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
8046 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
8049 return true;
8050 }
8051
8052 return false;
8053}
8054
8055/// Check if the given instruction forms a gather load pattern that can be
8056/// optimized for better Memory-Level Parallelism (MLP). This function
8057/// identifies chains of NEON lane load instructions that load data from
8058/// different memory addresses into individual lanes of a 128-bit vector
8059/// register, then attempts to split the pattern into parallel loads to break
8060/// the serial dependency between instructions.
8061///
8062/// Pattern Matched:
8063/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
8064/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
8065///
8066/// Transformed Into:
8067/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
8068/// to combine the results, enabling better memory-level parallelism.
8069///
8070/// Supported Element Types:
8071/// - 32-bit elements (LD1i32, 4 lanes total)
8072/// - 16-bit elements (LD1i16, 8 lanes total)
8073/// - 8-bit elements (LD1i8, 16 lanes total)
8075 SmallVectorImpl<unsigned> &Patterns,
8076 unsigned LoadLaneOpCode, unsigned NumLanes) {
8077 const MachineFunction *MF = Root.getMF();
8078
8079 // Early exit if optimizing for size.
8080 if (MF->getFunction().hasMinSize())
8081 return false;
8082
8083 const MachineRegisterInfo &MRI = MF->getRegInfo();
8085
8086 // The root of the pattern must load into the last lane of the vector.
8087 if (Root.getOperand(2).getImm() != NumLanes - 1)
8088 return false;
8089
8090 // Check that we have load into all lanes except lane 0.
8091 // For each load we also want to check that:
8092 // 1. It has a single non-debug use (since we will be replacing the virtual
8093 // register)
8094 // 2. That the addressing mode only uses a single pointer operand
8095 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8096 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
8097 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
8099 while (!RemainingLanes.empty() && CurrInstr &&
8100 CurrInstr->getOpcode() == LoadLaneOpCode &&
8101 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
8102 CurrInstr->getNumOperands() == 4) {
8103 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
8104 LoadInstrs.push_back(CurrInstr);
8105 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8106 }
8107
8108 // Check that we have found a match for lanes N-1.. 1.
8109 if (!RemainingLanes.empty())
8110 return false;
8111
8112 // Match the SUBREG_TO_REG sequence.
8113 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
8114 return false;
8115
8116 // Verify that the subreg to reg loads an integer into the first lane.
8117 auto Lane0LoadReg = CurrInstr->getOperand(1).getReg();
8118 unsigned SingleLaneSizeInBits = 128 / NumLanes;
8119 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
8120 return false;
8121
8122 // Verify that it also has a single non debug use.
8123 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
8124 return false;
8125
8126 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
8127
8128 // If there is any chance of aliasing, do not apply the pattern.
8129 // Walk backward through the MBB starting from Root.
8130 // Exit early if we've encountered all load instructions or hit the search
8131 // limit.
8132 auto MBBItr = Root.getIterator();
8133 unsigned RemainingSteps = GatherOptSearchLimit;
8134 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
8135 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
8136 const MachineBasicBlock *MBB = Root.getParent();
8137
8138 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
8139 !RemainingLoadInstrs.empty();
8140 --MBBItr, --RemainingSteps) {
8141 const MachineInstr &CurrInstr = *MBBItr;
8142
8143 // Remove this instruction from remaining loads if it's one we're tracking.
8144 RemainingLoadInstrs.erase(&CurrInstr);
8145
8146 // Check for potential aliasing with any of the load instructions to
8147 // optimize.
8148 if (CurrInstr.isLoadFoldBarrier())
8149 return false;
8150 }
8151
8152 // If we hit the search limit without finding all load instructions,
8153 // don't match the pattern.
8154 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8155 return false;
8156
8157 switch (NumLanes) {
8158 case 4:
8160 break;
8161 case 8:
8163 break;
8164 case 16:
8166 break;
8167 default:
8168 llvm_unreachable("Got bad number of lanes for gather pattern.");
8169 }
8170
8171 return true;
8172}
8173
8174/// Search for patterns of LD instructions we can optimize.
8176 SmallVectorImpl<unsigned> &Patterns) {
8177
8178 // The pattern searches for loads into single lanes.
8179 switch (Root.getOpcode()) {
8180 case AArch64::LD1i32:
8181 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8182 case AArch64::LD1i16:
8183 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8184 case AArch64::LD1i8:
8185 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8186 default:
8187 return false;
8188 }
8189}
8190
8191/// Generate optimized instruction sequence for gather load patterns to improve
8192/// Memory-Level Parallelism (MLP). This function transforms a chain of
8193/// sequential NEON lane loads into parallel vector loads that can execute
8194/// concurrently.
8195static void
8199 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8200 unsigned Pattern, unsigned NumLanes) {
8201 MachineFunction &MF = *Root.getParent()->getParent();
8202 MachineRegisterInfo &MRI = MF.getRegInfo();
8204
8205 // Gather the initial load instructions to build the pattern.
8206 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8207 MachineInstr *CurrInstr = &Root;
8208 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8209 LoadToLaneInstrs.push_back(CurrInstr);
8210 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8211 }
8212
8213 // Sort the load instructions according to the lane.
8214 llvm::sort(LoadToLaneInstrs,
8215 [](const MachineInstr *A, const MachineInstr *B) {
8216 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8217 });
8218
8219 MachineInstr *SubregToReg = CurrInstr;
8220 LoadToLaneInstrs.push_back(
8221 MRI.getUniqueVRegDef(SubregToReg->getOperand(1).getReg()));
8222 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8223
8224 const TargetRegisterClass *FPR128RegClass =
8225 MRI.getRegClass(Root.getOperand(0).getReg());
8226
8227 // Helper lambda to create a LD1 instruction.
8228 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8229 Register SrcRegister, unsigned Lane,
8230 Register OffsetRegister,
8231 bool OffsetRegisterKillState) {
8232 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8233 MachineInstrBuilder LoadIndexIntoRegister =
8234 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8235 NewRegister)
8236 .addReg(SrcRegister)
8237 .addImm(Lane)
8238 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8239 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8240 InsInstrs.push_back(LoadIndexIntoRegister);
8241 return NewRegister;
8242 };
8243
8244 // Helper to create load instruction based on the NumLanes in the NEON
8245 // register we are rewriting.
8246 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8247 Register OffsetReg,
8248 bool KillState) -> MachineInstrBuilder {
8249 unsigned Opcode;
8250 switch (NumLanes) {
8251 case 4:
8252 Opcode = AArch64::LDRSui;
8253 break;
8254 case 8:
8255 Opcode = AArch64::LDRHui;
8256 break;
8257 case 16:
8258 Opcode = AArch64::LDRBui;
8259 break;
8260 default:
8262 "Got unsupported number of lanes in machine-combiner gather pattern");
8263 }
8264 // Immediate offset load
8265 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8266 .addReg(OffsetReg)
8267 .addImm(0);
8268 };
8269
8270 // Load the remaining lanes into register 0.
8271 auto LanesToLoadToReg0 =
8272 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8273 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8274 Register PrevReg = SubregToReg->getOperand(0).getReg();
8275 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8276 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8277 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8278 OffsetRegOperand.getReg(),
8279 OffsetRegOperand.isKill());
8280 DelInstrs.push_back(LoadInstr);
8281 }
8282 Register LastLoadReg0 = PrevReg;
8283
8284 // First load into register 1. Perform an integer load to zero out the upper
8285 // lanes in a single instruction.
8286 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8287 MachineInstr *OriginalSplitLoad =
8288 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8289 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8290 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8291
8292 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8293 OriginalSplitLoad->getOperand(3);
8294 MachineInstrBuilder MiddleIndexLoadInstr =
8295 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8296 OriginalSplitToLoadOffsetOperand.getReg(),
8297 OriginalSplitToLoadOffsetOperand.isKill());
8298
8299 InstrIdxForVirtReg.insert(
8300 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8301 InsInstrs.push_back(MiddleIndexLoadInstr);
8302 DelInstrs.push_back(OriginalSplitLoad);
8303
8304 // Subreg To Reg instruction for register 1.
8305 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8306 unsigned SubregType;
8307 switch (NumLanes) {
8308 case 4:
8309 SubregType = AArch64::ssub;
8310 break;
8311 case 8:
8312 SubregType = AArch64::hsub;
8313 break;
8314 case 16:
8315 SubregType = AArch64::bsub;
8316 break;
8317 default:
8319 "Got invalid NumLanes for machine-combiner gather pattern");
8320 }
8321
8322 auto SubRegToRegInstr =
8323 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8324 DestRegForSubregToReg)
8325 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8326 .addImm(SubregType);
8327 InstrIdxForVirtReg.insert(
8328 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8329 InsInstrs.push_back(SubRegToRegInstr);
8330
8331 // Load remaining lanes into register 1.
8332 auto LanesToLoadToReg1 =
8333 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8334 LoadToLaneInstrsAscending.end());
8335 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8336 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8337 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8338 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8339 OffsetRegOperand.getReg(),
8340 OffsetRegOperand.isKill());
8341
8342 // Do not add the last reg to DelInstrs - it will be removed later.
8343 if (Index == NumLanes / 2 - 2) {
8344 break;
8345 }
8346 DelInstrs.push_back(LoadInstr);
8347 }
8348 Register LastLoadReg1 = PrevReg;
8349
8350 // Create the final zip instruction to combine the results.
8351 MachineInstrBuilder ZipInstr =
8352 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8353 Root.getOperand(0).getReg())
8354 .addReg(LastLoadReg0)
8355 .addReg(LastLoadReg1);
8356 InsInstrs.push_back(ZipInstr);
8357}
8358
8372
8373/// Return true when there is potentially a faster code sequence for an
8374/// instruction chain ending in \p Root. All potential patterns are listed in
8375/// the \p Pattern vector. Pattern should be sorted in priority order since the
8376/// pattern evaluator stops checking as soon as it finds a faster sequence.
8377
8378bool AArch64InstrInfo::getMachineCombinerPatterns(
8379 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8380 bool DoRegPressureReduce) const {
8381 // Integer patterns
8382 if (getMaddPatterns(Root, Patterns))
8383 return true;
8384 // Floating point patterns
8385 if (getFMULPatterns(Root, Patterns))
8386 return true;
8387 if (getFMAPatterns(Root, Patterns))
8388 return true;
8389 if (getFNEGPatterns(Root, Patterns))
8390 return true;
8391
8392 // Other patterns
8393 if (getMiscPatterns(Root, Patterns))
8394 return true;
8395
8396 // Load patterns
8397 if (getLoadPatterns(Root, Patterns))
8398 return true;
8399
8400 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8401 DoRegPressureReduce);
8402}
8403
8405/// genFusedMultiply - Generate fused multiply instructions.
8406/// This function supports both integer and floating point instructions.
8407/// A typical example:
8408/// F|MUL I=A,B,0
8409/// F|ADD R,I,C
8410/// ==> F|MADD R,A,B,C
8411/// \param MF Containing MachineFunction
8412/// \param MRI Register information
8413/// \param TII Target information
8414/// \param Root is the F|ADD instruction
8415/// \param [out] InsInstrs is a vector of machine instructions and will
8416/// contain the generated madd instruction
8417/// \param IdxMulOpd is index of operand in Root that is the result of
8418/// the F|MUL. In the example above IdxMulOpd is 1.
8419/// \param MaddOpc the opcode fo the f|madd instruction
8420/// \param RC Register class of operands
8421/// \param kind of fma instruction (addressing mode) to be generated
8422/// \param ReplacedAddend is the result register from the instruction
8423/// replacing the non-combined operand, if any.
8424static MachineInstr *
8426 const TargetInstrInfo *TII, MachineInstr &Root,
8427 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8428 unsigned MaddOpc, const TargetRegisterClass *RC,
8430 const Register *ReplacedAddend = nullptr) {
8431 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8432
8433 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8434 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8435 Register ResultReg = Root.getOperand(0).getReg();
8436 Register SrcReg0 = MUL->getOperand(1).getReg();
8437 bool Src0IsKill = MUL->getOperand(1).isKill();
8438 Register SrcReg1 = MUL->getOperand(2).getReg();
8439 bool Src1IsKill = MUL->getOperand(2).isKill();
8440
8441 Register SrcReg2;
8442 bool Src2IsKill;
8443 if (ReplacedAddend) {
8444 // If we just generated a new addend, we must be it's only use.
8445 SrcReg2 = *ReplacedAddend;
8446 Src2IsKill = true;
8447 } else {
8448 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8449 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8450 }
8451
8452 if (ResultReg.isVirtual())
8453 MRI.constrainRegClass(ResultReg, RC);
8454 if (SrcReg0.isVirtual())
8455 MRI.constrainRegClass(SrcReg0, RC);
8456 if (SrcReg1.isVirtual())
8457 MRI.constrainRegClass(SrcReg1, RC);
8458 if (SrcReg2.isVirtual())
8459 MRI.constrainRegClass(SrcReg2, RC);
8460
8462 if (kind == FMAInstKind::Default)
8463 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8464 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8465 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8466 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8467 else if (kind == FMAInstKind::Indexed)
8468 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8469 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8470 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8471 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8472 .addImm(MUL->getOperand(3).getImm());
8473 else if (kind == FMAInstKind::Accumulator)
8474 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8475 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8476 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8477 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8478 else
8479 assert(false && "Invalid FMA instruction kind \n");
8480 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8481 InsInstrs.push_back(MIB);
8482 return MUL;
8483}
8484
8485static MachineInstr *
8487 const TargetInstrInfo *TII, MachineInstr &Root,
8489 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8490
8491 unsigned Opc = 0;
8492 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8493 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8494 Opc = AArch64::FNMADDSrrr;
8495 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8496 Opc = AArch64::FNMADDDrrr;
8497 else
8498 return nullptr;
8499
8500 Register ResultReg = Root.getOperand(0).getReg();
8501 Register SrcReg0 = MAD->getOperand(1).getReg();
8502 Register SrcReg1 = MAD->getOperand(2).getReg();
8503 Register SrcReg2 = MAD->getOperand(3).getReg();
8504 bool Src0IsKill = MAD->getOperand(1).isKill();
8505 bool Src1IsKill = MAD->getOperand(2).isKill();
8506 bool Src2IsKill = MAD->getOperand(3).isKill();
8507 if (ResultReg.isVirtual())
8508 MRI.constrainRegClass(ResultReg, RC);
8509 if (SrcReg0.isVirtual())
8510 MRI.constrainRegClass(SrcReg0, RC);
8511 if (SrcReg1.isVirtual())
8512 MRI.constrainRegClass(SrcReg1, RC);
8513 if (SrcReg2.isVirtual())
8514 MRI.constrainRegClass(SrcReg2, RC);
8515
8517 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8518 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8519 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8520 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8521 InsInstrs.push_back(MIB);
8522
8523 return MAD;
8524}
8525
8526/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8527static MachineInstr *
8530 unsigned IdxDupOp, unsigned MulOpc,
8531 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
8532 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8533 "Invalid index of FMUL operand");
8534
8535 MachineFunction &MF = *Root.getMF();
8537
8538 MachineInstr *Dup =
8539 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8540
8541 if (Dup->getOpcode() == TargetOpcode::COPY)
8542 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8543
8544 Register DupSrcReg = Dup->getOperand(1).getReg();
8545 MRI.clearKillFlags(DupSrcReg);
8546 MRI.constrainRegClass(DupSrcReg, RC);
8547
8548 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8549
8550 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8551 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8552
8553 Register ResultReg = Root.getOperand(0).getReg();
8554
8556 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8557 .add(MulOp)
8558 .addReg(DupSrcReg)
8559 .addImm(DupSrcLane);
8560
8561 InsInstrs.push_back(MIB);
8562 return &Root;
8563}
8564
8565/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8566/// instructions.
8567///
8568/// \see genFusedMultiply
8572 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8573 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8575}
8576
8577/// genNeg - Helper to generate an intermediate negation of the second operand
8578/// of Root
8580 const TargetInstrInfo *TII, MachineInstr &Root,
8582 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8583 unsigned MnegOpc, const TargetRegisterClass *RC) {
8584 Register NewVR = MRI.createVirtualRegister(RC);
8586 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8587 .add(Root.getOperand(2));
8588 InsInstrs.push_back(MIB);
8589
8590 assert(InstrIdxForVirtReg.empty());
8591 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8592
8593 return NewVR;
8594}
8595
8596/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8597/// instructions with an additional negation of the accumulator
8601 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8602 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8603 assert(IdxMulOpd == 1);
8604
8605 Register NewVR =
8606 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8607 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8608 FMAInstKind::Accumulator, &NewVR);
8609}
8610
8611/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8612/// instructions.
8613///
8614/// \see genFusedMultiply
8618 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8619 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8621}
8622
8623/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8624/// instructions with an additional negation of the accumulator
8628 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8629 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8630 assert(IdxMulOpd == 1);
8631
8632 Register NewVR =
8633 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8634
8635 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8636 FMAInstKind::Indexed, &NewVR);
8637}
8638
8639/// genMaddR - Generate madd instruction and combine mul and add using
8640/// an extra virtual register
8641/// Example - an ADD intermediate needs to be stored in a register:
8642/// MUL I=A,B,0
8643/// ADD R,I,Imm
8644/// ==> ORR V, ZR, Imm
8645/// ==> MADD R,A,B,V
8646/// \param MF Containing MachineFunction
8647/// \param MRI Register information
8648/// \param TII Target information
8649/// \param Root is the ADD instruction
8650/// \param [out] InsInstrs is a vector of machine instructions and will
8651/// contain the generated madd instruction
8652/// \param IdxMulOpd is index of operand in Root that is the result of
8653/// the MUL. In the example above IdxMulOpd is 1.
8654/// \param MaddOpc the opcode fo the madd instruction
8655/// \param VR is a virtual register that holds the value of an ADD operand
8656/// (V in the example above).
8657/// \param RC Register class of operands
8659 const TargetInstrInfo *TII, MachineInstr &Root,
8661 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8662 const TargetRegisterClass *RC) {
8663 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8664
8665 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8666 Register ResultReg = Root.getOperand(0).getReg();
8667 Register SrcReg0 = MUL->getOperand(1).getReg();
8668 bool Src0IsKill = MUL->getOperand(1).isKill();
8669 Register SrcReg1 = MUL->getOperand(2).getReg();
8670 bool Src1IsKill = MUL->getOperand(2).isKill();
8671
8672 if (ResultReg.isVirtual())
8673 MRI.constrainRegClass(ResultReg, RC);
8674 if (SrcReg0.isVirtual())
8675 MRI.constrainRegClass(SrcReg0, RC);
8676 if (SrcReg1.isVirtual())
8677 MRI.constrainRegClass(SrcReg1, RC);
8679 MRI.constrainRegClass(VR, RC);
8680
8682 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8683 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8684 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8685 .addReg(VR);
8686 // Insert the MADD
8687 InsInstrs.push_back(MIB);
8688 return MUL;
8689}
8690
8691/// Do the following transformation
8692/// A - (B + C) ==> (A - B) - C
8693/// A - (B + C) ==> (A - C) - B
8695 const TargetInstrInfo *TII, MachineInstr &Root,
8698 unsigned IdxOpd1,
8699 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8700 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8701 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8702 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8703
8704 Register ResultReg = Root.getOperand(0).getReg();
8705 Register RegA = Root.getOperand(1).getReg();
8706 bool RegAIsKill = Root.getOperand(1).isKill();
8707 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8708 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8709 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8710 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8711 Register NewVR =
8713
8714 unsigned Opcode = Root.getOpcode();
8715 if (Opcode == AArch64::SUBSWrr)
8716 Opcode = AArch64::SUBWrr;
8717 else if (Opcode == AArch64::SUBSXrr)
8718 Opcode = AArch64::SUBXrr;
8719 else
8720 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8721 "Unexpected instruction opcode.");
8722
8723 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8724 Flags &= ~MachineInstr::NoSWrap;
8725 Flags &= ~MachineInstr::NoUWrap;
8726
8727 MachineInstrBuilder MIB1 =
8728 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8729 .addReg(RegA, getKillRegState(RegAIsKill))
8730 .addReg(RegB, getKillRegState(RegBIsKill))
8731 .setMIFlags(Flags);
8732 MachineInstrBuilder MIB2 =
8733 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8734 .addReg(NewVR, getKillRegState(true))
8735 .addReg(RegC, getKillRegState(RegCIsKill))
8736 .setMIFlags(Flags);
8737
8738 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8739 InsInstrs.push_back(MIB1);
8740 InsInstrs.push_back(MIB2);
8741 DelInstrs.push_back(AddMI);
8742 DelInstrs.push_back(&Root);
8743}
8744
8745unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8746 unsigned int AccumulatorOpCode) const {
8747 switch (AccumulatorOpCode) {
8748 case AArch64::UABALB_ZZZ_D:
8749 case AArch64::SABALB_ZZZ_D:
8750 case AArch64::UABALT_ZZZ_D:
8751 case AArch64::SABALT_ZZZ_D:
8752 return AArch64::ADD_ZZZ_D;
8753 case AArch64::UABALB_ZZZ_H:
8754 case AArch64::SABALB_ZZZ_H:
8755 case AArch64::UABALT_ZZZ_H:
8756 case AArch64::SABALT_ZZZ_H:
8757 return AArch64::ADD_ZZZ_H;
8758 case AArch64::UABALB_ZZZ_S:
8759 case AArch64::SABALB_ZZZ_S:
8760 case AArch64::UABALT_ZZZ_S:
8761 case AArch64::SABALT_ZZZ_S:
8762 return AArch64::ADD_ZZZ_S;
8763 case AArch64::UABALv16i8_v8i16:
8764 case AArch64::SABALv8i8_v8i16:
8765 case AArch64::SABAv8i16:
8766 case AArch64::UABAv8i16:
8767 return AArch64::ADDv8i16;
8768 case AArch64::SABALv2i32_v2i64:
8769 case AArch64::UABALv2i32_v2i64:
8770 case AArch64::SABALv4i32_v2i64:
8771 return AArch64::ADDv2i64;
8772 case AArch64::UABALv4i16_v4i32:
8773 case AArch64::SABALv4i16_v4i32:
8774 case AArch64::SABALv8i16_v4i32:
8775 case AArch64::SABAv4i32:
8776 case AArch64::UABAv4i32:
8777 return AArch64::ADDv4i32;
8778 case AArch64::UABALv4i32_v2i64:
8779 return AArch64::ADDv2i64;
8780 case AArch64::UABALv8i16_v4i32:
8781 return AArch64::ADDv4i32;
8782 case AArch64::UABALv8i8_v8i16:
8783 case AArch64::SABALv16i8_v8i16:
8784 return AArch64::ADDv8i16;
8785 case AArch64::UABAv16i8:
8786 case AArch64::SABAv16i8:
8787 return AArch64::ADDv16i8;
8788 case AArch64::UABAv4i16:
8789 case AArch64::SABAv4i16:
8790 return AArch64::ADDv4i16;
8791 case AArch64::UABAv2i32:
8792 case AArch64::SABAv2i32:
8793 return AArch64::ADDv2i32;
8794 case AArch64::UABAv8i8:
8795 case AArch64::SABAv8i8:
8796 return AArch64::ADDv8i8;
8797 default:
8798 llvm_unreachable("Unknown accumulator opcode");
8799 }
8800}
8801
8802/// When getMachineCombinerPatterns() finds potential patterns,
8803/// this function generates the instructions that could replace the
8804/// original code sequence
8805void AArch64InstrInfo::genAlternativeCodeSequence(
8806 MachineInstr &Root, unsigned Pattern,
8809 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8810 MachineBasicBlock &MBB = *Root.getParent();
8811 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8812 MachineFunction &MF = *MBB.getParent();
8813 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8814
8815 MachineInstr *MUL = nullptr;
8816 const TargetRegisterClass *RC;
8817 unsigned Opc;
8818 switch (Pattern) {
8819 default:
8820 // Reassociate instructions.
8821 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8822 DelInstrs, InstrIdxForVirtReg);
8823 return;
8825 // A - (B + C)
8826 // ==> (A - B) - C
8827 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8828 InstrIdxForVirtReg);
8829 return;
8831 // A - (B + C)
8832 // ==> (A - C) - B
8833 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8834 InstrIdxForVirtReg);
8835 return;
8838 // MUL I=A,B,0
8839 // ADD R,I,C
8840 // ==> MADD R,A,B,C
8841 // --- Create(MADD);
8843 Opc = AArch64::MADDWrrr;
8844 RC = &AArch64::GPR32RegClass;
8845 } else {
8846 Opc = AArch64::MADDXrrr;
8847 RC = &AArch64::GPR64RegClass;
8848 }
8849 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8850 break;
8853 // MUL I=A,B,0
8854 // ADD R,C,I
8855 // ==> MADD R,A,B,C
8856 // --- Create(MADD);
8858 Opc = AArch64::MADDWrrr;
8859 RC = &AArch64::GPR32RegClass;
8860 } else {
8861 Opc = AArch64::MADDXrrr;
8862 RC = &AArch64::GPR64RegClass;
8863 }
8864 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8865 break;
8870 // MUL I=A,B,0
8871 // ADD/SUB R,I,Imm
8872 // ==> MOV V, Imm/-Imm
8873 // ==> MADD R,A,B,V
8874 // --- Create(MADD);
8875 const TargetRegisterClass *RC;
8876 unsigned BitSize, MovImm;
8879 MovImm = AArch64::MOVi32imm;
8880 RC = &AArch64::GPR32spRegClass;
8881 BitSize = 32;
8882 Opc = AArch64::MADDWrrr;
8883 RC = &AArch64::GPR32RegClass;
8884 } else {
8885 MovImm = AArch64::MOVi64imm;
8886 RC = &AArch64::GPR64spRegClass;
8887 BitSize = 64;
8888 Opc = AArch64::MADDXrrr;
8889 RC = &AArch64::GPR64RegClass;
8890 }
8891 Register NewVR = MRI.createVirtualRegister(RC);
8892 uint64_t Imm = Root.getOperand(2).getImm();
8893
8894 if (Root.getOperand(3).isImm()) {
8895 unsigned Val = Root.getOperand(3).getImm();
8896 Imm = Imm << Val;
8897 }
8898 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8900 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8901 // Check that the immediate can be composed via a single instruction.
8903 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8904 if (Insn.size() != 1)
8905 return;
8906 MachineInstrBuilder MIB1 =
8907 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8908 .addImm(IsSub ? -Imm : Imm);
8909 InsInstrs.push_back(MIB1);
8910 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8911 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8912 break;
8913 }
8916 // MUL I=A,B,0
8917 // SUB R,I, C
8918 // ==> SUB V, 0, C
8919 // ==> MADD R,A,B,V // = -C + A*B
8920 // --- Create(MADD);
8921 const TargetRegisterClass *SubRC;
8922 unsigned SubOpc, ZeroReg;
8924 SubOpc = AArch64::SUBWrr;
8925 SubRC = &AArch64::GPR32spRegClass;
8926 ZeroReg = AArch64::WZR;
8927 Opc = AArch64::MADDWrrr;
8928 RC = &AArch64::GPR32RegClass;
8929 } else {
8930 SubOpc = AArch64::SUBXrr;
8931 SubRC = &AArch64::GPR64spRegClass;
8932 ZeroReg = AArch64::XZR;
8933 Opc = AArch64::MADDXrrr;
8934 RC = &AArch64::GPR64RegClass;
8935 }
8936 Register NewVR = MRI.createVirtualRegister(SubRC);
8937 // SUB NewVR, 0, C
8938 MachineInstrBuilder MIB1 =
8939 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8940 .addReg(ZeroReg)
8941 .add(Root.getOperand(2));
8942 InsInstrs.push_back(MIB1);
8943 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8944 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8945 break;
8946 }
8949 // MUL I=A,B,0
8950 // SUB R,C,I
8951 // ==> MSUB R,A,B,C (computes C - A*B)
8952 // --- Create(MSUB);
8954 Opc = AArch64::MSUBWrrr;
8955 RC = &AArch64::GPR32RegClass;
8956 } else {
8957 Opc = AArch64::MSUBXrrr;
8958 RC = &AArch64::GPR64RegClass;
8959 }
8960 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8961 break;
8963 Opc = AArch64::MLAv8i8;
8964 RC = &AArch64::FPR64RegClass;
8965 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8966 break;
8968 Opc = AArch64::MLAv8i8;
8969 RC = &AArch64::FPR64RegClass;
8970 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8971 break;
8973 Opc = AArch64::MLAv16i8;
8974 RC = &AArch64::FPR128RegClass;
8975 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8976 break;
8978 Opc = AArch64::MLAv16i8;
8979 RC = &AArch64::FPR128RegClass;
8980 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8981 break;
8983 Opc = AArch64::MLAv4i16;
8984 RC = &AArch64::FPR64RegClass;
8985 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8986 break;
8988 Opc = AArch64::MLAv4i16;
8989 RC = &AArch64::FPR64RegClass;
8990 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8991 break;
8993 Opc = AArch64::MLAv8i16;
8994 RC = &AArch64::FPR128RegClass;
8995 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8996 break;
8998 Opc = AArch64::MLAv8i16;
8999 RC = &AArch64::FPR128RegClass;
9000 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9001 break;
9003 Opc = AArch64::MLAv2i32;
9004 RC = &AArch64::FPR64RegClass;
9005 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9006 break;
9008 Opc = AArch64::MLAv2i32;
9009 RC = &AArch64::FPR64RegClass;
9010 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9011 break;
9013 Opc = AArch64::MLAv4i32;
9014 RC = &AArch64::FPR128RegClass;
9015 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9016 break;
9018 Opc = AArch64::MLAv4i32;
9019 RC = &AArch64::FPR128RegClass;
9020 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9021 break;
9022
9024 Opc = AArch64::MLAv8i8;
9025 RC = &AArch64::FPR64RegClass;
9026 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9027 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
9028 RC);
9029 break;
9031 Opc = AArch64::MLSv8i8;
9032 RC = &AArch64::FPR64RegClass;
9033 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9034 break;
9036 Opc = AArch64::MLAv16i8;
9037 RC = &AArch64::FPR128RegClass;
9038 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9039 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
9040 RC);
9041 break;
9043 Opc = AArch64::MLSv16i8;
9044 RC = &AArch64::FPR128RegClass;
9045 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9046 break;
9048 Opc = AArch64::MLAv4i16;
9049 RC = &AArch64::FPR64RegClass;
9050 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9051 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9052 RC);
9053 break;
9055 Opc = AArch64::MLSv4i16;
9056 RC = &AArch64::FPR64RegClass;
9057 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9058 break;
9060 Opc = AArch64::MLAv8i16;
9061 RC = &AArch64::FPR128RegClass;
9062 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9063 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9064 RC);
9065 break;
9067 Opc = AArch64::MLSv8i16;
9068 RC = &AArch64::FPR128RegClass;
9069 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9070 break;
9072 Opc = AArch64::MLAv2i32;
9073 RC = &AArch64::FPR64RegClass;
9074 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9075 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9076 RC);
9077 break;
9079 Opc = AArch64::MLSv2i32;
9080 RC = &AArch64::FPR64RegClass;
9081 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9082 break;
9084 Opc = AArch64::MLAv4i32;
9085 RC = &AArch64::FPR128RegClass;
9086 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
9087 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9088 RC);
9089 break;
9091 Opc = AArch64::MLSv4i32;
9092 RC = &AArch64::FPR128RegClass;
9093 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9094 break;
9095
9097 Opc = AArch64::MLAv4i16_indexed;
9098 RC = &AArch64::FPR64RegClass;
9099 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9100 break;
9102 Opc = AArch64::MLAv4i16_indexed;
9103 RC = &AArch64::FPR64RegClass;
9104 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9105 break;
9107 Opc = AArch64::MLAv8i16_indexed;
9108 RC = &AArch64::FPR128RegClass;
9109 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9110 break;
9112 Opc = AArch64::MLAv8i16_indexed;
9113 RC = &AArch64::FPR128RegClass;
9114 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9115 break;
9117 Opc = AArch64::MLAv2i32_indexed;
9118 RC = &AArch64::FPR64RegClass;
9119 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9120 break;
9122 Opc = AArch64::MLAv2i32_indexed;
9123 RC = &AArch64::FPR64RegClass;
9124 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9125 break;
9127 Opc = AArch64::MLAv4i32_indexed;
9128 RC = &AArch64::FPR128RegClass;
9129 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9130 break;
9132 Opc = AArch64::MLAv4i32_indexed;
9133 RC = &AArch64::FPR128RegClass;
9134 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9135 break;
9136
9138 Opc = AArch64::MLAv4i16_indexed;
9139 RC = &AArch64::FPR64RegClass;
9140 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9141 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
9142 RC);
9143 break;
9145 Opc = AArch64::MLSv4i16_indexed;
9146 RC = &AArch64::FPR64RegClass;
9147 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9148 break;
9150 Opc = AArch64::MLAv8i16_indexed;
9151 RC = &AArch64::FPR128RegClass;
9152 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9153 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9154 RC);
9155 break;
9157 Opc = AArch64::MLSv8i16_indexed;
9158 RC = &AArch64::FPR128RegClass;
9159 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9160 break;
9162 Opc = AArch64::MLAv2i32_indexed;
9163 RC = &AArch64::FPR64RegClass;
9164 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9165 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9166 RC);
9167 break;
9169 Opc = AArch64::MLSv2i32_indexed;
9170 RC = &AArch64::FPR64RegClass;
9171 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9172 break;
9174 Opc = AArch64::MLAv4i32_indexed;
9175 RC = &AArch64::FPR128RegClass;
9176 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9177 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9178 RC);
9179 break;
9181 Opc = AArch64::MLSv4i32_indexed;
9182 RC = &AArch64::FPR128RegClass;
9183 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9184 break;
9185
9186 // Floating Point Support
9188 Opc = AArch64::FMADDHrrr;
9189 RC = &AArch64::FPR16RegClass;
9190 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9191 break;
9193 Opc = AArch64::FMADDSrrr;
9194 RC = &AArch64::FPR32RegClass;
9195 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9196 break;
9198 Opc = AArch64::FMADDDrrr;
9199 RC = &AArch64::FPR64RegClass;
9200 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9201 break;
9202
9204 Opc = AArch64::FMADDHrrr;
9205 RC = &AArch64::FPR16RegClass;
9206 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9207 break;
9209 Opc = AArch64::FMADDSrrr;
9210 RC = &AArch64::FPR32RegClass;
9211 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9212 break;
9214 Opc = AArch64::FMADDDrrr;
9215 RC = &AArch64::FPR64RegClass;
9216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9217 break;
9218
9220 Opc = AArch64::FMLAv1i32_indexed;
9221 RC = &AArch64::FPR32RegClass;
9222 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9224 break;
9226 Opc = AArch64::FMLAv1i32_indexed;
9227 RC = &AArch64::FPR32RegClass;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9230 break;
9231
9233 Opc = AArch64::FMLAv1i64_indexed;
9234 RC = &AArch64::FPR64RegClass;
9235 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9237 break;
9239 Opc = AArch64::FMLAv1i64_indexed;
9240 RC = &AArch64::FPR64RegClass;
9241 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9243 break;
9244
9246 RC = &AArch64::FPR64RegClass;
9247 Opc = AArch64::FMLAv4i16_indexed;
9248 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9250 break;
9252 RC = &AArch64::FPR64RegClass;
9253 Opc = AArch64::FMLAv4f16;
9254 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9256 break;
9258 RC = &AArch64::FPR64RegClass;
9259 Opc = AArch64::FMLAv4i16_indexed;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9262 break;
9264 RC = &AArch64::FPR64RegClass;
9265 Opc = AArch64::FMLAv4f16;
9266 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9268 break;
9269
9272 RC = &AArch64::FPR64RegClass;
9274 Opc = AArch64::FMLAv2i32_indexed;
9275 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9277 } else {
9278 Opc = AArch64::FMLAv2f32;
9279 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9281 }
9282 break;
9285 RC = &AArch64::FPR64RegClass;
9287 Opc = AArch64::FMLAv2i32_indexed;
9288 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9290 } else {
9291 Opc = AArch64::FMLAv2f32;
9292 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9294 }
9295 break;
9296
9298 RC = &AArch64::FPR128RegClass;
9299 Opc = AArch64::FMLAv8i16_indexed;
9300 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9302 break;
9304 RC = &AArch64::FPR128RegClass;
9305 Opc = AArch64::FMLAv8f16;
9306 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9308 break;
9310 RC = &AArch64::FPR128RegClass;
9311 Opc = AArch64::FMLAv8i16_indexed;
9312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9314 break;
9316 RC = &AArch64::FPR128RegClass;
9317 Opc = AArch64::FMLAv8f16;
9318 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9320 break;
9321
9324 RC = &AArch64::FPR128RegClass;
9326 Opc = AArch64::FMLAv2i64_indexed;
9327 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9329 } else {
9330 Opc = AArch64::FMLAv2f64;
9331 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9333 }
9334 break;
9337 RC = &AArch64::FPR128RegClass;
9339 Opc = AArch64::FMLAv2i64_indexed;
9340 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9342 } else {
9343 Opc = AArch64::FMLAv2f64;
9344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9346 }
9347 break;
9348
9351 RC = &AArch64::FPR128RegClass;
9353 Opc = AArch64::FMLAv4i32_indexed;
9354 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9356 } else {
9357 Opc = AArch64::FMLAv4f32;
9358 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9360 }
9361 break;
9362
9365 RC = &AArch64::FPR128RegClass;
9367 Opc = AArch64::FMLAv4i32_indexed;
9368 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9370 } else {
9371 Opc = AArch64::FMLAv4f32;
9372 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9374 }
9375 break;
9376
9378 Opc = AArch64::FNMSUBHrrr;
9379 RC = &AArch64::FPR16RegClass;
9380 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9381 break;
9383 Opc = AArch64::FNMSUBSrrr;
9384 RC = &AArch64::FPR32RegClass;
9385 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9386 break;
9388 Opc = AArch64::FNMSUBDrrr;
9389 RC = &AArch64::FPR64RegClass;
9390 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9391 break;
9392
9394 Opc = AArch64::FNMADDHrrr;
9395 RC = &AArch64::FPR16RegClass;
9396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9397 break;
9399 Opc = AArch64::FNMADDSrrr;
9400 RC = &AArch64::FPR32RegClass;
9401 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9402 break;
9404 Opc = AArch64::FNMADDDrrr;
9405 RC = &AArch64::FPR64RegClass;
9406 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9407 break;
9408
9410 Opc = AArch64::FMSUBHrrr;
9411 RC = &AArch64::FPR16RegClass;
9412 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9413 break;
9415 Opc = AArch64::FMSUBSrrr;
9416 RC = &AArch64::FPR32RegClass;
9417 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9418 break;
9420 Opc = AArch64::FMSUBDrrr;
9421 RC = &AArch64::FPR64RegClass;
9422 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9423 break;
9424
9426 Opc = AArch64::FMLSv1i32_indexed;
9427 RC = &AArch64::FPR32RegClass;
9428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9430 break;
9431
9433 Opc = AArch64::FMLSv1i64_indexed;
9434 RC = &AArch64::FPR64RegClass;
9435 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9437 break;
9438
9441 RC = &AArch64::FPR64RegClass;
9442 Register NewVR = MRI.createVirtualRegister(RC);
9443 MachineInstrBuilder MIB1 =
9444 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9445 .add(Root.getOperand(2));
9446 InsInstrs.push_back(MIB1);
9447 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9449 Opc = AArch64::FMLAv4f16;
9450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9451 FMAInstKind::Accumulator, &NewVR);
9452 } else {
9453 Opc = AArch64::FMLAv4i16_indexed;
9454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9455 FMAInstKind::Indexed, &NewVR);
9456 }
9457 break;
9458 }
9460 RC = &AArch64::FPR64RegClass;
9461 Opc = AArch64::FMLSv4f16;
9462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9464 break;
9466 RC = &AArch64::FPR64RegClass;
9467 Opc = AArch64::FMLSv4i16_indexed;
9468 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9470 break;
9471
9474 RC = &AArch64::FPR64RegClass;
9476 Opc = AArch64::FMLSv2i32_indexed;
9477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9479 } else {
9480 Opc = AArch64::FMLSv2f32;
9481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9483 }
9484 break;
9485
9488 RC = &AArch64::FPR128RegClass;
9489 Register NewVR = MRI.createVirtualRegister(RC);
9490 MachineInstrBuilder MIB1 =
9491 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9492 .add(Root.getOperand(2));
9493 InsInstrs.push_back(MIB1);
9494 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9496 Opc = AArch64::FMLAv8f16;
9497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9498 FMAInstKind::Accumulator, &NewVR);
9499 } else {
9500 Opc = AArch64::FMLAv8i16_indexed;
9501 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9502 FMAInstKind::Indexed, &NewVR);
9503 }
9504 break;
9505 }
9507 RC = &AArch64::FPR128RegClass;
9508 Opc = AArch64::FMLSv8f16;
9509 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9511 break;
9513 RC = &AArch64::FPR128RegClass;
9514 Opc = AArch64::FMLSv8i16_indexed;
9515 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9517 break;
9518
9521 RC = &AArch64::FPR128RegClass;
9523 Opc = AArch64::FMLSv2i64_indexed;
9524 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9526 } else {
9527 Opc = AArch64::FMLSv2f64;
9528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9530 }
9531 break;
9532
9535 RC = &AArch64::FPR128RegClass;
9537 Opc = AArch64::FMLSv4i32_indexed;
9538 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9540 } else {
9541 Opc = AArch64::FMLSv4f32;
9542 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9544 }
9545 break;
9548 RC = &AArch64::FPR64RegClass;
9549 Register NewVR = MRI.createVirtualRegister(RC);
9550 MachineInstrBuilder MIB1 =
9551 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9552 .add(Root.getOperand(2));
9553 InsInstrs.push_back(MIB1);
9554 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9556 Opc = AArch64::FMLAv2i32_indexed;
9557 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9558 FMAInstKind::Indexed, &NewVR);
9559 } else {
9560 Opc = AArch64::FMLAv2f32;
9561 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9562 FMAInstKind::Accumulator, &NewVR);
9563 }
9564 break;
9565 }
9568 RC = &AArch64::FPR128RegClass;
9569 Register NewVR = MRI.createVirtualRegister(RC);
9570 MachineInstrBuilder MIB1 =
9571 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9572 .add(Root.getOperand(2));
9573 InsInstrs.push_back(MIB1);
9574 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9576 Opc = AArch64::FMLAv4i32_indexed;
9577 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9578 FMAInstKind::Indexed, &NewVR);
9579 } else {
9580 Opc = AArch64::FMLAv4f32;
9581 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9582 FMAInstKind::Accumulator, &NewVR);
9583 }
9584 break;
9585 }
9588 RC = &AArch64::FPR128RegClass;
9589 Register NewVR = MRI.createVirtualRegister(RC);
9590 MachineInstrBuilder MIB1 =
9591 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9592 .add(Root.getOperand(2));
9593 InsInstrs.push_back(MIB1);
9594 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9596 Opc = AArch64::FMLAv2i64_indexed;
9597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9598 FMAInstKind::Indexed, &NewVR);
9599 } else {
9600 Opc = AArch64::FMLAv2f64;
9601 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9602 FMAInstKind::Accumulator, &NewVR);
9603 }
9604 break;
9605 }
9608 unsigned IdxDupOp =
9610 : 2;
9611 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9612 &AArch64::FPR128RegClass, MRI);
9613 break;
9614 }
9617 unsigned IdxDupOp =
9619 : 2;
9620 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9621 &AArch64::FPR128RegClass, MRI);
9622 break;
9623 }
9626 unsigned IdxDupOp =
9628 : 2;
9629 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9630 &AArch64::FPR128_loRegClass, MRI);
9631 break;
9632 }
9635 unsigned IdxDupOp =
9637 : 2;
9638 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9639 &AArch64::FPR128RegClass, MRI);
9640 break;
9641 }
9644 unsigned IdxDupOp =
9646 : 2;
9647 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9648 &AArch64::FPR128_loRegClass, MRI);
9649 break;
9650 }
9652 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9653 break;
9654 }
9656 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9657 Pattern, 4);
9658 break;
9659 }
9661 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9662 Pattern, 8);
9663 break;
9664 }
9666 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9667 Pattern, 16);
9668 break;
9669 }
9670
9671 } // end switch (Pattern)
9672 // Record MUL and ADD/SUB for deletion
9673 if (MUL)
9674 DelInstrs.push_back(MUL);
9675 DelInstrs.push_back(&Root);
9676
9677 // Set the flags on the inserted instructions to be the merged flags of the
9678 // instructions that we have combined.
9679 uint32_t Flags = Root.getFlags();
9680 if (MUL)
9681 Flags = Root.mergeFlagsWith(*MUL);
9682 for (auto *MI : InsInstrs)
9683 MI->setFlags(Flags);
9684}
9685
9686/// Replace csincr-branch sequence by simple conditional branch
9687///
9688/// Examples:
9689/// 1. \code
9690/// csinc w9, wzr, wzr, <condition code>
9691/// tbnz w9, #0, 0x44
9692/// \endcode
9693/// to
9694/// \code
9695/// b.<inverted condition code>
9696/// \endcode
9697///
9698/// 2. \code
9699/// csinc w9, wzr, wzr, <condition code>
9700/// tbz w9, #0, 0x44
9701/// \endcode
9702/// to
9703/// \code
9704/// b.<condition code>
9705/// \endcode
9706///
9707/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9708/// compare's constant operand is power of 2.
9709///
9710/// Examples:
9711/// \code
9712/// and w8, w8, #0x400
9713/// cbnz w8, L1
9714/// \endcode
9715/// to
9716/// \code
9717/// tbnz w8, #10, L1
9718/// \endcode
9719///
9720/// \param MI Conditional Branch
9721/// \return True when the simple conditional branch is generated
9722///
9724 bool IsNegativeBranch = false;
9725 bool IsTestAndBranch = false;
9726 unsigned TargetBBInMI = 0;
9727 switch (MI.getOpcode()) {
9728 default:
9729 llvm_unreachable("Unknown branch instruction?");
9730 case AArch64::Bcc:
9731 case AArch64::CBWPri:
9732 case AArch64::CBXPri:
9733 case AArch64::CBBAssertExt:
9734 case AArch64::CBHAssertExt:
9735 case AArch64::CBWPrr:
9736 case AArch64::CBXPrr:
9737 return false;
9738 case AArch64::CBZW:
9739 case AArch64::CBZX:
9740 TargetBBInMI = 1;
9741 break;
9742 case AArch64::CBNZW:
9743 case AArch64::CBNZX:
9744 TargetBBInMI = 1;
9745 IsNegativeBranch = true;
9746 break;
9747 case AArch64::TBZW:
9748 case AArch64::TBZX:
9749 TargetBBInMI = 2;
9750 IsTestAndBranch = true;
9751 break;
9752 case AArch64::TBNZW:
9753 case AArch64::TBNZX:
9754 TargetBBInMI = 2;
9755 IsNegativeBranch = true;
9756 IsTestAndBranch = true;
9757 break;
9758 }
9759 // So we increment a zero register and test for bits other
9760 // than bit 0? Conservatively bail out in case the verifier
9761 // missed this case.
9762 if (IsTestAndBranch && MI.getOperand(1).getImm())
9763 return false;
9764
9765 // Find Definition.
9766 assert(MI.getParent() && "Incomplete machine instruction\n");
9767 MachineBasicBlock *MBB = MI.getParent();
9768 MachineFunction *MF = MBB->getParent();
9769 MachineRegisterInfo *MRI = &MF->getRegInfo();
9770 Register VReg = MI.getOperand(0).getReg();
9771 if (!VReg.isVirtual())
9772 return false;
9773
9774 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9775
9776 // Look through COPY instructions to find definition.
9777 while (DefMI->isCopy()) {
9778 Register CopyVReg = DefMI->getOperand(1).getReg();
9779 if (!MRI->hasOneNonDBGUse(CopyVReg))
9780 return false;
9781 if (!MRI->hasOneDef(CopyVReg))
9782 return false;
9783 DefMI = MRI->getVRegDef(CopyVReg);
9784 }
9785
9786 switch (DefMI->getOpcode()) {
9787 default:
9788 return false;
9789 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9790 case AArch64::ANDWri:
9791 case AArch64::ANDXri: {
9792 if (IsTestAndBranch)
9793 return false;
9794 if (DefMI->getParent() != MBB)
9795 return false;
9796 if (!MRI->hasOneNonDBGUse(VReg))
9797 return false;
9798
9799 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9801 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9802 if (!isPowerOf2_64(Mask))
9803 return false;
9804
9805 MachineOperand &MO = DefMI->getOperand(1);
9806 Register NewReg = MO.getReg();
9807 if (!NewReg.isVirtual())
9808 return false;
9809
9810 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9811
9812 MachineBasicBlock &RefToMBB = *MBB;
9813 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9814 DebugLoc DL = MI.getDebugLoc();
9815 unsigned Imm = Log2_64(Mask);
9816 unsigned Opc = (Imm < 32)
9817 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9818 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9819 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9820 .addReg(NewReg)
9821 .addImm(Imm)
9822 .addMBB(TBB);
9823 // Register lives on to the CBZ now.
9824 MO.setIsKill(false);
9825
9826 // For immediate smaller than 32, we need to use the 32-bit
9827 // variant (W) in all cases. Indeed the 64-bit variant does not
9828 // allow to encode them.
9829 // Therefore, if the input register is 64-bit, we need to take the
9830 // 32-bit sub-part.
9831 if (!Is32Bit && Imm < 32)
9832 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9833 MI.eraseFromParent();
9834 return true;
9835 }
9836 // Look for CSINC
9837 case AArch64::CSINCWr:
9838 case AArch64::CSINCXr: {
9839 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9840 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9841 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9842 DefMI->getOperand(2).getReg() == AArch64::XZR))
9843 return false;
9844
9845 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9846 true) != -1)
9847 return false;
9848
9849 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9850 // Convert only when the condition code is not modified between
9851 // the CSINC and the branch. The CC may be used by other
9852 // instructions in between.
9854 return false;
9855 MachineBasicBlock &RefToMBB = *MBB;
9856 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9857 DebugLoc DL = MI.getDebugLoc();
9858 if (IsNegativeBranch)
9860 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9861 MI.eraseFromParent();
9862 return true;
9863 }
9864 }
9865}
9866
9867std::pair<unsigned, unsigned>
9868AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9869 const unsigned Mask = AArch64II::MO_FRAGMENT;
9870 return std::make_pair(TF & Mask, TF & ~Mask);
9871}
9872
9874AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9875 using namespace AArch64II;
9876
9877 static const std::pair<unsigned, const char *> TargetFlags[] = {
9878 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9879 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9880 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9881 {MO_HI12, "aarch64-hi12"}};
9882 return ArrayRef(TargetFlags);
9883}
9884
9886AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9887 using namespace AArch64II;
9888
9889 static const std::pair<unsigned, const char *> TargetFlags[] = {
9890 {MO_COFFSTUB, "aarch64-coffstub"},
9891 {MO_GOT, "aarch64-got"},
9892 {MO_NC, "aarch64-nc"},
9893 {MO_S, "aarch64-s"},
9894 {MO_TLS, "aarch64-tls"},
9895 {MO_DLLIMPORT, "aarch64-dllimport"},
9896 {MO_PREL, "aarch64-prel"},
9897 {MO_TAGGED, "aarch64-tagged"},
9898 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9899 };
9900 return ArrayRef(TargetFlags);
9901}
9902
9904AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9905 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9906 {{MOSuppressPair, "aarch64-suppress-pair"},
9907 {MOStridedAccess, "aarch64-strided-access"}};
9908 return ArrayRef(TargetFlags);
9909}
9910
9911/// Constants defining how certain sequences should be outlined.
9912/// This encompasses how an outlined function should be called, and what kind of
9913/// frame should be emitted for that outlined function.
9914///
9915/// \p MachineOutlinerDefault implies that the function should be called with
9916/// a save and restore of LR to the stack.
9917///
9918/// That is,
9919///
9920/// I1 Save LR OUTLINED_FUNCTION:
9921/// I2 --> BL OUTLINED_FUNCTION I1
9922/// I3 Restore LR I2
9923/// I3
9924/// RET
9925///
9926/// * Call construction overhead: 3 (save + BL + restore)
9927/// * Frame construction overhead: 1 (ret)
9928/// * Requires stack fixups? Yes
9929///
9930/// \p MachineOutlinerTailCall implies that the function is being created from
9931/// a sequence of instructions ending in a return.
9932///
9933/// That is,
9934///
9935/// I1 OUTLINED_FUNCTION:
9936/// I2 --> B OUTLINED_FUNCTION I1
9937/// RET I2
9938/// RET
9939///
9940/// * Call construction overhead: 1 (B)
9941/// * Frame construction overhead: 0 (Return included in sequence)
9942/// * Requires stack fixups? No
9943///
9944/// \p MachineOutlinerNoLRSave implies that the function should be called using
9945/// a BL instruction, but doesn't require LR to be saved and restored. This
9946/// happens when LR is known to be dead.
9947///
9948/// That is,
9949///
9950/// I1 OUTLINED_FUNCTION:
9951/// I2 --> BL OUTLINED_FUNCTION I1
9952/// I3 I2
9953/// I3
9954/// RET
9955///
9956/// * Call construction overhead: 1 (BL)
9957/// * Frame construction overhead: 1 (RET)
9958/// * Requires stack fixups? No
9959///
9960/// \p MachineOutlinerThunk implies that the function is being created from
9961/// a sequence of instructions ending in a call. The outlined function is
9962/// called with a BL instruction, and the outlined function tail-calls the
9963/// original call destination.
9964///
9965/// That is,
9966///
9967/// I1 OUTLINED_FUNCTION:
9968/// I2 --> BL OUTLINED_FUNCTION I1
9969/// BL f I2
9970/// B f
9971/// * Call construction overhead: 1 (BL)
9972/// * Frame construction overhead: 0
9973/// * Requires stack fixups? No
9974///
9975/// \p MachineOutlinerRegSave implies that the function should be called with a
9976/// save and restore of LR to an available register. This allows us to avoid
9977/// stack fixups. Note that this outlining variant is compatible with the
9978/// NoLRSave case.
9979///
9980/// That is,
9981///
9982/// I1 Save LR OUTLINED_FUNCTION:
9983/// I2 --> BL OUTLINED_FUNCTION I1
9984/// I3 Restore LR I2
9985/// I3
9986/// RET
9987///
9988/// * Call construction overhead: 3 (save + BL + restore)
9989/// * Frame construction overhead: 1 (ret)
9990/// * Requires stack fixups? No
9992 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9993 MachineOutlinerTailCall, /// Only emit a branch.
9994 MachineOutlinerNoLRSave, /// Emit a call and return.
9995 MachineOutlinerThunk, /// Emit a call and tail-call.
9996 MachineOutlinerRegSave /// Same as default, but save to a register.
9997};
9998
10004
10006AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
10007 MachineFunction *MF = C.getMF();
10008 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
10009 const AArch64RegisterInfo *ARI =
10010 static_cast<const AArch64RegisterInfo *>(&TRI);
10011 // Check if there is an available register across the sequence that we can
10012 // use.
10013 for (unsigned Reg : AArch64::GPR64RegClass) {
10014 if (!ARI->isReservedReg(*MF, Reg) &&
10015 Reg != AArch64::LR && // LR is not reserved, but don't use it.
10016 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
10017 Reg != AArch64::X17 && // Ditto for X17.
10018 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
10019 C.isAvailableInsideSeq(Reg, TRI))
10020 return Reg;
10021 }
10022 return Register();
10023}
10024
10025static bool
10027 const outliner::Candidate &b) {
10028 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10029 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10030
10031 return MFIa->getSignReturnAddressCondition() ==
10033}
10034
10035static bool
10037 const outliner::Candidate &b) {
10038 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
10039 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
10040
10041 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
10042}
10043
10045 const outliner::Candidate &b) {
10046 const AArch64Subtarget &SubtargetA =
10048 const AArch64Subtarget &SubtargetB =
10049 b.getMF()->getSubtarget<AArch64Subtarget>();
10050 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
10051}
10052
10053std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10054AArch64InstrInfo::getOutliningCandidateInfo(
10055 const MachineModuleInfo &MMI,
10056 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10057 unsigned MinRepeats) const {
10058 unsigned SequenceSize = 0;
10059 for (auto &MI : RepeatedSequenceLocs[0])
10060 SequenceSize += getInstSizeInBytes(MI);
10061
10062 unsigned NumBytesToCreateFrame = 0;
10063
10064 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
10065 // These instructions are fused together by the scheduler.
10066 // Any candidate where ADRP is the last instruction should be rejected
10067 // as that will lead to splitting ADRP pair.
10068 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
10069 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
10070 if (LastMI.getOpcode() == AArch64::ADRP &&
10071 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
10072 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10073 return std::nullopt;
10074 }
10075
10076 // Similarly any candidate where the first instruction is ADD/LDR with a
10077 // page offset should be rejected to avoid ADRP splitting.
10078 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
10079 FirstMI.getOpcode() == AArch64::LDRXui) &&
10080 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
10081 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
10082 return std::nullopt;
10083 }
10084
10085 // We only allow outlining for functions having exactly matching return
10086 // address signing attributes, i.e., all share the same value for the
10087 // attribute "sign-return-address" and all share the same type of key they
10088 // are signed with.
10089 // Additionally we require all functions to simultaneously either support
10090 // v8.3a features or not. Otherwise an outlined function could get signed
10091 // using dedicated v8.3 instructions and a call from a function that doesn't
10092 // support v8.3 instructions would therefore be invalid.
10093 if (std::adjacent_find(
10094 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
10095 [](const outliner::Candidate &a, const outliner::Candidate &b) {
10096 // Return true if a and b are non-equal w.r.t. return address
10097 // signing or support of v8.3a features
10098 if (outliningCandidatesSigningScopeConsensus(a, b) &&
10099 outliningCandidatesSigningKeyConsensus(a, b) &&
10100 outliningCandidatesV8_3OpsConsensus(a, b)) {
10101 return false;
10102 }
10103 return true;
10104 }) != RepeatedSequenceLocs.end()) {
10105 return std::nullopt;
10106 }
10107
10108 // Since at this point all candidates agree on their return address signing
10109 // picking just one is fine. If the candidate functions potentially sign their
10110 // return addresses, the outlined function should do the same. Note that in
10111 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
10112 // not certainly true that the outlined function will have to sign its return
10113 // address but this decision is made later, when the decision to outline
10114 // has already been made.
10115 // The same holds for the number of additional instructions we need: On
10116 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
10117 // necessary. However, at this point we don't know if the outlined function
10118 // will have a RET instruction so we assume the worst.
10119 const TargetRegisterInfo &TRI = getRegisterInfo();
10120 // Performing a tail call may require extra checks when PAuth is enabled.
10121 // If PAuth is disabled, set it to zero for uniformity.
10122 unsigned NumBytesToCheckLRInTCEpilogue = 0;
10123 const auto RASignCondition = RepeatedSequenceLocs[0]
10124 .getMF()
10125 ->getInfo<AArch64FunctionInfo>()
10126 ->getSignReturnAddressCondition();
10127 if (RASignCondition != SignReturnAddress::None) {
10128 // One PAC and one AUT instructions
10129 NumBytesToCreateFrame += 8;
10130
10131 // PAuth is enabled - set extra tail call cost, if any.
10132 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
10133 *RepeatedSequenceLocs[0].getMF());
10134 NumBytesToCheckLRInTCEpilogue =
10136 // Checking the authenticated LR value may significantly impact
10137 // SequenceSize, so account for it for more precise results.
10138 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
10139 SequenceSize += NumBytesToCheckLRInTCEpilogue;
10140
10141 // We have to check if sp modifying instructions would get outlined.
10142 // If so we only allow outlining if sp is unchanged overall, so matching
10143 // sub and add instructions are okay to outline, all other sp modifications
10144 // are not
10145 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
10146 int SPValue = 0;
10147 for (auto &MI : C) {
10148 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
10149 switch (MI.getOpcode()) {
10150 case AArch64::ADDXri:
10151 case AArch64::ADDWri:
10152 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10153 assert(MI.getOperand(2).isImm() &&
10154 "Expected operand to be immediate");
10155 assert(MI.getOperand(1).isReg() &&
10156 "Expected operand to be a register");
10157 // Check if the add just increments sp. If so, we search for
10158 // matching sub instructions that decrement sp. If not, the
10159 // modification is illegal
10160 if (MI.getOperand(1).getReg() == AArch64::SP)
10161 SPValue += MI.getOperand(2).getImm();
10162 else
10163 return true;
10164 break;
10165 case AArch64::SUBXri:
10166 case AArch64::SUBWri:
10167 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10168 assert(MI.getOperand(2).isImm() &&
10169 "Expected operand to be immediate");
10170 assert(MI.getOperand(1).isReg() &&
10171 "Expected operand to be a register");
10172 // Check if the sub just decrements sp. If so, we search for
10173 // matching add instructions that increment sp. If not, the
10174 // modification is illegal
10175 if (MI.getOperand(1).getReg() == AArch64::SP)
10176 SPValue -= MI.getOperand(2).getImm();
10177 else
10178 return true;
10179 break;
10180 default:
10181 return true;
10182 }
10183 }
10184 }
10185 if (SPValue)
10186 return true;
10187 return false;
10188 };
10189 // Remove candidates with illegal stack modifying instructions
10190 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10191
10192 // If the sequence doesn't have enough candidates left, then we're done.
10193 if (RepeatedSequenceLocs.size() < MinRepeats)
10194 return std::nullopt;
10195 }
10196
10197 // Properties about candidate MBBs that hold for all of them.
10198 unsigned FlagsSetInAll = 0xF;
10199
10200 // Compute liveness information for each candidate, and set FlagsSetInAll.
10201 for (outliner::Candidate &C : RepeatedSequenceLocs)
10202 FlagsSetInAll &= C.Flags;
10203
10204 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10205
10206 // Helper lambda which sets call information for every candidate.
10207 auto SetCandidateCallInfo =
10208 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10209 for (outliner::Candidate &C : RepeatedSequenceLocs)
10210 C.setCallInfo(CallID, NumBytesForCall);
10211 };
10212
10213 unsigned FrameID = MachineOutlinerDefault;
10214 NumBytesToCreateFrame += 4;
10215
10216 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10217 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10218 });
10219
10220 // We check to see if CFI Instructions are present, and if they are
10221 // we find the number of CFI Instructions in the candidates.
10222 unsigned CFICount = 0;
10223 for (auto &I : RepeatedSequenceLocs[0]) {
10224 if (I.isCFIInstruction())
10225 CFICount++;
10226 }
10227
10228 // We compare the number of found CFI Instructions to the number of CFI
10229 // instructions in the parent function for each candidate. We must check this
10230 // since if we outline one of the CFI instructions in a function, we have to
10231 // outline them all for correctness. If we do not, the address offsets will be
10232 // incorrect between the two sections of the program.
10233 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10234 std::vector<MCCFIInstruction> CFIInstructions =
10235 C.getMF()->getFrameInstructions();
10236
10237 if (CFICount > 0 && CFICount != CFIInstructions.size())
10238 return std::nullopt;
10239 }
10240
10241 // Returns true if an instructions is safe to fix up, false otherwise.
10242 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10243 if (MI.isCall())
10244 return true;
10245
10246 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10247 !MI.readsRegister(AArch64::SP, &TRI))
10248 return true;
10249
10250 // Any modification of SP will break our code to save/restore LR.
10251 // FIXME: We could handle some instructions which add a constant
10252 // offset to SP, with a bit more work.
10253 if (MI.modifiesRegister(AArch64::SP, &TRI))
10254 return false;
10255
10256 // At this point, we have a stack instruction that we might need to
10257 // fix up. We'll handle it if it's a load or store.
10258 if (MI.mayLoadOrStore()) {
10259 const MachineOperand *Base; // Filled with the base operand of MI.
10260 int64_t Offset; // Filled with the offset of MI.
10261 bool OffsetIsScalable;
10262
10263 // Does it allow us to offset the base operand and is the base the
10264 // register SP?
10265 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10266 !Base->isReg() || Base->getReg() != AArch64::SP)
10267 return false;
10268
10269 // Fixe-up code below assumes bytes.
10270 if (OffsetIsScalable)
10271 return false;
10272
10273 // Find the minimum/maximum offset for this instruction and check
10274 // if fixing it up would be in range.
10275 int64_t MinOffset,
10276 MaxOffset; // Unscaled offsets for the instruction.
10277 // The scale to multiply the offsets by.
10278 TypeSize Scale(0U, false), DummyWidth(0U, false);
10279 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10280
10281 Offset += 16; // Update the offset to what it would be if we outlined.
10282 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10283 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10284 return false;
10285
10286 // It's in range, so we can outline it.
10287 return true;
10288 }
10289
10290 // FIXME: Add handling for instructions like "add x0, sp, #8".
10291
10292 // We can't fix it up, so don't outline it.
10293 return false;
10294 };
10295
10296 // True if it's possible to fix up each stack instruction in this sequence.
10297 // Important for frames/call variants that modify the stack.
10298 bool AllStackInstrsSafe =
10299 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10300
10301 // If the last instruction in any candidate is a terminator, then we should
10302 // tail call all of the candidates.
10303 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10304 FrameID = MachineOutlinerTailCall;
10305 NumBytesToCreateFrame = 0;
10306 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10307 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10308 }
10309
10310 else if (LastInstrOpcode == AArch64::BL ||
10311 ((LastInstrOpcode == AArch64::BLR ||
10312 LastInstrOpcode == AArch64::BLRNoIP) &&
10313 !HasBTI)) {
10314 // FIXME: Do we need to check if the code after this uses the value of LR?
10315 FrameID = MachineOutlinerThunk;
10316 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10317 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10318 }
10319
10320 else {
10321 // We need to decide how to emit calls + frames. We can always emit the same
10322 // frame if we don't need to save to the stack. If we have to save to the
10323 // stack, then we need a different frame.
10324 unsigned NumBytesNoStackCalls = 0;
10325 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10326
10327 // Check if we have to save LR.
10328 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10329 bool LRAvailable =
10331 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10332 : true;
10333 // If we have a noreturn caller, then we're going to be conservative and
10334 // say that we have to save LR. If we don't have a ret at the end of the
10335 // block, then we can't reason about liveness accurately.
10336 //
10337 // FIXME: We can probably do better than always disabling this in
10338 // noreturn functions by fixing up the liveness info.
10339 bool IsNoReturn =
10340 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10341
10342 // Is LR available? If so, we don't need a save.
10343 if (LRAvailable && !IsNoReturn) {
10344 NumBytesNoStackCalls += 4;
10345 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10346 CandidatesWithoutStackFixups.push_back(C);
10347 }
10348
10349 // Is an unused register available? If so, we won't modify the stack, so
10350 // we can outline with the same frame type as those that don't save LR.
10351 else if (findRegisterToSaveLRTo(C)) {
10352 NumBytesNoStackCalls += 12;
10353 C.setCallInfo(MachineOutlinerRegSave, 12);
10354 CandidatesWithoutStackFixups.push_back(C);
10355 }
10356
10357 // Is SP used in the sequence at all? If not, we don't have to modify
10358 // the stack, so we are guaranteed to get the same frame.
10359 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10360 NumBytesNoStackCalls += 12;
10361 C.setCallInfo(MachineOutlinerDefault, 12);
10362 CandidatesWithoutStackFixups.push_back(C);
10363 }
10364
10365 // If we outline this, we need to modify the stack. Pretend we don't
10366 // outline this by saving all of its bytes.
10367 else {
10368 NumBytesNoStackCalls += SequenceSize;
10369 }
10370 }
10371
10372 // If there are no places where we have to save LR, then note that we
10373 // don't have to update the stack. Otherwise, give every candidate the
10374 // default call type, as long as it's safe to do so.
10375 if (!AllStackInstrsSafe ||
10376 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10377 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10378 FrameID = MachineOutlinerNoLRSave;
10379 if (RepeatedSequenceLocs.size() < MinRepeats)
10380 return std::nullopt;
10381 } else {
10382 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10383
10384 // Bugzilla ID: 46767
10385 // TODO: Check if fixing up the stack more than once is safe so we can
10386 // outline these.
10387 //
10388 // An outline resulting in a caller that requires stack fixups at the
10389 // callsite to a callee that also requires stack fixups can happen when
10390 // there are no available registers at the candidate callsite for a
10391 // candidate that itself also has calls.
10392 //
10393 // In other words if function_containing_sequence in the following pseudo
10394 // assembly requires that we save LR at the point of the call, but there
10395 // are no available registers: in this case we save using SP and as a
10396 // result the SP offsets requires stack fixups by multiples of 16.
10397 //
10398 // function_containing_sequence:
10399 // ...
10400 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10401 // call OUTLINED_FUNCTION_N
10402 // restore LR from SP
10403 // ...
10404 //
10405 // OUTLINED_FUNCTION_N:
10406 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10407 // ...
10408 // bl foo
10409 // restore LR from SP
10410 // ret
10411 //
10412 // Because the code to handle more than one stack fixup does not
10413 // currently have the proper checks for legality, these cases will assert
10414 // in the AArch64 MachineOutliner. This is because the code to do this
10415 // needs more hardening, testing, better checks that generated code is
10416 // legal, etc and because it is only verified to handle a single pass of
10417 // stack fixup.
10418 //
10419 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10420 // these cases until they are known to be handled. Bugzilla 46767 is
10421 // referenced in comments at the assert site.
10422 //
10423 // To avoid asserting (or generating non-legal code on noassert builds)
10424 // we remove all candidates which would need more than one stack fixup by
10425 // pruning the cases where the candidate has calls while also having no
10426 // available LR and having no available general purpose registers to copy
10427 // LR to (ie one extra stack save/restore).
10428 //
10429 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10430 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10431 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10432 return (llvm::any_of(C, IsCall)) &&
10433 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10434 !findRegisterToSaveLRTo(C));
10435 });
10436 }
10437 }
10438
10439 // If we dropped all of the candidates, bail out here.
10440 if (RepeatedSequenceLocs.size() < MinRepeats)
10441 return std::nullopt;
10442 }
10443
10444 // Does every candidate's MBB contain a call? If so, then we might have a call
10445 // in the range.
10446 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10447 // Check if the range contains a call. These require a save + restore of the
10448 // link register.
10449 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10450 bool ModStackToSaveLR = false;
10451 if (any_of(drop_end(FirstCand),
10452 [](const MachineInstr &MI) { return MI.isCall(); }))
10453 ModStackToSaveLR = true;
10454
10455 // Handle the last instruction separately. If this is a tail call, then the
10456 // last instruction is a call. We don't want to save + restore in this case.
10457 // However, it could be possible that the last instruction is a call without
10458 // it being valid to tail call this sequence. We should consider this as
10459 // well.
10460 else if (FrameID != MachineOutlinerThunk &&
10461 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10462 ModStackToSaveLR = true;
10463
10464 if (ModStackToSaveLR) {
10465 // We can't fix up the stack. Bail out.
10466 if (!AllStackInstrsSafe)
10467 return std::nullopt;
10468
10469 // Save + restore LR.
10470 NumBytesToCreateFrame += 8;
10471 }
10472 }
10473
10474 // If we have CFI instructions, we can only outline if the outlined section
10475 // can be a tail call
10476 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10477 return std::nullopt;
10478
10479 return std::make_unique<outliner::OutlinedFunction>(
10480 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10481}
10482
10483void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10484 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10485 // If a bunch of candidates reach this point they must agree on their return
10486 // address signing. It is therefore enough to just consider the signing
10487 // behaviour of one of them
10488 const auto &CFn = Candidates.front().getMF()->getFunction();
10489
10490 if (CFn.hasFnAttribute("ptrauth-returns"))
10491 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10492 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10493 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10494 // Since all candidates belong to the same module, just copy the
10495 // function-level attributes of an arbitrary function.
10496 if (CFn.hasFnAttribute("sign-return-address"))
10497 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10498 if (CFn.hasFnAttribute("sign-return-address-key"))
10499 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10500
10501 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10502}
10503
10504bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10505 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10506 const Function &F = MF.getFunction();
10507
10508 // Can F be deduplicated by the linker? If it can, don't outline from it.
10509 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10510 return false;
10511
10512 // Don't outline from functions with section markings; the program could
10513 // expect that all the code is in the named section.
10514 // FIXME: Allow outlining from multiple functions with the same section
10515 // marking.
10516 if (F.hasSection())
10517 return false;
10518
10519 // Outlining from functions with redzones is unsafe since the outliner may
10520 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10521 // outline from it.
10522 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10523 if (!AFI || AFI->hasRedZone().value_or(true))
10524 return false;
10525
10526 // FIXME: Determine whether it is safe to outline from functions which contain
10527 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10528 // outlined together and ensure it is safe to outline with async unwind info,
10529 // required for saving & restoring VG around calls.
10530 if (AFI->hasStreamingModeChanges())
10531 return false;
10532
10533 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10535 return false;
10536
10537 // It's safe to outline from MF.
10538 return true;
10539}
10540
10542AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10543 unsigned &Flags) const {
10545 "Must track liveness!");
10547 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10548 Ranges;
10549 // According to the AArch64 Procedure Call Standard, the following are
10550 // undefined on entry/exit from a function call:
10551 //
10552 // * Registers x16, x17, (and thus w16, w17)
10553 // * Condition codes (and thus the NZCV register)
10554 //
10555 // If any of these registers are used inside or live across an outlined
10556 // function, then they may be modified later, either by the compiler or
10557 // some other tool (like the linker).
10558 //
10559 // To avoid outlining in these situations, partition each block into ranges
10560 // where these registers are dead. We will only outline from those ranges.
10561 LiveRegUnits LRU(getRegisterInfo());
10562 auto AreAllUnsafeRegsDead = [&LRU]() {
10563 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10564 LRU.available(AArch64::NZCV);
10565 };
10566
10567 // We need to know if LR is live across an outlining boundary later on in
10568 // order to decide how we'll create the outlined call, frame, etc.
10569 //
10570 // It's pretty expensive to check this for *every candidate* within a block.
10571 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10572 // to compute liveness from the end of the block for O(n) candidates within
10573 // the block.
10574 //
10575 // So, to improve the average case, let's keep track of liveness from the end
10576 // of the block to the beginning of *every outlinable range*. If we know that
10577 // LR is available in every range we could outline from, then we know that
10578 // we don't need to check liveness for any candidate within that range.
10579 bool LRAvailableEverywhere = true;
10580 // Compute liveness bottom-up.
10581 LRU.addLiveOuts(MBB);
10582 // Update flags that require info about the entire MBB.
10583 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10584 if (MI.isCall() && !MI.isTerminator())
10586 };
10587 // Range: [RangeBegin, RangeEnd)
10588 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10589 unsigned RangeLen;
10590 auto CreateNewRangeStartingAt =
10591 [&RangeBegin, &RangeEnd,
10592 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10593 RangeBegin = NewBegin;
10594 RangeEnd = std::next(RangeBegin);
10595 RangeLen = 0;
10596 };
10597 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10598 // At least one unsafe register is not dead. We do not want to outline at
10599 // this point. If it is long enough to outline from and does not cross a
10600 // bundle boundary, save the range [RangeBegin, RangeEnd).
10601 if (RangeLen <= 1)
10602 return;
10603 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10604 return;
10605 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10606 return;
10607 Ranges.emplace_back(RangeBegin, RangeEnd);
10608 };
10609 // Find the first point where all unsafe registers are dead.
10610 // FIND: <safe instr> <-- end of first potential range
10611 // SKIP: <unsafe def>
10612 // SKIP: ... everything between ...
10613 // SKIP: <unsafe use>
10614 auto FirstPossibleEndPt = MBB.instr_rbegin();
10615 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10616 LRU.stepBackward(*FirstPossibleEndPt);
10617 // Update flags that impact how we outline across the entire block,
10618 // regardless of safety.
10619 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10620 if (AreAllUnsafeRegsDead())
10621 break;
10622 }
10623 // If we exhausted the entire block, we have no safe ranges to outline.
10624 if (FirstPossibleEndPt == MBB.instr_rend())
10625 return Ranges;
10626 // Current range.
10627 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10628 // StartPt points to the first place where all unsafe registers
10629 // are dead (if there is any such point). Begin partitioning the MBB into
10630 // ranges.
10631 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10632 LRU.stepBackward(MI);
10633 UpdateWholeMBBFlags(MI);
10634 if (!AreAllUnsafeRegsDead()) {
10635 SaveRangeIfNonEmpty();
10636 CreateNewRangeStartingAt(MI.getIterator());
10637 continue;
10638 }
10639 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10640 RangeBegin = MI.getIterator();
10641 ++RangeLen;
10642 }
10643 // Above loop misses the last (or only) range. If we are still safe, then
10644 // let's save the range.
10645 if (AreAllUnsafeRegsDead())
10646 SaveRangeIfNonEmpty();
10647 if (Ranges.empty())
10648 return Ranges;
10649 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10650 // the order.
10651 std::reverse(Ranges.begin(), Ranges.end());
10652 // If there is at least one outlinable range where LR is unavailable
10653 // somewhere, remember that.
10654 if (!LRAvailableEverywhere)
10656 return Ranges;
10657}
10658
10660AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10662 unsigned Flags) const {
10663 MachineInstr &MI = *MIT;
10664
10665 // Don't outline anything used for return address signing. The outlined
10666 // function will get signed later if needed
10667 switch (MI.getOpcode()) {
10668 case AArch64::PACM:
10669 case AArch64::PACIASP:
10670 case AArch64::PACIBSP:
10671 case AArch64::PACIASPPC:
10672 case AArch64::PACIBSPPC:
10673 case AArch64::AUTIASP:
10674 case AArch64::AUTIBSP:
10675 case AArch64::AUTIASPPCi:
10676 case AArch64::AUTIASPPCr:
10677 case AArch64::AUTIBSPPCi:
10678 case AArch64::AUTIBSPPCr:
10679 case AArch64::RETAA:
10680 case AArch64::RETAB:
10681 case AArch64::RETAASPPCi:
10682 case AArch64::RETAASPPCr:
10683 case AArch64::RETABSPPCi:
10684 case AArch64::RETABSPPCr:
10685 case AArch64::EMITBKEY:
10686 case AArch64::PAUTH_PROLOGUE:
10687 case AArch64::PAUTH_EPILOGUE:
10689 }
10690
10691 // We can only outline these if we will tail call the outlined function, or
10692 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10693 // in a tail call.
10694 //
10695 // FIXME: If the proper fixups for the offset are implemented, this should be
10696 // possible.
10697 if (MI.isCFIInstruction())
10699
10700 // Is this a terminator for a basic block?
10701 if (MI.isTerminator())
10702 // TargetInstrInfo::getOutliningType has already filtered out anything
10703 // that would break this, so we can allow it here.
10705
10706 // Make sure none of the operands are un-outlinable.
10707 for (const MachineOperand &MOP : MI.operands()) {
10708 // A check preventing CFI indices was here before, but only CFI
10709 // instructions should have those.
10710 assert(!MOP.isCFIIndex());
10711
10712 // If it uses LR or W30 explicitly, then don't touch it.
10713 if (MOP.isReg() && !MOP.isImplicit() &&
10714 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10716 }
10717
10718 // Special cases for instructions that can always be outlined, but will fail
10719 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10720 // be outlined because they don't require a *specific* value to be in LR.
10721 if (MI.getOpcode() == AArch64::ADRP)
10723
10724 // If MI is a call we might be able to outline it. We don't want to outline
10725 // any calls that rely on the position of items on the stack. When we outline
10726 // something containing a call, we have to emit a save and restore of LR in
10727 // the outlined function. Currently, this always happens by saving LR to the
10728 // stack. Thus, if we outline, say, half the parameters for a function call
10729 // plus the call, then we'll break the callee's expectations for the layout
10730 // of the stack.
10731 //
10732 // FIXME: Allow calls to functions which construct a stack frame, as long
10733 // as they don't access arguments on the stack.
10734 // FIXME: Figure out some way to analyze functions defined in other modules.
10735 // We should be able to compute the memory usage based on the IR calling
10736 // convention, even if we can't see the definition.
10737 if (MI.isCall()) {
10738 // Get the function associated with the call. Look at each operand and find
10739 // the one that represents the callee and get its name.
10740 const Function *Callee = nullptr;
10741 for (const MachineOperand &MOP : MI.operands()) {
10742 if (MOP.isGlobal()) {
10743 Callee = dyn_cast<Function>(MOP.getGlobal());
10744 break;
10745 }
10746 }
10747
10748 // Never outline calls to mcount. There isn't any rule that would require
10749 // this, but the Linux kernel's "ftrace" feature depends on it.
10750 if (Callee && Callee->getName() == "\01_mcount")
10752
10753 // If we don't know anything about the callee, assume it depends on the
10754 // stack layout of the caller. In that case, it's only legal to outline
10755 // as a tail-call. Explicitly list the call instructions we know about so we
10756 // don't get unexpected results with call pseudo-instructions.
10757 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10758 if (MI.getOpcode() == AArch64::BLR ||
10759 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10760 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10761
10762 if (!Callee)
10763 return UnknownCallOutlineType;
10764
10765 // We have a function we have information about. Check it if it's something
10766 // can safely outline.
10767 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10768
10769 // We don't know what's going on with the callee at all. Don't touch it.
10770 if (!CalleeMF)
10771 return UnknownCallOutlineType;
10772
10773 // Check if we know anything about the callee saves on the function. If we
10774 // don't, then don't touch it, since that implies that we haven't
10775 // computed anything about its stack frame yet.
10776 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10777 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10778 MFI.getNumObjects() > 0)
10779 return UnknownCallOutlineType;
10780
10781 // At this point, we can say that CalleeMF ought to not pass anything on the
10782 // stack. Therefore, we can outline it.
10784 }
10785
10786 // Don't touch the link register or W30.
10787 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10788 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10790
10791 // Don't outline BTI instructions, because that will prevent the outlining
10792 // site from being indirectly callable.
10793 if (hasBTISemantics(MI))
10795
10797}
10798
10799void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10800 for (MachineInstr &MI : MBB) {
10801 const MachineOperand *Base;
10802 TypeSize Width(0, false);
10803 int64_t Offset;
10804 bool OffsetIsScalable;
10805
10806 // Is this a load or store with an immediate offset with SP as the base?
10807 if (!MI.mayLoadOrStore() ||
10808 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10809 &RI) ||
10810 (Base->isReg() && Base->getReg() != AArch64::SP))
10811 continue;
10812
10813 // It is, so we have to fix it up.
10814 TypeSize Scale(0U, false);
10815 int64_t Dummy1, Dummy2;
10816
10817 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10818 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10819 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10820 assert(Scale != 0 && "Unexpected opcode!");
10821 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10822
10823 // We've pushed the return address to the stack, so add 16 to the offset.
10824 // This is safe, since we already checked if it would overflow when we
10825 // checked if this instruction was legal to outline.
10826 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10827 StackOffsetOperand.setImm(NewImm);
10828 }
10829}
10830
10832 const AArch64InstrInfo *TII,
10833 bool ShouldSignReturnAddr) {
10834 if (!ShouldSignReturnAddr)
10835 return;
10836
10837 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10839 TII->createPauthEpilogueInstr(MBB, DebugLoc());
10840}
10841
10842void AArch64InstrInfo::buildOutlinedFrame(
10844 const outliner::OutlinedFunction &OF) const {
10845
10846 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10847
10848 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10849 FI->setOutliningStyle("Tail Call");
10850 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10851 // For thunk outlining, rewrite the last instruction from a call to a
10852 // tail-call.
10853 MachineInstr *Call = &*--MBB.instr_end();
10854 unsigned TailOpcode;
10855 if (Call->getOpcode() == AArch64::BL) {
10856 TailOpcode = AArch64::TCRETURNdi;
10857 } else {
10858 assert(Call->getOpcode() == AArch64::BLR ||
10859 Call->getOpcode() == AArch64::BLRNoIP);
10860 TailOpcode = AArch64::TCRETURNriALL;
10861 }
10862 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10863 .add(Call->getOperand(0))
10864 .addImm(0);
10865 MBB.insert(MBB.end(), TC);
10867
10868 FI->setOutliningStyle("Thunk");
10869 }
10870
10871 bool IsLeafFunction = true;
10872
10873 // Is there a call in the outlined range?
10874 auto IsNonTailCall = [](const MachineInstr &MI) {
10875 return MI.isCall() && !MI.isReturn();
10876 };
10877
10878 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10879 // Fix up the instructions in the range, since we're going to modify the
10880 // stack.
10881
10882 // Bugzilla ID: 46767
10883 // TODO: Check if fixing up twice is safe so we can outline these.
10884 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10885 "Can only fix up stack references once");
10886 fixupPostOutline(MBB);
10887
10888 IsLeafFunction = false;
10889
10890 // LR has to be a live in so that we can save it.
10891 if (!MBB.isLiveIn(AArch64::LR))
10892 MBB.addLiveIn(AArch64::LR);
10893
10896
10897 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10898 OF.FrameConstructionID == MachineOutlinerThunk)
10899 Et = std::prev(MBB.end());
10900
10901 // Insert a save before the outlined region
10902 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10903 .addReg(AArch64::SP, RegState::Define)
10904 .addReg(AArch64::LR)
10905 .addReg(AArch64::SP)
10906 .addImm(-16);
10907 It = MBB.insert(It, STRXpre);
10908
10909 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10910 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10911
10912 // Add a CFI saying the stack was moved 16 B down.
10913 CFIBuilder.buildDefCFAOffset(16);
10914
10915 // Add a CFI saying that the LR that we want to find is now 16 B higher
10916 // than before.
10917 CFIBuilder.buildOffset(AArch64::LR, -16);
10918 }
10919
10920 // Insert a restore before the terminator for the function.
10921 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10922 .addReg(AArch64::SP, RegState::Define)
10923 .addReg(AArch64::LR, RegState::Define)
10924 .addReg(AArch64::SP)
10925 .addImm(16);
10926 Et = MBB.insert(Et, LDRXpost);
10927 }
10928
10929 auto RASignCondition = FI->getSignReturnAddressCondition();
10930 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10931 RASignCondition, !IsLeafFunction);
10932
10933 // If this is a tail call outlined function, then there's already a return.
10934 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10935 OF.FrameConstructionID == MachineOutlinerThunk) {
10936 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10937 return;
10938 }
10939
10940 // It's not a tail call, so we have to insert the return ourselves.
10941
10942 // LR has to be a live in so that we can return to it.
10943 if (!MBB.isLiveIn(AArch64::LR))
10944 MBB.addLiveIn(AArch64::LR);
10945
10946 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10947 .addReg(AArch64::LR);
10948 MBB.insert(MBB.end(), ret);
10949
10950 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10951
10952 FI->setOutliningStyle("Function");
10953
10954 // Did we have to modify the stack by saving the link register?
10955 if (OF.FrameConstructionID != MachineOutlinerDefault)
10956 return;
10957
10958 // We modified the stack.
10959 // Walk over the basic block and fix up all the stack accesses.
10960 fixupPostOutline(MBB);
10961}
10962
10963MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10966
10967 // Are we tail calling?
10968 if (C.CallConstructionID == MachineOutlinerTailCall) {
10969 // If yes, then we can just branch to the label.
10970 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10971 .addGlobalAddress(M.getNamedValue(MF.getName()))
10972 .addImm(0));
10973 return It;
10974 }
10975
10976 // Are we saving the link register?
10977 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10978 C.CallConstructionID == MachineOutlinerThunk) {
10979 // No, so just insert the call.
10980 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10981 .addGlobalAddress(M.getNamedValue(MF.getName())));
10982 return It;
10983 }
10984
10985 // We want to return the spot where we inserted the call.
10987
10988 // Instructions for saving and restoring LR around the call instruction we're
10989 // going to insert.
10990 MachineInstr *Save;
10991 MachineInstr *Restore;
10992 // Can we save to a register?
10993 if (C.CallConstructionID == MachineOutlinerRegSave) {
10994 // FIXME: This logic should be sunk into a target-specific interface so that
10995 // we don't have to recompute the register.
10996 Register Reg = findRegisterToSaveLRTo(C);
10997 assert(Reg && "No callee-saved register available?");
10998
10999 // LR has to be a live in so that we can save it.
11000 if (!MBB.isLiveIn(AArch64::LR))
11001 MBB.addLiveIn(AArch64::LR);
11002
11003 // Save and restore LR from Reg.
11004 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
11005 .addReg(AArch64::XZR)
11006 .addReg(AArch64::LR)
11007 .addImm(0);
11008 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
11009 .addReg(AArch64::XZR)
11010 .addReg(Reg)
11011 .addImm(0);
11012 } else {
11013 // We have the default case. Save and restore from SP.
11014 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
11015 .addReg(AArch64::SP, RegState::Define)
11016 .addReg(AArch64::LR)
11017 .addReg(AArch64::SP)
11018 .addImm(-16);
11019 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
11020 .addReg(AArch64::SP, RegState::Define)
11021 .addReg(AArch64::LR, RegState::Define)
11022 .addReg(AArch64::SP)
11023 .addImm(16);
11024 }
11025
11026 It = MBB.insert(It, Save);
11027 It++;
11028
11029 // Insert the call.
11030 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
11031 .addGlobalAddress(M.getNamedValue(MF.getName())));
11032 CallPt = It;
11033 It++;
11034
11035 It = MBB.insert(It, Restore);
11036 return CallPt;
11037}
11038
11039bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
11040 MachineFunction &MF) const {
11041 return MF.getFunction().hasMinSize();
11042}
11043
11044void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
11046 DebugLoc &DL,
11047 bool AllowSideEffects) const {
11048 const MachineFunction &MF = *MBB.getParent();
11049 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
11050 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
11051
11052 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
11053 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
11054 } else if (STI.isSVEorStreamingSVEAvailable()) {
11055 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
11056 .addImm(0)
11057 .addImm(0);
11058 } else if (STI.isNeonAvailable()) {
11059 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
11060 .addImm(0);
11061 } else {
11062 // This is a streaming-compatible function without SVE. We don't have full
11063 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
11064 // So given `movi v..` would be illegal use `fmov d..` instead.
11065 assert(STI.hasNEON() && "Expected to have NEON.");
11066 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
11067 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
11068 }
11069}
11070
11071std::optional<DestSourcePair>
11073
11074 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
11075 // and zero immediate operands used as an alias for mov instruction.
11076 if (((MI.getOpcode() == AArch64::ORRWrs &&
11077 MI.getOperand(1).getReg() == AArch64::WZR &&
11078 MI.getOperand(3).getImm() == 0x0) ||
11079 (MI.getOpcode() == AArch64::ORRWrr &&
11080 MI.getOperand(1).getReg() == AArch64::WZR)) &&
11081 // Check that the w->w move is not a zero-extending w->x mov.
11082 (!MI.getOperand(0).getReg().isVirtual() ||
11083 MI.getOperand(0).getSubReg() == 0) &&
11084 (!MI.getOperand(0).getReg().isPhysical() ||
11085 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
11086 /*TRI=*/nullptr) == -1))
11087 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11088
11089 if (MI.getOpcode() == AArch64::ORRXrs &&
11090 MI.getOperand(1).getReg() == AArch64::XZR &&
11091 MI.getOperand(3).getImm() == 0x0)
11092 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11093
11094 return std::nullopt;
11095}
11096
11097std::optional<DestSourcePair>
11099 if ((MI.getOpcode() == AArch64::ORRWrs &&
11100 MI.getOperand(1).getReg() == AArch64::WZR &&
11101 MI.getOperand(3).getImm() == 0x0) ||
11102 (MI.getOpcode() == AArch64::ORRWrr &&
11103 MI.getOperand(1).getReg() == AArch64::WZR))
11104 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
11105 return std::nullopt;
11106}
11107
11108std::optional<RegImmPair>
11109AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
11110 int Sign = 1;
11111 int64_t Offset = 0;
11112
11113 // TODO: Handle cases where Reg is a super- or sub-register of the
11114 // destination register.
11115 const MachineOperand &Op0 = MI.getOperand(0);
11116 if (!Op0.isReg() || Reg != Op0.getReg())
11117 return std::nullopt;
11118
11119 switch (MI.getOpcode()) {
11120 default:
11121 return std::nullopt;
11122 case AArch64::SUBWri:
11123 case AArch64::SUBXri:
11124 case AArch64::SUBSWri:
11125 case AArch64::SUBSXri:
11126 Sign *= -1;
11127 [[fallthrough]];
11128 case AArch64::ADDSWri:
11129 case AArch64::ADDSXri:
11130 case AArch64::ADDWri:
11131 case AArch64::ADDXri: {
11132 // TODO: Third operand can be global address (usually some string).
11133 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
11134 !MI.getOperand(2).isImm())
11135 return std::nullopt;
11136 int Shift = MI.getOperand(3).getImm();
11137 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
11138 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
11139 }
11140 }
11141 return RegImmPair{MI.getOperand(1).getReg(), Offset};
11142}
11143
11144/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
11145/// the destination register then, if possible, describe the value in terms of
11146/// the source register.
11147static std::optional<ParamLoadedValue>
11149 const TargetInstrInfo *TII,
11150 const TargetRegisterInfo *TRI) {
11151 auto DestSrc = TII->isCopyLikeInstr(MI);
11152 if (!DestSrc)
11153 return std::nullopt;
11154
11155 Register DestReg = DestSrc->Destination->getReg();
11156 Register SrcReg = DestSrc->Source->getReg();
11157
11158 if (!DestReg.isValid() || !SrcReg.isValid())
11159 return std::nullopt;
11160
11161 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11162
11163 // If the described register is the destination, just return the source.
11164 if (DestReg == DescribedReg)
11165 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11166
11167 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11168 if (MI.getOpcode() == AArch64::ORRWrs &&
11169 TRI->isSuperRegister(DestReg, DescribedReg))
11170 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11171
11172 // We may need to describe the lower part of a ORRXrs move.
11173 if (MI.getOpcode() == AArch64::ORRXrs &&
11174 TRI->isSubRegister(DestReg, DescribedReg)) {
11175 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11176 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11177 }
11178
11179 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11180 "Unhandled ORR[XW]rs copy case");
11181
11182 return std::nullopt;
11183}
11184
11185bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11186 // Functions cannot be split to different sections on AArch64 if they have
11187 // a red zone. This is because relaxing a cross-section branch may require
11188 // incrementing the stack pointer to spill a register, which would overwrite
11189 // the red zone.
11190 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11191 return false;
11192
11194}
11195
11196bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11197 const MachineBasicBlock &MBB) const {
11198 // Asm Goto blocks can contain conditional branches to goto labels, which can
11199 // get moved out of range of the branch instruction.
11200 auto isAsmGoto = [](const MachineInstr &MI) {
11201 return MI.getOpcode() == AArch64::INLINEASM_BR;
11202 };
11203 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11204 return false;
11205
11206 // Because jump tables are label-relative instead of table-relative, they all
11207 // must be in the same section or relocation fixup handling will fail.
11208
11209 // Check if MBB is a jump table target
11210 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11211 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11212 return llvm::is_contained(JTE.MBBs, &MBB);
11213 };
11214 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11215 return false;
11216
11217 // Check if MBB contains a jump table lookup
11218 for (const MachineInstr &MI : MBB) {
11219 switch (MI.getOpcode()) {
11220 case TargetOpcode::G_BRJT:
11221 case AArch64::JumpTableDest32:
11222 case AArch64::JumpTableDest16:
11223 case AArch64::JumpTableDest8:
11224 return false;
11225 default:
11226 continue;
11227 }
11228 }
11229
11230 // MBB isn't a special case, so it's safe to be split to the cold section.
11231 return true;
11232}
11233
11234std::optional<ParamLoadedValue>
11235AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11236 Register Reg) const {
11237 const MachineFunction *MF = MI.getMF();
11238 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11239 switch (MI.getOpcode()) {
11240 case AArch64::MOVZWi:
11241 case AArch64::MOVZXi: {
11242 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11243 // 64-bit parameters, so we need to consider super-registers.
11244 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11245 return std::nullopt;
11246
11247 if (!MI.getOperand(1).isImm())
11248 return std::nullopt;
11249 int64_t Immediate = MI.getOperand(1).getImm();
11250 int Shift = MI.getOperand(2).getImm();
11251 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11252 nullptr);
11253 }
11254 case AArch64::ORRWrs:
11255 case AArch64::ORRXrs:
11256 return describeORRLoadedValue(MI, Reg, this, TRI);
11257 }
11258
11260}
11261
11262bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11263 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11264 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11265 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11266 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11267
11268 // Anyexts are nops.
11269 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11270 return true;
11271
11272 Register DefReg = ExtMI.getOperand(0).getReg();
11273 if (!MRI.hasOneNonDBGUse(DefReg))
11274 return false;
11275
11276 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11277 // addressing mode.
11278 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11279 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11280}
11281
11282uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11283 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11284}
11285
11286bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11287 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11288}
11289
11290bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11291 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11292}
11293
11294unsigned int
11295AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11296 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11297}
11298
11299bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11300 unsigned Scale) const {
11301 if (Offset && Scale)
11302 return false;
11303
11304 // Check Reg + Imm
11305 if (!Scale) {
11306 // 9-bit signed offset
11307 if (isInt<9>(Offset))
11308 return true;
11309
11310 // 12-bit unsigned offset
11311 unsigned Shift = Log2_64(NumBytes);
11312 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11313 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11314 (Offset >> Shift) << Shift == Offset)
11315 return true;
11316 return false;
11317 }
11318
11319 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11320 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11321}
11322
11324 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11325 return AArch64::BLRNoIP;
11326 else
11327 return AArch64::BLR;
11328}
11329
11331 DebugLoc DL) const {
11332 MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
11333 auto Builder = BuildMI(MBB, InsertPt, DL, get(AArch64::PAUTH_EPILOGUE))
11335
11336 const auto *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
11337 if (AFI->branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
11338 Builder.addReg(AArch64::X16, RegState::ImplicitDefine);
11339}
11340
11342AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
11343 Register TargetReg, bool FrameSetup) const {
11344 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11345
11346 MachineBasicBlock &MBB = *MBBI->getParent();
11347 MachineFunction &MF = *MBB.getParent();
11348 const AArch64InstrInfo *TII =
11349 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11350 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11351 DebugLoc DL = MBB.findDebugLoc(MBBI);
11352
11353 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11354 MachineBasicBlock *LoopTestMBB =
11355 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11356 MF.insert(MBBInsertPoint, LoopTestMBB);
11357 MachineBasicBlock *LoopBodyMBB =
11358 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11359 MF.insert(MBBInsertPoint, LoopBodyMBB);
11360 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11361 MF.insert(MBBInsertPoint, ExitMBB);
11362 MachineInstr::MIFlag Flags =
11364
11365 // LoopTest:
11366 // SUB SP, SP, #ProbeSize
11367 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11368 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11369
11370 // CMP SP, TargetReg
11371 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11372 AArch64::XZR)
11373 .addReg(AArch64::SP)
11374 .addReg(TargetReg)
11376 .setMIFlags(Flags);
11377
11378 // B.<Cond> LoopExit
11379 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11381 .addMBB(ExitMBB)
11382 .setMIFlags(Flags);
11383
11384 // LDR XZR, [SP]
11385 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::LDRXui))
11386 .addDef(AArch64::XZR)
11387 .addReg(AArch64::SP)
11388 .addImm(0)
11392 Align(8)))
11393 .setMIFlags(Flags);
11394
11395 // B loop
11396 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11397 .addMBB(LoopTestMBB)
11398 .setMIFlags(Flags);
11399
11400 // LoopExit:
11401 // MOV SP, TargetReg
11402 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11403 .addReg(TargetReg)
11404 .addImm(0)
11406 .setMIFlags(Flags);
11407
11408 // LDR XZR, [SP]
11409 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11410 .addReg(AArch64::XZR, RegState::Define)
11411 .addReg(AArch64::SP)
11412 .addImm(0)
11413 .setMIFlags(Flags);
11414
11415 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11417
11418 LoopTestMBB->addSuccessor(ExitMBB);
11419 LoopTestMBB->addSuccessor(LoopBodyMBB);
11420 LoopBodyMBB->addSuccessor(LoopTestMBB);
11421 MBB.addSuccessor(LoopTestMBB);
11422
11423 // Update liveins.
11424 if (MF.getRegInfo().reservedRegsFrozen())
11425 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11426
11427 return ExitMBB->begin();
11428}
11429
11430namespace {
11431class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11432 MachineFunction *MF;
11433 const TargetInstrInfo *TII;
11434 const TargetRegisterInfo *TRI;
11435 MachineRegisterInfo &MRI;
11436
11437 /// The block of the loop
11438 MachineBasicBlock *LoopBB;
11439 /// The conditional branch of the loop
11440 MachineInstr *CondBranch;
11441 /// The compare instruction for loop control
11442 MachineInstr *Comp;
11443 /// The number of the operand of the loop counter value in Comp
11444 unsigned CompCounterOprNum;
11445 /// The instruction that updates the loop counter value
11446 MachineInstr *Update;
11447 /// The number of the operand of the loop counter value in Update
11448 unsigned UpdateCounterOprNum;
11449 /// The initial value of the loop counter
11450 Register Init;
11451 /// True iff Update is a predecessor of Comp
11452 bool IsUpdatePriorComp;
11453
11454 /// The normalized condition used by createTripCountGreaterCondition()
11456
11457public:
11458 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11459 MachineInstr *Comp, unsigned CompCounterOprNum,
11460 MachineInstr *Update, unsigned UpdateCounterOprNum,
11461 Register Init, bool IsUpdatePriorComp,
11462 const SmallVectorImpl<MachineOperand> &Cond)
11463 : MF(Comp->getParent()->getParent()),
11464 TII(MF->getSubtarget().getInstrInfo()),
11465 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11466 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11467 CompCounterOprNum(CompCounterOprNum), Update(Update),
11468 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11469 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11470
11471 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11472 // Make the instructions for loop control be placed in stage 0.
11473 // The predecessors of Comp are considered by the caller.
11474 return MI == Comp;
11475 }
11476
11477 std::optional<bool> createTripCountGreaterCondition(
11478 int TC, MachineBasicBlock &MBB,
11479 SmallVectorImpl<MachineOperand> &CondParam) override {
11480 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11481 // Cond is normalized for such use.
11482 // The predecessors of the branch are assumed to have already been inserted.
11483 CondParam = Cond;
11484 return {};
11485 }
11486
11487 void createRemainingIterationsGreaterCondition(
11488 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11489 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11490
11491 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11492
11493 void adjustTripCount(int TripCountAdjust) override {}
11494
11495 bool isMVEExpanderSupported() override { return true; }
11496};
11497} // namespace
11498
11499/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11500/// is replaced by ReplaceReg. The output register is newly created.
11501/// The other operands are unchanged from MI.
11502static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11503 Register ReplaceReg, MachineBasicBlock &MBB,
11504 MachineBasicBlock::iterator InsertTo) {
11505 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11506 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11507 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11508 Register Result = 0;
11509 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11510 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11511 Result = MRI.createVirtualRegister(
11512 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11513 NewMI->getOperand(I).setReg(Result);
11514 } else if (I == ReplaceOprNum) {
11515 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11516 NewMI->getOperand(I).setReg(ReplaceReg);
11517 }
11518 }
11519 MBB.insert(InsertTo, NewMI);
11520 return Result;
11521}
11522
11523void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11526 // Create and accumulate conditions for next TC iterations.
11527 // Example:
11528 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11529 // # iteration of the kernel
11530 //
11531 // # insert the following instructions
11532 // cond = CSINCXr 0, 0, C, implicit $nzcv
11533 // counter = ADDXri counter, 1 # clone from this->Update
11534 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11535 // cond = CSINCXr cond, cond, C, implicit $nzcv
11536 // ... (repeat TC times)
11537 // SUBSXri cond, 0, implicit-def $nzcv
11538
11539 assert(CondBranch->getOpcode() == AArch64::Bcc);
11540 // CondCode to exit the loop
11542 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11543 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11545
11546 // Accumulate conditions to exit the loop
11547 Register AccCond = AArch64::XZR;
11548
11549 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11550 auto AccumulateCond = [&](Register CurCond,
11552 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11553 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11554 .addReg(NewCond, RegState::Define)
11555 .addReg(CurCond)
11556 .addReg(CurCond)
11558 return NewCond;
11559 };
11560
11561 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11562 // Update and Comp for I==0 are already exists in MBB
11563 // (MBB is an unrolled kernel)
11564 Register Counter;
11565 for (int I = 0; I <= TC; ++I) {
11566 Register NextCounter;
11567 if (I != 0)
11568 NextCounter =
11569 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11570
11571 AccCond = AccumulateCond(AccCond, CC);
11572
11573 if (I != TC) {
11574 if (I == 0) {
11575 if (Update != Comp && IsUpdatePriorComp) {
11576 Counter =
11577 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11578 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11579 MBB.end());
11580 } else {
11581 // can use already calculated value
11582 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11583 }
11584 } else if (Update != Comp) {
11585 NextCounter =
11586 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11587 }
11588 }
11589 Counter = NextCounter;
11590 }
11591 } else {
11592 Register Counter;
11593 if (LastStage0Insts.empty()) {
11594 // use initial counter value (testing if the trip count is sufficient to
11595 // be executed by pipelined code)
11596 Counter = Init;
11597 if (IsUpdatePriorComp)
11598 Counter =
11599 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11600 } else {
11601 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11602 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11603 }
11604
11605 for (int I = 0; I <= TC; ++I) {
11606 Register NextCounter;
11607 NextCounter =
11608 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11609 AccCond = AccumulateCond(AccCond, CC);
11610 if (I != TC && Update != Comp)
11611 NextCounter =
11612 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11613 Counter = NextCounter;
11614 }
11615 }
11616
11617 // If AccCond == 0, the remainder is greater than TC.
11618 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11619 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11620 .addReg(AccCond)
11621 .addImm(0)
11622 .addImm(0);
11623 Cond.clear();
11625}
11626
11627static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11628 Register &RegMBB, Register &RegOther) {
11629 assert(Phi.getNumOperands() == 5);
11630 if (Phi.getOperand(2).getMBB() == MBB) {
11631 RegMBB = Phi.getOperand(1).getReg();
11632 RegOther = Phi.getOperand(3).getReg();
11633 } else {
11634 assert(Phi.getOperand(4).getMBB() == MBB);
11635 RegMBB = Phi.getOperand(3).getReg();
11636 RegOther = Phi.getOperand(1).getReg();
11637 }
11638}
11639
11641 if (!Reg.isVirtual())
11642 return false;
11643 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11644 return MRI.getVRegDef(Reg)->getParent() != BB;
11645}
11646
11647/// If Reg is an induction variable, return true and set some parameters
11648static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11649 MachineInstr *&UpdateInst,
11650 unsigned &UpdateCounterOprNum, Register &InitReg,
11651 bool &IsUpdatePriorComp) {
11652 // Example:
11653 //
11654 // Preheader:
11655 // InitReg = ...
11656 // LoopBB:
11657 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11658 // Reg = COPY Reg0 ; COPY is ignored.
11659 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11660 // ; Reg is the value calculated in the previous
11661 // ; iteration, so IsUpdatePriorComp == false.
11662
11663 if (LoopBB->pred_size() != 2)
11664 return false;
11665 if (!Reg.isVirtual())
11666 return false;
11667 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11668 UpdateInst = nullptr;
11669 UpdateCounterOprNum = 0;
11670 InitReg = 0;
11671 IsUpdatePriorComp = true;
11672 Register CurReg = Reg;
11673 while (true) {
11674 MachineInstr *Def = MRI.getVRegDef(CurReg);
11675 if (Def->getParent() != LoopBB)
11676 return false;
11677 if (Def->isCopy()) {
11678 // Ignore copy instructions unless they contain subregisters
11679 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11680 return false;
11681 CurReg = Def->getOperand(1).getReg();
11682 } else if (Def->isPHI()) {
11683 if (InitReg != 0)
11684 return false;
11685 if (!UpdateInst)
11686 IsUpdatePriorComp = false;
11687 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11688 } else {
11689 if (UpdateInst)
11690 return false;
11691 switch (Def->getOpcode()) {
11692 case AArch64::ADDSXri:
11693 case AArch64::ADDSWri:
11694 case AArch64::SUBSXri:
11695 case AArch64::SUBSWri:
11696 case AArch64::ADDXri:
11697 case AArch64::ADDWri:
11698 case AArch64::SUBXri:
11699 case AArch64::SUBWri:
11700 UpdateInst = Def;
11701 UpdateCounterOprNum = 1;
11702 break;
11703 case AArch64::ADDSXrr:
11704 case AArch64::ADDSWrr:
11705 case AArch64::SUBSXrr:
11706 case AArch64::SUBSWrr:
11707 case AArch64::ADDXrr:
11708 case AArch64::ADDWrr:
11709 case AArch64::SUBXrr:
11710 case AArch64::SUBWrr:
11711 UpdateInst = Def;
11712 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11713 UpdateCounterOprNum = 1;
11714 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11715 UpdateCounterOprNum = 2;
11716 else
11717 return false;
11718 break;
11719 default:
11720 return false;
11721 }
11722 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11723 }
11724
11725 if (!CurReg.isVirtual())
11726 return false;
11727 if (Reg == CurReg)
11728 break;
11729 }
11730
11731 if (!UpdateInst)
11732 return false;
11733
11734 return true;
11735}
11736
11737std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11739 // Accept loops that meet the following conditions
11740 // * The conditional branch is BCC
11741 // * The compare instruction is ADDS/SUBS/WHILEXX
11742 // * One operand of the compare is an induction variable and the other is a
11743 // loop invariant value
11744 // * The induction variable is incremented/decremented by a single instruction
11745 // * Does not contain CALL or instructions which have unmodeled side effects
11746
11747 for (MachineInstr &MI : *LoopBB)
11748 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11749 // This instruction may use NZCV, which interferes with the instruction to
11750 // be inserted for loop control.
11751 return nullptr;
11752
11753 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11755 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11756 return nullptr;
11757
11758 // Infinite loops are not supported
11759 if (TBB == LoopBB && FBB == LoopBB)
11760 return nullptr;
11761
11762 // Must be conditional branch
11763 if (TBB != LoopBB && FBB == nullptr)
11764 return nullptr;
11765
11766 assert((TBB == LoopBB || FBB == LoopBB) &&
11767 "The Loop must be a single-basic-block loop");
11768
11769 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11771
11772 if (CondBranch->getOpcode() != AArch64::Bcc)
11773 return nullptr;
11774
11775 // Normalization for createTripCountGreaterCondition()
11776 if (TBB == LoopBB)
11778
11779 MachineInstr *Comp = nullptr;
11780 unsigned CompCounterOprNum = 0;
11781 for (MachineInstr &MI : reverse(*LoopBB)) {
11782 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11783 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11784 // operands is a loop invariant value
11785
11786 switch (MI.getOpcode()) {
11787 case AArch64::SUBSXri:
11788 case AArch64::SUBSWri:
11789 case AArch64::ADDSXri:
11790 case AArch64::ADDSWri:
11791 Comp = &MI;
11792 CompCounterOprNum = 1;
11793 break;
11794 case AArch64::ADDSWrr:
11795 case AArch64::ADDSXrr:
11796 case AArch64::SUBSWrr:
11797 case AArch64::SUBSXrr:
11798 Comp = &MI;
11799 break;
11800 default:
11801 if (isWhileOpcode(MI.getOpcode())) {
11802 Comp = &MI;
11803 break;
11804 }
11805 return nullptr;
11806 }
11807
11808 if (CompCounterOprNum == 0) {
11809 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11810 CompCounterOprNum = 2;
11811 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11812 CompCounterOprNum = 1;
11813 else
11814 return nullptr;
11815 }
11816 break;
11817 }
11818 }
11819 if (!Comp)
11820 return nullptr;
11821
11822 MachineInstr *Update = nullptr;
11823 Register Init;
11824 bool IsUpdatePriorComp;
11825 unsigned UpdateCounterOprNum;
11826 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11827 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11828 return nullptr;
11829
11830 return std::make_unique<AArch64PipelinerLoopInfo>(
11831 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11832 Init, IsUpdatePriorComp, Cond);
11833}
11834
11835/// verifyInstruction - Perform target specific instruction verification.
11836bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11837 StringRef &ErrInfo) const {
11838 // Verify that immediate offsets on load/store instructions are within range.
11839 // Stack objects with an FI operand are excluded as they can be fixed up
11840 // during PEI.
11841 TypeSize Scale(0U, false), Width(0U, false);
11842 int64_t MinOffset, MaxOffset;
11843 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11844 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11845 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11846 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11847 if (Imm < MinOffset || Imm > MaxOffset) {
11848 ErrInfo = "Unexpected immediate on load/store instruction";
11849 return false;
11850 }
11851 }
11852 }
11853
11854 const MCInstrDesc &MCID = MI.getDesc();
11855 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11856 const MachineOperand &MO = MI.getOperand(Op);
11857 switch (MCID.operands()[Op].OperandType) {
11859 if (!MO.isImm() || MO.getImm() != 0) {
11860 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11861 return false;
11862 }
11863 break;
11865 if (!MO.isImm() ||
11867 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11868 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11869 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11870 return false;
11871 }
11872 break;
11873 default:
11874 break;
11875 }
11876 }
11877 return true;
11878}
11879
11880#define GET_INSTRINFO_HELPERS
11881#define GET_INSTRMAP_INFO
11882#include "AArch64GenInstrInfo.inc"
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static bool mustAvoidNeonAtMBBI(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if in a streaming call site region without SME-FA64.
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static std::optional< unsigned > getLFIInstSizeInBytes(const MachineInstr &MI)
Return the maximum number of bytes of code the specified instruction may be after LFI rewriting.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool isInStreamingCallSiteRegion(MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
Returns true if the instruction at I is in a streaming call site region, within a single basic block.
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, RegState State, const TargetRegisterInfo *TRI)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
static bool isZExtLoad(const MachineInstr &MI)
Returns whether the instruction is a zero-extending load.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
void createPauthEpilogueInstr(MachineBasicBlock &MBB, DebugLoc DL) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, unsigned SubReg=0, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isSExtLoad(const MachineInstr &MI)
Returns whether the instruction is a sign-extending load.
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, int FrameIndex, MachineInstr *&CopyMI, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:664
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
bool def_empty(Register RegNo) const
def_empty - Return true if there are no instructions defining the specified register (it may be live-...
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
bool hasOneDef(Register RegNo) const
Return true if there is exactly one operand defining the specified register.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
const MCAsmInfo & getMCAsmInfo() const
Return target specific asm information.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:207
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
iterator end() const
Definition BasicBlock.h:89
LLVM_ABI Instruction & back() const
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr RegState getDefRegState(bool B)
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:322
static MCRegister getXRegFromWReg(MCRegister Reg)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
constexpr RegState getUndefRegState(bool B)
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.