LLVM 22.0.0git
AArch64InstrInfo.cpp
Go to the documentation of this file.
1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
16#include "AArch64PointerAuth.h"
17#include "AArch64Subtarget.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/DebugLoc.h"
45#include "llvm/IR/GlobalValue.h"
46#include "llvm/IR/Module.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCInst.h"
50#include "llvm/MC/MCInstrDesc.h"
55#include "llvm/Support/LEB128.h"
59#include <cassert>
60#include <cstdint>
61#include <iterator>
62#include <utility>
63
64using namespace llvm;
65
66#define GET_INSTRINFO_CTOR_DTOR
67#include "AArch64GenInstrInfo.inc"
68
69#define DEBUG_TYPE "AArch64InstrInfo"
70
71STATISTIC(NumCopyInstrs, "Number of COPY instructions expanded");
72STATISTIC(NumZCRegMoveInstrsGPR, "Number of zero-cycle GPR register move "
73 "instructions expanded from canonical COPY");
74STATISTIC(NumZCRegMoveInstrsFPR, "Number of zero-cycle FPR register move "
75 "instructions expanded from canonical COPY");
76STATISTIC(NumZCZeroingInstrsGPR, "Number of zero-cycle GPR zeroing "
77 "instructions expanded from canonical COPY");
78// NumZCZeroingInstrsFPR is counted at AArch64AsmPrinter
79
81 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
82 cl::desc("Restrict range of CB instructions (DEBUG)"));
83
85 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
86 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
87
89 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
90 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
91
93 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
94 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
95
97 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
98 cl::desc("Restrict range of B instructions (DEBUG)"));
99
101 "aarch64-search-limit", cl::Hidden, cl::init(2048),
102 cl::desc("Restrict range of instructions to search for the "
103 "machine-combiner gather pattern optimization"));
104
106 : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN,
107 AArch64::ADJCALLSTACKUP, AArch64::CATCHRET),
108 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
109
110/// GetInstSize - Return the number of bytes of code the specified
111/// instruction may be. This returns the maximum number of bytes.
113 const MachineBasicBlock &MBB = *MI.getParent();
114 const MachineFunction *MF = MBB.getParent();
115 const Function &F = MF->getFunction();
116 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
117
118 {
119 auto Op = MI.getOpcode();
120 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
121 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
122 }
123
124 // Meta-instructions emit no code.
125 if (MI.isMetaInstruction())
126 return 0;
127
128 // FIXME: We currently only handle pseudoinstructions that don't get expanded
129 // before the assembly printer.
130 unsigned NumBytes = 0;
131 const MCInstrDesc &Desc = MI.getDesc();
132
133 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
134 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
135
136 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
137 if (!MFI->shouldSignReturnAddress(*MF))
138 return NumBytes;
139
140 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
141 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
142 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
143 return NumBytes;
144 }
145
146 // Size should be preferably set in
147 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
148 // Specific cases handle instructions of variable sizes
149 switch (Desc.getOpcode()) {
150 default:
151 if (Desc.getSize())
152 return Desc.getSize();
153
154 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
155 // with fixed constant size but not specified in .td file) is a normal
156 // 4-byte insn.
157 NumBytes = 4;
158 break;
159 case TargetOpcode::STACKMAP:
160 // The upper bound for a stackmap intrinsic is the full length of its shadow
161 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
162 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
163 break;
164 case TargetOpcode::PATCHPOINT:
165 // The size of the patchpoint intrinsic is the number of bytes requested
166 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
167 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
168 break;
169 case TargetOpcode::STATEPOINT:
170 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
171 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
172 // No patch bytes means a normal call inst is emitted
173 if (NumBytes == 0)
174 NumBytes = 4;
175 break;
176 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
177 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
178 // instructions are expanded to the specified number of NOPs. Otherwise,
179 // they are expanded to 36-byte XRay sleds.
180 NumBytes =
181 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
182 break;
183 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
184 case TargetOpcode::PATCHABLE_TAIL_CALL:
185 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
186 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
187 NumBytes = 36;
188 break;
189 case TargetOpcode::PATCHABLE_EVENT_CALL:
190 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
191 NumBytes = 24;
192 break;
193
194 case AArch64::SPACE:
195 NumBytes = MI.getOperand(1).getImm();
196 break;
197 case TargetOpcode::BUNDLE:
198 NumBytes = getInstBundleLength(MI);
199 break;
200 }
201
202 return NumBytes;
203}
204
205unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
206 unsigned Size = 0;
208 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
209 while (++I != E && I->isInsideBundle()) {
210 assert(!I->isBundle() && "No nested bundle!");
212 }
213 return Size;
214}
215
218 // Block ends with fall-through condbranch.
219 switch (LastInst->getOpcode()) {
220 default:
221 llvm_unreachable("Unknown branch instruction?");
222 case AArch64::Bcc:
223 Target = LastInst->getOperand(1).getMBB();
224 Cond.push_back(LastInst->getOperand(0));
225 break;
226 case AArch64::CBZW:
227 case AArch64::CBZX:
228 case AArch64::CBNZW:
229 case AArch64::CBNZX:
230 Target = LastInst->getOperand(1).getMBB();
231 Cond.push_back(MachineOperand::CreateImm(-1));
232 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
233 Cond.push_back(LastInst->getOperand(0));
234 break;
235 case AArch64::TBZW:
236 case AArch64::TBZX:
237 case AArch64::TBNZW:
238 case AArch64::TBNZX:
239 Target = LastInst->getOperand(2).getMBB();
240 Cond.push_back(MachineOperand::CreateImm(-1));
241 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
242 Cond.push_back(LastInst->getOperand(0));
243 Cond.push_back(LastInst->getOperand(1));
244 break;
245 case AArch64::CBWPri:
246 case AArch64::CBXPri:
247 case AArch64::CBWPrr:
248 case AArch64::CBXPrr:
249 Target = LastInst->getOperand(3).getMBB();
250 Cond.push_back(MachineOperand::CreateImm(-1));
251 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
252 Cond.push_back(LastInst->getOperand(0));
253 Cond.push_back(LastInst->getOperand(1));
254 Cond.push_back(LastInst->getOperand(2));
255 break;
256 case AArch64::CBBAssertExt:
257 case AArch64::CBHAssertExt:
258 Target = LastInst->getOperand(3).getMBB();
259 Cond.push_back(MachineOperand::CreateImm(-1)); // -1
260 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); // Opc
261 Cond.push_back(LastInst->getOperand(0)); // Cond
262 Cond.push_back(LastInst->getOperand(1)); // Op0
263 Cond.push_back(LastInst->getOperand(2)); // Op1
264 Cond.push_back(LastInst->getOperand(4)); // Ext0
265 Cond.push_back(LastInst->getOperand(5)); // Ext1
266 break;
267 }
268}
269
270static unsigned getBranchDisplacementBits(unsigned Opc) {
271 switch (Opc) {
272 default:
273 llvm_unreachable("unexpected opcode!");
274 case AArch64::B:
275 return BDisplacementBits;
276 case AArch64::TBNZW:
277 case AArch64::TBZW:
278 case AArch64::TBNZX:
279 case AArch64::TBZX:
280 return TBZDisplacementBits;
281 case AArch64::CBNZW:
282 case AArch64::CBZW:
283 case AArch64::CBNZX:
284 case AArch64::CBZX:
285 return CBZDisplacementBits;
286 case AArch64::Bcc:
287 return BCCDisplacementBits;
288 case AArch64::CBWPri:
289 case AArch64::CBXPri:
290 case AArch64::CBBAssertExt:
291 case AArch64::CBHAssertExt:
292 case AArch64::CBWPrr:
293 case AArch64::CBXPrr:
294 return CBDisplacementBits;
295 }
296}
297
299 int64_t BrOffset) const {
300 unsigned Bits = getBranchDisplacementBits(BranchOp);
301 assert(Bits >= 3 && "max branch displacement must be enough to jump"
302 "over conditional branch expansion");
303 return isIntN(Bits, BrOffset / 4);
304}
305
308 switch (MI.getOpcode()) {
309 default:
310 llvm_unreachable("unexpected opcode!");
311 case AArch64::B:
312 return MI.getOperand(0).getMBB();
313 case AArch64::TBZW:
314 case AArch64::TBNZW:
315 case AArch64::TBZX:
316 case AArch64::TBNZX:
317 return MI.getOperand(2).getMBB();
318 case AArch64::CBZW:
319 case AArch64::CBNZW:
320 case AArch64::CBZX:
321 case AArch64::CBNZX:
322 case AArch64::Bcc:
323 return MI.getOperand(1).getMBB();
324 case AArch64::CBWPri:
325 case AArch64::CBXPri:
326 case AArch64::CBBAssertExt:
327 case AArch64::CBHAssertExt:
328 case AArch64::CBWPrr:
329 case AArch64::CBXPrr:
330 return MI.getOperand(3).getMBB();
331 }
332}
333
335 MachineBasicBlock &NewDestBB,
336 MachineBasicBlock &RestoreBB,
337 const DebugLoc &DL,
338 int64_t BrOffset,
339 RegScavenger *RS) const {
340 assert(RS && "RegScavenger required for long branching");
341 assert(MBB.empty() &&
342 "new block should be inserted for expanding unconditional branch");
343 assert(MBB.pred_size() == 1);
344 assert(RestoreBB.empty() &&
345 "restore block should be inserted for restoring clobbered registers");
346
347 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
348 // Offsets outside of the signed 33-bit range are not supported for ADRP +
349 // ADD.
350 if (!isInt<33>(BrOffset))
352 "Branch offsets outside of the signed 33-bit range not supported");
353
354 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
355 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
356 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
357 .addReg(Reg)
358 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
359 .addImm(0);
360 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
361 };
362
363 RS->enterBasicBlockEnd(MBB);
364 // If X16 is unused, we can rely on the linker to insert a range extension
365 // thunk if NewDestBB is out of range of a single B instruction.
366 constexpr Register Reg = AArch64::X16;
367 if (!RS->isRegUsed(Reg)) {
368 insertUnconditionalBranch(MBB, &NewDestBB, DL);
369 RS->setRegUsed(Reg);
370 return;
371 }
372
373 // If there's a free register and it's worth inflating the code size,
374 // manually insert the indirect branch.
375 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
376 if (Scavenged != AArch64::NoRegister &&
377 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
378 buildIndirectBranch(Scavenged, NewDestBB);
379 RS->setRegUsed(Scavenged);
380 return;
381 }
382
383 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
384 // with red zones.
385 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
386 if (!AFI || AFI->hasRedZone().value_or(true))
388 "Unable to insert indirect branch inside function that has red zone");
389
390 // Otherwise, spill X16 and defer range extension to the linker.
391 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
392 .addReg(AArch64::SP, RegState::Define)
393 .addReg(Reg)
394 .addReg(AArch64::SP)
395 .addImm(-16);
396
397 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
398
399 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
400 .addReg(AArch64::SP, RegState::Define)
402 .addReg(AArch64::SP)
403 .addImm(16);
404}
405
406// Branch analysis.
409 MachineBasicBlock *&FBB,
411 bool AllowModify) const {
412 // If the block has no terminators, it just falls into the block after it.
413 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
414 if (I == MBB.end())
415 return false;
416
417 // Skip over SpeculationBarrierEndBB terminators
418 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
419 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
420 --I;
421 }
422
423 if (!isUnpredicatedTerminator(*I))
424 return false;
425
426 // Get the last instruction in the block.
427 MachineInstr *LastInst = &*I;
428
429 // If there is only one terminator instruction, process it.
430 unsigned LastOpc = LastInst->getOpcode();
431 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
432 if (isUncondBranchOpcode(LastOpc)) {
433 TBB = LastInst->getOperand(0).getMBB();
434 return false;
435 }
436 if (isCondBranchOpcode(LastOpc)) {
437 // Block ends with fall-through condbranch.
438 parseCondBranch(LastInst, TBB, Cond);
439 return false;
440 }
441 return true; // Can't handle indirect branch.
442 }
443
444 // Get the instruction before it if it is a terminator.
445 MachineInstr *SecondLastInst = &*I;
446 unsigned SecondLastOpc = SecondLastInst->getOpcode();
447
448 // If AllowModify is true and the block ends with two or more unconditional
449 // branches, delete all but the first unconditional branch.
450 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
451 while (isUncondBranchOpcode(SecondLastOpc)) {
452 LastInst->eraseFromParent();
453 LastInst = SecondLastInst;
454 LastOpc = LastInst->getOpcode();
455 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
456 // Return now the only terminator is an unconditional branch.
457 TBB = LastInst->getOperand(0).getMBB();
458 return false;
459 }
460 SecondLastInst = &*I;
461 SecondLastOpc = SecondLastInst->getOpcode();
462 }
463 }
464
465 // If we're allowed to modify and the block ends in a unconditional branch
466 // which could simply fallthrough, remove the branch. (Note: This case only
467 // matters when we can't understand the whole sequence, otherwise it's also
468 // handled by BranchFolding.cpp.)
469 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
470 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
471 LastInst->eraseFromParent();
472 LastInst = SecondLastInst;
473 LastOpc = LastInst->getOpcode();
474 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
475 assert(!isUncondBranchOpcode(LastOpc) &&
476 "unreachable unconditional branches removed above");
477
478 if (isCondBranchOpcode(LastOpc)) {
479 // Block ends with fall-through condbranch.
480 parseCondBranch(LastInst, TBB, Cond);
481 return false;
482 }
483 return true; // Can't handle indirect branch.
484 }
485 SecondLastInst = &*I;
486 SecondLastOpc = SecondLastInst->getOpcode();
487 }
488
489 // If there are three terminators, we don't know what sort of block this is.
490 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
491 return true;
492
493 // If the block ends with a B and a Bcc, handle it.
494 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
495 parseCondBranch(SecondLastInst, TBB, Cond);
496 FBB = LastInst->getOperand(0).getMBB();
497 return false;
498 }
499
500 // If the block ends with two unconditional branches, handle it. The second
501 // one is not executed, so remove it.
502 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
503 TBB = SecondLastInst->getOperand(0).getMBB();
504 I = LastInst;
505 if (AllowModify)
506 I->eraseFromParent();
507 return false;
508 }
509
510 // ...likewise if it ends with an indirect branch followed by an unconditional
511 // branch.
512 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
513 I = LastInst;
514 if (AllowModify)
515 I->eraseFromParent();
516 return true;
517 }
518
519 // Otherwise, can't handle this.
520 return true;
521}
522
524 MachineBranchPredicate &MBP,
525 bool AllowModify) const {
526 // For the moment, handle only a block which ends with a cb(n)zx followed by
527 // a fallthrough. Why this? Because it is a common form.
528 // TODO: Should we handle b.cc?
529
530 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
531 if (I == MBB.end())
532 return true;
533
534 // Skip over SpeculationBarrierEndBB terminators
535 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
536 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
537 --I;
538 }
539
540 if (!isUnpredicatedTerminator(*I))
541 return true;
542
543 // Get the last instruction in the block.
544 MachineInstr *LastInst = &*I;
545 unsigned LastOpc = LastInst->getOpcode();
546 if (!isCondBranchOpcode(LastOpc))
547 return true;
548
549 switch (LastOpc) {
550 default:
551 return true;
552 case AArch64::CBZW:
553 case AArch64::CBZX:
554 case AArch64::CBNZW:
555 case AArch64::CBNZX:
556 break;
557 };
558
559 MBP.TrueDest = LastInst->getOperand(1).getMBB();
560 assert(MBP.TrueDest && "expected!");
561 MBP.FalseDest = MBB.getNextNode();
562
563 MBP.ConditionDef = nullptr;
564 MBP.SingleUseCondition = false;
565
566 MBP.LHS = LastInst->getOperand(0);
567 MBP.RHS = MachineOperand::CreateImm(0);
568 MBP.Predicate = (LastOpc == AArch64::CBNZX || LastOpc == AArch64::CBNZW)
569 ? MachineBranchPredicate::PRED_NE
570 : MachineBranchPredicate::PRED_EQ;
571 return false;
572}
573
576 if (Cond[0].getImm() != -1) {
577 // Regular Bcc
578 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
580 } else {
581 // Folded compare-and-branch
582 switch (Cond[1].getImm()) {
583 default:
584 llvm_unreachable("Unknown conditional branch!");
585 case AArch64::CBZW:
586 Cond[1].setImm(AArch64::CBNZW);
587 break;
588 case AArch64::CBNZW:
589 Cond[1].setImm(AArch64::CBZW);
590 break;
591 case AArch64::CBZX:
592 Cond[1].setImm(AArch64::CBNZX);
593 break;
594 case AArch64::CBNZX:
595 Cond[1].setImm(AArch64::CBZX);
596 break;
597 case AArch64::TBZW:
598 Cond[1].setImm(AArch64::TBNZW);
599 break;
600 case AArch64::TBNZW:
601 Cond[1].setImm(AArch64::TBZW);
602 break;
603 case AArch64::TBZX:
604 Cond[1].setImm(AArch64::TBNZX);
605 break;
606 case AArch64::TBNZX:
607 Cond[1].setImm(AArch64::TBZX);
608 break;
609
610 // Cond is { -1, Opcode, CC, Op0, Op1, ... }
611 case AArch64::CBWPri:
612 case AArch64::CBXPri:
613 case AArch64::CBBAssertExt:
614 case AArch64::CBHAssertExt:
615 case AArch64::CBWPrr:
616 case AArch64::CBXPrr: {
617 // Pseudos using standard 4bit Arm condition codes
619 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
621 }
622 }
623 }
624
625 return false;
626}
627
629 int *BytesRemoved) const {
630 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
631 if (I == MBB.end())
632 return 0;
633
634 if (!isUncondBranchOpcode(I->getOpcode()) &&
635 !isCondBranchOpcode(I->getOpcode()))
636 return 0;
637
638 // Remove the branch.
639 I->eraseFromParent();
640
641 I = MBB.end();
642
643 if (I == MBB.begin()) {
644 if (BytesRemoved)
645 *BytesRemoved = 4;
646 return 1;
647 }
648 --I;
649 if (!isCondBranchOpcode(I->getOpcode())) {
650 if (BytesRemoved)
651 *BytesRemoved = 4;
652 return 1;
653 }
654
655 // Remove the branch.
656 I->eraseFromParent();
657 if (BytesRemoved)
658 *BytesRemoved = 8;
659
660 return 2;
661}
662
663void AArch64InstrInfo::instantiateCondBranch(
666 if (Cond[0].getImm() != -1) {
667 // Regular Bcc
668 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
669 } else {
670 // Folded compare-and-branch
671 // Note that we use addOperand instead of addReg to keep the flags.
672
673 // cbz, cbnz
674 const MachineInstrBuilder MIB =
675 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
676
677 // tbz/tbnz
678 if (Cond.size() > 3)
679 MIB.add(Cond[3]);
680
681 // cb
682 if (Cond.size() > 4)
683 MIB.add(Cond[4]);
684
685 MIB.addMBB(TBB);
686
687 // cb[b,h]
688 if (Cond.size() > 5) {
689 MIB.addImm(Cond[5].getImm());
690 MIB.addImm(Cond[6].getImm());
691 }
692 }
693}
694
697 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
698 // Shouldn't be a fall through.
699 assert(TBB && "insertBranch must not be told to insert a fallthrough");
700
701 if (!FBB) {
702 if (Cond.empty()) // Unconditional branch?
703 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
704 else
705 instantiateCondBranch(MBB, DL, TBB, Cond);
706
707 if (BytesAdded)
708 *BytesAdded = 4;
709
710 return 1;
711 }
712
713 // Two-way conditional branch.
714 instantiateCondBranch(MBB, DL, TBB, Cond);
715 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
716
717 if (BytesAdded)
718 *BytesAdded = 8;
719
720 return 2;
721}
722
724 const TargetInstrInfo &TII) {
725 for (MachineInstr &MI : MBB->terminators()) {
726 unsigned Opc = MI.getOpcode();
727 switch (Opc) {
728 case AArch64::CBZW:
729 case AArch64::CBZX:
730 case AArch64::TBZW:
731 case AArch64::TBZX:
732 // CBZ/TBZ with WZR/XZR -> unconditional B
733 if (MI.getOperand(0).getReg() == AArch64::WZR ||
734 MI.getOperand(0).getReg() == AArch64::XZR) {
735 DEBUG_WITH_TYPE("optimizeTerminators",
736 dbgs() << "Removing always taken branch: " << MI);
737 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
738 SmallVector<MachineBasicBlock *> Succs(MBB->successors());
739 for (auto *S : Succs)
740 if (S != Target)
741 MBB->removeSuccessor(S);
742 DebugLoc DL = MI.getDebugLoc();
743 while (MBB->rbegin() != &MI)
744 MBB->rbegin()->eraseFromParent();
745 MI.eraseFromParent();
746 BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
747 return true;
748 }
749 break;
750 case AArch64::CBNZW:
751 case AArch64::CBNZX:
752 case AArch64::TBNZW:
753 case AArch64::TBNZX:
754 // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
755 if (MI.getOperand(0).getReg() == AArch64::WZR ||
756 MI.getOperand(0).getReg() == AArch64::XZR) {
757 DEBUG_WITH_TYPE("optimizeTerminators",
758 dbgs() << "Removing never taken branch: " << MI);
759 MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
760 MI.getParent()->removeSuccessor(Target);
761 MI.eraseFromParent();
762 return true;
763 }
764 break;
765 }
766 }
767 return false;
768}
769
770// Find the original register that VReg is copied from.
771static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
772 while (Register::isVirtualRegister(VReg)) {
773 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
774 if (!DefMI->isFullCopy())
775 return VReg;
776 VReg = DefMI->getOperand(1).getReg();
777 }
778 return VReg;
779}
780
781// Determine if VReg is defined by an instruction that can be folded into a
782// csel instruction. If so, return the folded opcode, and the replacement
783// register.
784static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
785 unsigned *NewReg = nullptr) {
786 VReg = removeCopies(MRI, VReg);
788 return 0;
789
790 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
791 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
792 unsigned Opc = 0;
793 unsigned SrcReg = 0;
794 switch (DefMI->getOpcode()) {
795 case AArch64::SUBREG_TO_REG:
796 // Check for the following way to define an 64-bit immediate:
797 // %0:gpr32 = MOVi32imm 1
798 // %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32
799 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0)
800 return 0;
801 if (!DefMI->getOperand(2).isReg())
802 return 0;
803 if (!DefMI->getOperand(3).isImm() ||
804 DefMI->getOperand(3).getImm() != AArch64::sub_32)
805 return 0;
806 DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg());
807 if (DefMI->getOpcode() != AArch64::MOVi32imm)
808 return 0;
809 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
810 return 0;
811 assert(Is64Bit);
812 SrcReg = AArch64::XZR;
813 Opc = AArch64::CSINCXr;
814 break;
815
816 case AArch64::MOVi32imm:
817 case AArch64::MOVi64imm:
818 if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
819 return 0;
820 SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
821 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
822 break;
823
824 case AArch64::ADDSXri:
825 case AArch64::ADDSWri:
826 // if NZCV is used, do not fold.
827 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
828 true) == -1)
829 return 0;
830 // fall-through to ADDXri and ADDWri.
831 [[fallthrough]];
832 case AArch64::ADDXri:
833 case AArch64::ADDWri:
834 // add x, 1 -> csinc.
835 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
836 DefMI->getOperand(3).getImm() != 0)
837 return 0;
838 SrcReg = DefMI->getOperand(1).getReg();
839 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
840 break;
841
842 case AArch64::ORNXrr:
843 case AArch64::ORNWrr: {
844 // not x -> csinv, represented as orn dst, xzr, src.
845 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
846 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
847 return 0;
848 SrcReg = DefMI->getOperand(2).getReg();
849 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
850 break;
851 }
852
853 case AArch64::SUBSXrr:
854 case AArch64::SUBSWrr:
855 // if NZCV is used, do not fold.
856 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
857 true) == -1)
858 return 0;
859 // fall-through to SUBXrr and SUBWrr.
860 [[fallthrough]];
861 case AArch64::SUBXrr:
862 case AArch64::SUBWrr: {
863 // neg x -> csneg, represented as sub dst, xzr, src.
864 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
865 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
866 return 0;
867 SrcReg = DefMI->getOperand(2).getReg();
868 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
869 break;
870 }
871 default:
872 return 0;
873 }
874 assert(Opc && SrcReg && "Missing parameters");
875
876 if (NewReg)
877 *NewReg = SrcReg;
878 return Opc;
879}
880
883 Register DstReg, Register TrueReg,
884 Register FalseReg, int &CondCycles,
885 int &TrueCycles,
886 int &FalseCycles) const {
887 // Check register classes.
888 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
889 const TargetRegisterClass *RC =
890 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
891 if (!RC)
892 return false;
893
894 // Also need to check the dest regclass, in case we're trying to optimize
895 // something like:
896 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
897 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
898 return false;
899
900 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
901 unsigned ExtraCondLat = Cond.size() != 1;
902
903 // GPRs are handled by csel.
904 // FIXME: Fold in x+1, -x, and ~x when applicable.
905 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
906 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
907 // Single-cycle csel, csinc, csinv, and csneg.
908 CondCycles = 1 + ExtraCondLat;
909 TrueCycles = FalseCycles = 1;
910 if (canFoldIntoCSel(MRI, TrueReg))
911 TrueCycles = 0;
912 else if (canFoldIntoCSel(MRI, FalseReg))
913 FalseCycles = 0;
914 return true;
915 }
916
917 // Scalar floating point is handled by fcsel.
918 // FIXME: Form fabs, fmin, and fmax when applicable.
919 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
920 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
921 CondCycles = 5 + ExtraCondLat;
922 TrueCycles = FalseCycles = 2;
923 return true;
924 }
925
926 // Can't do vectors.
927 return false;
928}
929
932 const DebugLoc &DL, Register DstReg,
934 Register TrueReg, Register FalseReg) const {
935 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
936
937 // Parse the condition code, see parseCondBranch() above.
939 switch (Cond.size()) {
940 default:
941 llvm_unreachable("Unknown condition opcode in Cond");
942 case 1: // b.cc
944 break;
945 case 3: { // cbz/cbnz
946 // We must insert a compare against 0.
947 bool Is64Bit;
948 switch (Cond[1].getImm()) {
949 default:
950 llvm_unreachable("Unknown branch opcode in Cond");
951 case AArch64::CBZW:
952 Is64Bit = false;
953 CC = AArch64CC::EQ;
954 break;
955 case AArch64::CBZX:
956 Is64Bit = true;
957 CC = AArch64CC::EQ;
958 break;
959 case AArch64::CBNZW:
960 Is64Bit = false;
961 CC = AArch64CC::NE;
962 break;
963 case AArch64::CBNZX:
964 Is64Bit = true;
965 CC = AArch64CC::NE;
966 break;
967 }
968 Register SrcReg = Cond[2].getReg();
969 if (Is64Bit) {
970 // cmp reg, #0 is actually subs xzr, reg, #0.
971 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
972 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
973 .addReg(SrcReg)
974 .addImm(0)
975 .addImm(0);
976 } else {
977 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
978 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
979 .addReg(SrcReg)
980 .addImm(0)
981 .addImm(0);
982 }
983 break;
984 }
985 case 4: { // tbz/tbnz
986 // We must insert a tst instruction.
987 switch (Cond[1].getImm()) {
988 default:
989 llvm_unreachable("Unknown branch opcode in Cond");
990 case AArch64::TBZW:
991 case AArch64::TBZX:
992 CC = AArch64CC::EQ;
993 break;
994 case AArch64::TBNZW:
995 case AArch64::TBNZX:
996 CC = AArch64CC::NE;
997 break;
998 }
999 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
1000 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
1001 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
1002 .addReg(Cond[2].getReg())
1003 .addImm(
1005 else
1006 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
1007 .addReg(Cond[2].getReg())
1008 .addImm(
1010 break;
1011 }
1012 case 5: { // cb
1013 // We must insert a cmp, that is a subs
1014 // 0 1 2 3 4
1015 // Cond is { -1, Opcode, CC, Op0, Op1 }
1016
1017 unsigned SubsOpc, SubsDestReg;
1018 bool IsImm = false;
1019 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1020 switch (Cond[1].getImm()) {
1021 default:
1022 llvm_unreachable("Unknown branch opcode in Cond");
1023 case AArch64::CBWPri:
1024 SubsOpc = AArch64::SUBSWri;
1025 SubsDestReg = AArch64::WZR;
1026 IsImm = true;
1027 break;
1028 case AArch64::CBXPri:
1029 SubsOpc = AArch64::SUBSXri;
1030 SubsDestReg = AArch64::XZR;
1031 IsImm = true;
1032 break;
1033 case AArch64::CBWPrr:
1034 SubsOpc = AArch64::SUBSWrr;
1035 SubsDestReg = AArch64::WZR;
1036 IsImm = false;
1037 break;
1038 case AArch64::CBXPrr:
1039 SubsOpc = AArch64::SUBSXrr;
1040 SubsDestReg = AArch64::XZR;
1041 IsImm = false;
1042 break;
1043 }
1044
1045 if (IsImm)
1046 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1047 .addReg(Cond[3].getReg())
1048 .addImm(Cond[4].getImm())
1049 .addImm(0);
1050 else
1051 BuildMI(MBB, I, DL, get(SubsOpc), SubsDestReg)
1052 .addReg(Cond[3].getReg())
1053 .addReg(Cond[4].getReg());
1054 } break;
1055 case 7: { // cb[b,h]
1056 // We must insert a cmp, that is a subs, but also zero- or sign-extensions
1057 // that have been folded. For the first operand we codegen an explicit
1058 // extension, for the second operand we fold the extension into cmp.
1059 // 0 1 2 3 4 5 6
1060 // Cond is { -1, Opcode, CC, Op0, Op1, Ext0, Ext1 }
1061
1062 // We need a new register for the now explicitly extended register
1063 Register Reg = Cond[4].getReg();
1065 unsigned ExtOpc;
1066 unsigned ExtBits;
1067 AArch64_AM::ShiftExtendType ExtendType =
1069 switch (ExtendType) {
1070 default:
1071 llvm_unreachable("Unknown shift-extend for CB instruction");
1072 case AArch64_AM::SXTB:
1073 assert(
1074 Cond[1].getImm() == AArch64::CBBAssertExt &&
1075 "Unexpected compare-and-branch instruction for SXTB shift-extend");
1076 ExtOpc = AArch64::SBFMWri;
1077 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1078 break;
1079 case AArch64_AM::SXTH:
1080 assert(
1081 Cond[1].getImm() == AArch64::CBHAssertExt &&
1082 "Unexpected compare-and-branch instruction for SXTH shift-extend");
1083 ExtOpc = AArch64::SBFMWri;
1084 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1085 break;
1086 case AArch64_AM::UXTB:
1087 assert(
1088 Cond[1].getImm() == AArch64::CBBAssertExt &&
1089 "Unexpected compare-and-branch instruction for UXTB shift-extend");
1090 ExtOpc = AArch64::ANDWri;
1091 ExtBits = AArch64_AM::encodeLogicalImmediate(0xff, 32);
1092 break;
1093 case AArch64_AM::UXTH:
1094 assert(
1095 Cond[1].getImm() == AArch64::CBHAssertExt &&
1096 "Unexpected compare-and-branch instruction for UXTH shift-extend");
1097 ExtOpc = AArch64::ANDWri;
1098 ExtBits = AArch64_AM::encodeLogicalImmediate(0xffff, 32);
1099 break;
1100 }
1101
1102 // Build the explicit extension of the first operand
1103 Reg = MRI.createVirtualRegister(&AArch64::GPR32spRegClass);
1105 BuildMI(MBB, I, DL, get(ExtOpc), Reg).addReg(Cond[4].getReg());
1106 if (ExtOpc != AArch64::ANDWri)
1107 MBBI.addImm(0);
1108 MBBI.addImm(ExtBits);
1109 }
1110
1111 // Now, subs with an extended second operand
1113 AArch64_AM::ShiftExtendType ExtendType =
1115 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1116 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1117 BuildMI(MBB, I, DL, get(AArch64::SUBSWrx), AArch64::WZR)
1118 .addReg(Cond[3].getReg())
1119 .addReg(Reg)
1120 .addImm(AArch64_AM::getArithExtendImm(ExtendType, 0));
1121 } // If no extension is needed, just a regular subs
1122 else {
1123 MRI.constrainRegClass(Reg, MRI.getRegClass(Cond[3].getReg()));
1124 MRI.constrainRegClass(Cond[3].getReg(), &AArch64::GPR32spRegClass);
1125 BuildMI(MBB, I, DL, get(AArch64::SUBSWrr), AArch64::WZR)
1126 .addReg(Cond[3].getReg())
1127 .addReg(Reg);
1128 }
1129
1130 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
1131 } break;
1132 }
1133
1134 unsigned Opc = 0;
1135 const TargetRegisterClass *RC = nullptr;
1136 bool TryFold = false;
1137 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
1138 RC = &AArch64::GPR64RegClass;
1139 Opc = AArch64::CSELXr;
1140 TryFold = true;
1141 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
1142 RC = &AArch64::GPR32RegClass;
1143 Opc = AArch64::CSELWr;
1144 TryFold = true;
1145 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
1146 RC = &AArch64::FPR64RegClass;
1147 Opc = AArch64::FCSELDrrr;
1148 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
1149 RC = &AArch64::FPR32RegClass;
1150 Opc = AArch64::FCSELSrrr;
1151 }
1152 assert(RC && "Unsupported regclass");
1153
1154 // Try folding simple instructions into the csel.
1155 if (TryFold) {
1156 unsigned NewReg = 0;
1157 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
1158 if (FoldedOpc) {
1159 // The folded opcodes csinc, csinc and csneg apply the operation to
1160 // FalseReg, so we need to invert the condition.
1162 TrueReg = FalseReg;
1163 } else
1164 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
1165
1166 // Fold the operation. Leave any dead instructions for DCE to clean up.
1167 if (FoldedOpc) {
1168 FalseReg = NewReg;
1169 Opc = FoldedOpc;
1170 // Extend the live range of NewReg.
1171 MRI.clearKillFlags(NewReg);
1172 }
1173 }
1174
1175 // Pull all virtual register into the appropriate class.
1176 MRI.constrainRegClass(TrueReg, RC);
1177 // FalseReg might be WZR or XZR if the folded operand is a literal 1.
1178 assert(
1179 (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
1180 FalseReg == AArch64::XZR) &&
1181 "FalseReg was folded into a non-virtual register other than WZR or XZR");
1182 if (FalseReg.isVirtual())
1183 MRI.constrainRegClass(FalseReg, RC);
1184
1185 // Insert the csel.
1186 BuildMI(MBB, I, DL, get(Opc), DstReg)
1187 .addReg(TrueReg)
1188 .addReg(FalseReg)
1189 .addImm(CC);
1190}
1191
1192// Return true if Imm can be loaded into a register by a "cheap" sequence of
1193// instructions. For now, "cheap" means at most two instructions.
1194static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
1195 if (BitSize == 32)
1196 return true;
1197
1198 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
1199 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
1201 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
1202
1203 return Is.size() <= 2;
1204}
1205
1206// Check if a COPY instruction is cheap.
1207static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
1208 assert(MI.isCopy() && "Expected COPY instruction");
1209 const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
1210
1211 // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
1212 // typically requiring an FMOV instruction with a 2-6 cycle latency.
1213 auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
1214 if (Reg.isVirtual())
1215 return MRI.getRegClass(Reg);
1216 if (Reg.isPhysical())
1217 return RI.getMinimalPhysRegClass(Reg);
1218 return nullptr;
1219 };
1220 const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
1221 const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
1222 if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
1223 return false;
1224
1225 return MI.isAsCheapAsAMove();
1226}
1227
1228// FIXME: this implementation should be micro-architecture dependent, so a
1229// micro-architecture target hook should be introduced here in future.
1231 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1232 if (isExynosCheapAsMove(MI))
1233 return true;
1234 return MI.isAsCheapAsAMove();
1235 }
1236
1237 switch (MI.getOpcode()) {
1238 default:
1239 return MI.isAsCheapAsAMove();
1240
1241 case TargetOpcode::COPY:
1242 return isCheapCopy(MI, RI);
1243
1244 case AArch64::ADDWrs:
1245 case AArch64::ADDXrs:
1246 case AArch64::SUBWrs:
1247 case AArch64::SUBXrs:
1248 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1249
1250 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1251 // ORRXri, it is as cheap as MOV.
1252 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1253 case AArch64::MOVi32imm:
1254 return isCheapImmediate(MI, 32);
1255 case AArch64::MOVi64imm:
1256 return isCheapImmediate(MI, 64);
1257 }
1258}
1259
1260bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1261 switch (MI.getOpcode()) {
1262 default:
1263 return false;
1264
1265 case AArch64::ADDWrs:
1266 case AArch64::ADDXrs:
1267 case AArch64::ADDSWrs:
1268 case AArch64::ADDSXrs: {
1269 unsigned Imm = MI.getOperand(3).getImm();
1270 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1271 if (ShiftVal == 0)
1272 return true;
1273 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1274 }
1275
1276 case AArch64::ADDWrx:
1277 case AArch64::ADDXrx:
1278 case AArch64::ADDXrx64:
1279 case AArch64::ADDSWrx:
1280 case AArch64::ADDSXrx:
1281 case AArch64::ADDSXrx64: {
1282 unsigned Imm = MI.getOperand(3).getImm();
1283 switch (AArch64_AM::getArithExtendType(Imm)) {
1284 default:
1285 return false;
1286 case AArch64_AM::UXTB:
1287 case AArch64_AM::UXTH:
1288 case AArch64_AM::UXTW:
1289 case AArch64_AM::UXTX:
1290 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1291 }
1292 }
1293
1294 case AArch64::SUBWrs:
1295 case AArch64::SUBSWrs: {
1296 unsigned Imm = MI.getOperand(3).getImm();
1297 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1298 return ShiftVal == 0 ||
1299 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1300 }
1301
1302 case AArch64::SUBXrs:
1303 case AArch64::SUBSXrs: {
1304 unsigned Imm = MI.getOperand(3).getImm();
1305 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1306 return ShiftVal == 0 ||
1307 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1308 }
1309
1310 case AArch64::SUBWrx:
1311 case AArch64::SUBXrx:
1312 case AArch64::SUBXrx64:
1313 case AArch64::SUBSWrx:
1314 case AArch64::SUBSXrx:
1315 case AArch64::SUBSXrx64: {
1316 unsigned Imm = MI.getOperand(3).getImm();
1317 switch (AArch64_AM::getArithExtendType(Imm)) {
1318 default:
1319 return false;
1320 case AArch64_AM::UXTB:
1321 case AArch64_AM::UXTH:
1322 case AArch64_AM::UXTW:
1323 case AArch64_AM::UXTX:
1324 return AArch64_AM::getArithShiftValue(Imm) == 0;
1325 }
1326 }
1327
1328 case AArch64::LDRBBroW:
1329 case AArch64::LDRBBroX:
1330 case AArch64::LDRBroW:
1331 case AArch64::LDRBroX:
1332 case AArch64::LDRDroW:
1333 case AArch64::LDRDroX:
1334 case AArch64::LDRHHroW:
1335 case AArch64::LDRHHroX:
1336 case AArch64::LDRHroW:
1337 case AArch64::LDRHroX:
1338 case AArch64::LDRQroW:
1339 case AArch64::LDRQroX:
1340 case AArch64::LDRSBWroW:
1341 case AArch64::LDRSBWroX:
1342 case AArch64::LDRSBXroW:
1343 case AArch64::LDRSBXroX:
1344 case AArch64::LDRSHWroW:
1345 case AArch64::LDRSHWroX:
1346 case AArch64::LDRSHXroW:
1347 case AArch64::LDRSHXroX:
1348 case AArch64::LDRSWroW:
1349 case AArch64::LDRSWroX:
1350 case AArch64::LDRSroW:
1351 case AArch64::LDRSroX:
1352 case AArch64::LDRWroW:
1353 case AArch64::LDRWroX:
1354 case AArch64::LDRXroW:
1355 case AArch64::LDRXroX:
1356 case AArch64::PRFMroW:
1357 case AArch64::PRFMroX:
1358 case AArch64::STRBBroW:
1359 case AArch64::STRBBroX:
1360 case AArch64::STRBroW:
1361 case AArch64::STRBroX:
1362 case AArch64::STRDroW:
1363 case AArch64::STRDroX:
1364 case AArch64::STRHHroW:
1365 case AArch64::STRHHroX:
1366 case AArch64::STRHroW:
1367 case AArch64::STRHroX:
1368 case AArch64::STRQroW:
1369 case AArch64::STRQroX:
1370 case AArch64::STRSroW:
1371 case AArch64::STRSroX:
1372 case AArch64::STRWroW:
1373 case AArch64::STRWroX:
1374 case AArch64::STRXroW:
1375 case AArch64::STRXroX: {
1376 unsigned IsSigned = MI.getOperand(3).getImm();
1377 return !IsSigned;
1378 }
1379 }
1380}
1381
1382bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1383 unsigned Opc = MI.getOpcode();
1384 switch (Opc) {
1385 default:
1386 return false;
1387 case AArch64::SEH_StackAlloc:
1388 case AArch64::SEH_SaveFPLR:
1389 case AArch64::SEH_SaveFPLR_X:
1390 case AArch64::SEH_SaveReg:
1391 case AArch64::SEH_SaveReg_X:
1392 case AArch64::SEH_SaveRegP:
1393 case AArch64::SEH_SaveRegP_X:
1394 case AArch64::SEH_SaveFReg:
1395 case AArch64::SEH_SaveFReg_X:
1396 case AArch64::SEH_SaveFRegP:
1397 case AArch64::SEH_SaveFRegP_X:
1398 case AArch64::SEH_SetFP:
1399 case AArch64::SEH_AddFP:
1400 case AArch64::SEH_Nop:
1401 case AArch64::SEH_PrologEnd:
1402 case AArch64::SEH_EpilogStart:
1403 case AArch64::SEH_EpilogEnd:
1404 case AArch64::SEH_PACSignLR:
1405 case AArch64::SEH_SaveAnyRegI:
1406 case AArch64::SEH_SaveAnyRegIP:
1407 case AArch64::SEH_SaveAnyRegQP:
1408 case AArch64::SEH_SaveAnyRegQPX:
1409 case AArch64::SEH_AllocZ:
1410 case AArch64::SEH_SaveZReg:
1411 case AArch64::SEH_SavePReg:
1412 return true;
1413 }
1414}
1415
1417 Register &SrcReg, Register &DstReg,
1418 unsigned &SubIdx) const {
1419 switch (MI.getOpcode()) {
1420 default:
1421 return false;
1422 case AArch64::SBFMXri: // aka sxtw
1423 case AArch64::UBFMXri: // aka uxtw
1424 // Check for the 32 -> 64 bit extension case, these instructions can do
1425 // much more.
1426 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1427 return false;
1428 // This is a signed or unsigned 32 -> 64 bit extension.
1429 SrcReg = MI.getOperand(1).getReg();
1430 DstReg = MI.getOperand(0).getReg();
1431 SubIdx = AArch64::sub_32;
1432 return true;
1433 }
1434}
1435
1437 const MachineInstr &MIa, const MachineInstr &MIb) const {
1439 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1440 int64_t OffsetA = 0, OffsetB = 0;
1441 TypeSize WidthA(0, false), WidthB(0, false);
1442 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1443
1444 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1445 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1446
1449 return false;
1450
1451 // Retrieve the base, offset from the base and width. Width
1452 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1453 // base are identical, and the offset of a lower memory access +
1454 // the width doesn't overlap the offset of a higher memory access,
1455 // then the memory accesses are different.
1456 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1457 // are assumed to have the same scale (vscale).
1458 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1459 WidthA, TRI) &&
1460 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1461 WidthB, TRI)) {
1462 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1463 OffsetAIsScalable == OffsetBIsScalable) {
1464 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1465 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1466 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1467 if (LowWidth.isScalable() == OffsetAIsScalable &&
1468 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1469 return true;
1470 }
1471 }
1472 return false;
1473}
1474
1476 const MachineBasicBlock *MBB,
1477 const MachineFunction &MF) const {
1479 return true;
1480
1481 // Do not move an instruction that can be recognized as a branch target.
1482 if (hasBTISemantics(MI))
1483 return true;
1484
1485 switch (MI.getOpcode()) {
1486 case AArch64::HINT:
1487 // CSDB hints are scheduling barriers.
1488 if (MI.getOperand(0).getImm() == 0x14)
1489 return true;
1490 break;
1491 case AArch64::DSB:
1492 case AArch64::ISB:
1493 // DSB and ISB also are scheduling barriers.
1494 return true;
1495 case AArch64::MSRpstatesvcrImm1:
1496 // SMSTART and SMSTOP are also scheduling barriers.
1497 return true;
1498 default:;
1499 }
1500 if (isSEHInstruction(MI))
1501 return true;
1502 auto Next = std::next(MI.getIterator());
1503 return Next != MBB->end() && Next->isCFIInstruction();
1504}
1505
1506/// analyzeCompare - For a comparison instruction, return the source registers
1507/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1508/// Return true if the comparison instruction can be analyzed.
1510 Register &SrcReg2, int64_t &CmpMask,
1511 int64_t &CmpValue) const {
1512 // The first operand can be a frame index where we'd normally expect a
1513 // register.
1514 // FIXME: Pass subregisters out of analyzeCompare
1515 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1516 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1517 return false;
1518
1519 switch (MI.getOpcode()) {
1520 default:
1521 break;
1522 case AArch64::PTEST_PP:
1523 case AArch64::PTEST_PP_ANY:
1524 case AArch64::PTEST_PP_FIRST:
1525 SrcReg = MI.getOperand(0).getReg();
1526 SrcReg2 = MI.getOperand(1).getReg();
1527 if (MI.getOperand(2).getSubReg())
1528 return false;
1529
1530 // Not sure about the mask and value for now...
1531 CmpMask = ~0;
1532 CmpValue = 0;
1533 return true;
1534 case AArch64::SUBSWrr:
1535 case AArch64::SUBSWrs:
1536 case AArch64::SUBSWrx:
1537 case AArch64::SUBSXrr:
1538 case AArch64::SUBSXrs:
1539 case AArch64::SUBSXrx:
1540 case AArch64::ADDSWrr:
1541 case AArch64::ADDSWrs:
1542 case AArch64::ADDSWrx:
1543 case AArch64::ADDSXrr:
1544 case AArch64::ADDSXrs:
1545 case AArch64::ADDSXrx:
1546 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1547 SrcReg = MI.getOperand(1).getReg();
1548 SrcReg2 = MI.getOperand(2).getReg();
1549
1550 // FIXME: Pass subregisters out of analyzeCompare
1551 if (MI.getOperand(2).getSubReg())
1552 return false;
1553
1554 CmpMask = ~0;
1555 CmpValue = 0;
1556 return true;
1557 case AArch64::SUBSWri:
1558 case AArch64::ADDSWri:
1559 case AArch64::SUBSXri:
1560 case AArch64::ADDSXri:
1561 SrcReg = MI.getOperand(1).getReg();
1562 SrcReg2 = 0;
1563 CmpMask = ~0;
1564 CmpValue = MI.getOperand(2).getImm();
1565 return true;
1566 case AArch64::ANDSWri:
1567 case AArch64::ANDSXri:
1568 // ANDS does not use the same encoding scheme as the others xxxS
1569 // instructions.
1570 SrcReg = MI.getOperand(1).getReg();
1571 SrcReg2 = 0;
1572 CmpMask = ~0;
1574 MI.getOperand(2).getImm(),
1575 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1576 return true;
1577 }
1578
1579 return false;
1580}
1581
1583 MachineBasicBlock *MBB = Instr.getParent();
1584 assert(MBB && "Can't get MachineBasicBlock here");
1585 MachineFunction *MF = MBB->getParent();
1586 assert(MF && "Can't get MachineFunction here");
1590
1591 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1592 ++OpIdx) {
1593 MachineOperand &MO = Instr.getOperand(OpIdx);
1594 const TargetRegisterClass *OpRegCstraints =
1595 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1596
1597 // If there's no constraint, there's nothing to do.
1598 if (!OpRegCstraints)
1599 continue;
1600 // If the operand is a frame index, there's nothing to do here.
1601 // A frame index operand will resolve correctly during PEI.
1602 if (MO.isFI())
1603 continue;
1604
1605 assert(MO.isReg() &&
1606 "Operand has register constraints without being a register!");
1607
1608 Register Reg = MO.getReg();
1609 if (Reg.isPhysical()) {
1610 if (!OpRegCstraints->contains(Reg))
1611 return false;
1612 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1613 !MRI->constrainRegClass(Reg, OpRegCstraints))
1614 return false;
1615 }
1616
1617 return true;
1618}
1619
1620/// Return the opcode that does not set flags when possible - otherwise
1621/// return the original opcode. The caller is responsible to do the actual
1622/// substitution and legality checking.
1624 // Don't convert all compare instructions, because for some the zero register
1625 // encoding becomes the sp register.
1626 bool MIDefinesZeroReg = false;
1627 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1628 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1629 MIDefinesZeroReg = true;
1630
1631 switch (MI.getOpcode()) {
1632 default:
1633 return MI.getOpcode();
1634 case AArch64::ADDSWrr:
1635 return AArch64::ADDWrr;
1636 case AArch64::ADDSWri:
1637 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1638 case AArch64::ADDSWrs:
1639 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1640 case AArch64::ADDSWrx:
1641 return AArch64::ADDWrx;
1642 case AArch64::ADDSXrr:
1643 return AArch64::ADDXrr;
1644 case AArch64::ADDSXri:
1645 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1646 case AArch64::ADDSXrs:
1647 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1648 case AArch64::ADDSXrx:
1649 return AArch64::ADDXrx;
1650 case AArch64::SUBSWrr:
1651 return AArch64::SUBWrr;
1652 case AArch64::SUBSWri:
1653 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1654 case AArch64::SUBSWrs:
1655 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1656 case AArch64::SUBSWrx:
1657 return AArch64::SUBWrx;
1658 case AArch64::SUBSXrr:
1659 return AArch64::SUBXrr;
1660 case AArch64::SUBSXri:
1661 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1662 case AArch64::SUBSXrs:
1663 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1664 case AArch64::SUBSXrx:
1665 return AArch64::SUBXrx;
1666 }
1667}
1668
1669enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1670
1671/// True when condition flags are accessed (either by writing or reading)
1672/// on the instruction trace starting at From and ending at To.
1673///
1674/// Note: If From and To are from different blocks it's assumed CC are accessed
1675/// on the path.
1678 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1679 // Early exit if To is at the beginning of the BB.
1680 if (To == To->getParent()->begin())
1681 return true;
1682
1683 // Check whether the instructions are in the same basic block
1684 // If not, assume the condition flags might get modified somewhere.
1685 if (To->getParent() != From->getParent())
1686 return true;
1687
1688 // From must be above To.
1689 assert(std::any_of(
1690 ++To.getReverse(), To->getParent()->rend(),
1691 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1692
1693 // We iterate backward starting at \p To until we hit \p From.
1694 for (const MachineInstr &Instr :
1696 if (((AccessToCheck & AK_Write) &&
1697 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1698 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1699 return true;
1700 }
1701 return false;
1702}
1703
1704std::optional<unsigned>
1705AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1706 MachineInstr *Pred,
1707 const MachineRegisterInfo *MRI) const {
1708 unsigned MaskOpcode = Mask->getOpcode();
1709 unsigned PredOpcode = Pred->getOpcode();
1710 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1711 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1712
1713 if (PredIsWhileLike) {
1714 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1715 // instruction and the condition is "any" since WHILcc does an implicit
1716 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1717 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1718 return PredOpcode;
1719
1720 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1721 // redundant since WHILE performs an implicit PTEST with an all active
1722 // mask.
1723 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1724 getElementSizeForOpcode(MaskOpcode) ==
1725 getElementSizeForOpcode(PredOpcode))
1726 return PredOpcode;
1727
1728 // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
1729 // WHILEcc performs an implicit PTEST with an all active mask, setting
1730 // the N flag as the PTEST_FIRST would.
1731 if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
1732 isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
1733 return PredOpcode;
1734
1735 return {};
1736 }
1737
1738 if (PredIsPTestLike) {
1739 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1740 // instruction that sets the flags as PTEST would and the condition is
1741 // "any" since PG is always a subset of the governing predicate of the
1742 // ptest-like instruction.
1743 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1744 return PredOpcode;
1745
1746 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1747
1748 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1749 // to look through a copy and try again. This is because some instructions
1750 // take a predicate whose register class is a subset of its result class.
1751 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1752 PTestLikeMask->getOperand(1).getReg().isVirtual())
1753 PTestLikeMask =
1754 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1755
1756 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1757 // the element size matches and either the PTEST_LIKE instruction uses
1758 // the same all active mask or the condition is "any".
1759 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1760 getElementSizeForOpcode(MaskOpcode) ==
1761 getElementSizeForOpcode(PredOpcode)) {
1762 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1763 return PredOpcode;
1764 }
1765
1766 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1767 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1768 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1769 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1770 // performed by the compare could consider fewer lanes for these element
1771 // sizes.
1772 //
1773 // For example, consider
1774 //
1775 // ptrue p0.b ; P0=1111-1111-1111-1111
1776 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1777 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1778 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1779 // ; ^ last active
1780 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1781 // ; ^ last active
1782 //
1783 // where the compare generates a canonical all active 32-bit predicate
1784 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1785 // active flag, whereas the PTEST instruction with the same mask doesn't.
1786 // For PTEST_ANY this doesn't apply as the flags in this case would be
1787 // identical regardless of element size.
1788 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1789 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1790 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1791 return PredOpcode;
1792
1793 return {};
1794 }
1795
1796 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1797 // opcode so the PTEST becomes redundant.
1798 switch (PredOpcode) {
1799 case AArch64::AND_PPzPP:
1800 case AArch64::BIC_PPzPP:
1801 case AArch64::EOR_PPzPP:
1802 case AArch64::NAND_PPzPP:
1803 case AArch64::NOR_PPzPP:
1804 case AArch64::ORN_PPzPP:
1805 case AArch64::ORR_PPzPP:
1806 case AArch64::BRKA_PPzP:
1807 case AArch64::BRKPA_PPzPP:
1808 case AArch64::BRKB_PPzP:
1809 case AArch64::BRKPB_PPzPP:
1810 case AArch64::RDFFR_PPz: {
1811 // Check to see if our mask is the same. If not the resulting flag bits
1812 // may be different and we can't remove the ptest.
1813 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1814 if (Mask != PredMask)
1815 return {};
1816 break;
1817 }
1818 case AArch64::BRKN_PPzP: {
1819 // BRKN uses an all active implicit mask to set flags unlike the other
1820 // flag-setting instructions.
1821 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1822 if ((MaskOpcode != AArch64::PTRUE_B) ||
1823 (Mask->getOperand(1).getImm() != 31))
1824 return {};
1825 break;
1826 }
1827 case AArch64::PTRUE_B:
1828 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1829 break;
1830 default:
1831 // Bail out if we don't recognize the input
1832 return {};
1833 }
1834
1835 return convertToFlagSettingOpc(PredOpcode);
1836}
1837
1838/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1839/// operation which could set the flags in an identical manner
1840bool AArch64InstrInfo::optimizePTestInstr(
1841 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1842 const MachineRegisterInfo *MRI) const {
1843 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1844 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1845
1846 if (Pred->isCopy() && PTest->getOpcode() == AArch64::PTEST_PP_FIRST) {
1847 // Instructions which return a multi-vector (e.g. WHILECC_x2) require copies
1848 // before the branch to extract each subregister.
1849 auto Op = Pred->getOperand(1);
1850 if (Op.isReg() && Op.getReg().isVirtual() &&
1851 Op.getSubReg() == AArch64::psub0)
1852 Pred = MRI->getUniqueVRegDef(Op.getReg());
1853 }
1854
1855 unsigned PredOpcode = Pred->getOpcode();
1856 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1857 if (!NewOp)
1858 return false;
1859
1860 const TargetRegisterInfo *TRI = &getRegisterInfo();
1861
1862 // If another instruction between Pred and PTest accesses flags, don't remove
1863 // the ptest or update the earlier instruction to modify them.
1864 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1865 return false;
1866
1867 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1868 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1869 // operand to be replaced with an equivalent instruction that also sets the
1870 // flags.
1871 PTest->eraseFromParent();
1872 if (*NewOp != PredOpcode) {
1873 Pred->setDesc(get(*NewOp));
1874 bool succeeded = UpdateOperandRegClass(*Pred);
1875 (void)succeeded;
1876 assert(succeeded && "Operands have incompatible register classes!");
1877 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1878 }
1879
1880 // Ensure that the flags def is live.
1881 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1882 unsigned i = 0, e = Pred->getNumOperands();
1883 for (; i != e; ++i) {
1884 MachineOperand &MO = Pred->getOperand(i);
1885 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1886 MO.setIsDead(false);
1887 break;
1888 }
1889 }
1890 }
1891 return true;
1892}
1893
1894/// Try to optimize a compare instruction. A compare instruction is an
1895/// instruction which produces AArch64::NZCV. It can be truly compare
1896/// instruction
1897/// when there are no uses of its destination register.
1898///
1899/// The following steps are tried in order:
1900/// 1. Convert CmpInstr into an unconditional version.
1901/// 2. Remove CmpInstr if above there is an instruction producing a needed
1902/// condition code or an instruction which can be converted into such an
1903/// instruction.
1904/// Only comparison with zero is supported.
1906 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1907 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1908 assert(CmpInstr.getParent());
1909 assert(MRI);
1910
1911 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1912 int DeadNZCVIdx =
1913 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1914 if (DeadNZCVIdx != -1) {
1915 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1916 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1917 CmpInstr.eraseFromParent();
1918 return true;
1919 }
1920 unsigned Opc = CmpInstr.getOpcode();
1921 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1922 if (NewOpc == Opc)
1923 return false;
1924 const MCInstrDesc &MCID = get(NewOpc);
1925 CmpInstr.setDesc(MCID);
1926 CmpInstr.removeOperand(DeadNZCVIdx);
1927 bool succeeded = UpdateOperandRegClass(CmpInstr);
1928 (void)succeeded;
1929 assert(succeeded && "Some operands reg class are incompatible!");
1930 return true;
1931 }
1932
1933 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1934 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY ||
1935 CmpInstr.getOpcode() == AArch64::PTEST_PP_FIRST)
1936 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1937
1938 if (SrcReg2 != 0)
1939 return false;
1940
1941 // CmpInstr is a Compare instruction if destination register is not used.
1942 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1943 return false;
1944
1945 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1946 return true;
1947 return (CmpValue == 0 || CmpValue == 1) &&
1948 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1949}
1950
1951/// Get opcode of S version of Instr.
1952/// If Instr is S version its opcode is returned.
1953/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1954/// or we are not interested in it.
1955static unsigned sForm(MachineInstr &Instr) {
1956 switch (Instr.getOpcode()) {
1957 default:
1958 return AArch64::INSTRUCTION_LIST_END;
1959
1960 case AArch64::ADDSWrr:
1961 case AArch64::ADDSWri:
1962 case AArch64::ADDSXrr:
1963 case AArch64::ADDSXri:
1964 case AArch64::ADDSWrx:
1965 case AArch64::ADDSXrx:
1966 case AArch64::SUBSWrr:
1967 case AArch64::SUBSWri:
1968 case AArch64::SUBSWrx:
1969 case AArch64::SUBSXrr:
1970 case AArch64::SUBSXri:
1971 case AArch64::SUBSXrx:
1972 case AArch64::ANDSWri:
1973 case AArch64::ANDSWrr:
1974 case AArch64::ANDSWrs:
1975 case AArch64::ANDSXri:
1976 case AArch64::ANDSXrr:
1977 case AArch64::ANDSXrs:
1978 case AArch64::BICSWrr:
1979 case AArch64::BICSXrr:
1980 case AArch64::BICSWrs:
1981 case AArch64::BICSXrs:
1982 return Instr.getOpcode();
1983
1984 case AArch64::ADDWrr:
1985 return AArch64::ADDSWrr;
1986 case AArch64::ADDWri:
1987 return AArch64::ADDSWri;
1988 case AArch64::ADDXrr:
1989 return AArch64::ADDSXrr;
1990 case AArch64::ADDXri:
1991 return AArch64::ADDSXri;
1992 case AArch64::ADDWrx:
1993 return AArch64::ADDSWrx;
1994 case AArch64::ADDXrx:
1995 return AArch64::ADDSXrx;
1996 case AArch64::ADCWr:
1997 return AArch64::ADCSWr;
1998 case AArch64::ADCXr:
1999 return AArch64::ADCSXr;
2000 case AArch64::SUBWrr:
2001 return AArch64::SUBSWrr;
2002 case AArch64::SUBWri:
2003 return AArch64::SUBSWri;
2004 case AArch64::SUBXrr:
2005 return AArch64::SUBSXrr;
2006 case AArch64::SUBXri:
2007 return AArch64::SUBSXri;
2008 case AArch64::SUBWrx:
2009 return AArch64::SUBSWrx;
2010 case AArch64::SUBXrx:
2011 return AArch64::SUBSXrx;
2012 case AArch64::SBCWr:
2013 return AArch64::SBCSWr;
2014 case AArch64::SBCXr:
2015 return AArch64::SBCSXr;
2016 case AArch64::ANDWri:
2017 return AArch64::ANDSWri;
2018 case AArch64::ANDXri:
2019 return AArch64::ANDSXri;
2020 case AArch64::ANDWrr:
2021 return AArch64::ANDSWrr;
2022 case AArch64::ANDWrs:
2023 return AArch64::ANDSWrs;
2024 case AArch64::ANDXrr:
2025 return AArch64::ANDSXrr;
2026 case AArch64::ANDXrs:
2027 return AArch64::ANDSXrs;
2028 case AArch64::BICWrr:
2029 return AArch64::BICSWrr;
2030 case AArch64::BICXrr:
2031 return AArch64::BICSXrr;
2032 case AArch64::BICWrs:
2033 return AArch64::BICSWrs;
2034 case AArch64::BICXrs:
2035 return AArch64::BICSXrs;
2036 }
2037}
2038
2039/// Check if AArch64::NZCV should be alive in successors of MBB.
2041 for (auto *BB : MBB->successors())
2042 if (BB->isLiveIn(AArch64::NZCV))
2043 return true;
2044 return false;
2045}
2046
2047/// \returns The condition code operand index for \p Instr if it is a branch
2048/// or select and -1 otherwise.
2049static int
2051 switch (Instr.getOpcode()) {
2052 default:
2053 return -1;
2054
2055 case AArch64::Bcc: {
2056 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2057 assert(Idx >= 2);
2058 return Idx - 2;
2059 }
2060
2061 case AArch64::CSINVWr:
2062 case AArch64::CSINVXr:
2063 case AArch64::CSINCWr:
2064 case AArch64::CSINCXr:
2065 case AArch64::CSELWr:
2066 case AArch64::CSELXr:
2067 case AArch64::CSNEGWr:
2068 case AArch64::CSNEGXr:
2069 case AArch64::FCSELSrrr:
2070 case AArch64::FCSELDrrr: {
2071 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
2072 assert(Idx >= 1);
2073 return Idx - 1;
2074 }
2075 }
2076}
2077
2078/// Find a condition code used by the instruction.
2079/// Returns AArch64CC::Invalid if either the instruction does not use condition
2080/// codes or we don't optimize CmpInstr in the presence of such instructions.
2083 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
2084 Instr.getOperand(CCIdx).getImm())
2086}
2087
2090 UsedNZCV UsedFlags;
2091 switch (CC) {
2092 default:
2093 break;
2094
2095 case AArch64CC::EQ: // Z set
2096 case AArch64CC::NE: // Z clear
2097 UsedFlags.Z = true;
2098 break;
2099
2100 case AArch64CC::HI: // Z clear and C set
2101 case AArch64CC::LS: // Z set or C clear
2102 UsedFlags.Z = true;
2103 [[fallthrough]];
2104 case AArch64CC::HS: // C set
2105 case AArch64CC::LO: // C clear
2106 UsedFlags.C = true;
2107 break;
2108
2109 case AArch64CC::MI: // N set
2110 case AArch64CC::PL: // N clear
2111 UsedFlags.N = true;
2112 break;
2113
2114 case AArch64CC::VS: // V set
2115 case AArch64CC::VC: // V clear
2116 UsedFlags.V = true;
2117 break;
2118
2119 case AArch64CC::GT: // Z clear, N and V the same
2120 case AArch64CC::LE: // Z set, N and V differ
2121 UsedFlags.Z = true;
2122 [[fallthrough]];
2123 case AArch64CC::GE: // N and V the same
2124 case AArch64CC::LT: // N and V differ
2125 UsedFlags.N = true;
2126 UsedFlags.V = true;
2127 break;
2128 }
2129 return UsedFlags;
2130}
2131
2132/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
2133/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
2134/// \returns std::nullopt otherwise.
2135///
2136/// Collect instructions using that flags in \p CCUseInstrs if provided.
2137std::optional<UsedNZCV>
2139 const TargetRegisterInfo &TRI,
2140 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
2141 MachineBasicBlock *CmpParent = CmpInstr.getParent();
2142 if (MI.getParent() != CmpParent)
2143 return std::nullopt;
2144
2145 if (areCFlagsAliveInSuccessors(CmpParent))
2146 return std::nullopt;
2147
2148 UsedNZCV NZCVUsedAfterCmp;
2150 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
2151 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
2153 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
2154 return std::nullopt;
2155 NZCVUsedAfterCmp |= getUsedNZCV(CC);
2156 if (CCUseInstrs)
2157 CCUseInstrs->push_back(&Instr);
2158 }
2159 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
2160 break;
2161 }
2162 return NZCVUsedAfterCmp;
2163}
2164
2165static bool isADDSRegImm(unsigned Opcode) {
2166 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
2167}
2168
2169static bool isSUBSRegImm(unsigned Opcode) {
2170 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
2171}
2172
2174 unsigned Opc = sForm(MI);
2175 switch (Opc) {
2176 case AArch64::ANDSWri:
2177 case AArch64::ANDSWrr:
2178 case AArch64::ANDSWrs:
2179 case AArch64::ANDSXri:
2180 case AArch64::ANDSXrr:
2181 case AArch64::ANDSXrs:
2182 case AArch64::BICSWrr:
2183 case AArch64::BICSXrr:
2184 case AArch64::BICSWrs:
2185 case AArch64::BICSXrs:
2186 return true;
2187 default:
2188 return false;
2189 }
2190}
2191
2192/// Check if CmpInstr can be substituted by MI.
2193///
2194/// CmpInstr can be substituted:
2195/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2196/// - and, MI and CmpInstr are from the same MachineBB
2197/// - and, condition flags are not alive in successors of the CmpInstr parent
2198/// - and, if MI opcode is the S form there must be no defs of flags between
2199/// MI and CmpInstr
2200/// or if MI opcode is not the S form there must be neither defs of flags
2201/// nor uses of flags between MI and CmpInstr.
2202/// - and, if C/V flags are not used after CmpInstr
2203/// or if N flag is used but MI produces poison value if signed overflow
2204/// occurs.
2206 const TargetRegisterInfo &TRI) {
2207 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
2208 // that may or may not set flags.
2209 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
2210
2211 const unsigned CmpOpcode = CmpInstr.getOpcode();
2212 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
2213 return false;
2214
2215 assert((CmpInstr.getOperand(2).isImm() &&
2216 CmpInstr.getOperand(2).getImm() == 0) &&
2217 "Caller guarantees that CmpInstr compares with constant 0");
2218
2219 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
2220 if (!NZVCUsed || NZVCUsed->C)
2221 return false;
2222
2223 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
2224 // '%vreg = add ...' or '%vreg = sub ...'.
2225 // Condition flag V is used to indicate signed overflow.
2226 // 1) MI and CmpInstr set N and V to the same value.
2227 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
2228 // signed overflow occurs, so CmpInstr could still be simplified away.
2229 // Note that Ands and Bics instructions always clear the V flag.
2230 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI))
2231 return false;
2232
2233 AccessKind AccessToCheck = AK_Write;
2234 if (sForm(MI) != MI.getOpcode())
2235 AccessToCheck = AK_All;
2236 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
2237}
2238
2239/// Substitute an instruction comparing to zero with another instruction
2240/// which produces needed condition flags.
2241///
2242/// Return true on success.
2243bool AArch64InstrInfo::substituteCmpToZero(
2244 MachineInstr &CmpInstr, unsigned SrcReg,
2245 const MachineRegisterInfo &MRI) const {
2246 // Get the unique definition of SrcReg.
2247 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2248 if (!MI)
2249 return false;
2250
2251 const TargetRegisterInfo &TRI = getRegisterInfo();
2252
2253 unsigned NewOpc = sForm(*MI);
2254 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
2255 return false;
2256
2257 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
2258 return false;
2259
2260 // Update the instruction to set NZCV.
2261 MI->setDesc(get(NewOpc));
2262 CmpInstr.eraseFromParent();
2264 (void)succeeded;
2265 assert(succeeded && "Some operands reg class are incompatible!");
2266 MI->addRegisterDefined(AArch64::NZCV, &TRI);
2267 return true;
2268}
2269
2270/// \returns True if \p CmpInstr can be removed.
2271///
2272/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
2273/// codes used in \p CCUseInstrs must be inverted.
2275 int CmpValue, const TargetRegisterInfo &TRI,
2277 bool &IsInvertCC) {
2278 assert((CmpValue == 0 || CmpValue == 1) &&
2279 "Only comparisons to 0 or 1 considered for removal!");
2280
2281 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
2282 unsigned MIOpc = MI.getOpcode();
2283 if (MIOpc == AArch64::CSINCWr) {
2284 if (MI.getOperand(1).getReg() != AArch64::WZR ||
2285 MI.getOperand(2).getReg() != AArch64::WZR)
2286 return false;
2287 } else if (MIOpc == AArch64::CSINCXr) {
2288 if (MI.getOperand(1).getReg() != AArch64::XZR ||
2289 MI.getOperand(2).getReg() != AArch64::XZR)
2290 return false;
2291 } else {
2292 return false;
2293 }
2295 if (MICC == AArch64CC::Invalid)
2296 return false;
2297
2298 // NZCV needs to be defined
2299 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
2300 return false;
2301
2302 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
2303 const unsigned CmpOpcode = CmpInstr.getOpcode();
2304 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
2305 if (CmpValue && !IsSubsRegImm)
2306 return false;
2307 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2308 return false;
2309
2310 // MI conditions allowed: eq, ne, mi, pl
2311 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2312 if (MIUsedNZCV.C || MIUsedNZCV.V)
2313 return false;
2314
2315 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2316 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2317 // Condition flags are not used in CmpInstr basic block successors and only
2318 // Z or N flags allowed to be used after CmpInstr within its basic block
2319 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2320 return false;
2321 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2322 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2323 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2324 return false;
2325 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2326 if (MIUsedNZCV.N && !CmpValue)
2327 return false;
2328
2329 // There must be no defs of flags between MI and CmpInstr
2330 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2331 return false;
2332
2333 // Condition code is inverted in the following cases:
2334 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2335 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2336 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2337 (!CmpValue && MICC == AArch64CC::NE);
2338 return true;
2339}
2340
2341/// Remove comparison in csinc-cmp sequence
2342///
2343/// Examples:
2344/// 1. \code
2345/// csinc w9, wzr, wzr, ne
2346/// cmp w9, #0
2347/// b.eq
2348/// \endcode
2349/// to
2350/// \code
2351/// csinc w9, wzr, wzr, ne
2352/// b.ne
2353/// \endcode
2354///
2355/// 2. \code
2356/// csinc x2, xzr, xzr, mi
2357/// cmp x2, #1
2358/// b.pl
2359/// \endcode
2360/// to
2361/// \code
2362/// csinc x2, xzr, xzr, mi
2363/// b.pl
2364/// \endcode
2365///
2366/// \param CmpInstr comparison instruction
2367/// \return True when comparison removed
2368bool AArch64InstrInfo::removeCmpToZeroOrOne(
2369 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2370 const MachineRegisterInfo &MRI) const {
2371 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2372 if (!MI)
2373 return false;
2374 const TargetRegisterInfo &TRI = getRegisterInfo();
2375 SmallVector<MachineInstr *, 4> CCUseInstrs;
2376 bool IsInvertCC = false;
2377 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2378 IsInvertCC))
2379 return false;
2380 // Make transformation
2381 CmpInstr.eraseFromParent();
2382 if (IsInvertCC) {
2383 // Invert condition codes in CmpInstr CC users
2384 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2385 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2386 assert(Idx >= 0 && "Unexpected instruction using CC.");
2387 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2389 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2390 CCOperand.setImm(CCUse);
2391 }
2392 }
2393 return true;
2394}
2395
2396bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2397 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2398 MI.getOpcode() != AArch64::CATCHRET)
2399 return false;
2400
2401 MachineBasicBlock &MBB = *MI.getParent();
2402 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2403 auto TRI = Subtarget.getRegisterInfo();
2404 DebugLoc DL = MI.getDebugLoc();
2405
2406 if (MI.getOpcode() == AArch64::CATCHRET) {
2407 // Skip to the first instruction before the epilog.
2408 const TargetInstrInfo *TII =
2410 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2412 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2413 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2414 FirstEpilogSEH != MBB.begin())
2415 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2416 if (FirstEpilogSEH != MBB.begin())
2417 FirstEpilogSEH = std::next(FirstEpilogSEH);
2418 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2419 .addReg(AArch64::X0, RegState::Define)
2420 .addMBB(TargetMBB);
2421 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2422 .addReg(AArch64::X0, RegState::Define)
2423 .addReg(AArch64::X0)
2424 .addMBB(TargetMBB)
2425 .addImm(0);
2426 TargetMBB->setMachineBlockAddressTaken();
2427 return true;
2428 }
2429
2430 Register Reg = MI.getOperand(0).getReg();
2432 if (M.getStackProtectorGuard() == "sysreg") {
2433 const AArch64SysReg::SysReg *SrcReg =
2434 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2435 if (!SrcReg)
2436 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2437
2438 // mrs xN, sysreg
2439 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2441 .addImm(SrcReg->Encoding);
2442 int Offset = M.getStackProtectorGuardOffset();
2443 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2444 // ldr xN, [xN, #offset]
2445 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2446 .addDef(Reg)
2448 .addImm(Offset / 8);
2449 } else if (Offset >= -256 && Offset <= 255) {
2450 // ldur xN, [xN, #offset]
2451 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2452 .addDef(Reg)
2454 .addImm(Offset);
2455 } else if (Offset >= -4095 && Offset <= 4095) {
2456 if (Offset > 0) {
2457 // add xN, xN, #offset
2458 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2459 .addDef(Reg)
2461 .addImm(Offset)
2462 .addImm(0);
2463 } else {
2464 // sub xN, xN, #offset
2465 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2466 .addDef(Reg)
2468 .addImm(-Offset)
2469 .addImm(0);
2470 }
2471 // ldr xN, [xN]
2472 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2473 .addDef(Reg)
2475 .addImm(0);
2476 } else {
2477 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2478 // than 23760.
2479 // It might be nice to use AArch64::MOVi32imm here, which would get
2480 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2481 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2482 // AArch64FrameLowering might help us find such a scratch register
2483 // though. If we failed to find a scratch register, we could emit a
2484 // stream of add instructions to build up the immediate. Or, we could try
2485 // to insert a AArch64::MOVi32imm before register allocation so that we
2486 // didn't need to scavenge for a scratch register.
2487 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2488 }
2489 MBB.erase(MI);
2490 return true;
2491 }
2492
2493 const GlobalValue *GV =
2494 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2495 const TargetMachine &TM = MBB.getParent()->getTarget();
2496 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2497 const unsigned char MO_NC = AArch64II::MO_NC;
2498
2499 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2500 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2501 .addGlobalAddress(GV, 0, OpFlags);
2502 if (Subtarget.isTargetILP32()) {
2503 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2504 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2505 .addDef(Reg32, RegState::Dead)
2507 .addImm(0)
2508 .addMemOperand(*MI.memoperands_begin())
2510 } else {
2511 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2513 .addImm(0)
2514 .addMemOperand(*MI.memoperands_begin());
2515 }
2516 } else if (TM.getCodeModel() == CodeModel::Large) {
2517 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2518 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2519 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2520 .addImm(0);
2521 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2523 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2524 .addImm(16);
2525 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2527 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2528 .addImm(32);
2529 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2532 .addImm(48);
2533 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2535 .addImm(0)
2536 .addMemOperand(*MI.memoperands_begin());
2537 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2538 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2539 .addGlobalAddress(GV, 0, OpFlags);
2540 } else {
2541 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2542 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2543 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2544 if (Subtarget.isTargetILP32()) {
2545 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2546 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2547 .addDef(Reg32, RegState::Dead)
2549 .addGlobalAddress(GV, 0, LoFlags)
2550 .addMemOperand(*MI.memoperands_begin())
2552 } else {
2553 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2555 .addGlobalAddress(GV, 0, LoFlags)
2556 .addMemOperand(*MI.memoperands_begin());
2557 }
2558 }
2559
2560 MBB.erase(MI);
2561
2562 return true;
2563}
2564
2565// Return true if this instruction simply sets its single destination register
2566// to zero. This is equivalent to a register rename of the zero-register.
2568 switch (MI.getOpcode()) {
2569 default:
2570 break;
2571 case AArch64::MOVZWi:
2572 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2573 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2574 assert(MI.getDesc().getNumOperands() == 3 &&
2575 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2576 return true;
2577 }
2578 break;
2579 case AArch64::ANDWri: // and Rd, Rzr, #imm
2580 return MI.getOperand(1).getReg() == AArch64::WZR;
2581 case AArch64::ANDXri:
2582 return MI.getOperand(1).getReg() == AArch64::XZR;
2583 case TargetOpcode::COPY:
2584 return MI.getOperand(1).getReg() == AArch64::WZR;
2585 }
2586 return false;
2587}
2588
2589// Return true if this instruction simply renames a general register without
2590// modifying bits.
2592 switch (MI.getOpcode()) {
2593 default:
2594 break;
2595 case TargetOpcode::COPY: {
2596 // GPR32 copies will by lowered to ORRXrs
2597 Register DstReg = MI.getOperand(0).getReg();
2598 return (AArch64::GPR32RegClass.contains(DstReg) ||
2599 AArch64::GPR64RegClass.contains(DstReg));
2600 }
2601 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2602 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2603 assert(MI.getDesc().getNumOperands() == 4 &&
2604 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2605 return true;
2606 }
2607 break;
2608 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2609 if (MI.getOperand(2).getImm() == 0) {
2610 assert(MI.getDesc().getNumOperands() == 4 &&
2611 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2612 return true;
2613 }
2614 break;
2615 }
2616 return false;
2617}
2618
2619// Return true if this instruction simply renames a general register without
2620// modifying bits.
2622 switch (MI.getOpcode()) {
2623 default:
2624 break;
2625 case TargetOpcode::COPY: {
2626 Register DstReg = MI.getOperand(0).getReg();
2627 return AArch64::FPR128RegClass.contains(DstReg);
2628 }
2629 case AArch64::ORRv16i8:
2630 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2631 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2632 "invalid ORRv16i8 operands");
2633 return true;
2634 }
2635 break;
2636 }
2637 return false;
2638}
2639
2640static bool isFrameLoadOpcode(int Opcode) {
2641 switch (Opcode) {
2642 default:
2643 return false;
2644 case AArch64::LDRWui:
2645 case AArch64::LDRXui:
2646 case AArch64::LDRBui:
2647 case AArch64::LDRHui:
2648 case AArch64::LDRSui:
2649 case AArch64::LDRDui:
2650 case AArch64::LDRQui:
2651 case AArch64::LDR_PXI:
2652 return true;
2653 }
2654}
2655
2657 int &FrameIndex) const {
2658 if (!isFrameLoadOpcode(MI.getOpcode()))
2659 return Register();
2660
2661 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2662 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2663 FrameIndex = MI.getOperand(1).getIndex();
2664 return MI.getOperand(0).getReg();
2665 }
2666 return Register();
2667}
2668
2669static bool isFrameStoreOpcode(int Opcode) {
2670 switch (Opcode) {
2671 default:
2672 return false;
2673 case AArch64::STRWui:
2674 case AArch64::STRXui:
2675 case AArch64::STRBui:
2676 case AArch64::STRHui:
2677 case AArch64::STRSui:
2678 case AArch64::STRDui:
2679 case AArch64::STRQui:
2680 case AArch64::STR_PXI:
2681 return true;
2682 }
2683}
2684
2686 int &FrameIndex) const {
2687 if (!isFrameStoreOpcode(MI.getOpcode()))
2688 return Register();
2689
2690 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2691 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2692 FrameIndex = MI.getOperand(1).getIndex();
2693 return MI.getOperand(0).getReg();
2694 }
2695 return Register();
2696}
2697
2699 int &FrameIndex) const {
2700 if (!isFrameStoreOpcode(MI.getOpcode()))
2701 return Register();
2702
2703 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
2704 return Reg;
2705
2707 if (hasStoreToStackSlot(MI, Accesses)) {
2708 if (Accesses.size() > 1)
2709 return Register();
2710
2711 FrameIndex =
2712 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2713 ->getFrameIndex();
2714 return MI.getOperand(0).getReg();
2715 }
2716 return Register();
2717}
2718
2720 int &FrameIndex) const {
2721 if (!isFrameLoadOpcode(MI.getOpcode()))
2722 return Register();
2723
2724 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
2725 return Reg;
2726
2728 if (hasLoadFromStackSlot(MI, Accesses)) {
2729 if (Accesses.size() > 1)
2730 return Register();
2731
2732 FrameIndex =
2733 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
2734 ->getFrameIndex();
2735 return MI.getOperand(0).getReg();
2736 }
2737 return Register();
2738}
2739
2740/// Check all MachineMemOperands for a hint to suppress pairing.
2742 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2743 return MMO->getFlags() & MOSuppressPair;
2744 });
2745}
2746
2747/// Set a flag on the first MachineMemOperand to suppress pairing.
2749 if (MI.memoperands_empty())
2750 return;
2751 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2752}
2753
2754/// Check all MachineMemOperands for a hint that the load/store is strided.
2756 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2757 return MMO->getFlags() & MOStridedAccess;
2758 });
2759}
2760
2762 switch (Opc) {
2763 default:
2764 return false;
2765 case AArch64::STURSi:
2766 case AArch64::STRSpre:
2767 case AArch64::STURDi:
2768 case AArch64::STRDpre:
2769 case AArch64::STURQi:
2770 case AArch64::STRQpre:
2771 case AArch64::STURBBi:
2772 case AArch64::STURHHi:
2773 case AArch64::STURWi:
2774 case AArch64::STRWpre:
2775 case AArch64::STURXi:
2776 case AArch64::STRXpre:
2777 case AArch64::LDURSi:
2778 case AArch64::LDRSpre:
2779 case AArch64::LDURDi:
2780 case AArch64::LDRDpre:
2781 case AArch64::LDURQi:
2782 case AArch64::LDRQpre:
2783 case AArch64::LDURWi:
2784 case AArch64::LDRWpre:
2785 case AArch64::LDURXi:
2786 case AArch64::LDRXpre:
2787 case AArch64::LDRSWpre:
2788 case AArch64::LDURSWi:
2789 case AArch64::LDURHHi:
2790 case AArch64::LDURBBi:
2791 case AArch64::LDURSBWi:
2792 case AArch64::LDURSHWi:
2793 return true;
2794 }
2795}
2796
2797std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2798 switch (Opc) {
2799 default: return {};
2800 case AArch64::PRFMui: return AArch64::PRFUMi;
2801 case AArch64::LDRXui: return AArch64::LDURXi;
2802 case AArch64::LDRWui: return AArch64::LDURWi;
2803 case AArch64::LDRBui: return AArch64::LDURBi;
2804 case AArch64::LDRHui: return AArch64::LDURHi;
2805 case AArch64::LDRSui: return AArch64::LDURSi;
2806 case AArch64::LDRDui: return AArch64::LDURDi;
2807 case AArch64::LDRQui: return AArch64::LDURQi;
2808 case AArch64::LDRBBui: return AArch64::LDURBBi;
2809 case AArch64::LDRHHui: return AArch64::LDURHHi;
2810 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2811 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2812 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2813 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2814 case AArch64::LDRSWui: return AArch64::LDURSWi;
2815 case AArch64::STRXui: return AArch64::STURXi;
2816 case AArch64::STRWui: return AArch64::STURWi;
2817 case AArch64::STRBui: return AArch64::STURBi;
2818 case AArch64::STRHui: return AArch64::STURHi;
2819 case AArch64::STRSui: return AArch64::STURSi;
2820 case AArch64::STRDui: return AArch64::STURDi;
2821 case AArch64::STRQui: return AArch64::STURQi;
2822 case AArch64::STRBBui: return AArch64::STURBBi;
2823 case AArch64::STRHHui: return AArch64::STURHHi;
2824 }
2825}
2826
2828 switch (Opc) {
2829 default:
2830 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2831 case AArch64::ADDG:
2832 case AArch64::LDAPURBi:
2833 case AArch64::LDAPURHi:
2834 case AArch64::LDAPURi:
2835 case AArch64::LDAPURSBWi:
2836 case AArch64::LDAPURSBXi:
2837 case AArch64::LDAPURSHWi:
2838 case AArch64::LDAPURSHXi:
2839 case AArch64::LDAPURSWi:
2840 case AArch64::LDAPURXi:
2841 case AArch64::LDR_PPXI:
2842 case AArch64::LDR_PXI:
2843 case AArch64::LDR_ZXI:
2844 case AArch64::LDR_ZZXI:
2845 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2846 case AArch64::LDR_ZZZXI:
2847 case AArch64::LDR_ZZZZXI:
2848 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2849 case AArch64::LDRBBui:
2850 case AArch64::LDRBui:
2851 case AArch64::LDRDui:
2852 case AArch64::LDRHHui:
2853 case AArch64::LDRHui:
2854 case AArch64::LDRQui:
2855 case AArch64::LDRSBWui:
2856 case AArch64::LDRSBXui:
2857 case AArch64::LDRSHWui:
2858 case AArch64::LDRSHXui:
2859 case AArch64::LDRSui:
2860 case AArch64::LDRSWui:
2861 case AArch64::LDRWui:
2862 case AArch64::LDRXui:
2863 case AArch64::LDURBBi:
2864 case AArch64::LDURBi:
2865 case AArch64::LDURDi:
2866 case AArch64::LDURHHi:
2867 case AArch64::LDURHi:
2868 case AArch64::LDURQi:
2869 case AArch64::LDURSBWi:
2870 case AArch64::LDURSBXi:
2871 case AArch64::LDURSHWi:
2872 case AArch64::LDURSHXi:
2873 case AArch64::LDURSi:
2874 case AArch64::LDURSWi:
2875 case AArch64::LDURWi:
2876 case AArch64::LDURXi:
2877 case AArch64::PRFMui:
2878 case AArch64::PRFUMi:
2879 case AArch64::ST2Gi:
2880 case AArch64::STGi:
2881 case AArch64::STLURBi:
2882 case AArch64::STLURHi:
2883 case AArch64::STLURWi:
2884 case AArch64::STLURXi:
2885 case AArch64::StoreSwiftAsyncContext:
2886 case AArch64::STR_PPXI:
2887 case AArch64::STR_PXI:
2888 case AArch64::STR_ZXI:
2889 case AArch64::STR_ZZXI:
2890 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2891 case AArch64::STR_ZZZXI:
2892 case AArch64::STR_ZZZZXI:
2893 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2894 case AArch64::STRBBui:
2895 case AArch64::STRBui:
2896 case AArch64::STRDui:
2897 case AArch64::STRHHui:
2898 case AArch64::STRHui:
2899 case AArch64::STRQui:
2900 case AArch64::STRSui:
2901 case AArch64::STRWui:
2902 case AArch64::STRXui:
2903 case AArch64::STURBBi:
2904 case AArch64::STURBi:
2905 case AArch64::STURDi:
2906 case AArch64::STURHHi:
2907 case AArch64::STURHi:
2908 case AArch64::STURQi:
2909 case AArch64::STURSi:
2910 case AArch64::STURWi:
2911 case AArch64::STURXi:
2912 case AArch64::STZ2Gi:
2913 case AArch64::STZGi:
2914 case AArch64::TAGPstack:
2915 return 2;
2916 case AArch64::LD1B_D_IMM:
2917 case AArch64::LD1B_H_IMM:
2918 case AArch64::LD1B_IMM:
2919 case AArch64::LD1B_S_IMM:
2920 case AArch64::LD1D_IMM:
2921 case AArch64::LD1H_D_IMM:
2922 case AArch64::LD1H_IMM:
2923 case AArch64::LD1H_S_IMM:
2924 case AArch64::LD1RB_D_IMM:
2925 case AArch64::LD1RB_H_IMM:
2926 case AArch64::LD1RB_IMM:
2927 case AArch64::LD1RB_S_IMM:
2928 case AArch64::LD1RD_IMM:
2929 case AArch64::LD1RH_D_IMM:
2930 case AArch64::LD1RH_IMM:
2931 case AArch64::LD1RH_S_IMM:
2932 case AArch64::LD1RSB_D_IMM:
2933 case AArch64::LD1RSB_H_IMM:
2934 case AArch64::LD1RSB_S_IMM:
2935 case AArch64::LD1RSH_D_IMM:
2936 case AArch64::LD1RSH_S_IMM:
2937 case AArch64::LD1RSW_IMM:
2938 case AArch64::LD1RW_D_IMM:
2939 case AArch64::LD1RW_IMM:
2940 case AArch64::LD1SB_D_IMM:
2941 case AArch64::LD1SB_H_IMM:
2942 case AArch64::LD1SB_S_IMM:
2943 case AArch64::LD1SH_D_IMM:
2944 case AArch64::LD1SH_S_IMM:
2945 case AArch64::LD1SW_D_IMM:
2946 case AArch64::LD1W_D_IMM:
2947 case AArch64::LD1W_IMM:
2948 case AArch64::LD2B_IMM:
2949 case AArch64::LD2D_IMM:
2950 case AArch64::LD2H_IMM:
2951 case AArch64::LD2W_IMM:
2952 case AArch64::LD3B_IMM:
2953 case AArch64::LD3D_IMM:
2954 case AArch64::LD3H_IMM:
2955 case AArch64::LD3W_IMM:
2956 case AArch64::LD4B_IMM:
2957 case AArch64::LD4D_IMM:
2958 case AArch64::LD4H_IMM:
2959 case AArch64::LD4W_IMM:
2960 case AArch64::LDG:
2961 case AArch64::LDNF1B_D_IMM:
2962 case AArch64::LDNF1B_H_IMM:
2963 case AArch64::LDNF1B_IMM:
2964 case AArch64::LDNF1B_S_IMM:
2965 case AArch64::LDNF1D_IMM:
2966 case AArch64::LDNF1H_D_IMM:
2967 case AArch64::LDNF1H_IMM:
2968 case AArch64::LDNF1H_S_IMM:
2969 case AArch64::LDNF1SB_D_IMM:
2970 case AArch64::LDNF1SB_H_IMM:
2971 case AArch64::LDNF1SB_S_IMM:
2972 case AArch64::LDNF1SH_D_IMM:
2973 case AArch64::LDNF1SH_S_IMM:
2974 case AArch64::LDNF1SW_D_IMM:
2975 case AArch64::LDNF1W_D_IMM:
2976 case AArch64::LDNF1W_IMM:
2977 case AArch64::LDNPDi:
2978 case AArch64::LDNPQi:
2979 case AArch64::LDNPSi:
2980 case AArch64::LDNPWi:
2981 case AArch64::LDNPXi:
2982 case AArch64::LDNT1B_ZRI:
2983 case AArch64::LDNT1D_ZRI:
2984 case AArch64::LDNT1H_ZRI:
2985 case AArch64::LDNT1W_ZRI:
2986 case AArch64::LDPDi:
2987 case AArch64::LDPQi:
2988 case AArch64::LDPSi:
2989 case AArch64::LDPWi:
2990 case AArch64::LDPXi:
2991 case AArch64::LDRBBpost:
2992 case AArch64::LDRBBpre:
2993 case AArch64::LDRBpost:
2994 case AArch64::LDRBpre:
2995 case AArch64::LDRDpost:
2996 case AArch64::LDRDpre:
2997 case AArch64::LDRHHpost:
2998 case AArch64::LDRHHpre:
2999 case AArch64::LDRHpost:
3000 case AArch64::LDRHpre:
3001 case AArch64::LDRQpost:
3002 case AArch64::LDRQpre:
3003 case AArch64::LDRSpost:
3004 case AArch64::LDRSpre:
3005 case AArch64::LDRWpost:
3006 case AArch64::LDRWpre:
3007 case AArch64::LDRXpost:
3008 case AArch64::LDRXpre:
3009 case AArch64::ST1B_D_IMM:
3010 case AArch64::ST1B_H_IMM:
3011 case AArch64::ST1B_IMM:
3012 case AArch64::ST1B_S_IMM:
3013 case AArch64::ST1D_IMM:
3014 case AArch64::ST1H_D_IMM:
3015 case AArch64::ST1H_IMM:
3016 case AArch64::ST1H_S_IMM:
3017 case AArch64::ST1W_D_IMM:
3018 case AArch64::ST1W_IMM:
3019 case AArch64::ST2B_IMM:
3020 case AArch64::ST2D_IMM:
3021 case AArch64::ST2H_IMM:
3022 case AArch64::ST2W_IMM:
3023 case AArch64::ST3B_IMM:
3024 case AArch64::ST3D_IMM:
3025 case AArch64::ST3H_IMM:
3026 case AArch64::ST3W_IMM:
3027 case AArch64::ST4B_IMM:
3028 case AArch64::ST4D_IMM:
3029 case AArch64::ST4H_IMM:
3030 case AArch64::ST4W_IMM:
3031 case AArch64::STGPi:
3032 case AArch64::STGPreIndex:
3033 case AArch64::STZGPreIndex:
3034 case AArch64::ST2GPreIndex:
3035 case AArch64::STZ2GPreIndex:
3036 case AArch64::STGPostIndex:
3037 case AArch64::STZGPostIndex:
3038 case AArch64::ST2GPostIndex:
3039 case AArch64::STZ2GPostIndex:
3040 case AArch64::STNPDi:
3041 case AArch64::STNPQi:
3042 case AArch64::STNPSi:
3043 case AArch64::STNPWi:
3044 case AArch64::STNPXi:
3045 case AArch64::STNT1B_ZRI:
3046 case AArch64::STNT1D_ZRI:
3047 case AArch64::STNT1H_ZRI:
3048 case AArch64::STNT1W_ZRI:
3049 case AArch64::STPDi:
3050 case AArch64::STPQi:
3051 case AArch64::STPSi:
3052 case AArch64::STPWi:
3053 case AArch64::STPXi:
3054 case AArch64::STRBBpost:
3055 case AArch64::STRBBpre:
3056 case AArch64::STRBpost:
3057 case AArch64::STRBpre:
3058 case AArch64::STRDpost:
3059 case AArch64::STRDpre:
3060 case AArch64::STRHHpost:
3061 case AArch64::STRHHpre:
3062 case AArch64::STRHpost:
3063 case AArch64::STRHpre:
3064 case AArch64::STRQpost:
3065 case AArch64::STRQpre:
3066 case AArch64::STRSpost:
3067 case AArch64::STRSpre:
3068 case AArch64::STRWpost:
3069 case AArch64::STRWpre:
3070 case AArch64::STRXpost:
3071 case AArch64::STRXpre:
3072 return 3;
3073 case AArch64::LDPDpost:
3074 case AArch64::LDPDpre:
3075 case AArch64::LDPQpost:
3076 case AArch64::LDPQpre:
3077 case AArch64::LDPSpost:
3078 case AArch64::LDPSpre:
3079 case AArch64::LDPWpost:
3080 case AArch64::LDPWpre:
3081 case AArch64::LDPXpost:
3082 case AArch64::LDPXpre:
3083 case AArch64::STGPpre:
3084 case AArch64::STGPpost:
3085 case AArch64::STPDpost:
3086 case AArch64::STPDpre:
3087 case AArch64::STPQpost:
3088 case AArch64::STPQpre:
3089 case AArch64::STPSpost:
3090 case AArch64::STPSpre:
3091 case AArch64::STPWpost:
3092 case AArch64::STPWpre:
3093 case AArch64::STPXpost:
3094 case AArch64::STPXpre:
3095 return 4;
3096 }
3097}
3098
3100 switch (MI.getOpcode()) {
3101 default:
3102 return false;
3103 // Scaled instructions.
3104 case AArch64::STRSui:
3105 case AArch64::STRDui:
3106 case AArch64::STRQui:
3107 case AArch64::STRXui:
3108 case AArch64::STRWui:
3109 case AArch64::LDRSui:
3110 case AArch64::LDRDui:
3111 case AArch64::LDRQui:
3112 case AArch64::LDRXui:
3113 case AArch64::LDRWui:
3114 case AArch64::LDRSWui:
3115 // Unscaled instructions.
3116 case AArch64::STURSi:
3117 case AArch64::STRSpre:
3118 case AArch64::STURDi:
3119 case AArch64::STRDpre:
3120 case AArch64::STURQi:
3121 case AArch64::STRQpre:
3122 case AArch64::STURWi:
3123 case AArch64::STRWpre:
3124 case AArch64::STURXi:
3125 case AArch64::STRXpre:
3126 case AArch64::LDURSi:
3127 case AArch64::LDRSpre:
3128 case AArch64::LDURDi:
3129 case AArch64::LDRDpre:
3130 case AArch64::LDURQi:
3131 case AArch64::LDRQpre:
3132 case AArch64::LDURWi:
3133 case AArch64::LDRWpre:
3134 case AArch64::LDURXi:
3135 case AArch64::LDRXpre:
3136 case AArch64::LDURSWi:
3137 case AArch64::LDRSWpre:
3138 // SVE instructions.
3139 case AArch64::LDR_ZXI:
3140 case AArch64::STR_ZXI:
3141 return true;
3142 }
3143}
3144
3146 switch (MI.getOpcode()) {
3147 default:
3148 assert((!MI.isCall() || !MI.isReturn()) &&
3149 "Unexpected instruction - was a new tail call opcode introduced?");
3150 return false;
3151 case AArch64::TCRETURNdi:
3152 case AArch64::TCRETURNri:
3153 case AArch64::TCRETURNrix16x17:
3154 case AArch64::TCRETURNrix17:
3155 case AArch64::TCRETURNrinotx16:
3156 case AArch64::TCRETURNriALL:
3157 case AArch64::AUTH_TCRETURN:
3158 case AArch64::AUTH_TCRETURN_BTI:
3159 return true;
3160 }
3161}
3162
3164 switch (Opc) {
3165 default:
3166 llvm_unreachable("Opcode has no flag setting equivalent!");
3167 // 32-bit cases:
3168 case AArch64::ADDWri:
3169 return AArch64::ADDSWri;
3170 case AArch64::ADDWrr:
3171 return AArch64::ADDSWrr;
3172 case AArch64::ADDWrs:
3173 return AArch64::ADDSWrs;
3174 case AArch64::ADDWrx:
3175 return AArch64::ADDSWrx;
3176 case AArch64::ANDWri:
3177 return AArch64::ANDSWri;
3178 case AArch64::ANDWrr:
3179 return AArch64::ANDSWrr;
3180 case AArch64::ANDWrs:
3181 return AArch64::ANDSWrs;
3182 case AArch64::BICWrr:
3183 return AArch64::BICSWrr;
3184 case AArch64::BICWrs:
3185 return AArch64::BICSWrs;
3186 case AArch64::SUBWri:
3187 return AArch64::SUBSWri;
3188 case AArch64::SUBWrr:
3189 return AArch64::SUBSWrr;
3190 case AArch64::SUBWrs:
3191 return AArch64::SUBSWrs;
3192 case AArch64::SUBWrx:
3193 return AArch64::SUBSWrx;
3194 // 64-bit cases:
3195 case AArch64::ADDXri:
3196 return AArch64::ADDSXri;
3197 case AArch64::ADDXrr:
3198 return AArch64::ADDSXrr;
3199 case AArch64::ADDXrs:
3200 return AArch64::ADDSXrs;
3201 case AArch64::ADDXrx:
3202 return AArch64::ADDSXrx;
3203 case AArch64::ANDXri:
3204 return AArch64::ANDSXri;
3205 case AArch64::ANDXrr:
3206 return AArch64::ANDSXrr;
3207 case AArch64::ANDXrs:
3208 return AArch64::ANDSXrs;
3209 case AArch64::BICXrr:
3210 return AArch64::BICSXrr;
3211 case AArch64::BICXrs:
3212 return AArch64::BICSXrs;
3213 case AArch64::SUBXri:
3214 return AArch64::SUBSXri;
3215 case AArch64::SUBXrr:
3216 return AArch64::SUBSXrr;
3217 case AArch64::SUBXrs:
3218 return AArch64::SUBSXrs;
3219 case AArch64::SUBXrx:
3220 return AArch64::SUBSXrx;
3221 // SVE instructions:
3222 case AArch64::AND_PPzPP:
3223 return AArch64::ANDS_PPzPP;
3224 case AArch64::BIC_PPzPP:
3225 return AArch64::BICS_PPzPP;
3226 case AArch64::EOR_PPzPP:
3227 return AArch64::EORS_PPzPP;
3228 case AArch64::NAND_PPzPP:
3229 return AArch64::NANDS_PPzPP;
3230 case AArch64::NOR_PPzPP:
3231 return AArch64::NORS_PPzPP;
3232 case AArch64::ORN_PPzPP:
3233 return AArch64::ORNS_PPzPP;
3234 case AArch64::ORR_PPzPP:
3235 return AArch64::ORRS_PPzPP;
3236 case AArch64::BRKA_PPzP:
3237 return AArch64::BRKAS_PPzP;
3238 case AArch64::BRKPA_PPzPP:
3239 return AArch64::BRKPAS_PPzPP;
3240 case AArch64::BRKB_PPzP:
3241 return AArch64::BRKBS_PPzP;
3242 case AArch64::BRKPB_PPzPP:
3243 return AArch64::BRKPBS_PPzPP;
3244 case AArch64::BRKN_PPzP:
3245 return AArch64::BRKNS_PPzP;
3246 case AArch64::RDFFR_PPz:
3247 return AArch64::RDFFRS_PPz;
3248 case AArch64::PTRUE_B:
3249 return AArch64::PTRUES_B;
3250 }
3251}
3252
3253// Is this a candidate for ld/st merging or pairing? For example, we don't
3254// touch volatiles or load/stores that have a hint to avoid pair formation.
3256
3257 bool IsPreLdSt = isPreLdSt(MI);
3258
3259 // If this is a volatile load/store, don't mess with it.
3260 if (MI.hasOrderedMemoryRef())
3261 return false;
3262
3263 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
3264 // For Pre-inc LD/ST, the operand is shifted by one.
3265 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
3266 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
3267 "Expected a reg or frame index operand.");
3268
3269 // For Pre-indexed addressing quadword instructions, the third operand is the
3270 // immediate value.
3271 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
3272
3273 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
3274 return false;
3275
3276 // Can't merge/pair if the instruction modifies the base register.
3277 // e.g., ldr x0, [x0]
3278 // This case will never occur with an FI base.
3279 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
3280 // STR<S,D,Q,W,X>pre, it can be merged.
3281 // For example:
3282 // ldr q0, [x11, #32]!
3283 // ldr q1, [x11, #16]
3284 // to
3285 // ldp q0, q1, [x11, #32]!
3286 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
3287 Register BaseReg = MI.getOperand(1).getReg();
3289 if (MI.modifiesRegister(BaseReg, TRI))
3290 return false;
3291 }
3292
3293 // Pairing SVE fills/spills is only valid for little-endian targets that
3294 // implement VLS 128.
3295 switch (MI.getOpcode()) {
3296 default:
3297 break;
3298 case AArch64::LDR_ZXI:
3299 case AArch64::STR_ZXI:
3300 if (!Subtarget.isLittleEndian() ||
3301 Subtarget.getSVEVectorSizeInBits() != 128)
3302 return false;
3303 }
3304
3305 // Check if this load/store has a hint to avoid pair formation.
3306 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
3308 return false;
3309
3310 // Do not pair any callee-save store/reload instructions in the
3311 // prologue/epilogue if the CFI information encoded the operations as separate
3312 // instructions, as that will cause the size of the actual prologue to mismatch
3313 // with the prologue size recorded in the Windows CFI.
3314 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
3315 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
3316 MI.getMF()->getFunction().needsUnwindTableEntry();
3317 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
3319 return false;
3320
3321 // On some CPUs quad load/store pairs are slower than two single load/stores.
3322 if (Subtarget.isPaired128Slow()) {
3323 switch (MI.getOpcode()) {
3324 default:
3325 break;
3326 case AArch64::LDURQi:
3327 case AArch64::STURQi:
3328 case AArch64::LDRQui:
3329 case AArch64::STRQui:
3330 return false;
3331 }
3332 }
3333
3334 return true;
3335}
3336
3339 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
3340 const TargetRegisterInfo *TRI) const {
3341 if (!LdSt.mayLoadOrStore())
3342 return false;
3343
3344 const MachineOperand *BaseOp;
3345 TypeSize WidthN(0, false);
3346 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
3347 WidthN, TRI))
3348 return false;
3349 // The maximum vscale is 16 under AArch64, return the maximal extent for the
3350 // vector.
3351 Width = LocationSize::precise(WidthN);
3352 BaseOps.push_back(BaseOp);
3353 return true;
3354}
3355
3356std::optional<ExtAddrMode>
3358 const TargetRegisterInfo *TRI) const {
3359 const MachineOperand *Base; // Filled with the base operand of MI.
3360 int64_t Offset; // Filled with the offset of MI.
3361 bool OffsetIsScalable;
3362 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3363 return std::nullopt;
3364
3365 if (!Base->isReg())
3366 return std::nullopt;
3367 ExtAddrMode AM;
3368 AM.BaseReg = Base->getReg();
3369 AM.Displacement = Offset;
3370 AM.ScaledReg = 0;
3371 AM.Scale = 0;
3372 return AM;
3373}
3374
3376 Register Reg,
3377 const MachineInstr &AddrI,
3378 ExtAddrMode &AM) const {
3379 // Filter out instructions into which we cannot fold.
3380 unsigned NumBytes;
3381 int64_t OffsetScale = 1;
3382 switch (MemI.getOpcode()) {
3383 default:
3384 return false;
3385
3386 case AArch64::LDURQi:
3387 case AArch64::STURQi:
3388 NumBytes = 16;
3389 break;
3390
3391 case AArch64::LDURDi:
3392 case AArch64::STURDi:
3393 case AArch64::LDURXi:
3394 case AArch64::STURXi:
3395 NumBytes = 8;
3396 break;
3397
3398 case AArch64::LDURWi:
3399 case AArch64::LDURSWi:
3400 case AArch64::STURWi:
3401 NumBytes = 4;
3402 break;
3403
3404 case AArch64::LDURHi:
3405 case AArch64::STURHi:
3406 case AArch64::LDURHHi:
3407 case AArch64::STURHHi:
3408 case AArch64::LDURSHXi:
3409 case AArch64::LDURSHWi:
3410 NumBytes = 2;
3411 break;
3412
3413 case AArch64::LDRBroX:
3414 case AArch64::LDRBBroX:
3415 case AArch64::LDRSBXroX:
3416 case AArch64::LDRSBWroX:
3417 case AArch64::STRBroX:
3418 case AArch64::STRBBroX:
3419 case AArch64::LDURBi:
3420 case AArch64::LDURBBi:
3421 case AArch64::LDURSBXi:
3422 case AArch64::LDURSBWi:
3423 case AArch64::STURBi:
3424 case AArch64::STURBBi:
3425 case AArch64::LDRBui:
3426 case AArch64::LDRBBui:
3427 case AArch64::LDRSBXui:
3428 case AArch64::LDRSBWui:
3429 case AArch64::STRBui:
3430 case AArch64::STRBBui:
3431 NumBytes = 1;
3432 break;
3433
3434 case AArch64::LDRQroX:
3435 case AArch64::STRQroX:
3436 case AArch64::LDRQui:
3437 case AArch64::STRQui:
3438 NumBytes = 16;
3439 OffsetScale = 16;
3440 break;
3441
3442 case AArch64::LDRDroX:
3443 case AArch64::STRDroX:
3444 case AArch64::LDRXroX:
3445 case AArch64::STRXroX:
3446 case AArch64::LDRDui:
3447 case AArch64::STRDui:
3448 case AArch64::LDRXui:
3449 case AArch64::STRXui:
3450 NumBytes = 8;
3451 OffsetScale = 8;
3452 break;
3453
3454 case AArch64::LDRWroX:
3455 case AArch64::LDRSWroX:
3456 case AArch64::STRWroX:
3457 case AArch64::LDRWui:
3458 case AArch64::LDRSWui:
3459 case AArch64::STRWui:
3460 NumBytes = 4;
3461 OffsetScale = 4;
3462 break;
3463
3464 case AArch64::LDRHroX:
3465 case AArch64::STRHroX:
3466 case AArch64::LDRHHroX:
3467 case AArch64::STRHHroX:
3468 case AArch64::LDRSHXroX:
3469 case AArch64::LDRSHWroX:
3470 case AArch64::LDRHui:
3471 case AArch64::STRHui:
3472 case AArch64::LDRHHui:
3473 case AArch64::STRHHui:
3474 case AArch64::LDRSHXui:
3475 case AArch64::LDRSHWui:
3476 NumBytes = 2;
3477 OffsetScale = 2;
3478 break;
3479 }
3480
3481 // Check the fold operand is not the loaded/stored value.
3482 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3483 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3484 return false;
3485
3486 // Handle memory instructions with a [Reg, Reg] addressing mode.
3487 if (MemI.getOperand(2).isReg()) {
3488 // Bail if the addressing mode already includes extension of the offset
3489 // register.
3490 if (MemI.getOperand(3).getImm())
3491 return false;
3492
3493 // Check if we actually have a scaled offset.
3494 if (MemI.getOperand(4).getImm() == 0)
3495 OffsetScale = 1;
3496
3497 // If the address instructions is folded into the base register, then the
3498 // addressing mode must not have a scale. Then we can swap the base and the
3499 // scaled registers.
3500 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3501 return false;
3502
3503 switch (AddrI.getOpcode()) {
3504 default:
3505 return false;
3506
3507 case AArch64::SBFMXri:
3508 // sxtw Xa, Wm
3509 // ldr Xd, [Xn, Xa, lsl #N]
3510 // ->
3511 // ldr Xd, [Xn, Wm, sxtw #N]
3512 if (AddrI.getOperand(2).getImm() != 0 ||
3513 AddrI.getOperand(3).getImm() != 31)
3514 return false;
3515
3516 AM.BaseReg = MemI.getOperand(1).getReg();
3517 if (AM.BaseReg == Reg)
3518 AM.BaseReg = MemI.getOperand(2).getReg();
3519 AM.ScaledReg = AddrI.getOperand(1).getReg();
3520 AM.Scale = OffsetScale;
3521 AM.Displacement = 0;
3523 return true;
3524
3525 case TargetOpcode::SUBREG_TO_REG: {
3526 // mov Wa, Wm
3527 // ldr Xd, [Xn, Xa, lsl #N]
3528 // ->
3529 // ldr Xd, [Xn, Wm, uxtw #N]
3530
3531 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3532 if (AddrI.getOperand(1).getImm() != 0 ||
3533 AddrI.getOperand(3).getImm() != AArch64::sub_32)
3534 return false;
3535
3536 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3537 Register OffsetReg = AddrI.getOperand(2).getReg();
3538 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3539 return false;
3540
3541 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3542 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3543 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3544 DefMI.getOperand(3).getImm() != 0)
3545 return false;
3546
3547 AM.BaseReg = MemI.getOperand(1).getReg();
3548 if (AM.BaseReg == Reg)
3549 AM.BaseReg = MemI.getOperand(2).getReg();
3550 AM.ScaledReg = DefMI.getOperand(2).getReg();
3551 AM.Scale = OffsetScale;
3552 AM.Displacement = 0;
3554 return true;
3555 }
3556 }
3557 }
3558
3559 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3560
3561 // Check we are not breaking a potential conversion to an LDP.
3562 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3563 int64_t NewOffset) -> bool {
3564 int64_t MinOffset, MaxOffset;
3565 switch (NumBytes) {
3566 default:
3567 return true;
3568 case 4:
3569 MinOffset = -256;
3570 MaxOffset = 252;
3571 break;
3572 case 8:
3573 MinOffset = -512;
3574 MaxOffset = 504;
3575 break;
3576 case 16:
3577 MinOffset = -1024;
3578 MaxOffset = 1008;
3579 break;
3580 }
3581 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3582 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3583 };
3584 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3585 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3586 int64_t NewOffset = OldOffset + Disp;
3587 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3588 return false;
3589 // If the old offset would fit into an LDP, but the new offset wouldn't,
3590 // bail out.
3591 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3592 return false;
3593 AM.BaseReg = AddrI.getOperand(1).getReg();
3594 AM.ScaledReg = 0;
3595 AM.Scale = 0;
3596 AM.Displacement = NewOffset;
3598 return true;
3599 };
3600
3601 auto canFoldAddRegIntoAddrMode =
3602 [&](int64_t Scale,
3604 if (MemI.getOperand(2).getImm() != 0)
3605 return false;
3606 if ((unsigned)Scale != Scale)
3607 return false;
3608 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3609 return false;
3610 AM.BaseReg = AddrI.getOperand(1).getReg();
3611 AM.ScaledReg = AddrI.getOperand(2).getReg();
3612 AM.Scale = Scale;
3613 AM.Displacement = 0;
3614 AM.Form = Form;
3615 return true;
3616 };
3617
3618 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3619 unsigned Opcode = MemI.getOpcode();
3620 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3621 Subtarget.isSTRQroSlow();
3622 };
3623
3624 int64_t Disp = 0;
3625 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3626 switch (AddrI.getOpcode()) {
3627 default:
3628 return false;
3629
3630 case AArch64::ADDXri:
3631 // add Xa, Xn, #N
3632 // ldr Xd, [Xa, #M]
3633 // ->
3634 // ldr Xd, [Xn, #N'+M]
3635 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3636 return canFoldAddSubImmIntoAddrMode(Disp);
3637
3638 case AArch64::SUBXri:
3639 // sub Xa, Xn, #N
3640 // ldr Xd, [Xa, #M]
3641 // ->
3642 // ldr Xd, [Xn, #N'+M]
3643 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3644 return canFoldAddSubImmIntoAddrMode(-Disp);
3645
3646 case AArch64::ADDXrs: {
3647 // add Xa, Xn, Xm, lsl #N
3648 // ldr Xd, [Xa]
3649 // ->
3650 // ldr Xd, [Xn, Xm, lsl #N]
3651
3652 // Don't fold the add if the result would be slower, unless optimising for
3653 // size.
3654 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3656 return false;
3657 Shift = AArch64_AM::getShiftValue(Shift);
3658 if (!OptSize) {
3659 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3660 return false;
3661 if (avoidSlowSTRQ(MemI))
3662 return false;
3663 }
3664 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3665 }
3666
3667 case AArch64::ADDXrr:
3668 // add Xa, Xn, Xm
3669 // ldr Xd, [Xa]
3670 // ->
3671 // ldr Xd, [Xn, Xm, lsl #0]
3672
3673 // Don't fold the add if the result would be slower, unless optimising for
3674 // size.
3675 if (!OptSize && avoidSlowSTRQ(MemI))
3676 return false;
3677 return canFoldAddRegIntoAddrMode(1);
3678
3679 case AArch64::ADDXrx:
3680 // add Xa, Xn, Wm, {s,u}xtw #N
3681 // ldr Xd, [Xa]
3682 // ->
3683 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3684
3685 // Don't fold the add if the result would be slower, unless optimising for
3686 // size.
3687 if (!OptSize && avoidSlowSTRQ(MemI))
3688 return false;
3689
3690 // Can fold only sign-/zero-extend of a word.
3691 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3693 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3694 return false;
3695
3696 return canFoldAddRegIntoAddrMode(
3697 1ULL << AArch64_AM::getArithShiftValue(Imm),
3700 }
3701}
3702
3703// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3704// return the opcode of an instruction performing the same operation, but using
3705// the [Reg, Reg] addressing mode.
3706static unsigned regOffsetOpcode(unsigned Opcode) {
3707 switch (Opcode) {
3708 default:
3709 llvm_unreachable("Address folding not implemented for instruction");
3710
3711 case AArch64::LDURQi:
3712 case AArch64::LDRQui:
3713 return AArch64::LDRQroX;
3714 case AArch64::STURQi:
3715 case AArch64::STRQui:
3716 return AArch64::STRQroX;
3717 case AArch64::LDURDi:
3718 case AArch64::LDRDui:
3719 return AArch64::LDRDroX;
3720 case AArch64::STURDi:
3721 case AArch64::STRDui:
3722 return AArch64::STRDroX;
3723 case AArch64::LDURXi:
3724 case AArch64::LDRXui:
3725 return AArch64::LDRXroX;
3726 case AArch64::STURXi:
3727 case AArch64::STRXui:
3728 return AArch64::STRXroX;
3729 case AArch64::LDURWi:
3730 case AArch64::LDRWui:
3731 return AArch64::LDRWroX;
3732 case AArch64::LDURSWi:
3733 case AArch64::LDRSWui:
3734 return AArch64::LDRSWroX;
3735 case AArch64::STURWi:
3736 case AArch64::STRWui:
3737 return AArch64::STRWroX;
3738 case AArch64::LDURHi:
3739 case AArch64::LDRHui:
3740 return AArch64::LDRHroX;
3741 case AArch64::STURHi:
3742 case AArch64::STRHui:
3743 return AArch64::STRHroX;
3744 case AArch64::LDURHHi:
3745 case AArch64::LDRHHui:
3746 return AArch64::LDRHHroX;
3747 case AArch64::STURHHi:
3748 case AArch64::STRHHui:
3749 return AArch64::STRHHroX;
3750 case AArch64::LDURSHXi:
3751 case AArch64::LDRSHXui:
3752 return AArch64::LDRSHXroX;
3753 case AArch64::LDURSHWi:
3754 case AArch64::LDRSHWui:
3755 return AArch64::LDRSHWroX;
3756 case AArch64::LDURBi:
3757 case AArch64::LDRBui:
3758 return AArch64::LDRBroX;
3759 case AArch64::LDURBBi:
3760 case AArch64::LDRBBui:
3761 return AArch64::LDRBBroX;
3762 case AArch64::LDURSBXi:
3763 case AArch64::LDRSBXui:
3764 return AArch64::LDRSBXroX;
3765 case AArch64::LDURSBWi:
3766 case AArch64::LDRSBWui:
3767 return AArch64::LDRSBWroX;
3768 case AArch64::STURBi:
3769 case AArch64::STRBui:
3770 return AArch64::STRBroX;
3771 case AArch64::STURBBi:
3772 case AArch64::STRBBui:
3773 return AArch64::STRBBroX;
3774 }
3775}
3776
3777// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3778// the opcode of an instruction performing the same operation, but using the
3779// [Reg, #Imm] addressing mode with scaled offset.
3780unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3781 switch (Opcode) {
3782 default:
3783 llvm_unreachable("Address folding not implemented for instruction");
3784
3785 case AArch64::LDURQi:
3786 Scale = 16;
3787 return AArch64::LDRQui;
3788 case AArch64::STURQi:
3789 Scale = 16;
3790 return AArch64::STRQui;
3791 case AArch64::LDURDi:
3792 Scale = 8;
3793 return AArch64::LDRDui;
3794 case AArch64::STURDi:
3795 Scale = 8;
3796 return AArch64::STRDui;
3797 case AArch64::LDURXi:
3798 Scale = 8;
3799 return AArch64::LDRXui;
3800 case AArch64::STURXi:
3801 Scale = 8;
3802 return AArch64::STRXui;
3803 case AArch64::LDURWi:
3804 Scale = 4;
3805 return AArch64::LDRWui;
3806 case AArch64::LDURSWi:
3807 Scale = 4;
3808 return AArch64::LDRSWui;
3809 case AArch64::STURWi:
3810 Scale = 4;
3811 return AArch64::STRWui;
3812 case AArch64::LDURHi:
3813 Scale = 2;
3814 return AArch64::LDRHui;
3815 case AArch64::STURHi:
3816 Scale = 2;
3817 return AArch64::STRHui;
3818 case AArch64::LDURHHi:
3819 Scale = 2;
3820 return AArch64::LDRHHui;
3821 case AArch64::STURHHi:
3822 Scale = 2;
3823 return AArch64::STRHHui;
3824 case AArch64::LDURSHXi:
3825 Scale = 2;
3826 return AArch64::LDRSHXui;
3827 case AArch64::LDURSHWi:
3828 Scale = 2;
3829 return AArch64::LDRSHWui;
3830 case AArch64::LDURBi:
3831 Scale = 1;
3832 return AArch64::LDRBui;
3833 case AArch64::LDURBBi:
3834 Scale = 1;
3835 return AArch64::LDRBBui;
3836 case AArch64::LDURSBXi:
3837 Scale = 1;
3838 return AArch64::LDRSBXui;
3839 case AArch64::LDURSBWi:
3840 Scale = 1;
3841 return AArch64::LDRSBWui;
3842 case AArch64::STURBi:
3843 Scale = 1;
3844 return AArch64::STRBui;
3845 case AArch64::STURBBi:
3846 Scale = 1;
3847 return AArch64::STRBBui;
3848 case AArch64::LDRQui:
3849 case AArch64::STRQui:
3850 Scale = 16;
3851 return Opcode;
3852 case AArch64::LDRDui:
3853 case AArch64::STRDui:
3854 case AArch64::LDRXui:
3855 case AArch64::STRXui:
3856 Scale = 8;
3857 return Opcode;
3858 case AArch64::LDRWui:
3859 case AArch64::LDRSWui:
3860 case AArch64::STRWui:
3861 Scale = 4;
3862 return Opcode;
3863 case AArch64::LDRHui:
3864 case AArch64::STRHui:
3865 case AArch64::LDRHHui:
3866 case AArch64::STRHHui:
3867 case AArch64::LDRSHXui:
3868 case AArch64::LDRSHWui:
3869 Scale = 2;
3870 return Opcode;
3871 case AArch64::LDRBui:
3872 case AArch64::LDRBBui:
3873 case AArch64::LDRSBXui:
3874 case AArch64::LDRSBWui:
3875 case AArch64::STRBui:
3876 case AArch64::STRBBui:
3877 Scale = 1;
3878 return Opcode;
3879 }
3880}
3881
3882// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3883// the opcode of an instruction performing the same operation, but using the
3884// [Reg, #Imm] addressing mode with unscaled offset.
3885unsigned unscaledOffsetOpcode(unsigned Opcode) {
3886 switch (Opcode) {
3887 default:
3888 llvm_unreachable("Address folding not implemented for instruction");
3889
3890 case AArch64::LDURQi:
3891 case AArch64::STURQi:
3892 case AArch64::LDURDi:
3893 case AArch64::STURDi:
3894 case AArch64::LDURXi:
3895 case AArch64::STURXi:
3896 case AArch64::LDURWi:
3897 case AArch64::LDURSWi:
3898 case AArch64::STURWi:
3899 case AArch64::LDURHi:
3900 case AArch64::STURHi:
3901 case AArch64::LDURHHi:
3902 case AArch64::STURHHi:
3903 case AArch64::LDURSHXi:
3904 case AArch64::LDURSHWi:
3905 case AArch64::LDURBi:
3906 case AArch64::STURBi:
3907 case AArch64::LDURBBi:
3908 case AArch64::STURBBi:
3909 case AArch64::LDURSBWi:
3910 case AArch64::LDURSBXi:
3911 return Opcode;
3912 case AArch64::LDRQui:
3913 return AArch64::LDURQi;
3914 case AArch64::STRQui:
3915 return AArch64::STURQi;
3916 case AArch64::LDRDui:
3917 return AArch64::LDURDi;
3918 case AArch64::STRDui:
3919 return AArch64::STURDi;
3920 case AArch64::LDRXui:
3921 return AArch64::LDURXi;
3922 case AArch64::STRXui:
3923 return AArch64::STURXi;
3924 case AArch64::LDRWui:
3925 return AArch64::LDURWi;
3926 case AArch64::LDRSWui:
3927 return AArch64::LDURSWi;
3928 case AArch64::STRWui:
3929 return AArch64::STURWi;
3930 case AArch64::LDRHui:
3931 return AArch64::LDURHi;
3932 case AArch64::STRHui:
3933 return AArch64::STURHi;
3934 case AArch64::LDRHHui:
3935 return AArch64::LDURHHi;
3936 case AArch64::STRHHui:
3937 return AArch64::STURHHi;
3938 case AArch64::LDRSHXui:
3939 return AArch64::LDURSHXi;
3940 case AArch64::LDRSHWui:
3941 return AArch64::LDURSHWi;
3942 case AArch64::LDRBBui:
3943 return AArch64::LDURBBi;
3944 case AArch64::LDRBui:
3945 return AArch64::LDURBi;
3946 case AArch64::STRBBui:
3947 return AArch64::STURBBi;
3948 case AArch64::STRBui:
3949 return AArch64::STURBi;
3950 case AArch64::LDRSBWui:
3951 return AArch64::LDURSBWi;
3952 case AArch64::LDRSBXui:
3953 return AArch64::LDURSBXi;
3954 }
3955}
3956
3957// Given the opcode of a memory load/store instruction, return the opcode of an
3958// instruction performing the same operation, but using
3959// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3960// offset register.
3961static unsigned offsetExtendOpcode(unsigned Opcode) {
3962 switch (Opcode) {
3963 default:
3964 llvm_unreachable("Address folding not implemented for instruction");
3965
3966 case AArch64::LDRQroX:
3967 case AArch64::LDURQi:
3968 case AArch64::LDRQui:
3969 return AArch64::LDRQroW;
3970 case AArch64::STRQroX:
3971 case AArch64::STURQi:
3972 case AArch64::STRQui:
3973 return AArch64::STRQroW;
3974 case AArch64::LDRDroX:
3975 case AArch64::LDURDi:
3976 case AArch64::LDRDui:
3977 return AArch64::LDRDroW;
3978 case AArch64::STRDroX:
3979 case AArch64::STURDi:
3980 case AArch64::STRDui:
3981 return AArch64::STRDroW;
3982 case AArch64::LDRXroX:
3983 case AArch64::LDURXi:
3984 case AArch64::LDRXui:
3985 return AArch64::LDRXroW;
3986 case AArch64::STRXroX:
3987 case AArch64::STURXi:
3988 case AArch64::STRXui:
3989 return AArch64::STRXroW;
3990 case AArch64::LDRWroX:
3991 case AArch64::LDURWi:
3992 case AArch64::LDRWui:
3993 return AArch64::LDRWroW;
3994 case AArch64::LDRSWroX:
3995 case AArch64::LDURSWi:
3996 case AArch64::LDRSWui:
3997 return AArch64::LDRSWroW;
3998 case AArch64::STRWroX:
3999 case AArch64::STURWi:
4000 case AArch64::STRWui:
4001 return AArch64::STRWroW;
4002 case AArch64::LDRHroX:
4003 case AArch64::LDURHi:
4004 case AArch64::LDRHui:
4005 return AArch64::LDRHroW;
4006 case AArch64::STRHroX:
4007 case AArch64::STURHi:
4008 case AArch64::STRHui:
4009 return AArch64::STRHroW;
4010 case AArch64::LDRHHroX:
4011 case AArch64::LDURHHi:
4012 case AArch64::LDRHHui:
4013 return AArch64::LDRHHroW;
4014 case AArch64::STRHHroX:
4015 case AArch64::STURHHi:
4016 case AArch64::STRHHui:
4017 return AArch64::STRHHroW;
4018 case AArch64::LDRSHXroX:
4019 case AArch64::LDURSHXi:
4020 case AArch64::LDRSHXui:
4021 return AArch64::LDRSHXroW;
4022 case AArch64::LDRSHWroX:
4023 case AArch64::LDURSHWi:
4024 case AArch64::LDRSHWui:
4025 return AArch64::LDRSHWroW;
4026 case AArch64::LDRBroX:
4027 case AArch64::LDURBi:
4028 case AArch64::LDRBui:
4029 return AArch64::LDRBroW;
4030 case AArch64::LDRBBroX:
4031 case AArch64::LDURBBi:
4032 case AArch64::LDRBBui:
4033 return AArch64::LDRBBroW;
4034 case AArch64::LDRSBXroX:
4035 case AArch64::LDURSBXi:
4036 case AArch64::LDRSBXui:
4037 return AArch64::LDRSBXroW;
4038 case AArch64::LDRSBWroX:
4039 case AArch64::LDURSBWi:
4040 case AArch64::LDRSBWui:
4041 return AArch64::LDRSBWroW;
4042 case AArch64::STRBroX:
4043 case AArch64::STURBi:
4044 case AArch64::STRBui:
4045 return AArch64::STRBroW;
4046 case AArch64::STRBBroX:
4047 case AArch64::STURBBi:
4048 case AArch64::STRBBui:
4049 return AArch64::STRBBroW;
4050 }
4051}
4052
4054 const ExtAddrMode &AM) const {
4055
4056 const DebugLoc &DL = MemI.getDebugLoc();
4057 MachineBasicBlock &MBB = *MemI.getParent();
4059
4061 if (AM.ScaledReg) {
4062 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
4063 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
4064 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4065 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4066 .addReg(MemI.getOperand(0).getReg(),
4067 MemI.mayLoad() ? RegState::Define : 0)
4068 .addReg(AM.BaseReg)
4069 .addReg(AM.ScaledReg)
4070 .addImm(0)
4071 .addImm(AM.Scale > 1)
4072 .setMemRefs(MemI.memoperands())
4073 .setMIFlags(MemI.getFlags());
4074 return B.getInstr();
4075 }
4076
4077 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
4078 "Addressing mode not supported for folding");
4079
4080 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
4081 unsigned Scale = 1;
4082 unsigned Opcode = MemI.getOpcode();
4083 if (isInt<9>(AM.Displacement))
4084 Opcode = unscaledOffsetOpcode(Opcode);
4085 else
4086 Opcode = scaledOffsetOpcode(Opcode, Scale);
4087
4088 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4089 .addReg(MemI.getOperand(0).getReg(),
4090 MemI.mayLoad() ? RegState::Define : 0)
4091 .addReg(AM.BaseReg)
4092 .addImm(AM.Displacement / Scale)
4093 .setMemRefs(MemI.memoperands())
4094 .setMIFlags(MemI.getFlags());
4095 return B.getInstr();
4096 }
4097
4100 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
4101 assert(AM.ScaledReg && !AM.Displacement &&
4102 "Address offset can be a register or an immediate, but not both");
4103 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
4104 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
4105 // Make sure the offset register is in the correct register class.
4106 Register OffsetReg = AM.ScaledReg;
4107 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
4108 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
4109 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4110 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
4111 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
4112 }
4113 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
4114 .addReg(MemI.getOperand(0).getReg(),
4115 MemI.mayLoad() ? RegState::Define : 0)
4116 .addReg(AM.BaseReg)
4117 .addReg(OffsetReg)
4119 .addImm(AM.Scale != 1)
4120 .setMemRefs(MemI.memoperands())
4121 .setMIFlags(MemI.getFlags());
4122
4123 return B.getInstr();
4124 }
4125
4127 "Function must not be called with an addressing mode it can't handle");
4128}
4129
4130/// Return true if the opcode is a post-index ld/st instruction, which really
4131/// loads from base+0.
4132static bool isPostIndexLdStOpcode(unsigned Opcode) {
4133 switch (Opcode) {
4134 default:
4135 return false;
4136 case AArch64::LD1Fourv16b_POST:
4137 case AArch64::LD1Fourv1d_POST:
4138 case AArch64::LD1Fourv2d_POST:
4139 case AArch64::LD1Fourv2s_POST:
4140 case AArch64::LD1Fourv4h_POST:
4141 case AArch64::LD1Fourv4s_POST:
4142 case AArch64::LD1Fourv8b_POST:
4143 case AArch64::LD1Fourv8h_POST:
4144 case AArch64::LD1Onev16b_POST:
4145 case AArch64::LD1Onev1d_POST:
4146 case AArch64::LD1Onev2d_POST:
4147 case AArch64::LD1Onev2s_POST:
4148 case AArch64::LD1Onev4h_POST:
4149 case AArch64::LD1Onev4s_POST:
4150 case AArch64::LD1Onev8b_POST:
4151 case AArch64::LD1Onev8h_POST:
4152 case AArch64::LD1Rv16b_POST:
4153 case AArch64::LD1Rv1d_POST:
4154 case AArch64::LD1Rv2d_POST:
4155 case AArch64::LD1Rv2s_POST:
4156 case AArch64::LD1Rv4h_POST:
4157 case AArch64::LD1Rv4s_POST:
4158 case AArch64::LD1Rv8b_POST:
4159 case AArch64::LD1Rv8h_POST:
4160 case AArch64::LD1Threev16b_POST:
4161 case AArch64::LD1Threev1d_POST:
4162 case AArch64::LD1Threev2d_POST:
4163 case AArch64::LD1Threev2s_POST:
4164 case AArch64::LD1Threev4h_POST:
4165 case AArch64::LD1Threev4s_POST:
4166 case AArch64::LD1Threev8b_POST:
4167 case AArch64::LD1Threev8h_POST:
4168 case AArch64::LD1Twov16b_POST:
4169 case AArch64::LD1Twov1d_POST:
4170 case AArch64::LD1Twov2d_POST:
4171 case AArch64::LD1Twov2s_POST:
4172 case AArch64::LD1Twov4h_POST:
4173 case AArch64::LD1Twov4s_POST:
4174 case AArch64::LD1Twov8b_POST:
4175 case AArch64::LD1Twov8h_POST:
4176 case AArch64::LD1i16_POST:
4177 case AArch64::LD1i32_POST:
4178 case AArch64::LD1i64_POST:
4179 case AArch64::LD1i8_POST:
4180 case AArch64::LD2Rv16b_POST:
4181 case AArch64::LD2Rv1d_POST:
4182 case AArch64::LD2Rv2d_POST:
4183 case AArch64::LD2Rv2s_POST:
4184 case AArch64::LD2Rv4h_POST:
4185 case AArch64::LD2Rv4s_POST:
4186 case AArch64::LD2Rv8b_POST:
4187 case AArch64::LD2Rv8h_POST:
4188 case AArch64::LD2Twov16b_POST:
4189 case AArch64::LD2Twov2d_POST:
4190 case AArch64::LD2Twov2s_POST:
4191 case AArch64::LD2Twov4h_POST:
4192 case AArch64::LD2Twov4s_POST:
4193 case AArch64::LD2Twov8b_POST:
4194 case AArch64::LD2Twov8h_POST:
4195 case AArch64::LD2i16_POST:
4196 case AArch64::LD2i32_POST:
4197 case AArch64::LD2i64_POST:
4198 case AArch64::LD2i8_POST:
4199 case AArch64::LD3Rv16b_POST:
4200 case AArch64::LD3Rv1d_POST:
4201 case AArch64::LD3Rv2d_POST:
4202 case AArch64::LD3Rv2s_POST:
4203 case AArch64::LD3Rv4h_POST:
4204 case AArch64::LD3Rv4s_POST:
4205 case AArch64::LD3Rv8b_POST:
4206 case AArch64::LD3Rv8h_POST:
4207 case AArch64::LD3Threev16b_POST:
4208 case AArch64::LD3Threev2d_POST:
4209 case AArch64::LD3Threev2s_POST:
4210 case AArch64::LD3Threev4h_POST:
4211 case AArch64::LD3Threev4s_POST:
4212 case AArch64::LD3Threev8b_POST:
4213 case AArch64::LD3Threev8h_POST:
4214 case AArch64::LD3i16_POST:
4215 case AArch64::LD3i32_POST:
4216 case AArch64::LD3i64_POST:
4217 case AArch64::LD3i8_POST:
4218 case AArch64::LD4Fourv16b_POST:
4219 case AArch64::LD4Fourv2d_POST:
4220 case AArch64::LD4Fourv2s_POST:
4221 case AArch64::LD4Fourv4h_POST:
4222 case AArch64::LD4Fourv4s_POST:
4223 case AArch64::LD4Fourv8b_POST:
4224 case AArch64::LD4Fourv8h_POST:
4225 case AArch64::LD4Rv16b_POST:
4226 case AArch64::LD4Rv1d_POST:
4227 case AArch64::LD4Rv2d_POST:
4228 case AArch64::LD4Rv2s_POST:
4229 case AArch64::LD4Rv4h_POST:
4230 case AArch64::LD4Rv4s_POST:
4231 case AArch64::LD4Rv8b_POST:
4232 case AArch64::LD4Rv8h_POST:
4233 case AArch64::LD4i16_POST:
4234 case AArch64::LD4i32_POST:
4235 case AArch64::LD4i64_POST:
4236 case AArch64::LD4i8_POST:
4237 case AArch64::LDAPRWpost:
4238 case AArch64::LDAPRXpost:
4239 case AArch64::LDIAPPWpost:
4240 case AArch64::LDIAPPXpost:
4241 case AArch64::LDPDpost:
4242 case AArch64::LDPQpost:
4243 case AArch64::LDPSWpost:
4244 case AArch64::LDPSpost:
4245 case AArch64::LDPWpost:
4246 case AArch64::LDPXpost:
4247 case AArch64::LDRBBpost:
4248 case AArch64::LDRBpost:
4249 case AArch64::LDRDpost:
4250 case AArch64::LDRHHpost:
4251 case AArch64::LDRHpost:
4252 case AArch64::LDRQpost:
4253 case AArch64::LDRSBWpost:
4254 case AArch64::LDRSBXpost:
4255 case AArch64::LDRSHWpost:
4256 case AArch64::LDRSHXpost:
4257 case AArch64::LDRSWpost:
4258 case AArch64::LDRSpost:
4259 case AArch64::LDRWpost:
4260 case AArch64::LDRXpost:
4261 case AArch64::ST1Fourv16b_POST:
4262 case AArch64::ST1Fourv1d_POST:
4263 case AArch64::ST1Fourv2d_POST:
4264 case AArch64::ST1Fourv2s_POST:
4265 case AArch64::ST1Fourv4h_POST:
4266 case AArch64::ST1Fourv4s_POST:
4267 case AArch64::ST1Fourv8b_POST:
4268 case AArch64::ST1Fourv8h_POST:
4269 case AArch64::ST1Onev16b_POST:
4270 case AArch64::ST1Onev1d_POST:
4271 case AArch64::ST1Onev2d_POST:
4272 case AArch64::ST1Onev2s_POST:
4273 case AArch64::ST1Onev4h_POST:
4274 case AArch64::ST1Onev4s_POST:
4275 case AArch64::ST1Onev8b_POST:
4276 case AArch64::ST1Onev8h_POST:
4277 case AArch64::ST1Threev16b_POST:
4278 case AArch64::ST1Threev1d_POST:
4279 case AArch64::ST1Threev2d_POST:
4280 case AArch64::ST1Threev2s_POST:
4281 case AArch64::ST1Threev4h_POST:
4282 case AArch64::ST1Threev4s_POST:
4283 case AArch64::ST1Threev8b_POST:
4284 case AArch64::ST1Threev8h_POST:
4285 case AArch64::ST1Twov16b_POST:
4286 case AArch64::ST1Twov1d_POST:
4287 case AArch64::ST1Twov2d_POST:
4288 case AArch64::ST1Twov2s_POST:
4289 case AArch64::ST1Twov4h_POST:
4290 case AArch64::ST1Twov4s_POST:
4291 case AArch64::ST1Twov8b_POST:
4292 case AArch64::ST1Twov8h_POST:
4293 case AArch64::ST1i16_POST:
4294 case AArch64::ST1i32_POST:
4295 case AArch64::ST1i64_POST:
4296 case AArch64::ST1i8_POST:
4297 case AArch64::ST2GPostIndex:
4298 case AArch64::ST2Twov16b_POST:
4299 case AArch64::ST2Twov2d_POST:
4300 case AArch64::ST2Twov2s_POST:
4301 case AArch64::ST2Twov4h_POST:
4302 case AArch64::ST2Twov4s_POST:
4303 case AArch64::ST2Twov8b_POST:
4304 case AArch64::ST2Twov8h_POST:
4305 case AArch64::ST2i16_POST:
4306 case AArch64::ST2i32_POST:
4307 case AArch64::ST2i64_POST:
4308 case AArch64::ST2i8_POST:
4309 case AArch64::ST3Threev16b_POST:
4310 case AArch64::ST3Threev2d_POST:
4311 case AArch64::ST3Threev2s_POST:
4312 case AArch64::ST3Threev4h_POST:
4313 case AArch64::ST3Threev4s_POST:
4314 case AArch64::ST3Threev8b_POST:
4315 case AArch64::ST3Threev8h_POST:
4316 case AArch64::ST3i16_POST:
4317 case AArch64::ST3i32_POST:
4318 case AArch64::ST3i64_POST:
4319 case AArch64::ST3i8_POST:
4320 case AArch64::ST4Fourv16b_POST:
4321 case AArch64::ST4Fourv2d_POST:
4322 case AArch64::ST4Fourv2s_POST:
4323 case AArch64::ST4Fourv4h_POST:
4324 case AArch64::ST4Fourv4s_POST:
4325 case AArch64::ST4Fourv8b_POST:
4326 case AArch64::ST4Fourv8h_POST:
4327 case AArch64::ST4i16_POST:
4328 case AArch64::ST4i32_POST:
4329 case AArch64::ST4i64_POST:
4330 case AArch64::ST4i8_POST:
4331 case AArch64::STGPostIndex:
4332 case AArch64::STGPpost:
4333 case AArch64::STPDpost:
4334 case AArch64::STPQpost:
4335 case AArch64::STPSpost:
4336 case AArch64::STPWpost:
4337 case AArch64::STPXpost:
4338 case AArch64::STRBBpost:
4339 case AArch64::STRBpost:
4340 case AArch64::STRDpost:
4341 case AArch64::STRHHpost:
4342 case AArch64::STRHpost:
4343 case AArch64::STRQpost:
4344 case AArch64::STRSpost:
4345 case AArch64::STRWpost:
4346 case AArch64::STRXpost:
4347 case AArch64::STZ2GPostIndex:
4348 case AArch64::STZGPostIndex:
4349 return true;
4350 }
4351}
4352
4354 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
4355 bool &OffsetIsScalable, TypeSize &Width,
4356 const TargetRegisterInfo *TRI) const {
4357 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4358 // Handle only loads/stores with base register followed by immediate offset.
4359 if (LdSt.getNumExplicitOperands() == 3) {
4360 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4361 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4362 !LdSt.getOperand(2).isImm())
4363 return false;
4364 } else if (LdSt.getNumExplicitOperands() == 4) {
4365 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4366 if (!LdSt.getOperand(1).isReg() ||
4367 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4368 !LdSt.getOperand(3).isImm())
4369 return false;
4370 } else
4371 return false;
4372
4373 // Get the scaling factor for the instruction and set the width for the
4374 // instruction.
4375 TypeSize Scale(0U, false);
4376 int64_t Dummy1, Dummy2;
4377
4378 // If this returns false, then it's an instruction we don't want to handle.
4379 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4380 return false;
4381
4382 // Compute the offset. Offset is calculated as the immediate operand
4383 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4384 // set to 1. Postindex are a special case which have an offset of 0.
4385 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4386 BaseOp = &LdSt.getOperand(2);
4387 Offset = 0;
4388 } else if (LdSt.getNumExplicitOperands() == 3) {
4389 BaseOp = &LdSt.getOperand(1);
4390 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4391 } else {
4392 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4393 BaseOp = &LdSt.getOperand(2);
4394 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4395 }
4396 OffsetIsScalable = Scale.isScalable();
4397
4398 return BaseOp->isReg() || BaseOp->isFI();
4399}
4400
4403 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4404 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4405 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4406 return OfsOp;
4407}
4408
4409bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4410 TypeSize &Width, int64_t &MinOffset,
4411 int64_t &MaxOffset) {
4412 switch (Opcode) {
4413 // Not a memory operation or something we want to handle.
4414 default:
4415 Scale = TypeSize::getFixed(0);
4416 Width = TypeSize::getFixed(0);
4417 MinOffset = MaxOffset = 0;
4418 return false;
4419 // LDR / STR
4420 case AArch64::LDRQui:
4421 case AArch64::STRQui:
4422 Scale = TypeSize::getFixed(16);
4423 Width = TypeSize::getFixed(16);
4424 MinOffset = 0;
4425 MaxOffset = 4095;
4426 break;
4427 case AArch64::LDRXui:
4428 case AArch64::LDRDui:
4429 case AArch64::STRXui:
4430 case AArch64::STRDui:
4431 case AArch64::PRFMui:
4432 Scale = TypeSize::getFixed(8);
4433 Width = TypeSize::getFixed(8);
4434 MinOffset = 0;
4435 MaxOffset = 4095;
4436 break;
4437 case AArch64::LDRWui:
4438 case AArch64::LDRSui:
4439 case AArch64::LDRSWui:
4440 case AArch64::STRWui:
4441 case AArch64::STRSui:
4442 Scale = TypeSize::getFixed(4);
4443 Width = TypeSize::getFixed(4);
4444 MinOffset = 0;
4445 MaxOffset = 4095;
4446 break;
4447 case AArch64::LDRHui:
4448 case AArch64::LDRHHui:
4449 case AArch64::LDRSHWui:
4450 case AArch64::LDRSHXui:
4451 case AArch64::STRHui:
4452 case AArch64::STRHHui:
4453 Scale = TypeSize::getFixed(2);
4454 Width = TypeSize::getFixed(2);
4455 MinOffset = 0;
4456 MaxOffset = 4095;
4457 break;
4458 case AArch64::LDRBui:
4459 case AArch64::LDRBBui:
4460 case AArch64::LDRSBWui:
4461 case AArch64::LDRSBXui:
4462 case AArch64::STRBui:
4463 case AArch64::STRBBui:
4464 Scale = TypeSize::getFixed(1);
4465 Width = TypeSize::getFixed(1);
4466 MinOffset = 0;
4467 MaxOffset = 4095;
4468 break;
4469 // post/pre inc
4470 case AArch64::STRQpre:
4471 case AArch64::LDRQpost:
4472 Scale = TypeSize::getFixed(1);
4473 Width = TypeSize::getFixed(16);
4474 MinOffset = -256;
4475 MaxOffset = 255;
4476 break;
4477 case AArch64::LDRDpost:
4478 case AArch64::LDRDpre:
4479 case AArch64::LDRXpost:
4480 case AArch64::LDRXpre:
4481 case AArch64::STRDpost:
4482 case AArch64::STRDpre:
4483 case AArch64::STRXpost:
4484 case AArch64::STRXpre:
4485 Scale = TypeSize::getFixed(1);
4486 Width = TypeSize::getFixed(8);
4487 MinOffset = -256;
4488 MaxOffset = 255;
4489 break;
4490 case AArch64::STRWpost:
4491 case AArch64::STRWpre:
4492 case AArch64::LDRWpost:
4493 case AArch64::LDRWpre:
4494 case AArch64::STRSpost:
4495 case AArch64::STRSpre:
4496 case AArch64::LDRSpost:
4497 case AArch64::LDRSpre:
4498 Scale = TypeSize::getFixed(1);
4499 Width = TypeSize::getFixed(4);
4500 MinOffset = -256;
4501 MaxOffset = 255;
4502 break;
4503 case AArch64::LDRHpost:
4504 case AArch64::LDRHpre:
4505 case AArch64::STRHpost:
4506 case AArch64::STRHpre:
4507 case AArch64::LDRHHpost:
4508 case AArch64::LDRHHpre:
4509 case AArch64::STRHHpost:
4510 case AArch64::STRHHpre:
4511 Scale = TypeSize::getFixed(1);
4512 Width = TypeSize::getFixed(2);
4513 MinOffset = -256;
4514 MaxOffset = 255;
4515 break;
4516 case AArch64::LDRBpost:
4517 case AArch64::LDRBpre:
4518 case AArch64::STRBpost:
4519 case AArch64::STRBpre:
4520 case AArch64::LDRBBpost:
4521 case AArch64::LDRBBpre:
4522 case AArch64::STRBBpost:
4523 case AArch64::STRBBpre:
4524 Scale = TypeSize::getFixed(1);
4525 Width = TypeSize::getFixed(1);
4526 MinOffset = -256;
4527 MaxOffset = 255;
4528 break;
4529 // Unscaled
4530 case AArch64::LDURQi:
4531 case AArch64::STURQi:
4532 Scale = TypeSize::getFixed(1);
4533 Width = TypeSize::getFixed(16);
4534 MinOffset = -256;
4535 MaxOffset = 255;
4536 break;
4537 case AArch64::LDURXi:
4538 case AArch64::LDURDi:
4539 case AArch64::LDAPURXi:
4540 case AArch64::STURXi:
4541 case AArch64::STURDi:
4542 case AArch64::STLURXi:
4543 case AArch64::PRFUMi:
4544 Scale = TypeSize::getFixed(1);
4545 Width = TypeSize::getFixed(8);
4546 MinOffset = -256;
4547 MaxOffset = 255;
4548 break;
4549 case AArch64::LDURWi:
4550 case AArch64::LDURSi:
4551 case AArch64::LDURSWi:
4552 case AArch64::LDAPURi:
4553 case AArch64::LDAPURSWi:
4554 case AArch64::STURWi:
4555 case AArch64::STURSi:
4556 case AArch64::STLURWi:
4557 Scale = TypeSize::getFixed(1);
4558 Width = TypeSize::getFixed(4);
4559 MinOffset = -256;
4560 MaxOffset = 255;
4561 break;
4562 case AArch64::LDURHi:
4563 case AArch64::LDURHHi:
4564 case AArch64::LDURSHXi:
4565 case AArch64::LDURSHWi:
4566 case AArch64::LDAPURHi:
4567 case AArch64::LDAPURSHWi:
4568 case AArch64::LDAPURSHXi:
4569 case AArch64::STURHi:
4570 case AArch64::STURHHi:
4571 case AArch64::STLURHi:
4572 Scale = TypeSize::getFixed(1);
4573 Width = TypeSize::getFixed(2);
4574 MinOffset = -256;
4575 MaxOffset = 255;
4576 break;
4577 case AArch64::LDURBi:
4578 case AArch64::LDURBBi:
4579 case AArch64::LDURSBXi:
4580 case AArch64::LDURSBWi:
4581 case AArch64::LDAPURBi:
4582 case AArch64::LDAPURSBWi:
4583 case AArch64::LDAPURSBXi:
4584 case AArch64::STURBi:
4585 case AArch64::STURBBi:
4586 case AArch64::STLURBi:
4587 Scale = TypeSize::getFixed(1);
4588 Width = TypeSize::getFixed(1);
4589 MinOffset = -256;
4590 MaxOffset = 255;
4591 break;
4592 // LDP / STP (including pre/post inc)
4593 case AArch64::LDPQi:
4594 case AArch64::LDNPQi:
4595 case AArch64::STPQi:
4596 case AArch64::STNPQi:
4597 case AArch64::LDPQpost:
4598 case AArch64::LDPQpre:
4599 case AArch64::STPQpost:
4600 case AArch64::STPQpre:
4601 Scale = TypeSize::getFixed(16);
4602 Width = TypeSize::getFixed(16 * 2);
4603 MinOffset = -64;
4604 MaxOffset = 63;
4605 break;
4606 case AArch64::LDPXi:
4607 case AArch64::LDPDi:
4608 case AArch64::LDNPXi:
4609 case AArch64::LDNPDi:
4610 case AArch64::STPXi:
4611 case AArch64::STPDi:
4612 case AArch64::STNPXi:
4613 case AArch64::STNPDi:
4614 case AArch64::LDPDpost:
4615 case AArch64::LDPDpre:
4616 case AArch64::LDPXpost:
4617 case AArch64::LDPXpre:
4618 case AArch64::STPDpost:
4619 case AArch64::STPDpre:
4620 case AArch64::STPXpost:
4621 case AArch64::STPXpre:
4622 Scale = TypeSize::getFixed(8);
4623 Width = TypeSize::getFixed(8 * 2);
4624 MinOffset = -64;
4625 MaxOffset = 63;
4626 break;
4627 case AArch64::LDPWi:
4628 case AArch64::LDPSi:
4629 case AArch64::LDNPWi:
4630 case AArch64::LDNPSi:
4631 case AArch64::STPWi:
4632 case AArch64::STPSi:
4633 case AArch64::STNPWi:
4634 case AArch64::STNPSi:
4635 case AArch64::LDPSpost:
4636 case AArch64::LDPSpre:
4637 case AArch64::LDPWpost:
4638 case AArch64::LDPWpre:
4639 case AArch64::STPSpost:
4640 case AArch64::STPSpre:
4641 case AArch64::STPWpost:
4642 case AArch64::STPWpre:
4643 Scale = TypeSize::getFixed(4);
4644 Width = TypeSize::getFixed(4 * 2);
4645 MinOffset = -64;
4646 MaxOffset = 63;
4647 break;
4648 case AArch64::StoreSwiftAsyncContext:
4649 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4650 Scale = TypeSize::getFixed(1);
4651 Width = TypeSize::getFixed(8);
4652 MinOffset = 0;
4653 MaxOffset = 4095;
4654 break;
4655 case AArch64::ADDG:
4656 Scale = TypeSize::getFixed(16);
4657 Width = TypeSize::getFixed(0);
4658 MinOffset = 0;
4659 MaxOffset = 63;
4660 break;
4661 case AArch64::TAGPstack:
4662 Scale = TypeSize::getFixed(16);
4663 Width = TypeSize::getFixed(0);
4664 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4665 // of 63 (not 64!).
4666 MinOffset = -63;
4667 MaxOffset = 63;
4668 break;
4669 case AArch64::LDG:
4670 case AArch64::STGi:
4671 case AArch64::STGPreIndex:
4672 case AArch64::STGPostIndex:
4673 case AArch64::STZGi:
4674 case AArch64::STZGPreIndex:
4675 case AArch64::STZGPostIndex:
4676 Scale = TypeSize::getFixed(16);
4677 Width = TypeSize::getFixed(16);
4678 MinOffset = -256;
4679 MaxOffset = 255;
4680 break;
4681 // SVE
4682 case AArch64::STR_ZZZZXI:
4683 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4684 case AArch64::LDR_ZZZZXI:
4685 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4686 Scale = TypeSize::getScalable(16);
4687 Width = TypeSize::getScalable(16 * 4);
4688 MinOffset = -256;
4689 MaxOffset = 252;
4690 break;
4691 case AArch64::STR_ZZZXI:
4692 case AArch64::LDR_ZZZXI:
4693 Scale = TypeSize::getScalable(16);
4694 Width = TypeSize::getScalable(16 * 3);
4695 MinOffset = -256;
4696 MaxOffset = 253;
4697 break;
4698 case AArch64::STR_ZZXI:
4699 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4700 case AArch64::LDR_ZZXI:
4701 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4702 Scale = TypeSize::getScalable(16);
4703 Width = TypeSize::getScalable(16 * 2);
4704 MinOffset = -256;
4705 MaxOffset = 254;
4706 break;
4707 case AArch64::LDR_PXI:
4708 case AArch64::STR_PXI:
4709 Scale = TypeSize::getScalable(2);
4710 Width = TypeSize::getScalable(2);
4711 MinOffset = -256;
4712 MaxOffset = 255;
4713 break;
4714 case AArch64::LDR_PPXI:
4715 case AArch64::STR_PPXI:
4716 Scale = TypeSize::getScalable(2);
4717 Width = TypeSize::getScalable(2 * 2);
4718 MinOffset = -256;
4719 MaxOffset = 254;
4720 break;
4721 case AArch64::LDR_ZXI:
4722 case AArch64::STR_ZXI:
4723 Scale = TypeSize::getScalable(16);
4724 Width = TypeSize::getScalable(16);
4725 MinOffset = -256;
4726 MaxOffset = 255;
4727 break;
4728 case AArch64::LD1B_IMM:
4729 case AArch64::LD1H_IMM:
4730 case AArch64::LD1W_IMM:
4731 case AArch64::LD1D_IMM:
4732 case AArch64::LDNT1B_ZRI:
4733 case AArch64::LDNT1H_ZRI:
4734 case AArch64::LDNT1W_ZRI:
4735 case AArch64::LDNT1D_ZRI:
4736 case AArch64::ST1B_IMM:
4737 case AArch64::ST1H_IMM:
4738 case AArch64::ST1W_IMM:
4739 case AArch64::ST1D_IMM:
4740 case AArch64::STNT1B_ZRI:
4741 case AArch64::STNT1H_ZRI:
4742 case AArch64::STNT1W_ZRI:
4743 case AArch64::STNT1D_ZRI:
4744 case AArch64::LDNF1B_IMM:
4745 case AArch64::LDNF1H_IMM:
4746 case AArch64::LDNF1W_IMM:
4747 case AArch64::LDNF1D_IMM:
4748 // A full vectors worth of data
4749 // Width = mbytes * elements
4750 Scale = TypeSize::getScalable(16);
4751 Width = TypeSize::getScalable(16);
4752 MinOffset = -8;
4753 MaxOffset = 7;
4754 break;
4755 case AArch64::LD2B_IMM:
4756 case AArch64::LD2H_IMM:
4757 case AArch64::LD2W_IMM:
4758 case AArch64::LD2D_IMM:
4759 case AArch64::ST2B_IMM:
4760 case AArch64::ST2H_IMM:
4761 case AArch64::ST2W_IMM:
4762 case AArch64::ST2D_IMM:
4763 Scale = TypeSize::getScalable(32);
4764 Width = TypeSize::getScalable(16 * 2);
4765 MinOffset = -8;
4766 MaxOffset = 7;
4767 break;
4768 case AArch64::LD3B_IMM:
4769 case AArch64::LD3H_IMM:
4770 case AArch64::LD3W_IMM:
4771 case AArch64::LD3D_IMM:
4772 case AArch64::ST3B_IMM:
4773 case AArch64::ST3H_IMM:
4774 case AArch64::ST3W_IMM:
4775 case AArch64::ST3D_IMM:
4776 Scale = TypeSize::getScalable(48);
4777 Width = TypeSize::getScalable(16 * 3);
4778 MinOffset = -8;
4779 MaxOffset = 7;
4780 break;
4781 case AArch64::LD4B_IMM:
4782 case AArch64::LD4H_IMM:
4783 case AArch64::LD4W_IMM:
4784 case AArch64::LD4D_IMM:
4785 case AArch64::ST4B_IMM:
4786 case AArch64::ST4H_IMM:
4787 case AArch64::ST4W_IMM:
4788 case AArch64::ST4D_IMM:
4789 Scale = TypeSize::getScalable(64);
4790 Width = TypeSize::getScalable(16 * 4);
4791 MinOffset = -8;
4792 MaxOffset = 7;
4793 break;
4794 case AArch64::LD1B_H_IMM:
4795 case AArch64::LD1SB_H_IMM:
4796 case AArch64::LD1H_S_IMM:
4797 case AArch64::LD1SH_S_IMM:
4798 case AArch64::LD1W_D_IMM:
4799 case AArch64::LD1SW_D_IMM:
4800 case AArch64::ST1B_H_IMM:
4801 case AArch64::ST1H_S_IMM:
4802 case AArch64::ST1W_D_IMM:
4803 case AArch64::LDNF1B_H_IMM:
4804 case AArch64::LDNF1SB_H_IMM:
4805 case AArch64::LDNF1H_S_IMM:
4806 case AArch64::LDNF1SH_S_IMM:
4807 case AArch64::LDNF1W_D_IMM:
4808 case AArch64::LDNF1SW_D_IMM:
4809 // A half vector worth of data
4810 // Width = mbytes * elements
4811 Scale = TypeSize::getScalable(8);
4812 Width = TypeSize::getScalable(8);
4813 MinOffset = -8;
4814 MaxOffset = 7;
4815 break;
4816 case AArch64::LD1B_S_IMM:
4817 case AArch64::LD1SB_S_IMM:
4818 case AArch64::LD1H_D_IMM:
4819 case AArch64::LD1SH_D_IMM:
4820 case AArch64::ST1B_S_IMM:
4821 case AArch64::ST1H_D_IMM:
4822 case AArch64::LDNF1B_S_IMM:
4823 case AArch64::LDNF1SB_S_IMM:
4824 case AArch64::LDNF1H_D_IMM:
4825 case AArch64::LDNF1SH_D_IMM:
4826 // A quarter vector worth of data
4827 // Width = mbytes * elements
4828 Scale = TypeSize::getScalable(4);
4829 Width = TypeSize::getScalable(4);
4830 MinOffset = -8;
4831 MaxOffset = 7;
4832 break;
4833 case AArch64::LD1B_D_IMM:
4834 case AArch64::LD1SB_D_IMM:
4835 case AArch64::ST1B_D_IMM:
4836 case AArch64::LDNF1B_D_IMM:
4837 case AArch64::LDNF1SB_D_IMM:
4838 // A eighth vector worth of data
4839 // Width = mbytes * elements
4840 Scale = TypeSize::getScalable(2);
4841 Width = TypeSize::getScalable(2);
4842 MinOffset = -8;
4843 MaxOffset = 7;
4844 break;
4845 case AArch64::ST2Gi:
4846 case AArch64::ST2GPreIndex:
4847 case AArch64::ST2GPostIndex:
4848 case AArch64::STZ2Gi:
4849 case AArch64::STZ2GPreIndex:
4850 case AArch64::STZ2GPostIndex:
4851 Scale = TypeSize::getFixed(16);
4852 Width = TypeSize::getFixed(32);
4853 MinOffset = -256;
4854 MaxOffset = 255;
4855 break;
4856 case AArch64::STGPi:
4857 case AArch64::STGPpost:
4858 case AArch64::STGPpre:
4859 Scale = TypeSize::getFixed(16);
4860 Width = TypeSize::getFixed(16);
4861 MinOffset = -64;
4862 MaxOffset = 63;
4863 break;
4864 case AArch64::LD1RB_IMM:
4865 case AArch64::LD1RB_H_IMM:
4866 case AArch64::LD1RB_S_IMM:
4867 case AArch64::LD1RB_D_IMM:
4868 case AArch64::LD1RSB_H_IMM:
4869 case AArch64::LD1RSB_S_IMM:
4870 case AArch64::LD1RSB_D_IMM:
4871 Scale = TypeSize::getFixed(1);
4872 Width = TypeSize::getFixed(1);
4873 MinOffset = 0;
4874 MaxOffset = 63;
4875 break;
4876 case AArch64::LD1RH_IMM:
4877 case AArch64::LD1RH_S_IMM:
4878 case AArch64::LD1RH_D_IMM:
4879 case AArch64::LD1RSH_S_IMM:
4880 case AArch64::LD1RSH_D_IMM:
4881 Scale = TypeSize::getFixed(2);
4882 Width = TypeSize::getFixed(2);
4883 MinOffset = 0;
4884 MaxOffset = 63;
4885 break;
4886 case AArch64::LD1RW_IMM:
4887 case AArch64::LD1RW_D_IMM:
4888 case AArch64::LD1RSW_IMM:
4889 Scale = TypeSize::getFixed(4);
4890 Width = TypeSize::getFixed(4);
4891 MinOffset = 0;
4892 MaxOffset = 63;
4893 break;
4894 case AArch64::LD1RD_IMM:
4895 Scale = TypeSize::getFixed(8);
4896 Width = TypeSize::getFixed(8);
4897 MinOffset = 0;
4898 MaxOffset = 63;
4899 break;
4900 }
4901
4902 return true;
4903}
4904
4905// Scaling factor for unscaled load or store.
4907 switch (Opc) {
4908 default:
4909 llvm_unreachable("Opcode has unknown scale!");
4910 case AArch64::LDRBBui:
4911 case AArch64::LDURBBi:
4912 case AArch64::LDRSBWui:
4913 case AArch64::LDURSBWi:
4914 case AArch64::STRBBui:
4915 case AArch64::STURBBi:
4916 return 1;
4917 case AArch64::LDRHHui:
4918 case AArch64::LDURHHi:
4919 case AArch64::LDRSHWui:
4920 case AArch64::LDURSHWi:
4921 case AArch64::STRHHui:
4922 case AArch64::STURHHi:
4923 return 2;
4924 case AArch64::LDRSui:
4925 case AArch64::LDURSi:
4926 case AArch64::LDRSpre:
4927 case AArch64::LDRSWui:
4928 case AArch64::LDURSWi:
4929 case AArch64::LDRSWpre:
4930 case AArch64::LDRWpre:
4931 case AArch64::LDRWui:
4932 case AArch64::LDURWi:
4933 case AArch64::STRSui:
4934 case AArch64::STURSi:
4935 case AArch64::STRSpre:
4936 case AArch64::STRWui:
4937 case AArch64::STURWi:
4938 case AArch64::STRWpre:
4939 case AArch64::LDPSi:
4940 case AArch64::LDPSWi:
4941 case AArch64::LDPWi:
4942 case AArch64::STPSi:
4943 case AArch64::STPWi:
4944 return 4;
4945 case AArch64::LDRDui:
4946 case AArch64::LDURDi:
4947 case AArch64::LDRDpre:
4948 case AArch64::LDRXui:
4949 case AArch64::LDURXi:
4950 case AArch64::LDRXpre:
4951 case AArch64::STRDui:
4952 case AArch64::STURDi:
4953 case AArch64::STRDpre:
4954 case AArch64::STRXui:
4955 case AArch64::STURXi:
4956 case AArch64::STRXpre:
4957 case AArch64::LDPDi:
4958 case AArch64::LDPXi:
4959 case AArch64::STPDi:
4960 case AArch64::STPXi:
4961 return 8;
4962 case AArch64::LDRQui:
4963 case AArch64::LDURQi:
4964 case AArch64::STRQui:
4965 case AArch64::STURQi:
4966 case AArch64::STRQpre:
4967 case AArch64::LDPQi:
4968 case AArch64::LDRQpre:
4969 case AArch64::STPQi:
4970 case AArch64::STGi:
4971 case AArch64::STZGi:
4972 case AArch64::ST2Gi:
4973 case AArch64::STZ2Gi:
4974 case AArch64::STGPi:
4975 return 16;
4976 }
4977}
4978
4980 switch (MI.getOpcode()) {
4981 default:
4982 return false;
4983 case AArch64::LDRWpre:
4984 case AArch64::LDRXpre:
4985 case AArch64::LDRSWpre:
4986 case AArch64::LDRSpre:
4987 case AArch64::LDRDpre:
4988 case AArch64::LDRQpre:
4989 return true;
4990 }
4991}
4992
4994 switch (MI.getOpcode()) {
4995 default:
4996 return false;
4997 case AArch64::STRWpre:
4998 case AArch64::STRXpre:
4999 case AArch64::STRSpre:
5000 case AArch64::STRDpre:
5001 case AArch64::STRQpre:
5002 return true;
5003 }
5004}
5005
5007 return isPreLd(MI) || isPreSt(MI);
5008}
5009
5011 switch (MI.getOpcode()) {
5012 default:
5013 return false;
5014 case AArch64::LDPSi:
5015 case AArch64::LDPSWi:
5016 case AArch64::LDPDi:
5017 case AArch64::LDPQi:
5018 case AArch64::LDPWi:
5019 case AArch64::LDPXi:
5020 case AArch64::STPSi:
5021 case AArch64::STPDi:
5022 case AArch64::STPQi:
5023 case AArch64::STPWi:
5024 case AArch64::STPXi:
5025 case AArch64::STGPi:
5026 return true;
5027 }
5028}
5029
5031 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5032 unsigned Idx =
5034 : 1;
5035 return MI.getOperand(Idx);
5036}
5037
5038const MachineOperand &
5040 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
5041 unsigned Idx =
5043 : 2;
5044 return MI.getOperand(Idx);
5045}
5046
5047const MachineOperand &
5049 switch (MI.getOpcode()) {
5050 default:
5051 llvm_unreachable("Unexpected opcode");
5052 case AArch64::LDRBroX:
5053 case AArch64::LDRBBroX:
5054 case AArch64::LDRSBXroX:
5055 case AArch64::LDRSBWroX:
5056 case AArch64::LDRHroX:
5057 case AArch64::LDRHHroX:
5058 case AArch64::LDRSHXroX:
5059 case AArch64::LDRSHWroX:
5060 case AArch64::LDRWroX:
5061 case AArch64::LDRSroX:
5062 case AArch64::LDRSWroX:
5063 case AArch64::LDRDroX:
5064 case AArch64::LDRXroX:
5065 case AArch64::LDRQroX:
5066 return MI.getOperand(4);
5067 }
5068}
5069
5071 Register Reg) {
5072 if (MI.getParent() == nullptr)
5073 return nullptr;
5074 const MachineFunction *MF = MI.getParent()->getParent();
5075 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
5076}
5077
5079 auto IsHFPR = [&](const MachineOperand &Op) {
5080 if (!Op.isReg())
5081 return false;
5082 auto Reg = Op.getReg();
5083 if (Reg.isPhysical())
5084 return AArch64::FPR16RegClass.contains(Reg);
5085 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5086 return TRC == &AArch64::FPR16RegClass ||
5087 TRC == &AArch64::FPR16_loRegClass;
5088 };
5089 return llvm::any_of(MI.operands(), IsHFPR);
5090}
5091
5093 auto IsQFPR = [&](const MachineOperand &Op) {
5094 if (!Op.isReg())
5095 return false;
5096 auto Reg = Op.getReg();
5097 if (Reg.isPhysical())
5098 return AArch64::FPR128RegClass.contains(Reg);
5099 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5100 return TRC == &AArch64::FPR128RegClass ||
5101 TRC == &AArch64::FPR128_loRegClass;
5102 };
5103 return llvm::any_of(MI.operands(), IsQFPR);
5104}
5105
5107 switch (MI.getOpcode()) {
5108 case AArch64::BRK:
5109 case AArch64::HLT:
5110 case AArch64::PACIASP:
5111 case AArch64::PACIBSP:
5112 // Implicit BTI behavior.
5113 return true;
5114 case AArch64::PAUTH_PROLOGUE:
5115 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
5116 return true;
5117 case AArch64::HINT: {
5118 unsigned Imm = MI.getOperand(0).getImm();
5119 // Explicit BTI instruction.
5120 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
5121 return true;
5122 // PACI(A|B)SP instructions.
5123 if (Imm == 25 || Imm == 27)
5124 return true;
5125 return false;
5126 }
5127 default:
5128 return false;
5129 }
5130}
5131
5133 if (Reg == 0)
5134 return false;
5135 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
5136 return AArch64::FPR128RegClass.contains(Reg) ||
5137 AArch64::FPR64RegClass.contains(Reg) ||
5138 AArch64::FPR32RegClass.contains(Reg) ||
5139 AArch64::FPR16RegClass.contains(Reg) ||
5140 AArch64::FPR8RegClass.contains(Reg);
5141}
5142
5144 auto IsFPR = [&](const MachineOperand &Op) {
5145 if (!Op.isReg())
5146 return false;
5147 auto Reg = Op.getReg();
5148 if (Reg.isPhysical())
5149 return isFpOrNEON(Reg);
5150
5151 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
5152 return TRC == &AArch64::FPR128RegClass ||
5153 TRC == &AArch64::FPR128_loRegClass ||
5154 TRC == &AArch64::FPR64RegClass ||
5155 TRC == &AArch64::FPR64_loRegClass ||
5156 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
5157 TRC == &AArch64::FPR8RegClass;
5158 };
5159 return llvm::any_of(MI.operands(), IsFPR);
5160}
5161
5162// Scale the unscaled offsets. Returns false if the unscaled offset can't be
5163// scaled.
5164static bool scaleOffset(unsigned Opc, int64_t &Offset) {
5166
5167 // If the byte-offset isn't a multiple of the stride, we can't scale this
5168 // offset.
5169 if (Offset % Scale != 0)
5170 return false;
5171
5172 // Convert the byte-offset used by unscaled into an "element" offset used
5173 // by the scaled pair load/store instructions.
5174 Offset /= Scale;
5175 return true;
5176}
5177
5178static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
5179 if (FirstOpc == SecondOpc)
5180 return true;
5181 // We can also pair sign-ext and zero-ext instructions.
5182 switch (FirstOpc) {
5183 default:
5184 return false;
5185 case AArch64::STRSui:
5186 case AArch64::STURSi:
5187 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
5188 case AArch64::STRDui:
5189 case AArch64::STURDi:
5190 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
5191 case AArch64::STRQui:
5192 case AArch64::STURQi:
5193 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
5194 case AArch64::STRWui:
5195 case AArch64::STURWi:
5196 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
5197 case AArch64::STRXui:
5198 case AArch64::STURXi:
5199 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
5200 case AArch64::LDRSui:
5201 case AArch64::LDURSi:
5202 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
5203 case AArch64::LDRDui:
5204 case AArch64::LDURDi:
5205 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
5206 case AArch64::LDRQui:
5207 case AArch64::LDURQi:
5208 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
5209 case AArch64::LDRWui:
5210 case AArch64::LDURWi:
5211 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
5212 case AArch64::LDRSWui:
5213 case AArch64::LDURSWi:
5214 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
5215 case AArch64::LDRXui:
5216 case AArch64::LDURXi:
5217 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
5218 }
5219 // These instructions can't be paired based on their opcodes.
5220 return false;
5221}
5222
5223static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
5224 int64_t Offset1, unsigned Opcode1, int FI2,
5225 int64_t Offset2, unsigned Opcode2) {
5226 // Accesses through fixed stack object frame indices may access a different
5227 // fixed stack slot. Check that the object offsets + offsets match.
5228 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
5229 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
5230 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
5231 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
5232 // Convert to scaled object offsets.
5233 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
5234 if (ObjectOffset1 % Scale1 != 0)
5235 return false;
5236 ObjectOffset1 /= Scale1;
5237 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
5238 if (ObjectOffset2 % Scale2 != 0)
5239 return false;
5240 ObjectOffset2 /= Scale2;
5241 ObjectOffset1 += Offset1;
5242 ObjectOffset2 += Offset2;
5243 return ObjectOffset1 + 1 == ObjectOffset2;
5244 }
5245
5246 return FI1 == FI2;
5247}
5248
5249/// Detect opportunities for ldp/stp formation.
5250///
5251/// Only called for LdSt for which getMemOperandWithOffset returns true.
5253 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
5254 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
5255 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
5256 unsigned NumBytes) const {
5257 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
5258 const MachineOperand &BaseOp1 = *BaseOps1.front();
5259 const MachineOperand &BaseOp2 = *BaseOps2.front();
5260 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
5261 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
5262 if (BaseOp1.getType() != BaseOp2.getType())
5263 return false;
5264
5265 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
5266 "Only base registers and frame indices are supported.");
5267
5268 // Check for both base regs and base FI.
5269 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
5270 return false;
5271
5272 // Only cluster up to a single pair.
5273 if (ClusterSize > 2)
5274 return false;
5275
5276 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
5277 return false;
5278
5279 // Can we pair these instructions based on their opcodes?
5280 unsigned FirstOpc = FirstLdSt.getOpcode();
5281 unsigned SecondOpc = SecondLdSt.getOpcode();
5282 if (!canPairLdStOpc(FirstOpc, SecondOpc))
5283 return false;
5284
5285 // Can't merge volatiles or load/stores that have a hint to avoid pair
5286 // formation, for example.
5287 if (!isCandidateToMergeOrPair(FirstLdSt) ||
5288 !isCandidateToMergeOrPair(SecondLdSt))
5289 return false;
5290
5291 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
5292 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
5293 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
5294 return false;
5295
5296 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
5297 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
5298 return false;
5299
5300 // Pairwise instructions have a 7-bit signed offset field.
5301 if (Offset1 > 63 || Offset1 < -64)
5302 return false;
5303
5304 // The caller should already have ordered First/SecondLdSt by offset.
5305 // Note: except for non-equal frame index bases
5306 if (BaseOp1.isFI()) {
5307 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
5308 "Caller should have ordered offsets.");
5309
5310 const MachineFrameInfo &MFI =
5311 FirstLdSt.getParent()->getParent()->getFrameInfo();
5312 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
5313 BaseOp2.getIndex(), Offset2, SecondOpc);
5314 }
5315
5316 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
5317
5318 return Offset1 + 1 == Offset2;
5319}
5320
5322 MCRegister Reg, unsigned SubIdx,
5323 unsigned State,
5324 const TargetRegisterInfo *TRI) {
5325 if (!SubIdx)
5326 return MIB.addReg(Reg, State);
5327
5328 if (Reg.isPhysical())
5329 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
5330 return MIB.addReg(Reg, State, SubIdx);
5331}
5332
5333static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
5334 unsigned NumRegs) {
5335 // We really want the positive remainder mod 32 here, that happens to be
5336 // easily obtainable with a mask.
5337 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
5338}
5339
5342 const DebugLoc &DL, MCRegister DestReg,
5343 MCRegister SrcReg, bool KillSrc,
5344 unsigned Opcode,
5345 ArrayRef<unsigned> Indices) const {
5346 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
5348 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5349 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5350 unsigned NumRegs = Indices.size();
5351
5352 int SubReg = 0, End = NumRegs, Incr = 1;
5353 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
5354 SubReg = NumRegs - 1;
5355 End = -1;
5356 Incr = -1;
5357 }
5358
5359 for (; SubReg != End; SubReg += Incr) {
5360 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5361 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5362 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
5363 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5364 }
5365}
5366
5369 const DebugLoc &DL, MCRegister DestReg,
5370 MCRegister SrcReg, bool KillSrc,
5371 unsigned Opcode, unsigned ZeroReg,
5372 llvm::ArrayRef<unsigned> Indices) const {
5374 unsigned NumRegs = Indices.size();
5375
5376#ifndef NDEBUG
5377 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5378 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5379 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5380 "GPR reg sequences should not be able to overlap");
5381#endif
5382
5383 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5384 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5385 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5386 MIB.addReg(ZeroReg);
5387 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5388 MIB.addImm(0);
5389 }
5390}
5391
5394 const DebugLoc &DL, Register DestReg,
5395 Register SrcReg, bool KillSrc,
5396 bool RenamableDest,
5397 bool RenamableSrc) const {
5398 ++NumCopyInstrs;
5399 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5400 AArch64::GPR32spRegClass.contains(SrcReg)) {
5401 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5402 // If either operand is WSP, expand to ADD #0.
5403 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5404 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5405 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5406 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5407 &AArch64::GPR64spRegClass);
5408 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5409 &AArch64::GPR64spRegClass);
5410 // This instruction is reading and writing X registers. This may upset
5411 // the register scavenger and machine verifier, so we need to indicate
5412 // that we are reading an undefined value from SrcRegX, but a proper
5413 // value from SrcReg.
5414 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5415 .addReg(SrcRegX, RegState::Undef)
5416 .addImm(0)
5418 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5419 ++NumZCRegMoveInstrsGPR;
5420 } else {
5421 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5422 .addReg(SrcReg, getKillRegState(KillSrc))
5423 .addImm(0)
5425 if (Subtarget.hasZeroCycleRegMoveGPR32())
5426 ++NumZCRegMoveInstrsGPR;
5427 }
5428 } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5429 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5430 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5431 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5432 &AArch64::GPR64spRegClass);
5433 assert(DestRegX.isValid() && "Destination super-reg not valid");
5434 MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
5435 &AArch64::GPR64spRegClass);
5436 assert(SrcRegX.isValid() && "Source super-reg not valid");
5437 // This instruction is reading and writing X registers. This may upset
5438 // the register scavenger and machine verifier, so we need to indicate
5439 // that we are reading an undefined value from SrcRegX, but a proper
5440 // value from SrcReg.
5441 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5442 .addReg(AArch64::XZR)
5443 .addReg(SrcRegX, RegState::Undef)
5444 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5445 ++NumZCRegMoveInstrsGPR;
5446 } else {
5447 // Otherwise, expand to ORR WZR.
5448 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5449 .addReg(AArch64::WZR)
5450 .addReg(SrcReg, getKillRegState(KillSrc));
5451 if (Subtarget.hasZeroCycleRegMoveGPR32())
5452 ++NumZCRegMoveInstrsGPR;
5453 }
5454 return;
5455 }
5456
5457 // GPR32 zeroing
5458 if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
5459 if (Subtarget.hasZeroCycleZeroingGPR64() &&
5460 !Subtarget.hasZeroCycleZeroingGPR32()) {
5461 MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
5462 &AArch64::GPR64spRegClass);
5463 assert(DestRegX.isValid() && "Destination super-reg not valid");
5464 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
5465 .addImm(0)
5467 ++NumZCZeroingInstrsGPR;
5468 } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
5469 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5470 .addImm(0)
5472 ++NumZCZeroingInstrsGPR;
5473 } else {
5474 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5475 .addReg(AArch64::WZR)
5476 .addReg(AArch64::WZR);
5477 }
5478 return;
5479 }
5480
5481 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5482 AArch64::GPR64spRegClass.contains(SrcReg)) {
5483 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5484 // If either operand is SP, expand to ADD #0.
5485 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5486 .addReg(SrcReg, getKillRegState(KillSrc))
5487 .addImm(0)
5489 if (Subtarget.hasZeroCycleRegMoveGPR64())
5490 ++NumZCRegMoveInstrsGPR;
5491 } else {
5492 // Otherwise, expand to ORR XZR.
5493 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5494 .addReg(AArch64::XZR)
5495 .addReg(SrcReg, getKillRegState(KillSrc));
5496 if (Subtarget.hasZeroCycleRegMoveGPR64())
5497 ++NumZCRegMoveInstrsGPR;
5498 }
5499 return;
5500 }
5501
5502 // GPR64 zeroing
5503 if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
5504 if (Subtarget.hasZeroCycleZeroingGPR64()) {
5505 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5506 .addImm(0)
5508 ++NumZCZeroingInstrsGPR;
5509 } else {
5510 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5511 .addReg(AArch64::XZR)
5512 .addReg(AArch64::XZR);
5513 }
5514 return;
5515 }
5516
5517 // Copy a Predicate register by ORRing with itself.
5518 if (AArch64::PPRRegClass.contains(DestReg) &&
5519 AArch64::PPRRegClass.contains(SrcReg)) {
5520 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5521 "Unexpected SVE register.");
5522 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5523 .addReg(SrcReg) // Pg
5524 .addReg(SrcReg)
5525 .addReg(SrcReg, getKillRegState(KillSrc));
5526 return;
5527 }
5528
5529 // Copy a predicate-as-counter register by ORRing with itself as if it
5530 // were a regular predicate (mask) register.
5531 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5532 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5533 if (DestIsPNR || SrcIsPNR) {
5534 auto ToPPR = [](MCRegister R) -> MCRegister {
5535 return (R - AArch64::PN0) + AArch64::P0;
5536 };
5537 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5538 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5539
5540 if (PPRSrcReg != PPRDestReg) {
5541 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5542 .addReg(PPRSrcReg) // Pg
5543 .addReg(PPRSrcReg)
5544 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5545 if (DestIsPNR)
5546 NewMI.addDef(DestReg, RegState::Implicit);
5547 }
5548 return;
5549 }
5550
5551 // Copy a Z register by ORRing with itself.
5552 if (AArch64::ZPRRegClass.contains(DestReg) &&
5553 AArch64::ZPRRegClass.contains(SrcReg)) {
5554 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5555 "Unexpected SVE register.");
5556 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5557 .addReg(SrcReg)
5558 .addReg(SrcReg, getKillRegState(KillSrc));
5559 return;
5560 }
5561
5562 // Copy a Z register pair by copying the individual sub-registers.
5563 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5564 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5565 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5566 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5567 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5568 "Unexpected SVE register.");
5569 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5570 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5571 Indices);
5572 return;
5573 }
5574
5575 // Copy a Z register triple by copying the individual sub-registers.
5576 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5577 AArch64::ZPR3RegClass.contains(SrcReg)) {
5578 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5579 "Unexpected SVE register.");
5580 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5581 AArch64::zsub2};
5582 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5583 Indices);
5584 return;
5585 }
5586
5587 // Copy a Z register quad by copying the individual sub-registers.
5588 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5589 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5590 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5591 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5592 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5593 "Unexpected SVE register.");
5594 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5595 AArch64::zsub2, AArch64::zsub3};
5596 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5597 Indices);
5598 return;
5599 }
5600
5601 // Copy a DDDD register quad by copying the individual sub-registers.
5602 if (AArch64::DDDDRegClass.contains(DestReg) &&
5603 AArch64::DDDDRegClass.contains(SrcReg)) {
5604 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5605 AArch64::dsub2, AArch64::dsub3};
5606 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5607 Indices);
5608 return;
5609 }
5610
5611 // Copy a DDD register triple by copying the individual sub-registers.
5612 if (AArch64::DDDRegClass.contains(DestReg) &&
5613 AArch64::DDDRegClass.contains(SrcReg)) {
5614 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5615 AArch64::dsub2};
5616 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5617 Indices);
5618 return;
5619 }
5620
5621 // Copy a DD register pair by copying the individual sub-registers.
5622 if (AArch64::DDRegClass.contains(DestReg) &&
5623 AArch64::DDRegClass.contains(SrcReg)) {
5624 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5625 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5626 Indices);
5627 return;
5628 }
5629
5630 // Copy a QQQQ register quad by copying the individual sub-registers.
5631 if (AArch64::QQQQRegClass.contains(DestReg) &&
5632 AArch64::QQQQRegClass.contains(SrcReg)) {
5633 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5634 AArch64::qsub2, AArch64::qsub3};
5635 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5636 Indices);
5637 return;
5638 }
5639
5640 // Copy a QQQ register triple by copying the individual sub-registers.
5641 if (AArch64::QQQRegClass.contains(DestReg) &&
5642 AArch64::QQQRegClass.contains(SrcReg)) {
5643 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5644 AArch64::qsub2};
5645 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5646 Indices);
5647 return;
5648 }
5649
5650 // Copy a QQ register pair by copying the individual sub-registers.
5651 if (AArch64::QQRegClass.contains(DestReg) &&
5652 AArch64::QQRegClass.contains(SrcReg)) {
5653 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5654 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5655 Indices);
5656 return;
5657 }
5658
5659 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5660 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5661 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5662 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5663 AArch64::XZR, Indices);
5664 return;
5665 }
5666
5667 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5668 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5669 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5670 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5671 AArch64::WZR, Indices);
5672 return;
5673 }
5674
5675 if (AArch64::FPR128RegClass.contains(DestReg) &&
5676 AArch64::FPR128RegClass.contains(SrcReg)) {
5677 if (Subtarget.isSVEorStreamingSVEAvailable() &&
5678 !Subtarget.isNeonAvailable()) {
5679 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5680 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5681 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5682 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5683 } else if (Subtarget.isNeonAvailable()) {
5684 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5685 .addReg(SrcReg)
5686 .addReg(SrcReg, getKillRegState(KillSrc));
5687 if (Subtarget.hasZeroCycleRegMoveFPR128())
5688 ++NumZCRegMoveInstrsFPR;
5689 } else {
5690 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5691 .addReg(AArch64::SP, RegState::Define)
5692 .addReg(SrcReg, getKillRegState(KillSrc))
5693 .addReg(AArch64::SP)
5694 .addImm(-16);
5695 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5696 .addReg(AArch64::SP, RegState::Define)
5697 .addReg(DestReg, RegState::Define)
5698 .addReg(AArch64::SP)
5699 .addImm(16);
5700 }
5701 return;
5702 }
5703
5704 if (AArch64::FPR64RegClass.contains(DestReg) &&
5705 AArch64::FPR64RegClass.contains(SrcReg)) {
5706 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5707 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5708 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5709 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
5710 &AArch64::FPR128RegClass);
5711 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
5712 &AArch64::FPR128RegClass);
5713 // This instruction is reading and writing Q registers. This may upset
5714 // the register scavenger and machine verifier, so we need to indicate
5715 // that we are reading an undefined value from SrcRegQ, but a proper
5716 // value from SrcReg.
5717 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5718 .addReg(SrcRegQ, RegState::Undef)
5719 .addReg(SrcRegQ, RegState::Undef)
5720 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5721 ++NumZCRegMoveInstrsFPR;
5722 } else {
5723 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5724 .addReg(SrcReg, getKillRegState(KillSrc));
5725 if (Subtarget.hasZeroCycleRegMoveFPR64())
5726 ++NumZCRegMoveInstrsFPR;
5727 }
5728 return;
5729 }
5730
5731 if (AArch64::FPR32RegClass.contains(DestReg) &&
5732 AArch64::FPR32RegClass.contains(SrcReg)) {
5733 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5734 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5735 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5736 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5737 &AArch64::FPR128RegClass);
5738 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5739 &AArch64::FPR128RegClass);
5740 // This instruction is reading and writing Q registers. This may upset
5741 // the register scavenger and machine verifier, so we need to indicate
5742 // that we are reading an undefined value from SrcRegQ, but a proper
5743 // value from SrcReg.
5744 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5745 .addReg(SrcRegQ, RegState::Undef)
5746 .addReg(SrcRegQ, RegState::Undef)
5747 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5748 ++NumZCRegMoveInstrsFPR;
5749 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5750 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5751 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
5752 &AArch64::FPR64RegClass);
5753 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
5754 &AArch64::FPR64RegClass);
5755 // This instruction is reading and writing D registers. This may upset
5756 // the register scavenger and machine verifier, so we need to indicate
5757 // that we are reading an undefined value from SrcRegD, but a proper
5758 // value from SrcReg.
5759 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5760 .addReg(SrcRegD, RegState::Undef)
5761 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5762 ++NumZCRegMoveInstrsFPR;
5763 } else {
5764 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5765 .addReg(SrcReg, getKillRegState(KillSrc));
5766 if (Subtarget.hasZeroCycleRegMoveFPR32())
5767 ++NumZCRegMoveInstrsFPR;
5768 }
5769 return;
5770 }
5771
5772 if (AArch64::FPR16RegClass.contains(DestReg) &&
5773 AArch64::FPR16RegClass.contains(SrcReg)) {
5774 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5775 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5776 !Subtarget.hasZeroCycleRegMoveFPR32() && Subtarget.isNeonAvailable()) {
5777 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5778 &AArch64::FPR128RegClass);
5779 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5780 &AArch64::FPR128RegClass);
5781 // This instruction is reading and writing Q registers. This may upset
5782 // the register scavenger and machine verifier, so we need to indicate
5783 // that we are reading an undefined value from SrcRegQ, but a proper
5784 // value from SrcReg.
5785 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5786 .addReg(SrcRegQ, RegState::Undef)
5787 .addReg(SrcRegQ, RegState::Undef)
5788 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5789 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5790 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5791 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5792 &AArch64::FPR64RegClass);
5793 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5794 &AArch64::FPR64RegClass);
5795 // This instruction is reading and writing D registers. This may upset
5796 // the register scavenger and machine verifier, so we need to indicate
5797 // that we are reading an undefined value from SrcRegD, but a proper
5798 // value from SrcReg.
5799 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5800 .addReg(SrcRegD, RegState::Undef)
5801 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5802 } else {
5803 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5804 &AArch64::FPR32RegClass);
5805 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5806 &AArch64::FPR32RegClass);
5807 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5808 .addReg(SrcReg, getKillRegState(KillSrc));
5809 }
5810 return;
5811 }
5812
5813 if (AArch64::FPR8RegClass.contains(DestReg) &&
5814 AArch64::FPR8RegClass.contains(SrcReg)) {
5815 if (Subtarget.hasZeroCycleRegMoveFPR128() &&
5816 !Subtarget.hasZeroCycleRegMoveFPR64() &&
5817 !Subtarget.hasZeroCycleRegMoveFPR64() && Subtarget.isNeonAvailable()) {
5818 MCRegister DestRegQ = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5819 &AArch64::FPR128RegClass);
5820 MCRegister SrcRegQ = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5821 &AArch64::FPR128RegClass);
5822 // This instruction is reading and writing Q registers. This may upset
5823 // the register scavenger and machine verifier, so we need to indicate
5824 // that we are reading an undefined value from SrcRegQ, but a proper
5825 // value from SrcReg.
5826 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestRegQ)
5827 .addReg(SrcRegQ, RegState::Undef)
5828 .addReg(SrcRegQ, RegState::Undef)
5829 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5830 } else if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5831 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5832 MCRegister DestRegD = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5833 &AArch64::FPR64RegClass);
5834 MCRegister SrcRegD = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5835 &AArch64::FPR64RegClass);
5836 // This instruction is reading and writing D registers. This may upset
5837 // the register scavenger and machine verifier, so we need to indicate
5838 // that we are reading an undefined value from SrcRegD, but a proper
5839 // value from SrcReg.
5840 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5841 .addReg(SrcRegD, RegState::Undef)
5842 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5843 } else {
5844 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5845 &AArch64::FPR32RegClass);
5846 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5847 &AArch64::FPR32RegClass);
5848 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5849 .addReg(SrcReg, getKillRegState(KillSrc));
5850 }
5851 return;
5852 }
5853
5854 // Copies between GPR64 and FPR64.
5855 if (AArch64::FPR64RegClass.contains(DestReg) &&
5856 AArch64::GPR64RegClass.contains(SrcReg)) {
5857 if (AArch64::XZR == SrcReg) {
5858 BuildMI(MBB, I, DL, get(AArch64::FMOVD0), DestReg);
5859 } else {
5860 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5861 .addReg(SrcReg, getKillRegState(KillSrc));
5862 }
5863 return;
5864 }
5865 if (AArch64::GPR64RegClass.contains(DestReg) &&
5866 AArch64::FPR64RegClass.contains(SrcReg)) {
5867 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5868 .addReg(SrcReg, getKillRegState(KillSrc));
5869 return;
5870 }
5871 // Copies between GPR32 and FPR32.
5872 if (AArch64::FPR32RegClass.contains(DestReg) &&
5873 AArch64::GPR32RegClass.contains(SrcReg)) {
5874 if (AArch64::WZR == SrcReg) {
5875 BuildMI(MBB, I, DL, get(AArch64::FMOVS0), DestReg);
5876 } else {
5877 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5878 .addReg(SrcReg, getKillRegState(KillSrc));
5879 }
5880 return;
5881 }
5882 if (AArch64::GPR32RegClass.contains(DestReg) &&
5883 AArch64::FPR32RegClass.contains(SrcReg)) {
5884 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5885 .addReg(SrcReg, getKillRegState(KillSrc));
5886 return;
5887 }
5888
5889 if (DestReg == AArch64::NZCV) {
5890 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5891 BuildMI(MBB, I, DL, get(AArch64::MSR))
5892 .addImm(AArch64SysReg::NZCV)
5893 .addReg(SrcReg, getKillRegState(KillSrc))
5894 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5895 return;
5896 }
5897
5898 if (SrcReg == AArch64::NZCV) {
5899 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5900 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5901 .addImm(AArch64SysReg::NZCV)
5902 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5903 return;
5904 }
5905
5906#ifndef NDEBUG
5907 errs() << RI.getRegAsmName(DestReg) << " = COPY " << RI.getRegAsmName(SrcReg)
5908 << "\n";
5909#endif
5910 llvm_unreachable("unimplemented reg-to-reg copy");
5911}
5912
5915 MachineBasicBlock::iterator InsertBefore,
5916 const MCInstrDesc &MCID,
5917 Register SrcReg, bool IsKill,
5918 unsigned SubIdx0, unsigned SubIdx1, int FI,
5919 MachineMemOperand *MMO) {
5920 Register SrcReg0 = SrcReg;
5921 Register SrcReg1 = SrcReg;
5922 if (SrcReg.isPhysical()) {
5923 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5924 SubIdx0 = 0;
5925 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5926 SubIdx1 = 0;
5927 }
5928 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5929 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5930 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5931 .addFrameIndex(FI)
5932 .addImm(0)
5933 .addMemOperand(MMO);
5934}
5935
5938 Register SrcReg, bool isKill, int FI,
5939 const TargetRegisterClass *RC,
5940 Register VReg,
5941 MachineInstr::MIFlag Flags) const {
5942 MachineFunction &MF = *MBB.getParent();
5943 MachineFrameInfo &MFI = MF.getFrameInfo();
5944
5946 MachineMemOperand *MMO =
5948 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5949 unsigned Opc = 0;
5950 bool Offset = true;
5952 unsigned StackID = TargetStackID::Default;
5953 switch (RI.getSpillSize(*RC)) {
5954 case 1:
5955 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5956 Opc = AArch64::STRBui;
5957 break;
5958 case 2: {
5959 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5960 Opc = AArch64::STRHui;
5961 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5962 AArch64::PPRRegClass.hasSubClassEq(RC)) {
5963 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5964 "Unexpected register store without SVE store instructions");
5965 Opc = AArch64::STR_PXI;
5967 }
5968 break;
5969 }
5970 case 4:
5971 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5972 Opc = AArch64::STRWui;
5973 if (SrcReg.isVirtual())
5974 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5975 else
5976 assert(SrcReg != AArch64::WSP);
5977 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5978 Opc = AArch64::STRSui;
5979 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5980 Opc = AArch64::STR_PPXI;
5982 }
5983 break;
5984 case 8:
5985 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5986 Opc = AArch64::STRXui;
5987 if (SrcReg.isVirtual())
5988 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5989 else
5990 assert(SrcReg != AArch64::SP);
5991 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5992 Opc = AArch64::STRDui;
5993 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5995 get(AArch64::STPWi), SrcReg, isKill,
5996 AArch64::sube32, AArch64::subo32, FI, MMO);
5997 return;
5998 }
5999 break;
6000 case 16:
6001 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6002 Opc = AArch64::STRQui;
6003 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6004 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6005 Opc = AArch64::ST1Twov1d;
6006 Offset = false;
6007 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6009 get(AArch64::STPXi), SrcReg, isKill,
6010 AArch64::sube64, AArch64::subo64, FI, MMO);
6011 return;
6012 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6013 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6014 "Unexpected register store without SVE store instructions");
6015 Opc = AArch64::STR_ZXI;
6017 }
6018 break;
6019 case 24:
6020 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6021 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6022 Opc = AArch64::ST1Threev1d;
6023 Offset = false;
6024 }
6025 break;
6026 case 32:
6027 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6028 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6029 Opc = AArch64::ST1Fourv1d;
6030 Offset = false;
6031 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6032 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6033 Opc = AArch64::ST1Twov2d;
6034 Offset = false;
6035 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6036 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6037 "Unexpected register store without SVE store instructions");
6038 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
6040 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6041 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6042 "Unexpected register store without SVE store instructions");
6043 Opc = AArch64::STR_ZZXI;
6045 }
6046 break;
6047 case 48:
6048 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6049 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6050 Opc = AArch64::ST1Threev2d;
6051 Offset = false;
6052 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6053 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6054 "Unexpected register store without SVE store instructions");
6055 Opc = AArch64::STR_ZZZXI;
6057 }
6058 break;
6059 case 64:
6060 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6061 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
6062 Opc = AArch64::ST1Fourv2d;
6063 Offset = false;
6064 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6065 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6066 "Unexpected register store without SVE store instructions");
6067 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
6069 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6070 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6071 "Unexpected register store without SVE store instructions");
6072 Opc = AArch64::STR_ZZZZXI;
6074 }
6075 break;
6076 }
6077 assert(Opc && "Unknown register class");
6078 MFI.setStackID(FI, StackID);
6079
6081 .addReg(SrcReg, getKillRegState(isKill))
6082 .addFrameIndex(FI);
6083
6084 if (Offset)
6085 MI.addImm(0);
6086 if (PNRReg.isValid())
6087 MI.addDef(PNRReg, RegState::Implicit);
6088 MI.addMemOperand(MMO);
6089}
6090
6093 MachineBasicBlock::iterator InsertBefore,
6094 const MCInstrDesc &MCID,
6095 Register DestReg, unsigned SubIdx0,
6096 unsigned SubIdx1, int FI,
6097 MachineMemOperand *MMO) {
6098 Register DestReg0 = DestReg;
6099 Register DestReg1 = DestReg;
6100 bool IsUndef = true;
6101 if (DestReg.isPhysical()) {
6102 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
6103 SubIdx0 = 0;
6104 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
6105 SubIdx1 = 0;
6106 IsUndef = false;
6107 }
6108 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
6109 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
6110 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
6111 .addFrameIndex(FI)
6112 .addImm(0)
6113 .addMemOperand(MMO);
6114}
6115
6118 Register DestReg, int FI,
6119 const TargetRegisterClass *RC,
6120 Register VReg,
6121 MachineInstr::MIFlag Flags) const {
6122 MachineFunction &MF = *MBB.getParent();
6123 MachineFrameInfo &MFI = MF.getFrameInfo();
6125 MachineMemOperand *MMO =
6127 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
6128
6129 unsigned Opc = 0;
6130 bool Offset = true;
6131 unsigned StackID = TargetStackID::Default;
6133 switch (TRI.getSpillSize(*RC)) {
6134 case 1:
6135 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
6136 Opc = AArch64::LDRBui;
6137 break;
6138 case 2: {
6139 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
6140 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
6141 Opc = AArch64::LDRHui;
6142 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
6143 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6144 "Unexpected register load without SVE load instructions");
6145 if (IsPNR)
6146 PNRReg = DestReg;
6147 Opc = AArch64::LDR_PXI;
6149 }
6150 break;
6151 }
6152 case 4:
6153 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
6154 Opc = AArch64::LDRWui;
6155 if (DestReg.isVirtual())
6156 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
6157 else
6158 assert(DestReg != AArch64::WSP);
6159 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6160 Opc = AArch64::LDRSui;
6161 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
6162 Opc = AArch64::LDR_PPXI;
6164 }
6165 break;
6166 case 8:
6167 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
6168 Opc = AArch64::LDRXui;
6169 if (DestReg.isVirtual())
6170 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
6171 else
6172 assert(DestReg != AArch64::SP);
6173 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
6174 Opc = AArch64::LDRDui;
6175 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
6177 get(AArch64::LDPWi), DestReg, AArch64::sube32,
6178 AArch64::subo32, FI, MMO);
6179 return;
6180 }
6181 break;
6182 case 16:
6183 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
6184 Opc = AArch64::LDRQui;
6185 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
6186 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6187 Opc = AArch64::LD1Twov1d;
6188 Offset = false;
6189 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
6191 get(AArch64::LDPXi), DestReg, AArch64::sube64,
6192 AArch64::subo64, FI, MMO);
6193 return;
6194 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
6195 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6196 "Unexpected register load without SVE load instructions");
6197 Opc = AArch64::LDR_ZXI;
6199 }
6200 break;
6201 case 24:
6202 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
6203 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6204 Opc = AArch64::LD1Threev1d;
6205 Offset = false;
6206 }
6207 break;
6208 case 32:
6209 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
6210 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6211 Opc = AArch64::LD1Fourv1d;
6212 Offset = false;
6213 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
6214 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6215 Opc = AArch64::LD1Twov2d;
6216 Offset = false;
6217 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6218 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6219 "Unexpected register load without SVE load instructions");
6220 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
6222 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
6223 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6224 "Unexpected register load without SVE load instructions");
6225 Opc = AArch64::LDR_ZZXI;
6227 }
6228 break;
6229 case 48:
6230 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
6231 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6232 Opc = AArch64::LD1Threev2d;
6233 Offset = false;
6234 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
6235 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6236 "Unexpected register load without SVE load instructions");
6237 Opc = AArch64::LDR_ZZZXI;
6239 }
6240 break;
6241 case 64:
6242 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
6243 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
6244 Opc = AArch64::LD1Fourv2d;
6245 Offset = false;
6246 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
6247 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6248 "Unexpected register load without SVE load instructions");
6249 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
6251 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
6252 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
6253 "Unexpected register load without SVE load instructions");
6254 Opc = AArch64::LDR_ZZZZXI;
6256 }
6257 break;
6258 }
6259
6260 assert(Opc && "Unknown register class");
6261 MFI.setStackID(FI, StackID);
6262
6264 .addReg(DestReg, getDefRegState(true))
6265 .addFrameIndex(FI);
6266 if (Offset)
6267 MI.addImm(0);
6268 if (PNRReg.isValid() && !PNRReg.isVirtual())
6269 MI.addDef(PNRReg, RegState::Implicit);
6270 MI.addMemOperand(MMO);
6271}
6272
6274 const MachineInstr &UseMI,
6275 const TargetRegisterInfo *TRI) {
6276 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
6277 UseMI.getIterator()),
6278 [TRI](const MachineInstr &I) {
6279 return I.modifiesRegister(AArch64::NZCV, TRI) ||
6280 I.readsRegister(AArch64::NZCV, TRI);
6281 });
6282}
6283
6284void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6285 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
6286 // The smallest scalable element supported by scaled SVE addressing
6287 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6288 // byte offset must always be a multiple of 2.
6289 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6290
6291 // VGSized offsets are divided by '2', because the VG register is the
6292 // the number of 64bit granules as opposed to 128bit vector chunks,
6293 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
6294 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
6295 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
6296 ByteSized = Offset.getFixed();
6297 VGSized = Offset.getScalable() / 2;
6298}
6299
6300/// Returns the offset in parts to which this frame offset can be
6301/// decomposed for the purpose of describing a frame offset.
6302/// For non-scalable offsets this is simply its byte size.
6303void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6304 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
6305 int64_t &NumDataVectors) {
6306 // The smallest scalable element supported by scaled SVE addressing
6307 // modes are predicates, which are 2 scalable bytes in size. So the scalable
6308 // byte offset must always be a multiple of 2.
6309 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
6310
6311 NumBytes = Offset.getFixed();
6312 NumDataVectors = 0;
6313 NumPredicateVectors = Offset.getScalable() / 2;
6314 // This method is used to get the offsets to adjust the frame offset.
6315 // If the function requires ADDPL to be used and needs more than two ADDPL
6316 // instructions, part of the offset is folded into NumDataVectors so that it
6317 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
6318 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
6319 NumPredicateVectors > 62) {
6320 NumDataVectors = NumPredicateVectors / 8;
6321 NumPredicateVectors -= NumDataVectors * 8;
6322 }
6323}
6324
6325// Convenience function to create a DWARF expression for: Constant `Operation`.
6326// This helper emits compact sequences for common cases. For example, for`-15
6327// DW_OP_plus`, this helper would create DW_OP_lit15 DW_OP_minus.
6330 if (Operation == dwarf::DW_OP_plus && Constant < 0 && -Constant <= 31) {
6331 // -Constant (1 to 31)
6332 Expr.push_back(dwarf::DW_OP_lit0 - Constant);
6333 Operation = dwarf::DW_OP_minus;
6334 } else if (Constant >= 0 && Constant <= 31) {
6335 // Literal value 0 to 31
6336 Expr.push_back(dwarf::DW_OP_lit0 + Constant);
6337 } else {
6338 // Signed constant
6339 Expr.push_back(dwarf::DW_OP_consts);
6341 }
6342 return Expr.push_back(Operation);
6343}
6344
6345// Convenience function to create a DWARF expression for a register.
6346static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
6347 Expr.push_back((char)dwarf::DW_OP_bregx);
6349 Expr.push_back(0);
6350}
6351
6352// Convenience function to create a DWARF expression for loading a register from
6353// a CFA offset.
6355 int64_t OffsetFromDefCFA) {
6356 // This assumes the top of the DWARF stack contains the CFA.
6357 Expr.push_back(dwarf::DW_OP_dup);
6358 // Add the offset to the register.
6359 appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
6360 // Dereference the address (loads a 64 bit value)..
6361 Expr.push_back(dwarf::DW_OP_deref);
6362}
6363
6364// Convenience function to create a comment for
6365// (+/-) NumBytes (* RegScale)?
6366static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
6367 StringRef RegScale = {}) {
6368 if (NumBytes) {
6369 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
6370 if (!RegScale.empty())
6371 Comment << ' ' << RegScale;
6372 }
6373}
6374
6375// Creates an MCCFIInstruction:
6376// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
6378 unsigned Reg,
6379 const StackOffset &Offset) {
6380 int64_t NumBytes, NumVGScaledBytes;
6381 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
6382 NumVGScaledBytes);
6383 std::string CommentBuffer;
6384 llvm::raw_string_ostream Comment(CommentBuffer);
6385
6386 if (Reg == AArch64::SP)
6387 Comment << "sp";
6388 else if (Reg == AArch64::FP)
6389 Comment << "fp";
6390 else
6391 Comment << printReg(Reg, &TRI);
6392
6393 // Build up the expression (Reg + NumBytes + VG * NumVGScaledBytes)
6394 SmallString<64> Expr;
6395 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6396 assert(DwarfReg <= 31 && "DwarfReg out of bounds (0..31)");
6397 // Reg + NumBytes
6398 Expr.push_back(dwarf::DW_OP_breg0 + DwarfReg);
6399 appendLEB128<LEB128Sign::Signed>(Expr, NumBytes);
6400 appendOffsetComment(NumBytes, Comment);
6401 if (NumVGScaledBytes) {
6402 // + VG * NumVGScaledBytes
6403 appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
6404 appendReadRegExpr(Expr, TRI.getDwarfRegNum(AArch64::VG, true));
6405 appendConstantExpr(Expr, NumVGScaledBytes, dwarf::DW_OP_mul);
6406 Expr.push_back(dwarf::DW_OP_plus);
6407 }
6408
6409 // Wrap this into DW_CFA_def_cfa.
6410 SmallString<64> DefCfaExpr;
6411 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
6412 appendLEB128<LEB128Sign::Unsigned>(DefCfaExpr, Expr.size());
6413 DefCfaExpr.append(Expr.str());
6414 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
6415 Comment.str());
6416}
6417
6419 unsigned FrameReg, unsigned Reg,
6420 const StackOffset &Offset,
6421 bool LastAdjustmentWasScalable) {
6422 if (Offset.getScalable())
6423 return createDefCFAExpression(TRI, Reg, Offset);
6424
6425 if (FrameReg == Reg && !LastAdjustmentWasScalable)
6426 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
6427
6428 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6429 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
6430}
6431
6434 const StackOffset &OffsetFromDefCFA,
6435 std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
6436 int64_t NumBytes, NumVGScaledBytes;
6437 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
6438 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
6439
6440 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
6441
6442 // Non-scalable offsets can use DW_CFA_offset directly.
6443 if (!NumVGScaledBytes)
6444 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
6445
6446 std::string CommentBuffer;
6447 llvm::raw_string_ostream Comment(CommentBuffer);
6448 Comment << printReg(Reg, &TRI) << " @ cfa";
6449
6450 // Build up expression (CFA + VG * NumVGScaledBytes + NumBytes)
6451 assert(NumVGScaledBytes && "Expected scalable offset");
6452 SmallString<64> OffsetExpr;
6453 // + VG * NumVGScaledBytes
6454 StringRef VGRegScale;
6455 if (IncomingVGOffsetFromDefCFA) {
6456 appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
6457 VGRegScale = "* IncomingVG";
6458 } else {
6459 appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
6460 VGRegScale = "* VG";
6461 }
6462 appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
6463 appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
6464 OffsetExpr.push_back(dwarf::DW_OP_plus);
6465 if (NumBytes) {
6466 // + NumBytes
6467 appendOffsetComment(NumBytes, Comment);
6468 appendConstantExpr(OffsetExpr, NumBytes, dwarf::DW_OP_plus);
6469 }
6470
6471 // Wrap this into DW_CFA_expression
6472 SmallString<64> CfaExpr;
6473 CfaExpr.push_back(dwarf::DW_CFA_expression);
6474 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, DwarfReg);
6475 appendLEB128<LEB128Sign::Unsigned>(CfaExpr, OffsetExpr.size());
6476 CfaExpr.append(OffsetExpr.str());
6477
6478 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
6479 Comment.str());
6480}
6481
6482// Helper function to emit a frame offset adjustment from a given
6483// pointer (SrcReg), stored into DestReg. This function is explicit
6484// in that it requires the opcode.
6487 const DebugLoc &DL, unsigned DestReg,
6488 unsigned SrcReg, int64_t Offset, unsigned Opc,
6489 const TargetInstrInfo *TII,
6490 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
6491 bool *HasWinCFI, bool EmitCFAOffset,
6492 StackOffset CFAOffset, unsigned FrameReg) {
6493 int Sign = 1;
6494 unsigned MaxEncoding, ShiftSize;
6495 switch (Opc) {
6496 case AArch64::ADDXri:
6497 case AArch64::ADDSXri:
6498 case AArch64::SUBXri:
6499 case AArch64::SUBSXri:
6500 MaxEncoding = 0xfff;
6501 ShiftSize = 12;
6502 break;
6503 case AArch64::ADDVL_XXI:
6504 case AArch64::ADDPL_XXI:
6505 case AArch64::ADDSVL_XXI:
6506 case AArch64::ADDSPL_XXI:
6507 MaxEncoding = 31;
6508 ShiftSize = 0;
6509 if (Offset < 0) {
6510 MaxEncoding = 32;
6511 Sign = -1;
6512 Offset = -Offset;
6513 }
6514 break;
6515 default:
6516 llvm_unreachable("Unsupported opcode");
6517 }
6518
6519 // `Offset` can be in bytes or in "scalable bytes".
6520 int VScale = 1;
6521 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6522 VScale = 16;
6523 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6524 VScale = 2;
6525
6526 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6527 // scratch register. If DestReg is a virtual register, use it as the
6528 // scratch register; otherwise, create a new virtual register (to be
6529 // replaced by the scavenger at the end of PEI). That case can be optimized
6530 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6531 // register can be loaded with offset%8 and the add/sub can use an extending
6532 // instruction with LSL#3.
6533 // Currently the function handles any offsets but generates a poor sequence
6534 // of code.
6535 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6536
6537 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6538 Register TmpReg = DestReg;
6539 if (TmpReg == AArch64::XZR)
6540 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6541 &AArch64::GPR64RegClass);
6542 do {
6543 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6544 unsigned LocalShiftSize = 0;
6545 if (ThisVal > MaxEncoding) {
6546 ThisVal = ThisVal >> ShiftSize;
6547 LocalShiftSize = ShiftSize;
6548 }
6549 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6550 "Encoding cannot handle value that big");
6551
6552 Offset -= ThisVal << LocalShiftSize;
6553 if (Offset == 0)
6554 TmpReg = DestReg;
6555 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6556 .addReg(SrcReg)
6557 .addImm(Sign * (int)ThisVal);
6558 if (ShiftSize)
6559 MBI = MBI.addImm(
6561 MBI = MBI.setMIFlag(Flag);
6562
6563 auto Change =
6564 VScale == 1
6565 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6566 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6567 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6568 CFAOffset += Change;
6569 else
6570 CFAOffset -= Change;
6571 if (EmitCFAOffset && DestReg == TmpReg) {
6572 MachineFunction &MF = *MBB.getParent();
6573 const TargetSubtargetInfo &STI = MF.getSubtarget();
6574 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6575
6576 unsigned CFIIndex = MF.addFrameInst(
6577 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6578 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6579 .addCFIIndex(CFIIndex)
6580 .setMIFlags(Flag);
6581 }
6582
6583 if (NeedsWinCFI) {
6584 int Imm = (int)(ThisVal << LocalShiftSize);
6585 if (VScale != 1 && DestReg == AArch64::SP) {
6586 if (HasWinCFI)
6587 *HasWinCFI = true;
6588 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6589 .addImm(ThisVal)
6590 .setMIFlag(Flag);
6591 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6592 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6593 assert(VScale == 1 && "Expected non-scalable operation");
6594 if (HasWinCFI)
6595 *HasWinCFI = true;
6596 if (Imm == 0)
6597 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6598 else
6599 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6600 .addImm(Imm)
6601 .setMIFlag(Flag);
6602 assert(Offset == 0 && "Expected remaining offset to be zero to "
6603 "emit a single SEH directive");
6604 } else if (DestReg == AArch64::SP) {
6605 assert(VScale == 1 && "Expected non-scalable operation");
6606 if (HasWinCFI)
6607 *HasWinCFI = true;
6608 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6609 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6610 .addImm(Imm)
6611 .setMIFlag(Flag);
6612 }
6613 }
6614
6615 SrcReg = TmpReg;
6616 } while (Offset);
6617}
6618
6621 unsigned DestReg, unsigned SrcReg,
6623 MachineInstr::MIFlag Flag, bool SetNZCV,
6624 bool NeedsWinCFI, bool *HasWinCFI,
6625 bool EmitCFAOffset, StackOffset CFAOffset,
6626 unsigned FrameReg) {
6627 // If a function is marked as arm_locally_streaming, then the runtime value of
6628 // vscale in the prologue/epilogue is different the runtime value of vscale
6629 // in the function's body. To avoid having to consider multiple vscales,
6630 // we can use `addsvl` to allocate any scalable stack-slots, which under
6631 // most circumstances will be only locals, not callee-save slots.
6632 const Function &F = MBB.getParent()->getFunction();
6633 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6634
6635 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6636 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6637 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6638
6639 // Insert ADDSXri for scalable offset at the end.
6640 bool NeedsFinalDefNZCV = SetNZCV && (NumPredicateVectors || NumDataVectors);
6641 if (NeedsFinalDefNZCV)
6642 SetNZCV = false;
6643
6644 // First emit non-scalable frame offsets, or a simple 'mov'.
6645 if (Bytes || (!Offset && SrcReg != DestReg)) {
6646 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6647 "SP increment/decrement not 8-byte aligned");
6648 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6649 if (Bytes < 0) {
6650 Bytes = -Bytes;
6651 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6652 }
6653 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6654 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6655 FrameReg);
6656 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6657 ? StackOffset::getFixed(-Bytes)
6658 : StackOffset::getFixed(Bytes);
6659 SrcReg = DestReg;
6660 FrameReg = DestReg;
6661 }
6662
6663 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6664 "WinCFI can't allocate fractions of an SVE data vector");
6665
6666 if (NumDataVectors) {
6667 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6668 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6669 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6670 FrameReg);
6671 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6672 SrcReg = DestReg;
6673 }
6674
6675 if (NumPredicateVectors) {
6676 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6677 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6678 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6679 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6680 FrameReg);
6681 }
6682
6683 if (NeedsFinalDefNZCV)
6684 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDSXri), DestReg)
6685 .addReg(DestReg)
6686 .addImm(0)
6687 .addImm(0);
6688}
6689
6692 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6693 LiveIntervals *LIS, VirtRegMap *VRM) const {
6694 // This is a bit of a hack. Consider this instruction:
6695 //
6696 // %0 = COPY %sp; GPR64all:%0
6697 //
6698 // We explicitly chose GPR64all for the virtual register so such a copy might
6699 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6700 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6701 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6702 //
6703 // To prevent that, we are going to constrain the %0 register class here.
6704 if (MI.isFullCopy()) {
6705 Register DstReg = MI.getOperand(0).getReg();
6706 Register SrcReg = MI.getOperand(1).getReg();
6707 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6708 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6709 return nullptr;
6710 }
6711 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6712 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6713 return nullptr;
6714 }
6715 // Nothing can folded with copy from/to NZCV.
6716 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6717 return nullptr;
6718 }
6719
6720 // Handle the case where a copy is being spilled or filled but the source
6721 // and destination register class don't match. For example:
6722 //
6723 // %0 = COPY %xzr; GPR64common:%0
6724 //
6725 // In this case we can still safely fold away the COPY and generate the
6726 // following spill code:
6727 //
6728 // STRXui %xzr, %stack.0
6729 //
6730 // This also eliminates spilled cross register class COPYs (e.g. between x and
6731 // d regs) of the same size. For example:
6732 //
6733 // %0 = COPY %1; GPR64:%0, FPR64:%1
6734 //
6735 // will be filled as
6736 //
6737 // LDRDui %0, fi<#0>
6738 //
6739 // instead of
6740 //
6741 // LDRXui %Temp, fi<#0>
6742 // %0 = FMOV %Temp
6743 //
6744 if (MI.isCopy() && Ops.size() == 1 &&
6745 // Make sure we're only folding the explicit COPY defs/uses.
6746 (Ops[0] == 0 || Ops[0] == 1)) {
6747 bool IsSpill = Ops[0] == 0;
6748 bool IsFill = !IsSpill;
6750 const MachineRegisterInfo &MRI = MF.getRegInfo();
6751 MachineBasicBlock &MBB = *MI.getParent();
6752 const MachineOperand &DstMO = MI.getOperand(0);
6753 const MachineOperand &SrcMO = MI.getOperand(1);
6754 Register DstReg = DstMO.getReg();
6755 Register SrcReg = SrcMO.getReg();
6756 // This is slightly expensive to compute for physical regs since
6757 // getMinimalPhysRegClass is slow.
6758 auto getRegClass = [&](unsigned Reg) {
6759 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6760 : TRI.getMinimalPhysRegClass(Reg);
6761 };
6762
6763 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6764 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6765 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6766 "Mismatched register size in non subreg COPY");
6767 if (IsSpill)
6768 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6769 getRegClass(SrcReg), Register());
6770 else
6771 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6772 getRegClass(DstReg), Register());
6773 return &*--InsertPt;
6774 }
6775
6776 // Handle cases like spilling def of:
6777 //
6778 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6779 //
6780 // where the physical register source can be widened and stored to the full
6781 // virtual reg destination stack slot, in this case producing:
6782 //
6783 // STRXui %xzr, %stack.0
6784 //
6785 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6786 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6787 assert(SrcMO.getSubReg() == 0 &&
6788 "Unexpected subreg on physical register");
6789 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6790 FrameIndex, &AArch64::GPR64RegClass, Register());
6791 return &*--InsertPt;
6792 }
6793
6794 // Handle cases like filling use of:
6795 //
6796 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6797 //
6798 // where we can load the full virtual reg source stack slot, into the subreg
6799 // destination, in this case producing:
6800 //
6801 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6802 //
6803 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6804 const TargetRegisterClass *FillRC = nullptr;
6805 switch (DstMO.getSubReg()) {
6806 default:
6807 break;
6808 case AArch64::sub_32:
6809 if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg)))
6810 FillRC = &AArch64::GPR32RegClass;
6811 break;
6812 case AArch64::ssub:
6813 FillRC = &AArch64::FPR32RegClass;
6814 break;
6815 case AArch64::dsub:
6816 FillRC = &AArch64::FPR64RegClass;
6817 break;
6818 }
6819
6820 if (FillRC) {
6821 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6822 TRI.getRegSizeInBits(*FillRC) &&
6823 "Mismatched regclass size on folded subreg COPY");
6824 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC,
6825 Register());
6826 MachineInstr &LoadMI = *--InsertPt;
6827 MachineOperand &LoadDst = LoadMI.getOperand(0);
6828 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6829 LoadDst.setSubReg(DstMO.getSubReg());
6830 LoadDst.setIsUndef();
6831 return &LoadMI;
6832 }
6833 }
6834 }
6835
6836 // Cannot fold.
6837 return nullptr;
6838}
6839
6841 StackOffset &SOffset,
6842 bool *OutUseUnscaledOp,
6843 unsigned *OutUnscaledOp,
6844 int64_t *EmittableOffset) {
6845 // Set output values in case of early exit.
6846 if (EmittableOffset)
6847 *EmittableOffset = 0;
6848 if (OutUseUnscaledOp)
6849 *OutUseUnscaledOp = false;
6850 if (OutUnscaledOp)
6851 *OutUnscaledOp = 0;
6852
6853 // Exit early for structured vector spills/fills as they can't take an
6854 // immediate offset.
6855 switch (MI.getOpcode()) {
6856 default:
6857 break;
6858 case AArch64::LD1Rv1d:
6859 case AArch64::LD1Rv2s:
6860 case AArch64::LD1Rv2d:
6861 case AArch64::LD1Rv4h:
6862 case AArch64::LD1Rv4s:
6863 case AArch64::LD1Rv8b:
6864 case AArch64::LD1Rv8h:
6865 case AArch64::LD1Rv16b:
6866 case AArch64::LD1Twov2d:
6867 case AArch64::LD1Threev2d:
6868 case AArch64::LD1Fourv2d:
6869 case AArch64::LD1Twov1d:
6870 case AArch64::LD1Threev1d:
6871 case AArch64::LD1Fourv1d:
6872 case AArch64::ST1Twov2d:
6873 case AArch64::ST1Threev2d:
6874 case AArch64::ST1Fourv2d:
6875 case AArch64::ST1Twov1d:
6876 case AArch64::ST1Threev1d:
6877 case AArch64::ST1Fourv1d:
6878 case AArch64::ST1i8:
6879 case AArch64::ST1i16:
6880 case AArch64::ST1i32:
6881 case AArch64::ST1i64:
6882 case AArch64::IRG:
6883 case AArch64::IRGstack:
6884 case AArch64::STGloop:
6885 case AArch64::STZGloop:
6887 }
6888
6889 // Get the min/max offset and the scale.
6890 TypeSize ScaleValue(0U, false), Width(0U, false);
6891 int64_t MinOff, MaxOff;
6892 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6893 MaxOff))
6894 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6895
6896 // Construct the complete offset.
6897 bool IsMulVL = ScaleValue.isScalable();
6898 unsigned Scale = ScaleValue.getKnownMinValue();
6899 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6900
6901 const MachineOperand &ImmOpnd =
6902 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6903 Offset += ImmOpnd.getImm() * Scale;
6904
6905 // If the offset doesn't match the scale, we rewrite the instruction to
6906 // use the unscaled instruction instead. Likewise, if we have a negative
6907 // offset and there is an unscaled op to use.
6908 std::optional<unsigned> UnscaledOp =
6910 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6911 if (useUnscaledOp &&
6912 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6913 MaxOff))
6914 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6915
6916 Scale = ScaleValue.getKnownMinValue();
6917 assert(IsMulVL == ScaleValue.isScalable() &&
6918 "Unscaled opcode has different value for scalable");
6919
6920 int64_t Remainder = Offset % Scale;
6921 assert(!(Remainder && useUnscaledOp) &&
6922 "Cannot have remainder when using unscaled op");
6923
6924 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6925 int64_t NewOffset = Offset / Scale;
6926 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6927 Offset = Remainder;
6928 else {
6929 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6930 Offset = Offset - (NewOffset * Scale);
6931 }
6932
6933 if (EmittableOffset)
6934 *EmittableOffset = NewOffset;
6935 if (OutUseUnscaledOp)
6936 *OutUseUnscaledOp = useUnscaledOp;
6937 if (OutUnscaledOp && UnscaledOp)
6938 *OutUnscaledOp = *UnscaledOp;
6939
6940 if (IsMulVL)
6941 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6942 else
6943 SOffset = StackOffset::get(Offset, SOffset.getScalable());
6945 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6946}
6947
6949 unsigned FrameReg, StackOffset &Offset,
6950 const AArch64InstrInfo *TII) {
6951 unsigned Opcode = MI.getOpcode();
6952 unsigned ImmIdx = FrameRegIdx + 1;
6953
6954 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6955 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6956 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6957 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6958 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6959 MI.eraseFromParent();
6960 Offset = StackOffset();
6961 return true;
6962 }
6963
6964 int64_t NewOffset;
6965 unsigned UnscaledOp;
6966 bool UseUnscaledOp;
6967 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6968 &UnscaledOp, &NewOffset);
6971 // Replace the FrameIndex with FrameReg.
6972 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6973 if (UseUnscaledOp)
6974 MI.setDesc(TII->get(UnscaledOp));
6975
6976 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6977 return !Offset;
6978 }
6979
6980 return false;
6981}
6982
6988
6989MCInst AArch64InstrInfo::getNop() const { return MCInstBuilder(AArch64::NOP); }
6990
6991// AArch64 supports MachineCombiner.
6992bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6993
6994// True when Opc sets flag
6995static bool isCombineInstrSettingFlag(unsigned Opc) {
6996 switch (Opc) {
6997 case AArch64::ADDSWrr:
6998 case AArch64::ADDSWri:
6999 case AArch64::ADDSXrr:
7000 case AArch64::ADDSXri:
7001 case AArch64::SUBSWrr:
7002 case AArch64::SUBSXrr:
7003 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7004 case AArch64::SUBSWri:
7005 case AArch64::SUBSXri:
7006 return true;
7007 default:
7008 break;
7009 }
7010 return false;
7011}
7012
7013// 32b Opcodes that can be combined with a MUL
7014static bool isCombineInstrCandidate32(unsigned Opc) {
7015 switch (Opc) {
7016 case AArch64::ADDWrr:
7017 case AArch64::ADDWri:
7018 case AArch64::SUBWrr:
7019 case AArch64::ADDSWrr:
7020 case AArch64::ADDSWri:
7021 case AArch64::SUBSWrr:
7022 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7023 case AArch64::SUBWri:
7024 case AArch64::SUBSWri:
7025 return true;
7026 default:
7027 break;
7028 }
7029 return false;
7030}
7031
7032// 64b Opcodes that can be combined with a MUL
7033static bool isCombineInstrCandidate64(unsigned Opc) {
7034 switch (Opc) {
7035 case AArch64::ADDXrr:
7036 case AArch64::ADDXri:
7037 case AArch64::SUBXrr:
7038 case AArch64::ADDSXrr:
7039 case AArch64::ADDSXri:
7040 case AArch64::SUBSXrr:
7041 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
7042 case AArch64::SUBXri:
7043 case AArch64::SUBSXri:
7044 case AArch64::ADDv8i8:
7045 case AArch64::ADDv16i8:
7046 case AArch64::ADDv4i16:
7047 case AArch64::ADDv8i16:
7048 case AArch64::ADDv2i32:
7049 case AArch64::ADDv4i32:
7050 case AArch64::SUBv8i8:
7051 case AArch64::SUBv16i8:
7052 case AArch64::SUBv4i16:
7053 case AArch64::SUBv8i16:
7054 case AArch64::SUBv2i32:
7055 case AArch64::SUBv4i32:
7056 return true;
7057 default:
7058 break;
7059 }
7060 return false;
7061}
7062
7063// FP Opcodes that can be combined with a FMUL.
7064static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
7065 switch (Inst.getOpcode()) {
7066 default:
7067 break;
7068 case AArch64::FADDHrr:
7069 case AArch64::FADDSrr:
7070 case AArch64::FADDDrr:
7071 case AArch64::FADDv4f16:
7072 case AArch64::FADDv8f16:
7073 case AArch64::FADDv2f32:
7074 case AArch64::FADDv2f64:
7075 case AArch64::FADDv4f32:
7076 case AArch64::FSUBHrr:
7077 case AArch64::FSUBSrr:
7078 case AArch64::FSUBDrr:
7079 case AArch64::FSUBv4f16:
7080 case AArch64::FSUBv8f16:
7081 case AArch64::FSUBv2f32:
7082 case AArch64::FSUBv2f64:
7083 case AArch64::FSUBv4f32:
7085 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
7086 // the target options or if FADD/FSUB has the contract fast-math flag.
7087 return Options.AllowFPOpFusion == FPOpFusion::Fast ||
7089 }
7090 return false;
7091}
7092
7093// Opcodes that can be combined with a MUL
7097
7098//
7099// Utility routine that checks if \param MO is defined by an
7100// \param CombineOpc instruction in the basic block \param MBB
7102 unsigned CombineOpc, unsigned ZeroReg = 0,
7103 bool CheckZeroReg = false) {
7104 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7105 MachineInstr *MI = nullptr;
7106
7107 if (MO.isReg() && MO.getReg().isVirtual())
7108 MI = MRI.getUniqueVRegDef(MO.getReg());
7109 // And it needs to be in the trace (otherwise, it won't have a depth).
7110 if (!MI || MI->getParent() != &MBB || MI->getOpcode() != CombineOpc)
7111 return false;
7112 // Must only used by the user we combine with.
7113 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
7114 return false;
7115
7116 if (CheckZeroReg) {
7117 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
7118 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
7119 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
7120 // The third input reg must be zero.
7121 if (MI->getOperand(3).getReg() != ZeroReg)
7122 return false;
7123 }
7124
7125 if (isCombineInstrSettingFlag(CombineOpc) &&
7126 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
7127 return false;
7128
7129 return true;
7130}
7131
7132//
7133// Is \param MO defined by an integer multiply and can be combined?
7135 unsigned MulOpc, unsigned ZeroReg) {
7136 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
7137}
7138
7139//
7140// Is \param MO defined by a floating-point multiply and can be combined?
7142 unsigned MulOpc) {
7143 return canCombine(MBB, MO, MulOpc);
7144}
7145
7146// TODO: There are many more machine instruction opcodes to match:
7147// 1. Other data types (integer, vectors)
7148// 2. Other math / logic operations (xor, or)
7149// 3. Other forms of the same operation (intrinsics and other variants)
7150bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
7151 bool Invert) const {
7152 if (Invert)
7153 return false;
7154 switch (Inst.getOpcode()) {
7155 // == Floating-point types ==
7156 // -- Floating-point instructions --
7157 case AArch64::FADDHrr:
7158 case AArch64::FADDSrr:
7159 case AArch64::FADDDrr:
7160 case AArch64::FMULHrr:
7161 case AArch64::FMULSrr:
7162 case AArch64::FMULDrr:
7163 case AArch64::FMULX16:
7164 case AArch64::FMULX32:
7165 case AArch64::FMULX64:
7166 // -- Advanced SIMD instructions --
7167 case AArch64::FADDv4f16:
7168 case AArch64::FADDv8f16:
7169 case AArch64::FADDv2f32:
7170 case AArch64::FADDv4f32:
7171 case AArch64::FADDv2f64:
7172 case AArch64::FMULv4f16:
7173 case AArch64::FMULv8f16:
7174 case AArch64::FMULv2f32:
7175 case AArch64::FMULv4f32:
7176 case AArch64::FMULv2f64:
7177 case AArch64::FMULXv4f16:
7178 case AArch64::FMULXv8f16:
7179 case AArch64::FMULXv2f32:
7180 case AArch64::FMULXv4f32:
7181 case AArch64::FMULXv2f64:
7182 // -- SVE instructions --
7183 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
7184 // in the SVE instruction set (though there are predicated ones).
7185 case AArch64::FADD_ZZZ_H:
7186 case AArch64::FADD_ZZZ_S:
7187 case AArch64::FADD_ZZZ_D:
7188 case AArch64::FMUL_ZZZ_H:
7189 case AArch64::FMUL_ZZZ_S:
7190 case AArch64::FMUL_ZZZ_D:
7193
7194 // == Integer types ==
7195 // -- Base instructions --
7196 // Opcodes MULWrr and MULXrr don't exist because
7197 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
7198 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
7199 // The machine-combiner does not support three-source-operands machine
7200 // instruction. So we cannot reassociate MULs.
7201 case AArch64::ADDWrr:
7202 case AArch64::ADDXrr:
7203 case AArch64::ANDWrr:
7204 case AArch64::ANDXrr:
7205 case AArch64::ORRWrr:
7206 case AArch64::ORRXrr:
7207 case AArch64::EORWrr:
7208 case AArch64::EORXrr:
7209 case AArch64::EONWrr:
7210 case AArch64::EONXrr:
7211 // -- Advanced SIMD instructions --
7212 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
7213 // in the Advanced SIMD instruction set.
7214 case AArch64::ADDv8i8:
7215 case AArch64::ADDv16i8:
7216 case AArch64::ADDv4i16:
7217 case AArch64::ADDv8i16:
7218 case AArch64::ADDv2i32:
7219 case AArch64::ADDv4i32:
7220 case AArch64::ADDv1i64:
7221 case AArch64::ADDv2i64:
7222 case AArch64::MULv8i8:
7223 case AArch64::MULv16i8:
7224 case AArch64::MULv4i16:
7225 case AArch64::MULv8i16:
7226 case AArch64::MULv2i32:
7227 case AArch64::MULv4i32:
7228 case AArch64::ANDv8i8:
7229 case AArch64::ANDv16i8:
7230 case AArch64::ORRv8i8:
7231 case AArch64::ORRv16i8:
7232 case AArch64::EORv8i8:
7233 case AArch64::EORv16i8:
7234 // -- SVE instructions --
7235 case AArch64::ADD_ZZZ_B:
7236 case AArch64::ADD_ZZZ_H:
7237 case AArch64::ADD_ZZZ_S:
7238 case AArch64::ADD_ZZZ_D:
7239 case AArch64::MUL_ZZZ_B:
7240 case AArch64::MUL_ZZZ_H:
7241 case AArch64::MUL_ZZZ_S:
7242 case AArch64::MUL_ZZZ_D:
7243 case AArch64::AND_ZZZ:
7244 case AArch64::ORR_ZZZ:
7245 case AArch64::EOR_ZZZ:
7246 return true;
7247
7248 default:
7249 return false;
7250 }
7251}
7252
7253/// Find instructions that can be turned into madd.
7255 SmallVectorImpl<unsigned> &Patterns) {
7256 unsigned Opc = Root.getOpcode();
7257 MachineBasicBlock &MBB = *Root.getParent();
7258 bool Found = false;
7259
7261 return false;
7263 int Cmp_NZCV =
7264 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
7265 // When NZCV is live bail out.
7266 if (Cmp_NZCV == -1)
7267 return false;
7268 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
7269 // When opcode can't change bail out.
7270 // CHECKME: do we miss any cases for opcode conversion?
7271 if (NewOpc == Opc)
7272 return false;
7273 Opc = NewOpc;
7274 }
7275
7276 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
7277 unsigned Pattern) {
7278 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
7279 Patterns.push_back(Pattern);
7280 Found = true;
7281 }
7282 };
7283
7284 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
7285 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
7286 Patterns.push_back(Pattern);
7287 Found = true;
7288 }
7289 };
7290
7292
7293 switch (Opc) {
7294 default:
7295 break;
7296 case AArch64::ADDWrr:
7297 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7298 "ADDWrr does not have register operands");
7299 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
7300 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
7301 break;
7302 case AArch64::ADDXrr:
7303 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
7304 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
7305 break;
7306 case AArch64::SUBWrr:
7307 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
7308 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
7309 break;
7310 case AArch64::SUBXrr:
7311 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
7312 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
7313 break;
7314 case AArch64::ADDWri:
7315 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
7316 break;
7317 case AArch64::ADDXri:
7318 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
7319 break;
7320 case AArch64::SUBWri:
7321 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
7322 break;
7323 case AArch64::SUBXri:
7324 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
7325 break;
7326 case AArch64::ADDv8i8:
7327 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
7328 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
7329 break;
7330 case AArch64::ADDv16i8:
7331 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
7332 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
7333 break;
7334 case AArch64::ADDv4i16:
7335 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
7336 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
7337 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
7338 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
7339 break;
7340 case AArch64::ADDv8i16:
7341 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
7342 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
7343 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
7344 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
7345 break;
7346 case AArch64::ADDv2i32:
7347 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
7348 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
7349 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
7350 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
7351 break;
7352 case AArch64::ADDv4i32:
7353 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
7354 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
7355 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
7356 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
7357 break;
7358 case AArch64::SUBv8i8:
7359 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
7360 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
7361 break;
7362 case AArch64::SUBv16i8:
7363 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
7364 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
7365 break;
7366 case AArch64::SUBv4i16:
7367 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
7368 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
7369 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
7370 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
7371 break;
7372 case AArch64::SUBv8i16:
7373 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
7374 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
7375 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
7376 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
7377 break;
7378 case AArch64::SUBv2i32:
7379 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
7380 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
7381 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
7382 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
7383 break;
7384 case AArch64::SUBv4i32:
7385 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
7386 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
7387 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
7388 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
7389 break;
7390 }
7391 return Found;
7392}
7393
7394bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
7395 switch (Opcode) {
7396 default:
7397 break;
7398 case AArch64::UABALB_ZZZ_D:
7399 case AArch64::UABALB_ZZZ_H:
7400 case AArch64::UABALB_ZZZ_S:
7401 case AArch64::UABALT_ZZZ_D:
7402 case AArch64::UABALT_ZZZ_H:
7403 case AArch64::UABALT_ZZZ_S:
7404 case AArch64::SABALB_ZZZ_D:
7405 case AArch64::SABALB_ZZZ_S:
7406 case AArch64::SABALB_ZZZ_H:
7407 case AArch64::SABALT_ZZZ_D:
7408 case AArch64::SABALT_ZZZ_S:
7409 case AArch64::SABALT_ZZZ_H:
7410 case AArch64::UABALv16i8_v8i16:
7411 case AArch64::UABALv2i32_v2i64:
7412 case AArch64::UABALv4i16_v4i32:
7413 case AArch64::UABALv4i32_v2i64:
7414 case AArch64::UABALv8i16_v4i32:
7415 case AArch64::UABALv8i8_v8i16:
7416 case AArch64::UABAv16i8:
7417 case AArch64::UABAv2i32:
7418 case AArch64::UABAv4i16:
7419 case AArch64::UABAv4i32:
7420 case AArch64::UABAv8i16:
7421 case AArch64::UABAv8i8:
7422 case AArch64::SABALv16i8_v8i16:
7423 case AArch64::SABALv2i32_v2i64:
7424 case AArch64::SABALv4i16_v4i32:
7425 case AArch64::SABALv4i32_v2i64:
7426 case AArch64::SABALv8i16_v4i32:
7427 case AArch64::SABALv8i8_v8i16:
7428 case AArch64::SABAv16i8:
7429 case AArch64::SABAv2i32:
7430 case AArch64::SABAv4i16:
7431 case AArch64::SABAv4i32:
7432 case AArch64::SABAv8i16:
7433 case AArch64::SABAv8i8:
7434 return true;
7435 }
7436
7437 return false;
7438}
7439
7440unsigned AArch64InstrInfo::getAccumulationStartOpcode(
7441 unsigned AccumulationOpcode) const {
7442 switch (AccumulationOpcode) {
7443 default:
7444 llvm_unreachable("Unsupported accumulation Opcode!");
7445 case AArch64::UABALB_ZZZ_D:
7446 return AArch64::UABDLB_ZZZ_D;
7447 case AArch64::UABALB_ZZZ_H:
7448 return AArch64::UABDLB_ZZZ_H;
7449 case AArch64::UABALB_ZZZ_S:
7450 return AArch64::UABDLB_ZZZ_S;
7451 case AArch64::UABALT_ZZZ_D:
7452 return AArch64::UABDLT_ZZZ_D;
7453 case AArch64::UABALT_ZZZ_H:
7454 return AArch64::UABDLT_ZZZ_H;
7455 case AArch64::UABALT_ZZZ_S:
7456 return AArch64::UABDLT_ZZZ_S;
7457 case AArch64::UABALv16i8_v8i16:
7458 return AArch64::UABDLv16i8_v8i16;
7459 case AArch64::UABALv2i32_v2i64:
7460 return AArch64::UABDLv2i32_v2i64;
7461 case AArch64::UABALv4i16_v4i32:
7462 return AArch64::UABDLv4i16_v4i32;
7463 case AArch64::UABALv4i32_v2i64:
7464 return AArch64::UABDLv4i32_v2i64;
7465 case AArch64::UABALv8i16_v4i32:
7466 return AArch64::UABDLv8i16_v4i32;
7467 case AArch64::UABALv8i8_v8i16:
7468 return AArch64::UABDLv8i8_v8i16;
7469 case AArch64::UABAv16i8:
7470 return AArch64::UABDv16i8;
7471 case AArch64::UABAv2i32:
7472 return AArch64::UABDv2i32;
7473 case AArch64::UABAv4i16:
7474 return AArch64::UABDv4i16;
7475 case AArch64::UABAv4i32:
7476 return AArch64::UABDv4i32;
7477 case AArch64::UABAv8i16:
7478 return AArch64::UABDv8i16;
7479 case AArch64::UABAv8i8:
7480 return AArch64::UABDv8i8;
7481 case AArch64::SABALB_ZZZ_D:
7482 return AArch64::SABDLB_ZZZ_D;
7483 case AArch64::SABALB_ZZZ_S:
7484 return AArch64::SABDLB_ZZZ_S;
7485 case AArch64::SABALB_ZZZ_H:
7486 return AArch64::SABDLB_ZZZ_H;
7487 case AArch64::SABALT_ZZZ_D:
7488 return AArch64::SABDLT_ZZZ_D;
7489 case AArch64::SABALT_ZZZ_S:
7490 return AArch64::SABDLT_ZZZ_S;
7491 case AArch64::SABALT_ZZZ_H:
7492 return AArch64::SABDLT_ZZZ_H;
7493 case AArch64::SABALv16i8_v8i16:
7494 return AArch64::SABDLv16i8_v8i16;
7495 case AArch64::SABALv2i32_v2i64:
7496 return AArch64::SABDLv2i32_v2i64;
7497 case AArch64::SABALv4i16_v4i32:
7498 return AArch64::SABDLv4i16_v4i32;
7499 case AArch64::SABALv4i32_v2i64:
7500 return AArch64::SABDLv4i32_v2i64;
7501 case AArch64::SABALv8i16_v4i32:
7502 return AArch64::SABDLv8i16_v4i32;
7503 case AArch64::SABALv8i8_v8i16:
7504 return AArch64::SABDLv8i8_v8i16;
7505 case AArch64::SABAv16i8:
7506 return AArch64::SABDv16i8;
7507 case AArch64::SABAv2i32:
7508 return AArch64::SABAv2i32;
7509 case AArch64::SABAv4i16:
7510 return AArch64::SABDv4i16;
7511 case AArch64::SABAv4i32:
7512 return AArch64::SABDv4i32;
7513 case AArch64::SABAv8i16:
7514 return AArch64::SABDv8i16;
7515 case AArch64::SABAv8i8:
7516 return AArch64::SABDv8i8;
7517 }
7518}
7519
7520/// Floating-Point Support
7521
7522/// Find instructions that can be turned into madd.
7524 SmallVectorImpl<unsigned> &Patterns) {
7525
7526 if (!isCombineInstrCandidateFP(Root))
7527 return false;
7528
7529 MachineBasicBlock &MBB = *Root.getParent();
7530 bool Found = false;
7531
7532 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7533 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7534 Patterns.push_back(Pattern);
7535 return true;
7536 }
7537 return false;
7538 };
7539
7541
7542 switch (Root.getOpcode()) {
7543 default:
7544 assert(false && "Unsupported FP instruction in combiner\n");
7545 break;
7546 case AArch64::FADDHrr:
7547 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7548 "FADDHrr does not have register operands");
7549
7550 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7551 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7552 break;
7553 case AArch64::FADDSrr:
7554 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7555 "FADDSrr does not have register operands");
7556
7557 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7558 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7559
7560 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7561 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7562 break;
7563 case AArch64::FADDDrr:
7564 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7565 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7566
7567 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7568 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7569 break;
7570 case AArch64::FADDv4f16:
7571 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7572 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7573
7574 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7575 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7576 break;
7577 case AArch64::FADDv8f16:
7578 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7579 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7580
7581 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7582 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7583 break;
7584 case AArch64::FADDv2f32:
7585 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7586 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7587
7588 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7589 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7590 break;
7591 case AArch64::FADDv2f64:
7592 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7593 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7594
7595 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7596 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7597 break;
7598 case AArch64::FADDv4f32:
7599 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7600 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7601
7602 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7603 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7604 break;
7605 case AArch64::FSUBHrr:
7606 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7607 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7608 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7609 break;
7610 case AArch64::FSUBSrr:
7611 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7612
7613 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7614 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7615
7616 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7617 break;
7618 case AArch64::FSUBDrr:
7619 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7620
7621 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7622 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7623
7624 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7625 break;
7626 case AArch64::FSUBv4f16:
7627 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7628 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7629
7630 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7631 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7632 break;
7633 case AArch64::FSUBv8f16:
7634 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7635 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7636
7637 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7638 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7639 break;
7640 case AArch64::FSUBv2f32:
7641 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7642 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7643
7644 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7645 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7646 break;
7647 case AArch64::FSUBv2f64:
7648 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7649 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7650
7651 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7652 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7653 break;
7654 case AArch64::FSUBv4f32:
7655 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7656 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7657
7658 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7659 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7660 break;
7661 }
7662 return Found;
7663}
7664
7666 SmallVectorImpl<unsigned> &Patterns) {
7667 MachineBasicBlock &MBB = *Root.getParent();
7668 bool Found = false;
7669
7670 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7671 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7672 MachineOperand &MO = Root.getOperand(Operand);
7673 MachineInstr *MI = nullptr;
7674 if (MO.isReg() && MO.getReg().isVirtual())
7675 MI = MRI.getUniqueVRegDef(MO.getReg());
7676 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7677 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7678 MI->getOperand(1).getReg().isVirtual())
7679 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7680 if (MI && MI->getOpcode() == Opcode) {
7681 Patterns.push_back(Pattern);
7682 return true;
7683 }
7684 return false;
7685 };
7686
7688
7689 switch (Root.getOpcode()) {
7690 default:
7691 return false;
7692 case AArch64::FMULv2f32:
7693 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7694 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7695 break;
7696 case AArch64::FMULv2f64:
7697 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7698 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7699 break;
7700 case AArch64::FMULv4f16:
7701 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7702 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7703 break;
7704 case AArch64::FMULv4f32:
7705 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7706 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7707 break;
7708 case AArch64::FMULv8f16:
7709 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7710 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7711 break;
7712 }
7713
7714 return Found;
7715}
7716
7718 SmallVectorImpl<unsigned> &Patterns) {
7719 unsigned Opc = Root.getOpcode();
7720 MachineBasicBlock &MBB = *Root.getParent();
7721 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7722
7723 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7724 MachineOperand &MO = Root.getOperand(1);
7725 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
7726 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7727 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7731 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7732 Patterns.push_back(Pattern);
7733 return true;
7734 }
7735 return false;
7736 };
7737
7738 switch (Opc) {
7739 default:
7740 break;
7741 case AArch64::FNEGDr:
7742 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7743 case AArch64::FNEGSr:
7744 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7745 }
7746
7747 return false;
7748}
7749
7750/// Return true when a code sequence can improve throughput. It
7751/// should be called only for instructions in loops.
7752/// \param Pattern - combiner pattern
7754 switch (Pattern) {
7755 default:
7756 break;
7862 return true;
7863 } // end switch (Pattern)
7864 return false;
7865}
7866
7867/// Find other MI combine patterns.
7869 SmallVectorImpl<unsigned> &Patterns) {
7870 // A - (B + C) ==> (A - B) - C or (A - C) - B
7871 unsigned Opc = Root.getOpcode();
7872 MachineBasicBlock &MBB = *Root.getParent();
7873
7874 switch (Opc) {
7875 case AArch64::SUBWrr:
7876 case AArch64::SUBSWrr:
7877 case AArch64::SUBXrr:
7878 case AArch64::SUBSXrr:
7879 // Found candidate root.
7880 break;
7881 default:
7882 return false;
7883 }
7884
7886 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7887 -1)
7888 return false;
7889
7890 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7891 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7892 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7893 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7896 return true;
7897 }
7898
7899 return false;
7900}
7901
7902/// Check if the given instruction forms a gather load pattern that can be
7903/// optimized for better Memory-Level Parallelism (MLP). This function
7904/// identifies chains of NEON lane load instructions that load data from
7905/// different memory addresses into individual lanes of a 128-bit vector
7906/// register, then attempts to split the pattern into parallel loads to break
7907/// the serial dependency between instructions.
7908///
7909/// Pattern Matched:
7910/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) ->
7911/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root)
7912///
7913/// Transformed Into:
7914/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64
7915/// to combine the results, enabling better memory-level parallelism.
7916///
7917/// Supported Element Types:
7918/// - 32-bit elements (LD1i32, 4 lanes total)
7919/// - 16-bit elements (LD1i16, 8 lanes total)
7920/// - 8-bit elements (LD1i8, 16 lanes total)
7922 SmallVectorImpl<unsigned> &Patterns,
7923 unsigned LoadLaneOpCode, unsigned NumLanes) {
7924 const MachineFunction *MF = Root.getMF();
7925
7926 // Early exit if optimizing for size.
7927 if (MF->getFunction().hasMinSize())
7928 return false;
7929
7930 const MachineRegisterInfo &MRI = MF->getRegInfo();
7932
7933 // The root of the pattern must load into the last lane of the vector.
7934 if (Root.getOperand(2).getImm() != NumLanes - 1)
7935 return false;
7936
7937 // Check that we have load into all lanes except lane 0.
7938 // For each load we also want to check that:
7939 // 1. It has a single non-debug use (since we will be replacing the virtual
7940 // register)
7941 // 2. That the addressing mode only uses a single pointer operand
7942 auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7943 auto Range = llvm::seq<unsigned>(1, NumLanes - 1);
7944 SmallSet<unsigned, 16> RemainingLanes(Range.begin(), Range.end());
7946 while (!RemainingLanes.empty() && CurrInstr &&
7947 CurrInstr->getOpcode() == LoadLaneOpCode &&
7948 MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) &&
7949 CurrInstr->getNumOperands() == 4) {
7950 RemainingLanes.erase(CurrInstr->getOperand(2).getImm());
7951 LoadInstrs.push_back(CurrInstr);
7952 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
7953 }
7954
7955 // Check that we have found a match for lanes N-1.. 1.
7956 if (!RemainingLanes.empty())
7957 return false;
7958
7959 // Match the SUBREG_TO_REG sequence.
7960 if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG)
7961 return false;
7962
7963 // Verify that the subreg to reg loads an integer into the first lane.
7964 auto Lane0LoadReg = CurrInstr->getOperand(2).getReg();
7965 unsigned SingleLaneSizeInBits = 128 / NumLanes;
7966 if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits)
7967 return false;
7968
7969 // Verify that it also has a single non debug use.
7970 if (!MRI.hasOneNonDBGUse(Lane0LoadReg))
7971 return false;
7972
7973 LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg));
7974
7975 // If there is any chance of aliasing, do not apply the pattern.
7976 // Walk backward through the MBB starting from Root.
7977 // Exit early if we've encountered all load instructions or hit the search
7978 // limit.
7979 auto MBBItr = Root.getIterator();
7980 unsigned RemainingSteps = GatherOptSearchLimit;
7981 SmallPtrSet<const MachineInstr *, 16> RemainingLoadInstrs;
7982 RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end());
7983 const MachineBasicBlock *MBB = Root.getParent();
7984
7985 for (; MBBItr != MBB->begin() && RemainingSteps > 0 &&
7986 !RemainingLoadInstrs.empty();
7987 --MBBItr, --RemainingSteps) {
7988 const MachineInstr &CurrInstr = *MBBItr;
7989
7990 // Remove this instruction from remaining loads if it's one we're tracking.
7991 RemainingLoadInstrs.erase(&CurrInstr);
7992
7993 // Check for potential aliasing with any of the load instructions to
7994 // optimize.
7995 if (CurrInstr.isLoadFoldBarrier())
7996 return false;
7997 }
7998
7999 // If we hit the search limit without finding all load instructions,
8000 // don't match the pattern.
8001 if (RemainingSteps == 0 && !RemainingLoadInstrs.empty())
8002 return false;
8003
8004 switch (NumLanes) {
8005 case 4:
8007 break;
8008 case 8:
8010 break;
8011 case 16:
8013 break;
8014 default:
8015 llvm_unreachable("Got bad number of lanes for gather pattern.");
8016 }
8017
8018 return true;
8019}
8020
8021/// Search for patterns of LD instructions we can optimize.
8023 SmallVectorImpl<unsigned> &Patterns) {
8024
8025 // The pattern searches for loads into single lanes.
8026 switch (Root.getOpcode()) {
8027 case AArch64::LD1i32:
8028 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4);
8029 case AArch64::LD1i16:
8030 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8);
8031 case AArch64::LD1i8:
8032 return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16);
8033 default:
8034 return false;
8035 }
8036}
8037
8038/// Generate optimized instruction sequence for gather load patterns to improve
8039/// Memory-Level Parallelism (MLP). This function transforms a chain of
8040/// sequential NEON lane loads into parallel vector loads that can execute
8041/// concurrently.
8042static void
8046 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8047 unsigned Pattern, unsigned NumLanes) {
8048 MachineFunction &MF = *Root.getParent()->getParent();
8051
8052 // Gather the initial load instructions to build the pattern.
8053 SmallVector<MachineInstr *, 16> LoadToLaneInstrs;
8054 MachineInstr *CurrInstr = &Root;
8055 for (unsigned i = 0; i < NumLanes - 1; ++i) {
8056 LoadToLaneInstrs.push_back(CurrInstr);
8057 CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg());
8058 }
8059
8060 // Sort the load instructions according to the lane.
8061 llvm::sort(LoadToLaneInstrs,
8062 [](const MachineInstr *A, const MachineInstr *B) {
8063 return A->getOperand(2).getImm() > B->getOperand(2).getImm();
8064 });
8065
8066 MachineInstr *SubregToReg = CurrInstr;
8067 LoadToLaneInstrs.push_back(
8068 MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg()));
8069 auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs);
8070
8071 const TargetRegisterClass *FPR128RegClass =
8072 MRI.getRegClass(Root.getOperand(0).getReg());
8073
8074 // Helper lambda to create a LD1 instruction.
8075 auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr,
8076 Register SrcRegister, unsigned Lane,
8077 Register OffsetRegister,
8078 bool OffsetRegisterKillState) {
8079 auto NewRegister = MRI.createVirtualRegister(FPR128RegClass);
8080 MachineInstrBuilder LoadIndexIntoRegister =
8081 BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()),
8082 NewRegister)
8083 .addReg(SrcRegister)
8084 .addImm(Lane)
8085 .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
8086 InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
8087 InsInstrs.push_back(LoadIndexIntoRegister);
8088 return NewRegister;
8089 };
8090
8091 // Helper to create load instruction based on the NumLanes in the NEON
8092 // register we are rewriting.
8093 auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
8094 Register OffsetReg,
8095 bool KillState) -> MachineInstrBuilder {
8096 unsigned Opcode;
8097 switch (NumLanes) {
8098 case 4:
8099 Opcode = AArch64::LDRSui;
8100 break;
8101 case 8:
8102 Opcode = AArch64::LDRHui;
8103 break;
8104 case 16:
8105 Opcode = AArch64::LDRBui;
8106 break;
8107 default:
8109 "Got unsupported number of lanes in machine-combiner gather pattern");
8110 }
8111 // Immediate offset load
8112 return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
8113 .addReg(OffsetReg)
8114 .addImm(0);
8115 };
8116
8117 // Load the remaining lanes into register 0.
8118 auto LanesToLoadToReg0 =
8119 llvm::make_range(LoadToLaneInstrsAscending.begin() + 1,
8120 LoadToLaneInstrsAscending.begin() + NumLanes / 2);
8121 Register PrevReg = SubregToReg->getOperand(0).getReg();
8122 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) {
8123 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8124 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8125 OffsetRegOperand.getReg(),
8126 OffsetRegOperand.isKill());
8127 DelInstrs.push_back(LoadInstr);
8128 }
8129 Register LastLoadReg0 = PrevReg;
8130
8131 // First load into register 1. Perform an integer load to zero out the upper
8132 // lanes in a single instruction.
8133 MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin();
8134 MachineInstr *OriginalSplitLoad =
8135 *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2);
8136 Register DestRegForMiddleIndex = MRI.createVirtualRegister(
8137 MRI.getRegClass(Lane0Load->getOperand(0).getReg()));
8138
8139 const MachineOperand &OriginalSplitToLoadOffsetOperand =
8140 OriginalSplitLoad->getOperand(3);
8141 MachineInstrBuilder MiddleIndexLoadInstr =
8142 CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
8143 OriginalSplitToLoadOffsetOperand.getReg(),
8144 OriginalSplitToLoadOffsetOperand.isKill());
8145
8146 InstrIdxForVirtReg.insert(
8147 std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
8148 InsInstrs.push_back(MiddleIndexLoadInstr);
8149 DelInstrs.push_back(OriginalSplitLoad);
8150
8151 // Subreg To Reg instruction for register 1.
8152 Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass);
8153 unsigned SubregType;
8154 switch (NumLanes) {
8155 case 4:
8156 SubregType = AArch64::ssub;
8157 break;
8158 case 8:
8159 SubregType = AArch64::hsub;
8160 break;
8161 case 16:
8162 SubregType = AArch64::bsub;
8163 break;
8164 default:
8166 "Got invalid NumLanes for machine-combiner gather pattern");
8167 }
8168
8169 auto SubRegToRegInstr =
8170 BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()),
8171 DestRegForSubregToReg)
8172 .addImm(0)
8173 .addReg(DestRegForMiddleIndex, getKillRegState(true))
8174 .addImm(SubregType);
8175 InstrIdxForVirtReg.insert(
8176 std::make_pair(DestRegForSubregToReg, InsInstrs.size()));
8177 InsInstrs.push_back(SubRegToRegInstr);
8178
8179 // Load remaining lanes into register 1.
8180 auto LanesToLoadToReg1 =
8181 llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1,
8182 LoadToLaneInstrsAscending.end());
8183 PrevReg = SubRegToRegInstr->getOperand(0).getReg();
8184 for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) {
8185 const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3);
8186 PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1,
8187 OffsetRegOperand.getReg(),
8188 OffsetRegOperand.isKill());
8189
8190 // Do not add the last reg to DelInstrs - it will be removed later.
8191 if (Index == NumLanes / 2 - 2) {
8192 break;
8193 }
8194 DelInstrs.push_back(LoadInstr);
8195 }
8196 Register LastLoadReg1 = PrevReg;
8197
8198 // Create the final zip instruction to combine the results.
8199 MachineInstrBuilder ZipInstr =
8200 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64),
8201 Root.getOperand(0).getReg())
8202 .addReg(LastLoadReg0)
8203 .addReg(LastLoadReg1);
8204 InsInstrs.push_back(ZipInstr);
8205}
8206
8220
8221/// Return true when there is potentially a faster code sequence for an
8222/// instruction chain ending in \p Root. All potential patterns are listed in
8223/// the \p Pattern vector. Pattern should be sorted in priority order since the
8224/// pattern evaluator stops checking as soon as it finds a faster sequence.
8225
8226bool AArch64InstrInfo::getMachineCombinerPatterns(
8227 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
8228 bool DoRegPressureReduce) const {
8229 // Integer patterns
8230 if (getMaddPatterns(Root, Patterns))
8231 return true;
8232 // Floating point patterns
8233 if (getFMULPatterns(Root, Patterns))
8234 return true;
8235 if (getFMAPatterns(Root, Patterns))
8236 return true;
8237 if (getFNEGPatterns(Root, Patterns))
8238 return true;
8239
8240 // Other patterns
8241 if (getMiscPatterns(Root, Patterns))
8242 return true;
8243
8244 // Load patterns
8245 if (getLoadPatterns(Root, Patterns))
8246 return true;
8247
8248 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
8249 DoRegPressureReduce);
8250}
8251
8253/// genFusedMultiply - Generate fused multiply instructions.
8254/// This function supports both integer and floating point instructions.
8255/// A typical example:
8256/// F|MUL I=A,B,0
8257/// F|ADD R,I,C
8258/// ==> F|MADD R,A,B,C
8259/// \param MF Containing MachineFunction
8260/// \param MRI Register information
8261/// \param TII Target information
8262/// \param Root is the F|ADD instruction
8263/// \param [out] InsInstrs is a vector of machine instructions and will
8264/// contain the generated madd instruction
8265/// \param IdxMulOpd is index of operand in Root that is the result of
8266/// the F|MUL. In the example above IdxMulOpd is 1.
8267/// \param MaddOpc the opcode fo the f|madd instruction
8268/// \param RC Register class of operands
8269/// \param kind of fma instruction (addressing mode) to be generated
8270/// \param ReplacedAddend is the result register from the instruction
8271/// replacing the non-combined operand, if any.
8272static MachineInstr *
8274 const TargetInstrInfo *TII, MachineInstr &Root,
8275 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
8276 unsigned MaddOpc, const TargetRegisterClass *RC,
8278 const Register *ReplacedAddend = nullptr) {
8279 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8280
8281 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
8282 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8283 Register ResultReg = Root.getOperand(0).getReg();
8284 Register SrcReg0 = MUL->getOperand(1).getReg();
8285 bool Src0IsKill = MUL->getOperand(1).isKill();
8286 Register SrcReg1 = MUL->getOperand(2).getReg();
8287 bool Src1IsKill = MUL->getOperand(2).isKill();
8288
8289 Register SrcReg2;
8290 bool Src2IsKill;
8291 if (ReplacedAddend) {
8292 // If we just generated a new addend, we must be it's only use.
8293 SrcReg2 = *ReplacedAddend;
8294 Src2IsKill = true;
8295 } else {
8296 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
8297 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
8298 }
8299
8300 if (ResultReg.isVirtual())
8301 MRI.constrainRegClass(ResultReg, RC);
8302 if (SrcReg0.isVirtual())
8303 MRI.constrainRegClass(SrcReg0, RC);
8304 if (SrcReg1.isVirtual())
8305 MRI.constrainRegClass(SrcReg1, RC);
8306 if (SrcReg2.isVirtual())
8307 MRI.constrainRegClass(SrcReg2, RC);
8308
8310 if (kind == FMAInstKind::Default)
8311 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8312 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8313 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8314 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8315 else if (kind == FMAInstKind::Indexed)
8316 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8317 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8318 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8319 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8320 .addImm(MUL->getOperand(3).getImm());
8321 else if (kind == FMAInstKind::Accumulator)
8322 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8323 .addReg(SrcReg2, getKillRegState(Src2IsKill))
8324 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8325 .addReg(SrcReg1, getKillRegState(Src1IsKill));
8326 else
8327 assert(false && "Invalid FMA instruction kind \n");
8328 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
8329 InsInstrs.push_back(MIB);
8330 return MUL;
8331}
8332
8333static MachineInstr *
8335 const TargetInstrInfo *TII, MachineInstr &Root,
8337 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
8338
8339 unsigned Opc = 0;
8340 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
8341 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
8342 Opc = AArch64::FNMADDSrrr;
8343 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
8344 Opc = AArch64::FNMADDDrrr;
8345 else
8346 return nullptr;
8347
8348 Register ResultReg = Root.getOperand(0).getReg();
8349 Register SrcReg0 = MAD->getOperand(1).getReg();
8350 Register SrcReg1 = MAD->getOperand(2).getReg();
8351 Register SrcReg2 = MAD->getOperand(3).getReg();
8352 bool Src0IsKill = MAD->getOperand(1).isKill();
8353 bool Src1IsKill = MAD->getOperand(2).isKill();
8354 bool Src2IsKill = MAD->getOperand(3).isKill();
8355 if (ResultReg.isVirtual())
8356 MRI.constrainRegClass(ResultReg, RC);
8357 if (SrcReg0.isVirtual())
8358 MRI.constrainRegClass(SrcReg0, RC);
8359 if (SrcReg1.isVirtual())
8360 MRI.constrainRegClass(SrcReg1, RC);
8361 if (SrcReg2.isVirtual())
8362 MRI.constrainRegClass(SrcReg2, RC);
8363
8365 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
8366 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8367 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8368 .addReg(SrcReg2, getKillRegState(Src2IsKill));
8369 InsInstrs.push_back(MIB);
8370
8371 return MAD;
8372}
8373
8374/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
8375static MachineInstr *
8378 unsigned IdxDupOp, unsigned MulOpc,
8380 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
8381 "Invalid index of FMUL operand");
8382
8383 MachineFunction &MF = *Root.getMF();
8385
8386 MachineInstr *Dup =
8387 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
8388
8389 if (Dup->getOpcode() == TargetOpcode::COPY)
8390 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
8391
8392 Register DupSrcReg = Dup->getOperand(1).getReg();
8393 MRI.clearKillFlags(DupSrcReg);
8394 MRI.constrainRegClass(DupSrcReg, RC);
8395
8396 unsigned DupSrcLane = Dup->getOperand(2).getImm();
8397
8398 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
8399 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
8400
8401 Register ResultReg = Root.getOperand(0).getReg();
8402
8404 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
8405 .add(MulOp)
8406 .addReg(DupSrcReg)
8407 .addImm(DupSrcLane);
8408
8409 InsInstrs.push_back(MIB);
8410 return &Root;
8411}
8412
8413/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
8414/// instructions.
8415///
8416/// \see genFusedMultiply
8420 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8421 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8423}
8424
8425/// genNeg - Helper to generate an intermediate negation of the second operand
8426/// of Root
8428 const TargetInstrInfo *TII, MachineInstr &Root,
8430 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
8431 unsigned MnegOpc, const TargetRegisterClass *RC) {
8432 Register NewVR = MRI.createVirtualRegister(RC);
8434 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
8435 .add(Root.getOperand(2));
8436 InsInstrs.push_back(MIB);
8437
8438 assert(InstrIdxForVirtReg.empty());
8439 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8440
8441 return NewVR;
8442}
8443
8444/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8445/// instructions with an additional negation of the accumulator
8449 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8450 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8451 assert(IdxMulOpd == 1);
8452
8453 Register NewVR =
8454 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8455 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8456 FMAInstKind::Accumulator, &NewVR);
8457}
8458
8459/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
8460/// instructions.
8461///
8462/// \see genFusedMultiply
8466 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
8467 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8469}
8470
8471/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
8472/// instructions with an additional negation of the accumulator
8476 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
8477 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
8478 assert(IdxMulOpd == 1);
8479
8480 Register NewVR =
8481 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
8482
8483 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
8484 FMAInstKind::Indexed, &NewVR);
8485}
8486
8487/// genMaddR - Generate madd instruction and combine mul and add using
8488/// an extra virtual register
8489/// Example - an ADD intermediate needs to be stored in a register:
8490/// MUL I=A,B,0
8491/// ADD R,I,Imm
8492/// ==> ORR V, ZR, Imm
8493/// ==> MADD R,A,B,V
8494/// \param MF Containing MachineFunction
8495/// \param MRI Register information
8496/// \param TII Target information
8497/// \param Root is the ADD instruction
8498/// \param [out] InsInstrs is a vector of machine instructions and will
8499/// contain the generated madd instruction
8500/// \param IdxMulOpd is index of operand in Root that is the result of
8501/// the MUL. In the example above IdxMulOpd is 1.
8502/// \param MaddOpc the opcode fo the madd instruction
8503/// \param VR is a virtual register that holds the value of an ADD operand
8504/// (V in the example above).
8505/// \param RC Register class of operands
8507 const TargetInstrInfo *TII, MachineInstr &Root,
8509 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
8510 const TargetRegisterClass *RC) {
8511 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
8512
8513 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
8514 Register ResultReg = Root.getOperand(0).getReg();
8515 Register SrcReg0 = MUL->getOperand(1).getReg();
8516 bool Src0IsKill = MUL->getOperand(1).isKill();
8517 Register SrcReg1 = MUL->getOperand(2).getReg();
8518 bool Src1IsKill = MUL->getOperand(2).isKill();
8519
8520 if (ResultReg.isVirtual())
8521 MRI.constrainRegClass(ResultReg, RC);
8522 if (SrcReg0.isVirtual())
8523 MRI.constrainRegClass(SrcReg0, RC);
8524 if (SrcReg1.isVirtual())
8525 MRI.constrainRegClass(SrcReg1, RC);
8527 MRI.constrainRegClass(VR, RC);
8528
8530 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
8531 .addReg(SrcReg0, getKillRegState(Src0IsKill))
8532 .addReg(SrcReg1, getKillRegState(Src1IsKill))
8533 .addReg(VR);
8534 // Insert the MADD
8535 InsInstrs.push_back(MIB);
8536 return MUL;
8537}
8538
8539/// Do the following transformation
8540/// A - (B + C) ==> (A - B) - C
8541/// A - (B + C) ==> (A - C) - B
8543 const TargetInstrInfo *TII, MachineInstr &Root,
8546 unsigned IdxOpd1,
8547 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
8548 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
8549 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
8550 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
8551
8552 Register ResultReg = Root.getOperand(0).getReg();
8553 Register RegA = Root.getOperand(1).getReg();
8554 bool RegAIsKill = Root.getOperand(1).isKill();
8555 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
8556 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
8557 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
8558 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
8559 Register NewVR =
8560 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
8561
8562 unsigned Opcode = Root.getOpcode();
8563 if (Opcode == AArch64::SUBSWrr)
8564 Opcode = AArch64::SUBWrr;
8565 else if (Opcode == AArch64::SUBSXrr)
8566 Opcode = AArch64::SUBXrr;
8567 else
8568 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
8569 "Unexpected instruction opcode.");
8570
8571 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
8572 Flags &= ~MachineInstr::NoSWrap;
8573 Flags &= ~MachineInstr::NoUWrap;
8574
8575 MachineInstrBuilder MIB1 =
8576 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
8577 .addReg(RegA, getKillRegState(RegAIsKill))
8578 .addReg(RegB, getKillRegState(RegBIsKill))
8579 .setMIFlags(Flags);
8580 MachineInstrBuilder MIB2 =
8581 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
8582 .addReg(NewVR, getKillRegState(true))
8583 .addReg(RegC, getKillRegState(RegCIsKill))
8584 .setMIFlags(Flags);
8585
8586 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8587 InsInstrs.push_back(MIB1);
8588 InsInstrs.push_back(MIB2);
8589 DelInstrs.push_back(AddMI);
8590 DelInstrs.push_back(&Root);
8591}
8592
8593unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
8594 unsigned int AccumulatorOpCode) const {
8595 switch (AccumulatorOpCode) {
8596 case AArch64::UABALB_ZZZ_D:
8597 case AArch64::SABALB_ZZZ_D:
8598 case AArch64::UABALT_ZZZ_D:
8599 case AArch64::SABALT_ZZZ_D:
8600 return AArch64::ADD_ZZZ_D;
8601 case AArch64::UABALB_ZZZ_H:
8602 case AArch64::SABALB_ZZZ_H:
8603 case AArch64::UABALT_ZZZ_H:
8604 case AArch64::SABALT_ZZZ_H:
8605 return AArch64::ADD_ZZZ_H;
8606 case AArch64::UABALB_ZZZ_S:
8607 case AArch64::SABALB_ZZZ_S:
8608 case AArch64::UABALT_ZZZ_S:
8609 case AArch64::SABALT_ZZZ_S:
8610 return AArch64::ADD_ZZZ_S;
8611 case AArch64::UABALv16i8_v8i16:
8612 case AArch64::SABALv8i8_v8i16:
8613 case AArch64::SABAv8i16:
8614 case AArch64::UABAv8i16:
8615 return AArch64::ADDv8i16;
8616 case AArch64::SABALv2i32_v2i64:
8617 case AArch64::UABALv2i32_v2i64:
8618 case AArch64::SABALv4i32_v2i64:
8619 return AArch64::ADDv2i64;
8620 case AArch64::UABALv4i16_v4i32:
8621 case AArch64::SABALv4i16_v4i32:
8622 case AArch64::SABALv8i16_v4i32:
8623 case AArch64::SABAv4i32:
8624 case AArch64::UABAv4i32:
8625 return AArch64::ADDv4i32;
8626 case AArch64::UABALv4i32_v2i64:
8627 return AArch64::ADDv2i64;
8628 case AArch64::UABALv8i16_v4i32:
8629 return AArch64::ADDv4i32;
8630 case AArch64::UABALv8i8_v8i16:
8631 case AArch64::SABALv16i8_v8i16:
8632 return AArch64::ADDv8i16;
8633 case AArch64::UABAv16i8:
8634 case AArch64::SABAv16i8:
8635 return AArch64::ADDv16i8;
8636 case AArch64::UABAv4i16:
8637 case AArch64::SABAv4i16:
8638 return AArch64::ADDv4i16;
8639 case AArch64::UABAv2i32:
8640 case AArch64::SABAv2i32:
8641 return AArch64::ADDv2i32;
8642 case AArch64::UABAv8i8:
8643 case AArch64::SABAv8i8:
8644 return AArch64::ADDv8i8;
8645 default:
8646 llvm_unreachable("Unknown accumulator opcode");
8647 }
8648}
8649
8650/// When getMachineCombinerPatterns() finds potential patterns,
8651/// this function generates the instructions that could replace the
8652/// original code sequence
8653void AArch64InstrInfo::genAlternativeCodeSequence(
8654 MachineInstr &Root, unsigned Pattern,
8657 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
8658 MachineBasicBlock &MBB = *Root.getParent();
8659 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8660 MachineFunction &MF = *MBB.getParent();
8661 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
8662
8663 MachineInstr *MUL = nullptr;
8664 const TargetRegisterClass *RC;
8665 unsigned Opc;
8666 switch (Pattern) {
8667 default:
8668 // Reassociate instructions.
8669 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
8670 DelInstrs, InstrIdxForVirtReg);
8671 return;
8673 // A - (B + C)
8674 // ==> (A - B) - C
8675 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
8676 InstrIdxForVirtReg);
8677 return;
8679 // A - (B + C)
8680 // ==> (A - C) - B
8681 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
8682 InstrIdxForVirtReg);
8683 return;
8686 // MUL I=A,B,0
8687 // ADD R,I,C
8688 // ==> MADD R,A,B,C
8689 // --- Create(MADD);
8691 Opc = AArch64::MADDWrrr;
8692 RC = &AArch64::GPR32RegClass;
8693 } else {
8694 Opc = AArch64::MADDXrrr;
8695 RC = &AArch64::GPR64RegClass;
8696 }
8697 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8698 break;
8701 // MUL I=A,B,0
8702 // ADD R,C,I
8703 // ==> MADD R,A,B,C
8704 // --- Create(MADD);
8706 Opc = AArch64::MADDWrrr;
8707 RC = &AArch64::GPR32RegClass;
8708 } else {
8709 Opc = AArch64::MADDXrrr;
8710 RC = &AArch64::GPR64RegClass;
8711 }
8712 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8713 break;
8718 // MUL I=A,B,0
8719 // ADD/SUB R,I,Imm
8720 // ==> MOV V, Imm/-Imm
8721 // ==> MADD R,A,B,V
8722 // --- Create(MADD);
8723 const TargetRegisterClass *RC;
8724 unsigned BitSize, MovImm;
8727 MovImm = AArch64::MOVi32imm;
8728 RC = &AArch64::GPR32spRegClass;
8729 BitSize = 32;
8730 Opc = AArch64::MADDWrrr;
8731 RC = &AArch64::GPR32RegClass;
8732 } else {
8733 MovImm = AArch64::MOVi64imm;
8734 RC = &AArch64::GPR64spRegClass;
8735 BitSize = 64;
8736 Opc = AArch64::MADDXrrr;
8737 RC = &AArch64::GPR64RegClass;
8738 }
8739 Register NewVR = MRI.createVirtualRegister(RC);
8740 uint64_t Imm = Root.getOperand(2).getImm();
8741
8742 if (Root.getOperand(3).isImm()) {
8743 unsigned Val = Root.getOperand(3).getImm();
8744 Imm = Imm << Val;
8745 }
8746 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
8748 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
8749 // Check that the immediate can be composed via a single instruction.
8751 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
8752 if (Insn.size() != 1)
8753 return;
8754 MachineInstrBuilder MIB1 =
8755 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
8756 .addImm(IsSub ? -Imm : Imm);
8757 InsInstrs.push_back(MIB1);
8758 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8759 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8760 break;
8761 }
8764 // MUL I=A,B,0
8765 // SUB R,I, C
8766 // ==> SUB V, 0, C
8767 // ==> MADD R,A,B,V // = -C + A*B
8768 // --- Create(MADD);
8769 const TargetRegisterClass *SubRC;
8770 unsigned SubOpc, ZeroReg;
8772 SubOpc = AArch64::SUBWrr;
8773 SubRC = &AArch64::GPR32spRegClass;
8774 ZeroReg = AArch64::WZR;
8775 Opc = AArch64::MADDWrrr;
8776 RC = &AArch64::GPR32RegClass;
8777 } else {
8778 SubOpc = AArch64::SUBXrr;
8779 SubRC = &AArch64::GPR64spRegClass;
8780 ZeroReg = AArch64::XZR;
8781 Opc = AArch64::MADDXrrr;
8782 RC = &AArch64::GPR64RegClass;
8783 }
8784 Register NewVR = MRI.createVirtualRegister(SubRC);
8785 // SUB NewVR, 0, C
8786 MachineInstrBuilder MIB1 =
8787 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
8788 .addReg(ZeroReg)
8789 .add(Root.getOperand(2));
8790 InsInstrs.push_back(MIB1);
8791 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8792 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
8793 break;
8794 }
8797 // MUL I=A,B,0
8798 // SUB R,C,I
8799 // ==> MSUB R,A,B,C (computes C - A*B)
8800 // --- Create(MSUB);
8802 Opc = AArch64::MSUBWrrr;
8803 RC = &AArch64::GPR32RegClass;
8804 } else {
8805 Opc = AArch64::MSUBXrrr;
8806 RC = &AArch64::GPR64RegClass;
8807 }
8808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8809 break;
8811 Opc = AArch64::MLAv8i8;
8812 RC = &AArch64::FPR64RegClass;
8813 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8814 break;
8816 Opc = AArch64::MLAv8i8;
8817 RC = &AArch64::FPR64RegClass;
8818 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8819 break;
8821 Opc = AArch64::MLAv16i8;
8822 RC = &AArch64::FPR128RegClass;
8823 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8824 break;
8826 Opc = AArch64::MLAv16i8;
8827 RC = &AArch64::FPR128RegClass;
8828 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8829 break;
8831 Opc = AArch64::MLAv4i16;
8832 RC = &AArch64::FPR64RegClass;
8833 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8834 break;
8836 Opc = AArch64::MLAv4i16;
8837 RC = &AArch64::FPR64RegClass;
8838 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8839 break;
8841 Opc = AArch64::MLAv8i16;
8842 RC = &AArch64::FPR128RegClass;
8843 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8844 break;
8846 Opc = AArch64::MLAv8i16;
8847 RC = &AArch64::FPR128RegClass;
8848 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8849 break;
8851 Opc = AArch64::MLAv2i32;
8852 RC = &AArch64::FPR64RegClass;
8853 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8854 break;
8856 Opc = AArch64::MLAv2i32;
8857 RC = &AArch64::FPR64RegClass;
8858 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8859 break;
8861 Opc = AArch64::MLAv4i32;
8862 RC = &AArch64::FPR128RegClass;
8863 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8864 break;
8866 Opc = AArch64::MLAv4i32;
8867 RC = &AArch64::FPR128RegClass;
8868 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8869 break;
8870
8872 Opc = AArch64::MLAv8i8;
8873 RC = &AArch64::FPR64RegClass;
8874 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8875 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8876 RC);
8877 break;
8879 Opc = AArch64::MLSv8i8;
8880 RC = &AArch64::FPR64RegClass;
8881 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8882 break;
8884 Opc = AArch64::MLAv16i8;
8885 RC = &AArch64::FPR128RegClass;
8886 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8887 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
8888 RC);
8889 break;
8891 Opc = AArch64::MLSv16i8;
8892 RC = &AArch64::FPR128RegClass;
8893 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8894 break;
8896 Opc = AArch64::MLAv4i16;
8897 RC = &AArch64::FPR64RegClass;
8898 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8899 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8900 RC);
8901 break;
8903 Opc = AArch64::MLSv4i16;
8904 RC = &AArch64::FPR64RegClass;
8905 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8906 break;
8908 Opc = AArch64::MLAv8i16;
8909 RC = &AArch64::FPR128RegClass;
8910 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8911 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8912 RC);
8913 break;
8915 Opc = AArch64::MLSv8i16;
8916 RC = &AArch64::FPR128RegClass;
8917 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8918 break;
8920 Opc = AArch64::MLAv2i32;
8921 RC = &AArch64::FPR64RegClass;
8922 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8923 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8924 RC);
8925 break;
8927 Opc = AArch64::MLSv2i32;
8928 RC = &AArch64::FPR64RegClass;
8929 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8930 break;
8932 Opc = AArch64::MLAv4i32;
8933 RC = &AArch64::FPR128RegClass;
8934 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8935 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8936 RC);
8937 break;
8939 Opc = AArch64::MLSv4i32;
8940 RC = &AArch64::FPR128RegClass;
8941 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8942 break;
8943
8945 Opc = AArch64::MLAv4i16_indexed;
8946 RC = &AArch64::FPR64RegClass;
8947 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8948 break;
8950 Opc = AArch64::MLAv4i16_indexed;
8951 RC = &AArch64::FPR64RegClass;
8952 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8953 break;
8955 Opc = AArch64::MLAv8i16_indexed;
8956 RC = &AArch64::FPR128RegClass;
8957 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8958 break;
8960 Opc = AArch64::MLAv8i16_indexed;
8961 RC = &AArch64::FPR128RegClass;
8962 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8963 break;
8965 Opc = AArch64::MLAv2i32_indexed;
8966 RC = &AArch64::FPR64RegClass;
8967 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8968 break;
8970 Opc = AArch64::MLAv2i32_indexed;
8971 RC = &AArch64::FPR64RegClass;
8972 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8973 break;
8975 Opc = AArch64::MLAv4i32_indexed;
8976 RC = &AArch64::FPR128RegClass;
8977 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8978 break;
8980 Opc = AArch64::MLAv4i32_indexed;
8981 RC = &AArch64::FPR128RegClass;
8982 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8983 break;
8984
8986 Opc = AArch64::MLAv4i16_indexed;
8987 RC = &AArch64::FPR64RegClass;
8988 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8989 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8990 RC);
8991 break;
8993 Opc = AArch64::MLSv4i16_indexed;
8994 RC = &AArch64::FPR64RegClass;
8995 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8996 break;
8998 Opc = AArch64::MLAv8i16_indexed;
8999 RC = &AArch64::FPR128RegClass;
9000 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9001 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
9002 RC);
9003 break;
9005 Opc = AArch64::MLSv8i16_indexed;
9006 RC = &AArch64::FPR128RegClass;
9007 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9008 break;
9010 Opc = AArch64::MLAv2i32_indexed;
9011 RC = &AArch64::FPR64RegClass;
9012 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9013 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
9014 RC);
9015 break;
9017 Opc = AArch64::MLSv2i32_indexed;
9018 RC = &AArch64::FPR64RegClass;
9019 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9020 break;
9022 Opc = AArch64::MLAv4i32_indexed;
9023 RC = &AArch64::FPR128RegClass;
9024 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
9025 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
9026 RC);
9027 break;
9029 Opc = AArch64::MLSv4i32_indexed;
9030 RC = &AArch64::FPR128RegClass;
9031 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9032 break;
9033
9034 // Floating Point Support
9036 Opc = AArch64::FMADDHrrr;
9037 RC = &AArch64::FPR16RegClass;
9038 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9039 break;
9041 Opc = AArch64::FMADDSrrr;
9042 RC = &AArch64::FPR32RegClass;
9043 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9044 break;
9046 Opc = AArch64::FMADDDrrr;
9047 RC = &AArch64::FPR64RegClass;
9048 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9049 break;
9050
9052 Opc = AArch64::FMADDHrrr;
9053 RC = &AArch64::FPR16RegClass;
9054 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9055 break;
9057 Opc = AArch64::FMADDSrrr;
9058 RC = &AArch64::FPR32RegClass;
9059 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9060 break;
9062 Opc = AArch64::FMADDDrrr;
9063 RC = &AArch64::FPR64RegClass;
9064 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9065 break;
9066
9068 Opc = AArch64::FMLAv1i32_indexed;
9069 RC = &AArch64::FPR32RegClass;
9070 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9072 break;
9074 Opc = AArch64::FMLAv1i32_indexed;
9075 RC = &AArch64::FPR32RegClass;
9076 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9078 break;
9079
9081 Opc = AArch64::FMLAv1i64_indexed;
9082 RC = &AArch64::FPR64RegClass;
9083 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9085 break;
9087 Opc = AArch64::FMLAv1i64_indexed;
9088 RC = &AArch64::FPR64RegClass;
9089 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9091 break;
9092
9094 RC = &AArch64::FPR64RegClass;
9095 Opc = AArch64::FMLAv4i16_indexed;
9096 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9098 break;
9100 RC = &AArch64::FPR64RegClass;
9101 Opc = AArch64::FMLAv4f16;
9102 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9104 break;
9106 RC = &AArch64::FPR64RegClass;
9107 Opc = AArch64::FMLAv4i16_indexed;
9108 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9110 break;
9112 RC = &AArch64::FPR64RegClass;
9113 Opc = AArch64::FMLAv4f16;
9114 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9116 break;
9117
9120 RC = &AArch64::FPR64RegClass;
9122 Opc = AArch64::FMLAv2i32_indexed;
9123 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9125 } else {
9126 Opc = AArch64::FMLAv2f32;
9127 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9129 }
9130 break;
9133 RC = &AArch64::FPR64RegClass;
9135 Opc = AArch64::FMLAv2i32_indexed;
9136 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9138 } else {
9139 Opc = AArch64::FMLAv2f32;
9140 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9142 }
9143 break;
9144
9146 RC = &AArch64::FPR128RegClass;
9147 Opc = AArch64::FMLAv8i16_indexed;
9148 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9150 break;
9152 RC = &AArch64::FPR128RegClass;
9153 Opc = AArch64::FMLAv8f16;
9154 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9156 break;
9158 RC = &AArch64::FPR128RegClass;
9159 Opc = AArch64::FMLAv8i16_indexed;
9160 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9162 break;
9164 RC = &AArch64::FPR128RegClass;
9165 Opc = AArch64::FMLAv8f16;
9166 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9168 break;
9169
9172 RC = &AArch64::FPR128RegClass;
9174 Opc = AArch64::FMLAv2i64_indexed;
9175 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9177 } else {
9178 Opc = AArch64::FMLAv2f64;
9179 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9181 }
9182 break;
9185 RC = &AArch64::FPR128RegClass;
9187 Opc = AArch64::FMLAv2i64_indexed;
9188 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9190 } else {
9191 Opc = AArch64::FMLAv2f64;
9192 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9194 }
9195 break;
9196
9199 RC = &AArch64::FPR128RegClass;
9201 Opc = AArch64::FMLAv4i32_indexed;
9202 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9204 } else {
9205 Opc = AArch64::FMLAv4f32;
9206 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9208 }
9209 break;
9210
9213 RC = &AArch64::FPR128RegClass;
9215 Opc = AArch64::FMLAv4i32_indexed;
9216 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9218 } else {
9219 Opc = AArch64::FMLAv4f32;
9220 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9222 }
9223 break;
9224
9226 Opc = AArch64::FNMSUBHrrr;
9227 RC = &AArch64::FPR16RegClass;
9228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9229 break;
9231 Opc = AArch64::FNMSUBSrrr;
9232 RC = &AArch64::FPR32RegClass;
9233 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9234 break;
9236 Opc = AArch64::FNMSUBDrrr;
9237 RC = &AArch64::FPR64RegClass;
9238 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9239 break;
9240
9242 Opc = AArch64::FNMADDHrrr;
9243 RC = &AArch64::FPR16RegClass;
9244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9245 break;
9247 Opc = AArch64::FNMADDSrrr;
9248 RC = &AArch64::FPR32RegClass;
9249 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9250 break;
9252 Opc = AArch64::FNMADDDrrr;
9253 RC = &AArch64::FPR64RegClass;
9254 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
9255 break;
9256
9258 Opc = AArch64::FMSUBHrrr;
9259 RC = &AArch64::FPR16RegClass;
9260 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9261 break;
9263 Opc = AArch64::FMSUBSrrr;
9264 RC = &AArch64::FPR32RegClass;
9265 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9266 break;
9268 Opc = AArch64::FMSUBDrrr;
9269 RC = &AArch64::FPR64RegClass;
9270 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
9271 break;
9272
9274 Opc = AArch64::FMLSv1i32_indexed;
9275 RC = &AArch64::FPR32RegClass;
9276 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9278 break;
9279
9281 Opc = AArch64::FMLSv1i64_indexed;
9282 RC = &AArch64::FPR64RegClass;
9283 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9285 break;
9286
9289 RC = &AArch64::FPR64RegClass;
9290 Register NewVR = MRI.createVirtualRegister(RC);
9291 MachineInstrBuilder MIB1 =
9292 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
9293 .add(Root.getOperand(2));
9294 InsInstrs.push_back(MIB1);
9295 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9297 Opc = AArch64::FMLAv4f16;
9298 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9299 FMAInstKind::Accumulator, &NewVR);
9300 } else {
9301 Opc = AArch64::FMLAv4i16_indexed;
9302 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9303 FMAInstKind::Indexed, &NewVR);
9304 }
9305 break;
9306 }
9308 RC = &AArch64::FPR64RegClass;
9309 Opc = AArch64::FMLSv4f16;
9310 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9312 break;
9314 RC = &AArch64::FPR64RegClass;
9315 Opc = AArch64::FMLSv4i16_indexed;
9316 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9318 break;
9319
9322 RC = &AArch64::FPR64RegClass;
9324 Opc = AArch64::FMLSv2i32_indexed;
9325 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9327 } else {
9328 Opc = AArch64::FMLSv2f32;
9329 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9331 }
9332 break;
9333
9336 RC = &AArch64::FPR128RegClass;
9337 Register NewVR = MRI.createVirtualRegister(RC);
9338 MachineInstrBuilder MIB1 =
9339 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
9340 .add(Root.getOperand(2));
9341 InsInstrs.push_back(MIB1);
9342 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9344 Opc = AArch64::FMLAv8f16;
9345 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9346 FMAInstKind::Accumulator, &NewVR);
9347 } else {
9348 Opc = AArch64::FMLAv8i16_indexed;
9349 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9350 FMAInstKind::Indexed, &NewVR);
9351 }
9352 break;
9353 }
9355 RC = &AArch64::FPR128RegClass;
9356 Opc = AArch64::FMLSv8f16;
9357 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9359 break;
9361 RC = &AArch64::FPR128RegClass;
9362 Opc = AArch64::FMLSv8i16_indexed;
9363 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9365 break;
9366
9369 RC = &AArch64::FPR128RegClass;
9371 Opc = AArch64::FMLSv2i64_indexed;
9372 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9374 } else {
9375 Opc = AArch64::FMLSv2f64;
9376 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9378 }
9379 break;
9380
9383 RC = &AArch64::FPR128RegClass;
9385 Opc = AArch64::FMLSv4i32_indexed;
9386 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9388 } else {
9389 Opc = AArch64::FMLSv4f32;
9390 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
9392 }
9393 break;
9396 RC = &AArch64::FPR64RegClass;
9397 Register NewVR = MRI.createVirtualRegister(RC);
9398 MachineInstrBuilder MIB1 =
9399 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
9400 .add(Root.getOperand(2));
9401 InsInstrs.push_back(MIB1);
9402 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9404 Opc = AArch64::FMLAv2i32_indexed;
9405 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9406 FMAInstKind::Indexed, &NewVR);
9407 } else {
9408 Opc = AArch64::FMLAv2f32;
9409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9410 FMAInstKind::Accumulator, &NewVR);
9411 }
9412 break;
9413 }
9416 RC = &AArch64::FPR128RegClass;
9417 Register NewVR = MRI.createVirtualRegister(RC);
9418 MachineInstrBuilder MIB1 =
9419 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
9420 .add(Root.getOperand(2));
9421 InsInstrs.push_back(MIB1);
9422 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9424 Opc = AArch64::FMLAv4i32_indexed;
9425 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9426 FMAInstKind::Indexed, &NewVR);
9427 } else {
9428 Opc = AArch64::FMLAv4f32;
9429 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9430 FMAInstKind::Accumulator, &NewVR);
9431 }
9432 break;
9433 }
9436 RC = &AArch64::FPR128RegClass;
9437 Register NewVR = MRI.createVirtualRegister(RC);
9438 MachineInstrBuilder MIB1 =
9439 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
9440 .add(Root.getOperand(2));
9441 InsInstrs.push_back(MIB1);
9442 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
9444 Opc = AArch64::FMLAv2i64_indexed;
9445 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9446 FMAInstKind::Indexed, &NewVR);
9447 } else {
9448 Opc = AArch64::FMLAv2f64;
9449 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
9450 FMAInstKind::Accumulator, &NewVR);
9451 }
9452 break;
9453 }
9456 unsigned IdxDupOp =
9458 : 2;
9459 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
9460 &AArch64::FPR128RegClass, MRI);
9461 break;
9462 }
9465 unsigned IdxDupOp =
9467 : 2;
9468 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
9469 &AArch64::FPR128RegClass, MRI);
9470 break;
9471 }
9474 unsigned IdxDupOp =
9476 : 2;
9477 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
9478 &AArch64::FPR128_loRegClass, MRI);
9479 break;
9480 }
9483 unsigned IdxDupOp =
9485 : 2;
9486 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
9487 &AArch64::FPR128RegClass, MRI);
9488 break;
9489 }
9492 unsigned IdxDupOp =
9494 : 2;
9495 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
9496 &AArch64::FPR128_loRegClass, MRI);
9497 break;
9498 }
9500 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
9501 break;
9502 }
9504 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9505 Pattern, 4);
9506 break;
9507 }
9509 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9510 Pattern, 8);
9511 break;
9512 }
9514 generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg,
9515 Pattern, 16);
9516 break;
9517 }
9518
9519 } // end switch (Pattern)
9520 // Record MUL and ADD/SUB for deletion
9521 if (MUL)
9522 DelInstrs.push_back(MUL);
9523 DelInstrs.push_back(&Root);
9524
9525 // Set the flags on the inserted instructions to be the merged flags of the
9526 // instructions that we have combined.
9527 uint32_t Flags = Root.getFlags();
9528 if (MUL)
9529 Flags = Root.mergeFlagsWith(*MUL);
9530 for (auto *MI : InsInstrs)
9531 MI->setFlags(Flags);
9532}
9533
9534/// Replace csincr-branch sequence by simple conditional branch
9535///
9536/// Examples:
9537/// 1. \code
9538/// csinc w9, wzr, wzr, <condition code>
9539/// tbnz w9, #0, 0x44
9540/// \endcode
9541/// to
9542/// \code
9543/// b.<inverted condition code>
9544/// \endcode
9545///
9546/// 2. \code
9547/// csinc w9, wzr, wzr, <condition code>
9548/// tbz w9, #0, 0x44
9549/// \endcode
9550/// to
9551/// \code
9552/// b.<condition code>
9553/// \endcode
9554///
9555/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
9556/// compare's constant operand is power of 2.
9557///
9558/// Examples:
9559/// \code
9560/// and w8, w8, #0x400
9561/// cbnz w8, L1
9562/// \endcode
9563/// to
9564/// \code
9565/// tbnz w8, #10, L1
9566/// \endcode
9567///
9568/// \param MI Conditional Branch
9569/// \return True when the simple conditional branch is generated
9570///
9572 bool IsNegativeBranch = false;
9573 bool IsTestAndBranch = false;
9574 unsigned TargetBBInMI = 0;
9575 switch (MI.getOpcode()) {
9576 default:
9577 llvm_unreachable("Unknown branch instruction?");
9578 case AArch64::Bcc:
9579 case AArch64::CBWPri:
9580 case AArch64::CBXPri:
9581 case AArch64::CBBAssertExt:
9582 case AArch64::CBHAssertExt:
9583 case AArch64::CBWPrr:
9584 case AArch64::CBXPrr:
9585 return false;
9586 case AArch64::CBZW:
9587 case AArch64::CBZX:
9588 TargetBBInMI = 1;
9589 break;
9590 case AArch64::CBNZW:
9591 case AArch64::CBNZX:
9592 TargetBBInMI = 1;
9593 IsNegativeBranch = true;
9594 break;
9595 case AArch64::TBZW:
9596 case AArch64::TBZX:
9597 TargetBBInMI = 2;
9598 IsTestAndBranch = true;
9599 break;
9600 case AArch64::TBNZW:
9601 case AArch64::TBNZX:
9602 TargetBBInMI = 2;
9603 IsNegativeBranch = true;
9604 IsTestAndBranch = true;
9605 break;
9606 }
9607 // So we increment a zero register and test for bits other
9608 // than bit 0? Conservatively bail out in case the verifier
9609 // missed this case.
9610 if (IsTestAndBranch && MI.getOperand(1).getImm())
9611 return false;
9612
9613 // Find Definition.
9614 assert(MI.getParent() && "Incomplete machine instruction\n");
9615 MachineBasicBlock *MBB = MI.getParent();
9616 MachineFunction *MF = MBB->getParent();
9618 Register VReg = MI.getOperand(0).getReg();
9619 if (!VReg.isVirtual())
9620 return false;
9621
9622 MachineInstr *DefMI = MRI->getVRegDef(VReg);
9623
9624 // Look through COPY instructions to find definition.
9625 while (DefMI->isCopy()) {
9626 Register CopyVReg = DefMI->getOperand(1).getReg();
9627 if (!MRI->hasOneNonDBGUse(CopyVReg))
9628 return false;
9629 if (!MRI->hasOneDef(CopyVReg))
9630 return false;
9631 DefMI = MRI->getVRegDef(CopyVReg);
9632 }
9633
9634 switch (DefMI->getOpcode()) {
9635 default:
9636 return false;
9637 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
9638 case AArch64::ANDWri:
9639 case AArch64::ANDXri: {
9640 if (IsTestAndBranch)
9641 return false;
9642 if (DefMI->getParent() != MBB)
9643 return false;
9644 if (!MRI->hasOneNonDBGUse(VReg))
9645 return false;
9646
9647 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
9649 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
9650 if (!isPowerOf2_64(Mask))
9651 return false;
9652
9653 MachineOperand &MO = DefMI->getOperand(1);
9654 Register NewReg = MO.getReg();
9655 if (!NewReg.isVirtual())
9656 return false;
9657
9658 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
9659
9660 MachineBasicBlock &RefToMBB = *MBB;
9661 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
9662 DebugLoc DL = MI.getDebugLoc();
9663 unsigned Imm = Log2_64(Mask);
9664 unsigned Opc = (Imm < 32)
9665 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
9666 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
9667 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
9668 .addReg(NewReg)
9669 .addImm(Imm)
9670 .addMBB(TBB);
9671 // Register lives on to the CBZ now.
9672 MO.setIsKill(false);
9673
9674 // For immediate smaller than 32, we need to use the 32-bit
9675 // variant (W) in all cases. Indeed the 64-bit variant does not
9676 // allow to encode them.
9677 // Therefore, if the input register is 64-bit, we need to take the
9678 // 32-bit sub-part.
9679 if (!Is32Bit && Imm < 32)
9680 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
9681 MI.eraseFromParent();
9682 return true;
9683 }
9684 // Look for CSINC
9685 case AArch64::CSINCWr:
9686 case AArch64::CSINCXr: {
9687 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
9688 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
9689 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
9690 DefMI->getOperand(2).getReg() == AArch64::XZR))
9691 return false;
9692
9693 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
9694 true) != -1)
9695 return false;
9696
9697 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
9698 // Convert only when the condition code is not modified between
9699 // the CSINC and the branch. The CC may be used by other
9700 // instructions in between.
9702 return false;
9703 MachineBasicBlock &RefToMBB = *MBB;
9704 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
9705 DebugLoc DL = MI.getDebugLoc();
9706 if (IsNegativeBranch)
9708 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
9709 MI.eraseFromParent();
9710 return true;
9711 }
9712 }
9713}
9714
9715std::pair<unsigned, unsigned>
9716AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
9717 const unsigned Mask = AArch64II::MO_FRAGMENT;
9718 return std::make_pair(TF & Mask, TF & ~Mask);
9719}
9720
9722AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
9723 using namespace AArch64II;
9724
9725 static const std::pair<unsigned, const char *> TargetFlags[] = {
9726 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
9727 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
9728 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
9729 {MO_HI12, "aarch64-hi12"}};
9730 return ArrayRef(TargetFlags);
9731}
9732
9734AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
9735 using namespace AArch64II;
9736
9737 static const std::pair<unsigned, const char *> TargetFlags[] = {
9738 {MO_COFFSTUB, "aarch64-coffstub"},
9739 {MO_GOT, "aarch64-got"},
9740 {MO_NC, "aarch64-nc"},
9741 {MO_S, "aarch64-s"},
9742 {MO_TLS, "aarch64-tls"},
9743 {MO_DLLIMPORT, "aarch64-dllimport"},
9744 {MO_PREL, "aarch64-prel"},
9745 {MO_TAGGED, "aarch64-tagged"},
9746 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
9747 };
9748 return ArrayRef(TargetFlags);
9749}
9750
9752AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
9753 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9754 {{MOSuppressPair, "aarch64-suppress-pair"},
9755 {MOStridedAccess, "aarch64-strided-access"}};
9756 return ArrayRef(TargetFlags);
9757}
9758
9759/// Constants defining how certain sequences should be outlined.
9760/// This encompasses how an outlined function should be called, and what kind of
9761/// frame should be emitted for that outlined function.
9762///
9763/// \p MachineOutlinerDefault implies that the function should be called with
9764/// a save and restore of LR to the stack.
9765///
9766/// That is,
9767///
9768/// I1 Save LR OUTLINED_FUNCTION:
9769/// I2 --> BL OUTLINED_FUNCTION I1
9770/// I3 Restore LR I2
9771/// I3
9772/// RET
9773///
9774/// * Call construction overhead: 3 (save + BL + restore)
9775/// * Frame construction overhead: 1 (ret)
9776/// * Requires stack fixups? Yes
9777///
9778/// \p MachineOutlinerTailCall implies that the function is being created from
9779/// a sequence of instructions ending in a return.
9780///
9781/// That is,
9782///
9783/// I1 OUTLINED_FUNCTION:
9784/// I2 --> B OUTLINED_FUNCTION I1
9785/// RET I2
9786/// RET
9787///
9788/// * Call construction overhead: 1 (B)
9789/// * Frame construction overhead: 0 (Return included in sequence)
9790/// * Requires stack fixups? No
9791///
9792/// \p MachineOutlinerNoLRSave implies that the function should be called using
9793/// a BL instruction, but doesn't require LR to be saved and restored. This
9794/// happens when LR is known to be dead.
9795///
9796/// That is,
9797///
9798/// I1 OUTLINED_FUNCTION:
9799/// I2 --> BL OUTLINED_FUNCTION I1
9800/// I3 I2
9801/// I3
9802/// RET
9803///
9804/// * Call construction overhead: 1 (BL)
9805/// * Frame construction overhead: 1 (RET)
9806/// * Requires stack fixups? No
9807///
9808/// \p MachineOutlinerThunk implies that the function is being created from
9809/// a sequence of instructions ending in a call. The outlined function is
9810/// called with a BL instruction, and the outlined function tail-calls the
9811/// original call destination.
9812///
9813/// That is,
9814///
9815/// I1 OUTLINED_FUNCTION:
9816/// I2 --> BL OUTLINED_FUNCTION I1
9817/// BL f I2
9818/// B f
9819/// * Call construction overhead: 1 (BL)
9820/// * Frame construction overhead: 0
9821/// * Requires stack fixups? No
9822///
9823/// \p MachineOutlinerRegSave implies that the function should be called with a
9824/// save and restore of LR to an available register. This allows us to avoid
9825/// stack fixups. Note that this outlining variant is compatible with the
9826/// NoLRSave case.
9827///
9828/// That is,
9829///
9830/// I1 Save LR OUTLINED_FUNCTION:
9831/// I2 --> BL OUTLINED_FUNCTION I1
9832/// I3 Restore LR I2
9833/// I3
9834/// RET
9835///
9836/// * Call construction overhead: 3 (save + BL + restore)
9837/// * Frame construction overhead: 1 (ret)
9838/// * Requires stack fixups? No
9840 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9841 MachineOutlinerTailCall, /// Only emit a branch.
9842 MachineOutlinerNoLRSave, /// Emit a call and return.
9843 MachineOutlinerThunk, /// Emit a call and tail-call.
9844 MachineOutlinerRegSave /// Same as default, but save to a register.
9845};
9846
9852
9854AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9855 MachineFunction *MF = C.getMF();
9856 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9857 const AArch64RegisterInfo *ARI =
9858 static_cast<const AArch64RegisterInfo *>(&TRI);
9859 // Check if there is an available register across the sequence that we can
9860 // use.
9861 for (unsigned Reg : AArch64::GPR64RegClass) {
9862 if (!ARI->isReservedReg(*MF, Reg) &&
9863 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9864 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9865 Reg != AArch64::X17 && // Ditto for X17.
9866 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9867 C.isAvailableInsideSeq(Reg, TRI))
9868 return Reg;
9869 }
9870 return Register();
9871}
9872
9873static bool
9875 const outliner::Candidate &b) {
9876 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9877 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9878
9879 return MFIa->getSignReturnAddressCondition() ==
9881}
9882
9883static bool
9885 const outliner::Candidate &b) {
9886 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9887 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9888
9889 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9890}
9891
9893 const outliner::Candidate &b) {
9894 const AArch64Subtarget &SubtargetA =
9896 const AArch64Subtarget &SubtargetB =
9897 b.getMF()->getSubtarget<AArch64Subtarget>();
9898 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9899}
9900
9901std::optional<std::unique_ptr<outliner::OutlinedFunction>>
9902AArch64InstrInfo::getOutliningCandidateInfo(
9903 const MachineModuleInfo &MMI,
9904 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9905 unsigned MinRepeats) const {
9906 unsigned SequenceSize = 0;
9907 for (auto &MI : RepeatedSequenceLocs[0])
9908 SequenceSize += getInstSizeInBytes(MI);
9909
9910 unsigned NumBytesToCreateFrame = 0;
9911
9912 // Avoid splitting ADRP ADD/LDR pair into outlined functions.
9913 // These instructions are fused together by the scheduler.
9914 // Any candidate where ADRP is the last instruction should be rejected
9915 // as that will lead to splitting ADRP pair.
9916 MachineInstr &LastMI = RepeatedSequenceLocs[0].back();
9917 MachineInstr &FirstMI = RepeatedSequenceLocs[0].front();
9918 if (LastMI.getOpcode() == AArch64::ADRP &&
9919 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_PAGE) != 0 &&
9920 (LastMI.getOperand(1).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9921 return std::nullopt;
9922 }
9923
9924 // Similarly any candidate where the first instruction is ADD/LDR with a
9925 // page offset should be rejected to avoid ADRP splitting.
9926 if ((FirstMI.getOpcode() == AArch64::ADDXri ||
9927 FirstMI.getOpcode() == AArch64::LDRXui) &&
9928 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_PAGEOFF) != 0 &&
9929 (FirstMI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) != 0) {
9930 return std::nullopt;
9931 }
9932
9933 // We only allow outlining for functions having exactly matching return
9934 // address signing attributes, i.e., all share the same value for the
9935 // attribute "sign-return-address" and all share the same type of key they
9936 // are signed with.
9937 // Additionally we require all functions to simultaneously either support
9938 // v8.3a features or not. Otherwise an outlined function could get signed
9939 // using dedicated v8.3 instructions and a call from a function that doesn't
9940 // support v8.3 instructions would therefore be invalid.
9941 if (std::adjacent_find(
9942 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
9943 [](const outliner::Candidate &a, const outliner::Candidate &b) {
9944 // Return true if a and b are non-equal w.r.t. return address
9945 // signing or support of v8.3a features
9946 if (outliningCandidatesSigningScopeConsensus(a, b) &&
9947 outliningCandidatesSigningKeyConsensus(a, b) &&
9948 outliningCandidatesV8_3OpsConsensus(a, b)) {
9949 return false;
9950 }
9951 return true;
9952 }) != RepeatedSequenceLocs.end()) {
9953 return std::nullopt;
9954 }
9955
9956 // Since at this point all candidates agree on their return address signing
9957 // picking just one is fine. If the candidate functions potentially sign their
9958 // return addresses, the outlined function should do the same. Note that in
9959 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
9960 // not certainly true that the outlined function will have to sign its return
9961 // address but this decision is made later, when the decision to outline
9962 // has already been made.
9963 // The same holds for the number of additional instructions we need: On
9964 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
9965 // necessary. However, at this point we don't know if the outlined function
9966 // will have a RET instruction so we assume the worst.
9967 const TargetRegisterInfo &TRI = getRegisterInfo();
9968 // Performing a tail call may require extra checks when PAuth is enabled.
9969 // If PAuth is disabled, set it to zero for uniformity.
9970 unsigned NumBytesToCheckLRInTCEpilogue = 0;
9971 const auto RASignCondition = RepeatedSequenceLocs[0]
9972 .getMF()
9973 ->getInfo<AArch64FunctionInfo>()
9974 ->getSignReturnAddressCondition();
9975 if (RASignCondition != SignReturnAddress::None) {
9976 // One PAC and one AUT instructions
9977 NumBytesToCreateFrame += 8;
9978
9979 // PAuth is enabled - set extra tail call cost, if any.
9980 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
9981 *RepeatedSequenceLocs[0].getMF());
9982 NumBytesToCheckLRInTCEpilogue =
9984 // Checking the authenticated LR value may significantly impact
9985 // SequenceSize, so account for it for more precise results.
9986 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
9987 SequenceSize += NumBytesToCheckLRInTCEpilogue;
9988
9989 // We have to check if sp modifying instructions would get outlined.
9990 // If so we only allow outlining if sp is unchanged overall, so matching
9991 // sub and add instructions are okay to outline, all other sp modifications
9992 // are not
9993 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
9994 int SPValue = 0;
9995 for (auto &MI : C) {
9996 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
9997 switch (MI.getOpcode()) {
9998 case AArch64::ADDXri:
9999 case AArch64::ADDWri:
10000 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10001 assert(MI.getOperand(2).isImm() &&
10002 "Expected operand to be immediate");
10003 assert(MI.getOperand(1).isReg() &&
10004 "Expected operand to be a register");
10005 // Check if the add just increments sp. If so, we search for
10006 // matching sub instructions that decrement sp. If not, the
10007 // modification is illegal
10008 if (MI.getOperand(1).getReg() == AArch64::SP)
10009 SPValue += MI.getOperand(2).getImm();
10010 else
10011 return true;
10012 break;
10013 case AArch64::SUBXri:
10014 case AArch64::SUBWri:
10015 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
10016 assert(MI.getOperand(2).isImm() &&
10017 "Expected operand to be immediate");
10018 assert(MI.getOperand(1).isReg() &&
10019 "Expected operand to be a register");
10020 // Check if the sub just decrements sp. If so, we search for
10021 // matching add instructions that increment sp. If not, the
10022 // modification is illegal
10023 if (MI.getOperand(1).getReg() == AArch64::SP)
10024 SPValue -= MI.getOperand(2).getImm();
10025 else
10026 return true;
10027 break;
10028 default:
10029 return true;
10030 }
10031 }
10032 }
10033 if (SPValue)
10034 return true;
10035 return false;
10036 };
10037 // Remove candidates with illegal stack modifying instructions
10038 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
10039
10040 // If the sequence doesn't have enough candidates left, then we're done.
10041 if (RepeatedSequenceLocs.size() < MinRepeats)
10042 return std::nullopt;
10043 }
10044
10045 // Properties about candidate MBBs that hold for all of them.
10046 unsigned FlagsSetInAll = 0xF;
10047
10048 // Compute liveness information for each candidate, and set FlagsSetInAll.
10049 for (outliner::Candidate &C : RepeatedSequenceLocs)
10050 FlagsSetInAll &= C.Flags;
10051
10052 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
10053
10054 // Helper lambda which sets call information for every candidate.
10055 auto SetCandidateCallInfo =
10056 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
10057 for (outliner::Candidate &C : RepeatedSequenceLocs)
10058 C.setCallInfo(CallID, NumBytesForCall);
10059 };
10060
10061 unsigned FrameID = MachineOutlinerDefault;
10062 NumBytesToCreateFrame += 4;
10063
10064 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
10065 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
10066 });
10067
10068 // We check to see if CFI Instructions are present, and if they are
10069 // we find the number of CFI Instructions in the candidates.
10070 unsigned CFICount = 0;
10071 for (auto &I : RepeatedSequenceLocs[0]) {
10072 if (I.isCFIInstruction())
10073 CFICount++;
10074 }
10075
10076 // We compare the number of found CFI Instructions to the number of CFI
10077 // instructions in the parent function for each candidate. We must check this
10078 // since if we outline one of the CFI instructions in a function, we have to
10079 // outline them all for correctness. If we do not, the address offsets will be
10080 // incorrect between the two sections of the program.
10081 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10082 std::vector<MCCFIInstruction> CFIInstructions =
10083 C.getMF()->getFrameInstructions();
10084
10085 if (CFICount > 0 && CFICount != CFIInstructions.size())
10086 return std::nullopt;
10087 }
10088
10089 // Returns true if an instructions is safe to fix up, false otherwise.
10090 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
10091 if (MI.isCall())
10092 return true;
10093
10094 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
10095 !MI.readsRegister(AArch64::SP, &TRI))
10096 return true;
10097
10098 // Any modification of SP will break our code to save/restore LR.
10099 // FIXME: We could handle some instructions which add a constant
10100 // offset to SP, with a bit more work.
10101 if (MI.modifiesRegister(AArch64::SP, &TRI))
10102 return false;
10103
10104 // At this point, we have a stack instruction that we might need to
10105 // fix up. We'll handle it if it's a load or store.
10106 if (MI.mayLoadOrStore()) {
10107 const MachineOperand *Base; // Filled with the base operand of MI.
10108 int64_t Offset; // Filled with the offset of MI.
10109 bool OffsetIsScalable;
10110
10111 // Does it allow us to offset the base operand and is the base the
10112 // register SP?
10113 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
10114 !Base->isReg() || Base->getReg() != AArch64::SP)
10115 return false;
10116
10117 // Fixe-up code below assumes bytes.
10118 if (OffsetIsScalable)
10119 return false;
10120
10121 // Find the minimum/maximum offset for this instruction and check
10122 // if fixing it up would be in range.
10123 int64_t MinOffset,
10124 MaxOffset; // Unscaled offsets for the instruction.
10125 // The scale to multiply the offsets by.
10126 TypeSize Scale(0U, false), DummyWidth(0U, false);
10127 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
10128
10129 Offset += 16; // Update the offset to what it would be if we outlined.
10130 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
10131 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
10132 return false;
10133
10134 // It's in range, so we can outline it.
10135 return true;
10136 }
10137
10138 // FIXME: Add handling for instructions like "add x0, sp, #8".
10139
10140 // We can't fix it up, so don't outline it.
10141 return false;
10142 };
10143
10144 // True if it's possible to fix up each stack instruction in this sequence.
10145 // Important for frames/call variants that modify the stack.
10146 bool AllStackInstrsSafe =
10147 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
10148
10149 // If the last instruction in any candidate is a terminator, then we should
10150 // tail call all of the candidates.
10151 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10152 FrameID = MachineOutlinerTailCall;
10153 NumBytesToCreateFrame = 0;
10154 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
10155 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
10156 }
10157
10158 else if (LastInstrOpcode == AArch64::BL ||
10159 ((LastInstrOpcode == AArch64::BLR ||
10160 LastInstrOpcode == AArch64::BLRNoIP) &&
10161 !HasBTI)) {
10162 // FIXME: Do we need to check if the code after this uses the value of LR?
10163 FrameID = MachineOutlinerThunk;
10164 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
10165 SetCandidateCallInfo(MachineOutlinerThunk, 4);
10166 }
10167
10168 else {
10169 // We need to decide how to emit calls + frames. We can always emit the same
10170 // frame if we don't need to save to the stack. If we have to save to the
10171 // stack, then we need a different frame.
10172 unsigned NumBytesNoStackCalls = 0;
10173 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
10174
10175 // Check if we have to save LR.
10176 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10177 bool LRAvailable =
10179 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
10180 : true;
10181 // If we have a noreturn caller, then we're going to be conservative and
10182 // say that we have to save LR. If we don't have a ret at the end of the
10183 // block, then we can't reason about liveness accurately.
10184 //
10185 // FIXME: We can probably do better than always disabling this in
10186 // noreturn functions by fixing up the liveness info.
10187 bool IsNoReturn =
10188 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
10189
10190 // Is LR available? If so, we don't need a save.
10191 if (LRAvailable && !IsNoReturn) {
10192 NumBytesNoStackCalls += 4;
10193 C.setCallInfo(MachineOutlinerNoLRSave, 4);
10194 CandidatesWithoutStackFixups.push_back(C);
10195 }
10196
10197 // Is an unused register available? If so, we won't modify the stack, so
10198 // we can outline with the same frame type as those that don't save LR.
10199 else if (findRegisterToSaveLRTo(C)) {
10200 NumBytesNoStackCalls += 12;
10201 C.setCallInfo(MachineOutlinerRegSave, 12);
10202 CandidatesWithoutStackFixups.push_back(C);
10203 }
10204
10205 // Is SP used in the sequence at all? If not, we don't have to modify
10206 // the stack, so we are guaranteed to get the same frame.
10207 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
10208 NumBytesNoStackCalls += 12;
10209 C.setCallInfo(MachineOutlinerDefault, 12);
10210 CandidatesWithoutStackFixups.push_back(C);
10211 }
10212
10213 // If we outline this, we need to modify the stack. Pretend we don't
10214 // outline this by saving all of its bytes.
10215 else {
10216 NumBytesNoStackCalls += SequenceSize;
10217 }
10218 }
10219
10220 // If there are no places where we have to save LR, then note that we
10221 // don't have to update the stack. Otherwise, give every candidate the
10222 // default call type, as long as it's safe to do so.
10223 if (!AllStackInstrsSafe ||
10224 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
10225 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
10226 FrameID = MachineOutlinerNoLRSave;
10227 if (RepeatedSequenceLocs.size() < MinRepeats)
10228 return std::nullopt;
10229 } else {
10230 SetCandidateCallInfo(MachineOutlinerDefault, 12);
10231
10232 // Bugzilla ID: 46767
10233 // TODO: Check if fixing up the stack more than once is safe so we can
10234 // outline these.
10235 //
10236 // An outline resulting in a caller that requires stack fixups at the
10237 // callsite to a callee that also requires stack fixups can happen when
10238 // there are no available registers at the candidate callsite for a
10239 // candidate that itself also has calls.
10240 //
10241 // In other words if function_containing_sequence in the following pseudo
10242 // assembly requires that we save LR at the point of the call, but there
10243 // are no available registers: in this case we save using SP and as a
10244 // result the SP offsets requires stack fixups by multiples of 16.
10245 //
10246 // function_containing_sequence:
10247 // ...
10248 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10249 // call OUTLINED_FUNCTION_N
10250 // restore LR from SP
10251 // ...
10252 //
10253 // OUTLINED_FUNCTION_N:
10254 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
10255 // ...
10256 // bl foo
10257 // restore LR from SP
10258 // ret
10259 //
10260 // Because the code to handle more than one stack fixup does not
10261 // currently have the proper checks for legality, these cases will assert
10262 // in the AArch64 MachineOutliner. This is because the code to do this
10263 // needs more hardening, testing, better checks that generated code is
10264 // legal, etc and because it is only verified to handle a single pass of
10265 // stack fixup.
10266 //
10267 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
10268 // these cases until they are known to be handled. Bugzilla 46767 is
10269 // referenced in comments at the assert site.
10270 //
10271 // To avoid asserting (or generating non-legal code on noassert builds)
10272 // we remove all candidates which would need more than one stack fixup by
10273 // pruning the cases where the candidate has calls while also having no
10274 // available LR and having no available general purpose registers to copy
10275 // LR to (ie one extra stack save/restore).
10276 //
10277 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10278 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
10279 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
10280 return (llvm::any_of(C, IsCall)) &&
10281 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
10282 !findRegisterToSaveLRTo(C));
10283 });
10284 }
10285 }
10286
10287 // If we dropped all of the candidates, bail out here.
10288 if (RepeatedSequenceLocs.size() < MinRepeats)
10289 return std::nullopt;
10290 }
10291
10292 // Does every candidate's MBB contain a call? If so, then we might have a call
10293 // in the range.
10294 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
10295 // Check if the range contains a call. These require a save + restore of the
10296 // link register.
10297 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
10298 bool ModStackToSaveLR = false;
10299 if (any_of(drop_end(FirstCand),
10300 [](const MachineInstr &MI) { return MI.isCall(); }))
10301 ModStackToSaveLR = true;
10302
10303 // Handle the last instruction separately. If this is a tail call, then the
10304 // last instruction is a call. We don't want to save + restore in this case.
10305 // However, it could be possible that the last instruction is a call without
10306 // it being valid to tail call this sequence. We should consider this as
10307 // well.
10308 else if (FrameID != MachineOutlinerThunk &&
10309 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
10310 ModStackToSaveLR = true;
10311
10312 if (ModStackToSaveLR) {
10313 // We can't fix up the stack. Bail out.
10314 if (!AllStackInstrsSafe)
10315 return std::nullopt;
10316
10317 // Save + restore LR.
10318 NumBytesToCreateFrame += 8;
10319 }
10320 }
10321
10322 // If we have CFI instructions, we can only outline if the outlined section
10323 // can be a tail call
10324 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
10325 return std::nullopt;
10326
10327 return std::make_unique<outliner::OutlinedFunction>(
10328 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
10329}
10330
10331void AArch64InstrInfo::mergeOutliningCandidateAttributes(
10332 Function &F, std::vector<outliner::Candidate> &Candidates) const {
10333 // If a bunch of candidates reach this point they must agree on their return
10334 // address signing. It is therefore enough to just consider the signing
10335 // behaviour of one of them
10336 const auto &CFn = Candidates.front().getMF()->getFunction();
10337
10338 if (CFn.hasFnAttribute("ptrauth-returns"))
10339 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
10340 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
10341 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
10342 // Since all candidates belong to the same module, just copy the
10343 // function-level attributes of an arbitrary function.
10344 if (CFn.hasFnAttribute("sign-return-address"))
10345 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
10346 if (CFn.hasFnAttribute("sign-return-address-key"))
10347 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
10348
10349 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
10350}
10351
10352bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
10353 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10354 const Function &F = MF.getFunction();
10355
10356 // Can F be deduplicated by the linker? If it can, don't outline from it.
10357 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10358 return false;
10359
10360 // Don't outline from functions with section markings; the program could
10361 // expect that all the code is in the named section.
10362 // FIXME: Allow outlining from multiple functions with the same section
10363 // marking.
10364 if (F.hasSection())
10365 return false;
10366
10367 // Outlining from functions with redzones is unsafe since the outliner may
10368 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
10369 // outline from it.
10370 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
10371 if (!AFI || AFI->hasRedZone().value_or(true))
10372 return false;
10373
10374 // FIXME: Determine whether it is safe to outline from functions which contain
10375 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
10376 // outlined together and ensure it is safe to outline with async unwind info,
10377 // required for saving & restoring VG around calls.
10378 if (AFI->hasStreamingModeChanges())
10379 return false;
10380
10381 // FIXME: Teach the outliner to generate/handle Windows unwind info.
10383 return false;
10384
10385 // It's safe to outline from MF.
10386 return true;
10387}
10388
10390AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
10391 unsigned &Flags) const {
10393 "Must track liveness!");
10395 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
10396 Ranges;
10397 // According to the AArch64 Procedure Call Standard, the following are
10398 // undefined on entry/exit from a function call:
10399 //
10400 // * Registers x16, x17, (and thus w16, w17)
10401 // * Condition codes (and thus the NZCV register)
10402 //
10403 // If any of these registers are used inside or live across an outlined
10404 // function, then they may be modified later, either by the compiler or
10405 // some other tool (like the linker).
10406 //
10407 // To avoid outlining in these situations, partition each block into ranges
10408 // where these registers are dead. We will only outline from those ranges.
10409 LiveRegUnits LRU(getRegisterInfo());
10410 auto AreAllUnsafeRegsDead = [&LRU]() {
10411 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
10412 LRU.available(AArch64::NZCV);
10413 };
10414
10415 // We need to know if LR is live across an outlining boundary later on in
10416 // order to decide how we'll create the outlined call, frame, etc.
10417 //
10418 // It's pretty expensive to check this for *every candidate* within a block.
10419 // That's some potentially n^2 behaviour, since in the worst case, we'd need
10420 // to compute liveness from the end of the block for O(n) candidates within
10421 // the block.
10422 //
10423 // So, to improve the average case, let's keep track of liveness from the end
10424 // of the block to the beginning of *every outlinable range*. If we know that
10425 // LR is available in every range we could outline from, then we know that
10426 // we don't need to check liveness for any candidate within that range.
10427 bool LRAvailableEverywhere = true;
10428 // Compute liveness bottom-up.
10429 LRU.addLiveOuts(MBB);
10430 // Update flags that require info about the entire MBB.
10431 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
10432 if (MI.isCall() && !MI.isTerminator())
10434 };
10435 // Range: [RangeBegin, RangeEnd)
10436 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
10437 unsigned RangeLen;
10438 auto CreateNewRangeStartingAt =
10439 [&RangeBegin, &RangeEnd,
10440 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
10441 RangeBegin = NewBegin;
10442 RangeEnd = std::next(RangeBegin);
10443 RangeLen = 0;
10444 };
10445 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
10446 // At least one unsafe register is not dead. We do not want to outline at
10447 // this point. If it is long enough to outline from and does not cross a
10448 // bundle boundary, save the range [RangeBegin, RangeEnd).
10449 if (RangeLen <= 1)
10450 return;
10451 if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred())
10452 return;
10453 if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred())
10454 return;
10455 Ranges.emplace_back(RangeBegin, RangeEnd);
10456 };
10457 // Find the first point where all unsafe registers are dead.
10458 // FIND: <safe instr> <-- end of first potential range
10459 // SKIP: <unsafe def>
10460 // SKIP: ... everything between ...
10461 // SKIP: <unsafe use>
10462 auto FirstPossibleEndPt = MBB.instr_rbegin();
10463 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
10464 LRU.stepBackward(*FirstPossibleEndPt);
10465 // Update flags that impact how we outline across the entire block,
10466 // regardless of safety.
10467 UpdateWholeMBBFlags(*FirstPossibleEndPt);
10468 if (AreAllUnsafeRegsDead())
10469 break;
10470 }
10471 // If we exhausted the entire block, we have no safe ranges to outline.
10472 if (FirstPossibleEndPt == MBB.instr_rend())
10473 return Ranges;
10474 // Current range.
10475 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
10476 // StartPt points to the first place where all unsafe registers
10477 // are dead (if there is any such point). Begin partitioning the MBB into
10478 // ranges.
10479 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
10480 LRU.stepBackward(MI);
10481 UpdateWholeMBBFlags(MI);
10482 if (!AreAllUnsafeRegsDead()) {
10483 SaveRangeIfNonEmpty();
10484 CreateNewRangeStartingAt(MI.getIterator());
10485 continue;
10486 }
10487 LRAvailableEverywhere &= LRU.available(AArch64::LR);
10488 RangeBegin = MI.getIterator();
10489 ++RangeLen;
10490 }
10491 // Above loop misses the last (or only) range. If we are still safe, then
10492 // let's save the range.
10493 if (AreAllUnsafeRegsDead())
10494 SaveRangeIfNonEmpty();
10495 if (Ranges.empty())
10496 return Ranges;
10497 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
10498 // the order.
10499 std::reverse(Ranges.begin(), Ranges.end());
10500 // If there is at least one outlinable range where LR is unavailable
10501 // somewhere, remember that.
10502 if (!LRAvailableEverywhere)
10504 return Ranges;
10505}
10506
10508AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
10510 unsigned Flags) const {
10511 MachineInstr &MI = *MIT;
10512
10513 // Don't outline anything used for return address signing. The outlined
10514 // function will get signed later if needed
10515 switch (MI.getOpcode()) {
10516 case AArch64::PACM:
10517 case AArch64::PACIASP:
10518 case AArch64::PACIBSP:
10519 case AArch64::PACIASPPC:
10520 case AArch64::PACIBSPPC:
10521 case AArch64::AUTIASP:
10522 case AArch64::AUTIBSP:
10523 case AArch64::AUTIASPPCi:
10524 case AArch64::AUTIASPPCr:
10525 case AArch64::AUTIBSPPCi:
10526 case AArch64::AUTIBSPPCr:
10527 case AArch64::RETAA:
10528 case AArch64::RETAB:
10529 case AArch64::RETAASPPCi:
10530 case AArch64::RETAASPPCr:
10531 case AArch64::RETABSPPCi:
10532 case AArch64::RETABSPPCr:
10533 case AArch64::EMITBKEY:
10534 case AArch64::PAUTH_PROLOGUE:
10535 case AArch64::PAUTH_EPILOGUE:
10537 }
10538
10539 // We can only outline these if we will tail call the outlined function, or
10540 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
10541 // in a tail call.
10542 //
10543 // FIXME: If the proper fixups for the offset are implemented, this should be
10544 // possible.
10545 if (MI.isCFIInstruction())
10547
10548 // Is this a terminator for a basic block?
10549 if (MI.isTerminator())
10550 // TargetInstrInfo::getOutliningType has already filtered out anything
10551 // that would break this, so we can allow it here.
10553
10554 // Make sure none of the operands are un-outlinable.
10555 for (const MachineOperand &MOP : MI.operands()) {
10556 // A check preventing CFI indices was here before, but only CFI
10557 // instructions should have those.
10558 assert(!MOP.isCFIIndex());
10559
10560 // If it uses LR or W30 explicitly, then don't touch it.
10561 if (MOP.isReg() && !MOP.isImplicit() &&
10562 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
10564 }
10565
10566 // Special cases for instructions that can always be outlined, but will fail
10567 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
10568 // be outlined because they don't require a *specific* value to be in LR.
10569 if (MI.getOpcode() == AArch64::ADRP)
10571
10572 // If MI is a call we might be able to outline it. We don't want to outline
10573 // any calls that rely on the position of items on the stack. When we outline
10574 // something containing a call, we have to emit a save and restore of LR in
10575 // the outlined function. Currently, this always happens by saving LR to the
10576 // stack. Thus, if we outline, say, half the parameters for a function call
10577 // plus the call, then we'll break the callee's expectations for the layout
10578 // of the stack.
10579 //
10580 // FIXME: Allow calls to functions which construct a stack frame, as long
10581 // as they don't access arguments on the stack.
10582 // FIXME: Figure out some way to analyze functions defined in other modules.
10583 // We should be able to compute the memory usage based on the IR calling
10584 // convention, even if we can't see the definition.
10585 if (MI.isCall()) {
10586 // Get the function associated with the call. Look at each operand and find
10587 // the one that represents the callee and get its name.
10588 const Function *Callee = nullptr;
10589 for (const MachineOperand &MOP : MI.operands()) {
10590 if (MOP.isGlobal()) {
10591 Callee = dyn_cast<Function>(MOP.getGlobal());
10592 break;
10593 }
10594 }
10595
10596 // Never outline calls to mcount. There isn't any rule that would require
10597 // this, but the Linux kernel's "ftrace" feature depends on it.
10598 if (Callee && Callee->getName() == "\01_mcount")
10600
10601 // If we don't know anything about the callee, assume it depends on the
10602 // stack layout of the caller. In that case, it's only legal to outline
10603 // as a tail-call. Explicitly list the call instructions we know about so we
10604 // don't get unexpected results with call pseudo-instructions.
10605 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
10606 if (MI.getOpcode() == AArch64::BLR ||
10607 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
10608 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
10609
10610 if (!Callee)
10611 return UnknownCallOutlineType;
10612
10613 // We have a function we have information about. Check it if it's something
10614 // can safely outline.
10615 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
10616
10617 // We don't know what's going on with the callee at all. Don't touch it.
10618 if (!CalleeMF)
10619 return UnknownCallOutlineType;
10620
10621 // Check if we know anything about the callee saves on the function. If we
10622 // don't, then don't touch it, since that implies that we haven't
10623 // computed anything about its stack frame yet.
10624 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
10625 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
10626 MFI.getNumObjects() > 0)
10627 return UnknownCallOutlineType;
10628
10629 // At this point, we can say that CalleeMF ought to not pass anything on the
10630 // stack. Therefore, we can outline it.
10632 }
10633
10634 // Don't touch the link register or W30.
10635 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
10636 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
10638
10639 // Don't outline BTI instructions, because that will prevent the outlining
10640 // site from being indirectly callable.
10641 if (hasBTISemantics(MI))
10643
10645}
10646
10647void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
10648 for (MachineInstr &MI : MBB) {
10649 const MachineOperand *Base;
10650 TypeSize Width(0, false);
10651 int64_t Offset;
10652 bool OffsetIsScalable;
10653
10654 // Is this a load or store with an immediate offset with SP as the base?
10655 if (!MI.mayLoadOrStore() ||
10656 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
10657 &RI) ||
10658 (Base->isReg() && Base->getReg() != AArch64::SP))
10659 continue;
10660
10661 // It is, so we have to fix it up.
10662 TypeSize Scale(0U, false);
10663 int64_t Dummy1, Dummy2;
10664
10665 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
10666 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
10667 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
10668 assert(Scale != 0 && "Unexpected opcode!");
10669 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
10670
10671 // We've pushed the return address to the stack, so add 16 to the offset.
10672 // This is safe, since we already checked if it would overflow when we
10673 // checked if this instruction was legal to outline.
10674 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
10675 StackOffsetOperand.setImm(NewImm);
10676 }
10677}
10678
10680 const AArch64InstrInfo *TII,
10681 bool ShouldSignReturnAddr) {
10682 if (!ShouldSignReturnAddr)
10683 return;
10684
10685 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
10687 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
10688 TII->get(AArch64::PAUTH_EPILOGUE))
10690}
10691
10692void AArch64InstrInfo::buildOutlinedFrame(
10694 const outliner::OutlinedFunction &OF) const {
10695
10696 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
10697
10698 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10699 FI->setOutliningStyle("Tail Call");
10700 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
10701 // For thunk outlining, rewrite the last instruction from a call to a
10702 // tail-call.
10703 MachineInstr *Call = &*--MBB.instr_end();
10704 unsigned TailOpcode;
10705 if (Call->getOpcode() == AArch64::BL) {
10706 TailOpcode = AArch64::TCRETURNdi;
10707 } else {
10708 assert(Call->getOpcode() == AArch64::BLR ||
10709 Call->getOpcode() == AArch64::BLRNoIP);
10710 TailOpcode = AArch64::TCRETURNriALL;
10711 }
10712 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
10713 .add(Call->getOperand(0))
10714 .addImm(0);
10715 MBB.insert(MBB.end(), TC);
10717
10718 FI->setOutliningStyle("Thunk");
10719 }
10720
10721 bool IsLeafFunction = true;
10722
10723 // Is there a call in the outlined range?
10724 auto IsNonTailCall = [](const MachineInstr &MI) {
10725 return MI.isCall() && !MI.isReturn();
10726 };
10727
10728 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
10729 // Fix up the instructions in the range, since we're going to modify the
10730 // stack.
10731
10732 // Bugzilla ID: 46767
10733 // TODO: Check if fixing up twice is safe so we can outline these.
10734 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
10735 "Can only fix up stack references once");
10736 fixupPostOutline(MBB);
10737
10738 IsLeafFunction = false;
10739
10740 // LR has to be a live in so that we can save it.
10741 if (!MBB.isLiveIn(AArch64::LR))
10742 MBB.addLiveIn(AArch64::LR);
10743
10746
10747 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10748 OF.FrameConstructionID == MachineOutlinerThunk)
10749 Et = std::prev(MBB.end());
10750
10751 // Insert a save before the outlined region
10752 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10753 .addReg(AArch64::SP, RegState::Define)
10754 .addReg(AArch64::LR)
10755 .addReg(AArch64::SP)
10756 .addImm(-16);
10757 It = MBB.insert(It, STRXpre);
10758
10759 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
10760 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
10761
10762 // Add a CFI saying the stack was moved 16 B down.
10763 CFIBuilder.buildDefCFAOffset(16);
10764
10765 // Add a CFI saying that the LR that we want to find is now 16 B higher
10766 // than before.
10767 CFIBuilder.buildOffset(AArch64::LR, -16);
10768 }
10769
10770 // Insert a restore before the terminator for the function.
10771 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10772 .addReg(AArch64::SP, RegState::Define)
10773 .addReg(AArch64::LR, RegState::Define)
10774 .addReg(AArch64::SP)
10775 .addImm(16);
10776 Et = MBB.insert(Et, LDRXpost);
10777 }
10778
10779 auto RASignCondition = FI->getSignReturnAddressCondition();
10780 bool ShouldSignReturnAddr = AArch64FunctionInfo::shouldSignReturnAddress(
10781 RASignCondition, !IsLeafFunction);
10782
10783 // If this is a tail call outlined function, then there's already a return.
10784 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
10785 OF.FrameConstructionID == MachineOutlinerThunk) {
10786 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10787 return;
10788 }
10789
10790 // It's not a tail call, so we have to insert the return ourselves.
10791
10792 // LR has to be a live in so that we can return to it.
10793 if (!MBB.isLiveIn(AArch64::LR))
10794 MBB.addLiveIn(AArch64::LR);
10795
10796 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
10797 .addReg(AArch64::LR);
10798 MBB.insert(MBB.end(), ret);
10799
10800 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
10801
10802 FI->setOutliningStyle("Function");
10803
10804 // Did we have to modify the stack by saving the link register?
10805 if (OF.FrameConstructionID != MachineOutlinerDefault)
10806 return;
10807
10808 // We modified the stack.
10809 // Walk over the basic block and fix up all the stack accesses.
10810 fixupPostOutline(MBB);
10811}
10812
10813MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
10816
10817 // Are we tail calling?
10818 if (C.CallConstructionID == MachineOutlinerTailCall) {
10819 // If yes, then we can just branch to the label.
10820 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
10821 .addGlobalAddress(M.getNamedValue(MF.getName()))
10822 .addImm(0));
10823 return It;
10824 }
10825
10826 // Are we saving the link register?
10827 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
10828 C.CallConstructionID == MachineOutlinerThunk) {
10829 // No, so just insert the call.
10830 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10831 .addGlobalAddress(M.getNamedValue(MF.getName())));
10832 return It;
10833 }
10834
10835 // We want to return the spot where we inserted the call.
10837
10838 // Instructions for saving and restoring LR around the call instruction we're
10839 // going to insert.
10840 MachineInstr *Save;
10841 MachineInstr *Restore;
10842 // Can we save to a register?
10843 if (C.CallConstructionID == MachineOutlinerRegSave) {
10844 // FIXME: This logic should be sunk into a target-specific interface so that
10845 // we don't have to recompute the register.
10846 Register Reg = findRegisterToSaveLRTo(C);
10847 assert(Reg && "No callee-saved register available?");
10848
10849 // LR has to be a live in so that we can save it.
10850 if (!MBB.isLiveIn(AArch64::LR))
10851 MBB.addLiveIn(AArch64::LR);
10852
10853 // Save and restore LR from Reg.
10854 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
10855 .addReg(AArch64::XZR)
10856 .addReg(AArch64::LR)
10857 .addImm(0);
10858 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
10859 .addReg(AArch64::XZR)
10860 .addReg(Reg)
10861 .addImm(0);
10862 } else {
10863 // We have the default case. Save and restore from SP.
10864 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
10865 .addReg(AArch64::SP, RegState::Define)
10866 .addReg(AArch64::LR)
10867 .addReg(AArch64::SP)
10868 .addImm(-16);
10869 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10870 .addReg(AArch64::SP, RegState::Define)
10871 .addReg(AArch64::LR, RegState::Define)
10872 .addReg(AArch64::SP)
10873 .addImm(16);
10874 }
10875
10876 It = MBB.insert(It, Save);
10877 It++;
10878
10879 // Insert the call.
10880 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10881 .addGlobalAddress(M.getNamedValue(MF.getName())));
10882 CallPt = It;
10883 It++;
10884
10885 It = MBB.insert(It, Restore);
10886 return CallPt;
10887}
10888
10889bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10890 MachineFunction &MF) const {
10891 return MF.getFunction().hasMinSize();
10892}
10893
10894void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10896 DebugLoc &DL,
10897 bool AllowSideEffects) const {
10898 const MachineFunction &MF = *MBB.getParent();
10899 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10900 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10901
10902 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10903 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
10904 } else if (STI.isSVEorStreamingSVEAvailable()) {
10905 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
10906 .addImm(0)
10907 .addImm(0);
10908 } else if (STI.isNeonAvailable()) {
10909 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
10910 .addImm(0);
10911 } else {
10912 // This is a streaming-compatible function without SVE. We don't have full
10913 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10914 // So given `movi v..` would be illegal use `fmov d..` instead.
10915 assert(STI.hasNEON() && "Expected to have NEON.");
10916 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
10917 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
10918 }
10919}
10920
10921std::optional<DestSourcePair>
10923
10924 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10925 // and zero immediate operands used as an alias for mov instruction.
10926 if (((MI.getOpcode() == AArch64::ORRWrs &&
10927 MI.getOperand(1).getReg() == AArch64::WZR &&
10928 MI.getOperand(3).getImm() == 0x0) ||
10929 (MI.getOpcode() == AArch64::ORRWrr &&
10930 MI.getOperand(1).getReg() == AArch64::WZR)) &&
10931 // Check that the w->w move is not a zero-extending w->x mov.
10932 (!MI.getOperand(0).getReg().isVirtual() ||
10933 MI.getOperand(0).getSubReg() == 0) &&
10934 (!MI.getOperand(0).getReg().isPhysical() ||
10935 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
10936 /*TRI=*/nullptr) == -1))
10937 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10938
10939 if (MI.getOpcode() == AArch64::ORRXrs &&
10940 MI.getOperand(1).getReg() == AArch64::XZR &&
10941 MI.getOperand(3).getImm() == 0x0)
10942 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10943
10944 return std::nullopt;
10945}
10946
10947std::optional<DestSourcePair>
10949 if ((MI.getOpcode() == AArch64::ORRWrs &&
10950 MI.getOperand(1).getReg() == AArch64::WZR &&
10951 MI.getOperand(3).getImm() == 0x0) ||
10952 (MI.getOpcode() == AArch64::ORRWrr &&
10953 MI.getOperand(1).getReg() == AArch64::WZR))
10954 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10955 return std::nullopt;
10956}
10957
10958std::optional<RegImmPair>
10959AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
10960 int Sign = 1;
10961 int64_t Offset = 0;
10962
10963 // TODO: Handle cases where Reg is a super- or sub-register of the
10964 // destination register.
10965 const MachineOperand &Op0 = MI.getOperand(0);
10966 if (!Op0.isReg() || Reg != Op0.getReg())
10967 return std::nullopt;
10968
10969 switch (MI.getOpcode()) {
10970 default:
10971 return std::nullopt;
10972 case AArch64::SUBWri:
10973 case AArch64::SUBXri:
10974 case AArch64::SUBSWri:
10975 case AArch64::SUBSXri:
10976 Sign *= -1;
10977 [[fallthrough]];
10978 case AArch64::ADDSWri:
10979 case AArch64::ADDSXri:
10980 case AArch64::ADDWri:
10981 case AArch64::ADDXri: {
10982 // TODO: Third operand can be global address (usually some string).
10983 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
10984 !MI.getOperand(2).isImm())
10985 return std::nullopt;
10986 int Shift = MI.getOperand(3).getImm();
10987 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
10988 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
10989 }
10990 }
10991 return RegImmPair{MI.getOperand(1).getReg(), Offset};
10992}
10993
10994/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
10995/// the destination register then, if possible, describe the value in terms of
10996/// the source register.
10997static std::optional<ParamLoadedValue>
10999 const TargetInstrInfo *TII,
11000 const TargetRegisterInfo *TRI) {
11001 auto DestSrc = TII->isCopyLikeInstr(MI);
11002 if (!DestSrc)
11003 return std::nullopt;
11004
11005 Register DestReg = DestSrc->Destination->getReg();
11006 Register SrcReg = DestSrc->Source->getReg();
11007
11008 if (!DestReg.isValid() || !SrcReg.isValid())
11009 return std::nullopt;
11010
11011 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
11012
11013 // If the described register is the destination, just return the source.
11014 if (DestReg == DescribedReg)
11015 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11016
11017 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
11018 if (MI.getOpcode() == AArch64::ORRWrs &&
11019 TRI->isSuperRegister(DestReg, DescribedReg))
11020 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
11021
11022 // We may need to describe the lower part of a ORRXrs move.
11023 if (MI.getOpcode() == AArch64::ORRXrs &&
11024 TRI->isSubRegister(DestReg, DescribedReg)) {
11025 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
11026 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
11027 }
11028
11029 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
11030 "Unhandled ORR[XW]rs copy case");
11031
11032 return std::nullopt;
11033}
11034
11035bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
11036 // Functions cannot be split to different sections on AArch64 if they have
11037 // a red zone. This is because relaxing a cross-section branch may require
11038 // incrementing the stack pointer to spill a register, which would overwrite
11039 // the red zone.
11040 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
11041 return false;
11042
11044}
11045
11046bool AArch64InstrInfo::isMBBSafeToSplitToCold(
11047 const MachineBasicBlock &MBB) const {
11048 // Asm Goto blocks can contain conditional branches to goto labels, which can
11049 // get moved out of range of the branch instruction.
11050 auto isAsmGoto = [](const MachineInstr &MI) {
11051 return MI.getOpcode() == AArch64::INLINEASM_BR;
11052 };
11053 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
11054 return false;
11055
11056 // Because jump tables are label-relative instead of table-relative, they all
11057 // must be in the same section or relocation fixup handling will fail.
11058
11059 // Check if MBB is a jump table target
11060 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
11061 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
11062 return llvm::is_contained(JTE.MBBs, &MBB);
11063 };
11064 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
11065 return false;
11066
11067 // Check if MBB contains a jump table lookup
11068 for (const MachineInstr &MI : MBB) {
11069 switch (MI.getOpcode()) {
11070 case TargetOpcode::G_BRJT:
11071 case AArch64::JumpTableDest32:
11072 case AArch64::JumpTableDest16:
11073 case AArch64::JumpTableDest8:
11074 return false;
11075 default:
11076 continue;
11077 }
11078 }
11079
11080 // MBB isn't a special case, so it's safe to be split to the cold section.
11081 return true;
11082}
11083
11084std::optional<ParamLoadedValue>
11085AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
11086 Register Reg) const {
11087 const MachineFunction *MF = MI.getMF();
11088 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
11089 switch (MI.getOpcode()) {
11090 case AArch64::MOVZWi:
11091 case AArch64::MOVZXi: {
11092 // MOVZWi may be used for producing zero-extended 32-bit immediates in
11093 // 64-bit parameters, so we need to consider super-registers.
11094 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
11095 return std::nullopt;
11096
11097 if (!MI.getOperand(1).isImm())
11098 return std::nullopt;
11099 int64_t Immediate = MI.getOperand(1).getImm();
11100 int Shift = MI.getOperand(2).getImm();
11101 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
11102 nullptr);
11103 }
11104 case AArch64::ORRWrs:
11105 case AArch64::ORRXrs:
11106 return describeORRLoadedValue(MI, Reg, this, TRI);
11107 }
11108
11110}
11111
11112bool AArch64InstrInfo::isExtendLikelyToBeFolded(
11113 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
11114 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
11115 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
11116 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
11117
11118 // Anyexts are nops.
11119 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
11120 return true;
11121
11122 Register DefReg = ExtMI.getOperand(0).getReg();
11123 if (!MRI.hasOneNonDBGUse(DefReg))
11124 return false;
11125
11126 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
11127 // addressing mode.
11128 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
11129 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
11130}
11131
11132uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
11133 return get(Opc).TSFlags & AArch64::ElementSizeMask;
11134}
11135
11136bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
11137 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
11138}
11139
11140bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
11141 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
11142}
11143
11144unsigned int
11145AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
11146 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
11147}
11148
11149bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
11150 unsigned Scale) const {
11151 if (Offset && Scale)
11152 return false;
11153
11154 // Check Reg + Imm
11155 if (!Scale) {
11156 // 9-bit signed offset
11157 if (isInt<9>(Offset))
11158 return true;
11159
11160 // 12-bit unsigned offset
11161 unsigned Shift = Log2_64(NumBytes);
11162 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
11163 // Must be a multiple of NumBytes (NumBytes is a power of 2)
11164 (Offset >> Shift) << Shift == Offset)
11165 return true;
11166 return false;
11167 }
11168
11169 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
11170 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
11171}
11172
11174 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
11175 return AArch64::BLRNoIP;
11176 else
11177 return AArch64::BLR;
11178}
11179
11182 Register TargetReg, bool FrameSetup) const {
11183 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
11184
11185 MachineBasicBlock &MBB = *MBBI->getParent();
11186 MachineFunction &MF = *MBB.getParent();
11187 const AArch64InstrInfo *TII =
11188 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
11189 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
11190 DebugLoc DL = MBB.findDebugLoc(MBBI);
11191
11192 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
11193 MachineBasicBlock *LoopTestMBB =
11194 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11195 MF.insert(MBBInsertPoint, LoopTestMBB);
11196 MachineBasicBlock *LoopBodyMBB =
11197 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11198 MF.insert(MBBInsertPoint, LoopBodyMBB);
11199 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
11200 MF.insert(MBBInsertPoint, ExitMBB);
11201 MachineInstr::MIFlag Flags =
11203
11204 // LoopTest:
11205 // SUB SP, SP, #ProbeSize
11206 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
11207 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
11208
11209 // CMP SP, TargetReg
11210 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
11211 AArch64::XZR)
11212 .addReg(AArch64::SP)
11213 .addReg(TargetReg)
11215 .setMIFlags(Flags);
11216
11217 // B.<Cond> LoopExit
11218 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
11220 .addMBB(ExitMBB)
11221 .setMIFlags(Flags);
11222
11223 // STR XZR, [SP]
11224 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
11225 .addReg(AArch64::XZR)
11226 .addReg(AArch64::SP)
11227 .addImm(0)
11228 .setMIFlags(Flags);
11229
11230 // B loop
11231 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
11232 .addMBB(LoopTestMBB)
11233 .setMIFlags(Flags);
11234
11235 // LoopExit:
11236 // MOV SP, TargetReg
11237 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
11238 .addReg(TargetReg)
11239 .addImm(0)
11241 .setMIFlags(Flags);
11242
11243 // LDR XZR, [SP]
11244 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
11245 .addReg(AArch64::XZR, RegState::Define)
11246 .addReg(AArch64::SP)
11247 .addImm(0)
11248 .setMIFlags(Flags);
11249
11250 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
11252
11253 LoopTestMBB->addSuccessor(ExitMBB);
11254 LoopTestMBB->addSuccessor(LoopBodyMBB);
11255 LoopBodyMBB->addSuccessor(LoopTestMBB);
11256 MBB.addSuccessor(LoopTestMBB);
11257
11258 // Update liveins.
11259 if (MF.getRegInfo().reservedRegsFrozen())
11260 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
11261
11262 return ExitMBB->begin();
11263}
11264
11265namespace {
11266class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
11267 MachineFunction *MF;
11268 const TargetInstrInfo *TII;
11269 const TargetRegisterInfo *TRI;
11271
11272 /// The block of the loop
11273 MachineBasicBlock *LoopBB;
11274 /// The conditional branch of the loop
11275 MachineInstr *CondBranch;
11276 /// The compare instruction for loop control
11277 MachineInstr *Comp;
11278 /// The number of the operand of the loop counter value in Comp
11279 unsigned CompCounterOprNum;
11280 /// The instruction that updates the loop counter value
11281 MachineInstr *Update;
11282 /// The number of the operand of the loop counter value in Update
11283 unsigned UpdateCounterOprNum;
11284 /// The initial value of the loop counter
11285 Register Init;
11286 /// True iff Update is a predecessor of Comp
11287 bool IsUpdatePriorComp;
11288
11289 /// The normalized condition used by createTripCountGreaterCondition()
11291
11292public:
11293 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
11294 MachineInstr *Comp, unsigned CompCounterOprNum,
11295 MachineInstr *Update, unsigned UpdateCounterOprNum,
11296 Register Init, bool IsUpdatePriorComp,
11298 : MF(Comp->getParent()->getParent()),
11299 TII(MF->getSubtarget().getInstrInfo()),
11300 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
11301 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
11302 CompCounterOprNum(CompCounterOprNum), Update(Update),
11303 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
11304 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
11305
11306 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
11307 // Make the instructions for loop control be placed in stage 0.
11308 // The predecessors of Comp are considered by the caller.
11309 return MI == Comp;
11310 }
11311
11312 std::optional<bool> createTripCountGreaterCondition(
11313 int TC, MachineBasicBlock &MBB,
11314 SmallVectorImpl<MachineOperand> &CondParam) override {
11315 // A branch instruction will be inserted as "if (Cond) goto epilogue".
11316 // Cond is normalized for such use.
11317 // The predecessors of the branch are assumed to have already been inserted.
11318 CondParam = Cond;
11319 return {};
11320 }
11321
11322 void createRemainingIterationsGreaterCondition(
11323 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
11324 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
11325
11326 void setPreheader(MachineBasicBlock *NewPreheader) override {}
11327
11328 void adjustTripCount(int TripCountAdjust) override {}
11329
11330 bool isMVEExpanderSupported() override { return true; }
11331};
11332} // namespace
11333
11334/// Clone an instruction from MI. The register of ReplaceOprNum-th operand
11335/// is replaced by ReplaceReg. The output register is newly created.
11336/// The other operands are unchanged from MI.
11337static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
11338 Register ReplaceReg, MachineBasicBlock &MBB,
11339 MachineBasicBlock::iterator InsertTo) {
11340 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
11341 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
11342 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
11343 Register Result = 0;
11344 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
11345 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
11346 Result = MRI.createVirtualRegister(
11347 MRI.getRegClass(NewMI->getOperand(0).getReg()));
11348 NewMI->getOperand(I).setReg(Result);
11349 } else if (I == ReplaceOprNum) {
11350 MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I));
11351 NewMI->getOperand(I).setReg(ReplaceReg);
11352 }
11353 }
11354 MBB.insert(InsertTo, NewMI);
11355 return Result;
11356}
11357
11358void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
11361 // Create and accumulate conditions for next TC iterations.
11362 // Example:
11363 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
11364 // # iteration of the kernel
11365 //
11366 // # insert the following instructions
11367 // cond = CSINCXr 0, 0, C, implicit $nzcv
11368 // counter = ADDXri counter, 1 # clone from this->Update
11369 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
11370 // cond = CSINCXr cond, cond, C, implicit $nzcv
11371 // ... (repeat TC times)
11372 // SUBSXri cond, 0, implicit-def $nzcv
11373
11374 assert(CondBranch->getOpcode() == AArch64::Bcc);
11375 // CondCode to exit the loop
11377 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
11378 if (CondBranch->getOperand(1).getMBB() == LoopBB)
11380
11381 // Accumulate conditions to exit the loop
11382 Register AccCond = AArch64::XZR;
11383
11384 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
11385 auto AccumulateCond = [&](Register CurCond,
11387 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
11388 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
11389 .addReg(NewCond, RegState::Define)
11390 .addReg(CurCond)
11391 .addReg(CurCond)
11393 return NewCond;
11394 };
11395
11396 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
11397 // Update and Comp for I==0 are already exists in MBB
11398 // (MBB is an unrolled kernel)
11399 Register Counter;
11400 for (int I = 0; I <= TC; ++I) {
11401 Register NextCounter;
11402 if (I != 0)
11403 NextCounter =
11404 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11405
11406 AccCond = AccumulateCond(AccCond, CC);
11407
11408 if (I != TC) {
11409 if (I == 0) {
11410 if (Update != Comp && IsUpdatePriorComp) {
11411 Counter =
11412 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11413 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
11414 MBB.end());
11415 } else {
11416 // can use already calculated value
11417 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
11418 }
11419 } else if (Update != Comp) {
11420 NextCounter =
11421 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11422 }
11423 }
11424 Counter = NextCounter;
11425 }
11426 } else {
11427 Register Counter;
11428 if (LastStage0Insts.empty()) {
11429 // use initial counter value (testing if the trip count is sufficient to
11430 // be executed by pipelined code)
11431 Counter = Init;
11432 if (IsUpdatePriorComp)
11433 Counter =
11434 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11435 } else {
11436 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
11437 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
11438 }
11439
11440 for (int I = 0; I <= TC; ++I) {
11441 Register NextCounter;
11442 NextCounter =
11443 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
11444 AccCond = AccumulateCond(AccCond, CC);
11445 if (I != TC && Update != Comp)
11446 NextCounter =
11447 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
11448 Counter = NextCounter;
11449 }
11450 }
11451
11452 // If AccCond == 0, the remainder is greater than TC.
11453 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
11454 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
11455 .addReg(AccCond)
11456 .addImm(0)
11457 .addImm(0);
11458 Cond.clear();
11460}
11461
11462static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
11463 Register &RegMBB, Register &RegOther) {
11464 assert(Phi.getNumOperands() == 5);
11465 if (Phi.getOperand(2).getMBB() == MBB) {
11466 RegMBB = Phi.getOperand(1).getReg();
11467 RegOther = Phi.getOperand(3).getReg();
11468 } else {
11469 assert(Phi.getOperand(4).getMBB() == MBB);
11470 RegMBB = Phi.getOperand(3).getReg();
11471 RegOther = Phi.getOperand(1).getReg();
11472 }
11473}
11474
11476 if (!Reg.isVirtual())
11477 return false;
11478 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
11479 return MRI.getVRegDef(Reg)->getParent() != BB;
11480}
11481
11482/// If Reg is an induction variable, return true and set some parameters
11483static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
11484 MachineInstr *&UpdateInst,
11485 unsigned &UpdateCounterOprNum, Register &InitReg,
11486 bool &IsUpdatePriorComp) {
11487 // Example:
11488 //
11489 // Preheader:
11490 // InitReg = ...
11491 // LoopBB:
11492 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
11493 // Reg = COPY Reg0 ; COPY is ignored.
11494 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
11495 // ; Reg is the value calculated in the previous
11496 // ; iteration, so IsUpdatePriorComp == false.
11497
11498 if (LoopBB->pred_size() != 2)
11499 return false;
11500 if (!Reg.isVirtual())
11501 return false;
11502 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
11503 UpdateInst = nullptr;
11504 UpdateCounterOprNum = 0;
11505 InitReg = 0;
11506 IsUpdatePriorComp = true;
11507 Register CurReg = Reg;
11508 while (true) {
11509 MachineInstr *Def = MRI.getVRegDef(CurReg);
11510 if (Def->getParent() != LoopBB)
11511 return false;
11512 if (Def->isCopy()) {
11513 // Ignore copy instructions unless they contain subregisters
11514 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
11515 return false;
11516 CurReg = Def->getOperand(1).getReg();
11517 } else if (Def->isPHI()) {
11518 if (InitReg != 0)
11519 return false;
11520 if (!UpdateInst)
11521 IsUpdatePriorComp = false;
11522 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
11523 } else {
11524 if (UpdateInst)
11525 return false;
11526 switch (Def->getOpcode()) {
11527 case AArch64::ADDSXri:
11528 case AArch64::ADDSWri:
11529 case AArch64::SUBSXri:
11530 case AArch64::SUBSWri:
11531 case AArch64::ADDXri:
11532 case AArch64::ADDWri:
11533 case AArch64::SUBXri:
11534 case AArch64::SUBWri:
11535 UpdateInst = Def;
11536 UpdateCounterOprNum = 1;
11537 break;
11538 case AArch64::ADDSXrr:
11539 case AArch64::ADDSWrr:
11540 case AArch64::SUBSXrr:
11541 case AArch64::SUBSWrr:
11542 case AArch64::ADDXrr:
11543 case AArch64::ADDWrr:
11544 case AArch64::SUBXrr:
11545 case AArch64::SUBWrr:
11546 UpdateInst = Def;
11547 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
11548 UpdateCounterOprNum = 1;
11549 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
11550 UpdateCounterOprNum = 2;
11551 else
11552 return false;
11553 break;
11554 default:
11555 return false;
11556 }
11557 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
11558 }
11559
11560 if (!CurReg.isVirtual())
11561 return false;
11562 if (Reg == CurReg)
11563 break;
11564 }
11565
11566 if (!UpdateInst)
11567 return false;
11568
11569 return true;
11570}
11571
11572std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
11574 // Accept loops that meet the following conditions
11575 // * The conditional branch is BCC
11576 // * The compare instruction is ADDS/SUBS/WHILEXX
11577 // * One operand of the compare is an induction variable and the other is a
11578 // loop invariant value
11579 // * The induction variable is incremented/decremented by a single instruction
11580 // * Does not contain CALL or instructions which have unmodeled side effects
11581
11582 for (MachineInstr &MI : *LoopBB)
11583 if (MI.isCall() || MI.hasUnmodeledSideEffects())
11584 // This instruction may use NZCV, which interferes with the instruction to
11585 // be inserted for loop control.
11586 return nullptr;
11587
11588 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
11590 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
11591 return nullptr;
11592
11593 // Infinite loops are not supported
11594 if (TBB == LoopBB && FBB == LoopBB)
11595 return nullptr;
11596
11597 // Must be conditional branch
11598 if (TBB != LoopBB && FBB == nullptr)
11599 return nullptr;
11600
11601 assert((TBB == LoopBB || FBB == LoopBB) &&
11602 "The Loop must be a single-basic-block loop");
11603
11604 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
11606
11607 if (CondBranch->getOpcode() != AArch64::Bcc)
11608 return nullptr;
11609
11610 // Normalization for createTripCountGreaterCondition()
11611 if (TBB == LoopBB)
11613
11614 MachineInstr *Comp = nullptr;
11615 unsigned CompCounterOprNum = 0;
11616 for (MachineInstr &MI : reverse(*LoopBB)) {
11617 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
11618 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
11619 // operands is a loop invariant value
11620
11621 switch (MI.getOpcode()) {
11622 case AArch64::SUBSXri:
11623 case AArch64::SUBSWri:
11624 case AArch64::ADDSXri:
11625 case AArch64::ADDSWri:
11626 Comp = &MI;
11627 CompCounterOprNum = 1;
11628 break;
11629 case AArch64::ADDSWrr:
11630 case AArch64::ADDSXrr:
11631 case AArch64::SUBSWrr:
11632 case AArch64::SUBSXrr:
11633 Comp = &MI;
11634 break;
11635 default:
11636 if (isWhileOpcode(MI.getOpcode())) {
11637 Comp = &MI;
11638 break;
11639 }
11640 return nullptr;
11641 }
11642
11643 if (CompCounterOprNum == 0) {
11644 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
11645 CompCounterOprNum = 2;
11646 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
11647 CompCounterOprNum = 1;
11648 else
11649 return nullptr;
11650 }
11651 break;
11652 }
11653 }
11654 if (!Comp)
11655 return nullptr;
11656
11657 MachineInstr *Update = nullptr;
11658 Register Init;
11659 bool IsUpdatePriorComp;
11660 unsigned UpdateCounterOprNum;
11661 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
11662 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
11663 return nullptr;
11664
11665 return std::make_unique<AArch64PipelinerLoopInfo>(
11666 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
11667 Init, IsUpdatePriorComp, Cond);
11668}
11669
11670/// verifyInstruction - Perform target specific instruction verification.
11671bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
11672 StringRef &ErrInfo) const {
11673 // Verify that immediate offsets on load/store instructions are within range.
11674 // Stack objects with an FI operand are excluded as they can be fixed up
11675 // during PEI.
11676 TypeSize Scale(0U, false), Width(0U, false);
11677 int64_t MinOffset, MaxOffset;
11678 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
11679 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
11680 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
11681 int64_t Imm = MI.getOperand(ImmIdx).getImm();
11682 if (Imm < MinOffset || Imm > MaxOffset) {
11683 ErrInfo = "Unexpected immediate on load/store instruction";
11684 return false;
11685 }
11686 }
11687 }
11688
11689 const MCInstrDesc &MCID = MI.getDesc();
11690 for (unsigned Op = 0; Op < MCID.getNumOperands(); Op++) {
11691 const MachineOperand &MO = MI.getOperand(Op);
11692 switch (MCID.operands()[Op].OperandType) {
11694 if (!MO.isImm() || MO.getImm() != 0) {
11695 ErrInfo = "OPERAND_IMPLICIT_IMM_0 should be 0";
11696 return false;
11697 }
11698 break;
11700 if (!MO.isImm() ||
11702 (AArch64_AM::getShiftValue(MO.getImm()) != 8 &&
11703 AArch64_AM::getShiftValue(MO.getImm()) != 16)) {
11704 ErrInfo = "OPERAND_SHIFT_MSL should be msl shift of 8 or 16";
11705 return false;
11706 }
11707 break;
11708 default:
11709 break;
11710 }
11711 }
11712 return true;
11713}
11714
11715#define GET_INSTRINFO_HELPERS
11716#define GET_INSTRMAP_INFO
11717#include "AArch64GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, unsigned NumRegs)
static cl::opt< unsigned > BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of Bcc instructions (DEBUG)"))
static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned MnegOpc, const TargetRegisterClass *RC)
genNeg - Helper to generate an intermediate negation of the second operand of Root
static bool isFrameStoreOpcode(int Opcode)
static cl::opt< unsigned > GatherOptSearchLimit("aarch64-search-limit", cl::Hidden, cl::init(2048), cl::desc("Restrict range of instructions to search for the " "machine-combiner gather pattern optimization"))
static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find instructions that can be turned into madd.
static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr)
Find a condition code used by the instruction.
static MachineInstr * genFusedMultiplyAcc(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyAcc - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genFusedMultiplyAccNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate64(unsigned Opc)
static bool isFrameLoadOpcode(int Opcode)
static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg)
static bool areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, const TargetRegisterInfo *TRI, const AccessKind AccessToCheck=AK_All)
True when condition flags are accessed (either by writing or reading) on the instruction trace starti...
static bool getFMAPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Floating-Point Support.
static bool isADDSRegImm(unsigned Opcode)
static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI)
static bool isANDOpcode(MachineInstr &MI)
static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment, StringRef RegScale={})
static unsigned sForm(MachineInstr &Instr)
Get opcode of S version of Instr.
static bool isCombineInstrSettingFlag(unsigned Opc)
static bool getFNEGPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, MachineInstr *&UpdateInst, unsigned &UpdateCounterOprNum, Register &InitReg, bool &IsUpdatePriorComp)
If Reg is an induction variable, return true and set some parameters.
static const MachineInstrBuilder & AddSubReg(const MachineInstrBuilder &MIB, MCRegister Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI)
static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc)
static int findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr)
static bool isPostIndexLdStOpcode(unsigned Opcode)
Return true if the opcode is a post-index ld/st instruction, which really loads from base+0.
static unsigned getBranchDisplacementBits(unsigned Opc)
static cl::opt< unsigned > CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9), cl::desc("Restrict range of CB instructions (DEBUG)"))
static std::optional< ParamLoadedValue > describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
If the given ORR instruction is a copy, and DescribedReg overlaps with the destination register then,...
static bool getFMULPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
static void appendReadRegExpr(SmallVectorImpl< char > &Expr, unsigned RegNum)
static MachineInstr * genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, const TargetRegisterClass *RC)
genMaddR - Generate madd instruction and combine mul and add using an extra virtual register Example ...
static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, Register ReplaceReg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertTo)
Clone an instruction from MI.
static bool scaleOffset(unsigned Opc, int64_t &Offset)
static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc)
unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale)
static MachineInstr * genFusedMultiplyIdx(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC)
genFusedMultiplyIdx - Helper to generate fused multiply accumulate instructions.
static MachineInstr * genIndexedMultiply(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC, MachineRegisterInfo &MRI)
Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
static bool isSUBSRegImm(unsigned Opcode)
static bool UpdateOperandRegClass(MachineInstr &Instr)
static const TargetRegisterClass * getRegClass(const MachineInstr &MI, Register Reg)
static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, int CmpValue, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > &CCUseInstrs, bool &IsInvertCC)
unsigned unscaledOffsetOpcode(unsigned Opcode)
static bool getLoadPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Search for patterns of LD instructions we can optimize.
static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI)
Check if CmpInstr can be substituted by MI.
static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC)
static bool isCombineInstrCandidateFP(const MachineInstr &Inst)
static void appendLoadRegExpr(SmallVectorImpl< char > &Expr, int64_t OffsetFromDefCFA)
static void appendConstantExpr(SmallVectorImpl< char > &Expr, int64_t Constant, dwarf::LocationAtom Operation)
static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI)
Return the opcode that does not set flags when possible - otherwise return the original opcode.
static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool isCombineInstrCandidate32(unsigned Opc)
static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, SmallVectorImpl< MachineOperand > &Cond)
static unsigned offsetExtendOpcode(unsigned Opcode)
MachineOutlinerMBBFlags
@ LRUnavailableSomewhere
@ UnsafeRegsDead
static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register DestReg, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
static void generateGatherLanePattern(MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned Pattern, unsigned NumLanes)
Generate optimized instruction sequence for gather load patterns to improve Memory-Level Parallelism ...
static bool getMiscPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns)
Find other MI combine patterns.
static bool outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, const outliner::Candidate &b)
static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t Offset1, unsigned Opcode1, int FI2, int64_t Offset2, unsigned Opcode2)
static cl::opt< unsigned > TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"))
static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, Register &RegMBB, Register &RegOther)
static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, unsigned Reg, const StackOffset &Offset)
static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB)
static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC, FMAInstKind kind=FMAInstKind::Default, const Register *ReplacedAddend=nullptr)
genFusedMultiply - Generate fused multiply instructions.
static bool getGatherLanePattern(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, unsigned LoadLaneOpCode, unsigned NumLanes)
Check if the given instruction forms a gather load pattern that can be optimized for better Memory-Le...
static MachineInstr * genFusedMultiplyIdxNeg(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg, unsigned IdxMulOpd, unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC)
genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate instructions with an additional...
static bool isCombineInstrCandidate(unsigned Opc)
static unsigned regOffsetOpcode(unsigned Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerRegSave
Emit a call and tail-call.
@ MachineOutlinerNoLRSave
Only emit a branch.
@ MachineOutlinerThunk
Emit a call and return.
@ MachineOutlinerDefault
static cl::opt< unsigned > BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)"))
static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB)
Check if AArch64::NZCV should be alive in successors of MBB.
static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int64_t Offset, unsigned Opc, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg)
static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize)
static cl::opt< unsigned > CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"))
static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, unsigned IdxOpd1, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
Do the following transformation A - (B + C) ==> (A - B) - C A - (B + C) ==> (A - C) - B.
static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, unsigned *NewReg=nullptr)
static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, const AArch64InstrInfo *TII, bool ShouldSignReturnAddr)
static MachineInstr * genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl< MachineInstr * > &InsInstrs)
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, unsigned MulOpc, unsigned ZeroReg)
static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MCInstrDesc &MCID, Register SrcReg, bool IsKill, unsigned SubIdx0, unsigned SubIdx1, int FI, MachineMemOperand *MMO)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
@ Default
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Check Debug Module
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
PowerPC Reduce CR logical Operation
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
Definition Debug.h:72
static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, unsigned CombineOpc=0)
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
SignReturnAddress getSignReturnAddressCondition() const
void setOutliningStyle(const std::string &Style)
std::optional< bool > hasRedZone() const
static bool shouldSignReturnAddress(SignReturnAddress Condition, bool IsLRSpilled)
static bool isHForm(const MachineInstr &MI)
Returns whether the instruction is in H form (16 bit operands)
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool hasBTISemantics(const MachineInstr &MI)
Returns whether the instruction can be compatible with non-zero BTYPE.
static bool isQForm(const MachineInstr &MI)
Returns whether the instruction is in Q form (128 bit operands)
static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, TypeSize &Width, int64_t &MinOffset, int64_t &MaxOffset)
Returns true if opcode Opc is a memory operation.
static bool isTailCallReturnInst(const MachineInstr &MI)
Returns true if MI is one of the TCRETURN* instructions.
static bool isFPRCopy(const MachineInstr &MI)
Does this instruction rename an FPR without modifying bits?
MachineInstr * emitLdStWithAddr(MachineInstr &MemI, const ExtAddrMode &AM) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is an instruction that moves/copies value from one register to an...
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
GetInstSize - Return the number of bytes of code the specified instruction may be.
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
static bool isGPRCopy(const MachineInstr &MI)
Does this instruction rename a GPR without modifying bits?
static unsigned convertToFlagSettingOpc(unsigned Opc)
Return the opcode that set flags when possible.
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the immediate offset operator of a load/store.
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
static std::optional< unsigned > getUnscaledLdSt(unsigned Opc)
Returns the unscaled load/store for the scaled load/store opcode, if there is a corresponding unscale...
static bool hasUnscaledLdStOffset(unsigned Opc)
Return true if it has an unscaled load/store offset.
static const MachineOperand & getLdStAmountOp(const MachineInstr &MI)
Returns the shift amount operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &MI, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const override
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
static bool isPairableLdStInst(const MachineInstr &MI)
Return true if pairing the given load or store may be paired with another.
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
const AArch64RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
static bool isPreSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed store.
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
AArch64InstrInfo(const AArch64Subtarget &STI)
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
bool getMemOperandWithOffsetWidth(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, TypeSize &Width, const TargetRegisterInfo *TRI) const
If OffsetIsScalable is set to 'true', the offset is scaled by vscale.
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isStridedAccess(const MachineInstr &MI)
Return true if the given load or store is a strided memory access.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
Detect opportunities for ldp/stp formation.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
bool isThroughputPattern(unsigned Pattern) const override
Return true when a code sequence can improve throughput.
MachineOperand & getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const
Return the immediate offset of the base register in a load/store LdSt.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, const MachineInstr &AddrI, ExtAddrMode &AM) const override
static bool isLdStPairSuppressed(const MachineInstr &MI)
Return true if pairing the given load or store is hinted to be unprofitable.
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
Check for post-frame ptr elimination stack locations as well.
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, Register TargetReg, bool FrameSetup) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
optimizeCompareInstr - Convert the instruction supplying the argument to the comparison into one that...
static unsigned getLoadStoreImmIdx(unsigned Opc)
Returns the index for the immediate for a given instruction.
static bool isGPRZero(const MachineInstr &MI)
Does this instruction set its full destination register to zero?
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef< unsigned > Indices) const
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
analyzeCompare - For a comparison instruction, return the source registers in SrcReg and SrcReg2,...
CombinerObjective getCombinerObjective(unsigned Pattern) const override
static bool isFpOrNEON(Register Reg)
Returns whether the physical register is FP or NEON.
bool isAsCheapAsAMove(const MachineInstr &MI) const override
std::optional< DestSourcePair > isCopyLikeInstrImpl(const MachineInstr &MI) const override
static void suppressLdStPair(MachineInstr &MI)
Hint that pairing the given load or store is unprofitable.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
static bool isPreLd(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load.
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef< unsigned > Indices) const
bool optimizeCondBranch(MachineInstr &MI) const override
Replace csincr-branch sequence by simple conditional branch.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
bool isCandidateToMergeOrPair(const MachineInstr &MI) const
Return true if this is a load/store that can be potentially paired/merged.
MCInst getNop() const override
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
bool empty() const
Definition DenseMap.h:109
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static LocationSize precise(uint64_t Value)
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition MCAsmInfo.h:64
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
static MCCFIInstruction cfiDefCfa(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa defines a rule for computing CFA as: take address from Register and add Offset to it.
Definition MCDwarf.h:576
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:618
static MCCFIInstruction cfiDefCfaOffset(MCSymbol *L, int64_t Offset, SMLoc Loc={})
.cfi_def_cfa_offset modifies a rule for computing CFA.
Definition MCDwarf.h:591
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:688
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
Describe properties that are true of each instruction in the target description file.
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
ArrayRef< MCOperandInfo > operands() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isValid() const
Definition MCRegister.h:84
static constexpr unsigned NoRegister
Definition MCRegister.h:60
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
Set of metadata that should be preserved when using BuildMI().
bool isInlineAsmBrIndirectTarget() const
Returns true if this is the indirect dest of an INLINEASM_BR.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
reverse_instr_iterator instr_rbegin()
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
Instructions::const_iterator const_instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
void setStackID(int ObjectIdx, uint8_t ID)
bool isCalleeSavedInfoValid() const
Has the callee saved info been calculated yet?
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
unsigned getNumObjects() const
Return the number of objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
reverse_iterator getReverse() const
Get a reverse iterator to the same node.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
bool isCall(QueryType Type=AnyInBundle) const
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
LLVM_ABI uint32_t mergeFlagsWith(const MachineInstr &Other) const
Return the MIFlags which represent both MachineInstrs.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
LLVM_ABI bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool registerDefIsDead(Register Reg, const TargetRegisterInfo *TRI) const
Returns true if the register is dead in this machine instruction.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI bool isLoadFoldBarrier() const
Returns true if it is illegal to fold a load across this instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void addRegisterDefined(Register Reg, const TargetRegisterInfo *RegInfo=nullptr)
We have determined MI defines a register.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
LLVM_ABI MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
bool reservedRegsFrozen() const
reservedRegsFrozen - Returns true after freezeReservedRegs() was called to ensure the set of reserved...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
MI-level patchpoint operands.
Definition StackMaps.h:77
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given patchpoint should emit.
Definition StackMaps.h:105
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static constexpr bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:66
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Represents a location in source code.
Definition SMLoc.h:22
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
MI-level stackmap operands.
Definition StackMaps.h:36
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given stackmap should emit.
Definition StackMaps.h:51
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
int64_t getScalable() const
Returns the scalable component of the stack.
Definition TypeSize.h:49
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
MI-level Statepoint operands.
Definition StackMaps.h:159
uint32_t getNumPatchBytes() const
Return the number of patchable bytes the given statepoint should emit.
Definition StackMaps.h:208
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Object returned by analyzeLoopForPipelining.
TargetInstrInfo - Interface to description of machine instruction set.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual CombinerObjective getCombinerObjective(unsigned Pattern) const
Return the objective of a combiner pattern.
virtual bool isFunctionSafeToSplit(const MachineFunction &MF) const
Return true if the function is a viable candidate for machine function splitting.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
Value * getOperand(unsigned i) const
Definition User.h:233
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
self_iterator getIterator()
Definition ilist_node.h:123
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_S
MO_S - Indicates that the bits of the symbol operand represented by MO_G0 etc are signed.
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_PREL
MO_PREL - Indicates that the bits of the symbol operand represented by MO_G0 etc are PC relative.
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_ARM64EC_CALLMANGLE
MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version of a symbol,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_TAGGED
MO_TAGGED - With MO_PAGE, indicates that the page includes a memory tag in bits 56-63.
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
unsigned getCheckerSizeInBytes(AuthCheckMethod Method)
Returns the number of bytes added by checkAuthenticatedRegister.
static uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize)
decodeLogicalImmediate - Decode a logical immediate value in the form "N:immr:imms" (where the immr a...
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
static unsigned getArithShiftValue(unsigned Imm)
getArithShiftValue - get the arithmetic shift value.
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static AArch64_AM::ShiftExtendType getExtendType(unsigned Imm)
getExtendType - Extract the extend type for operands of arithmetic ops.
static AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm)
static AArch64_AM::ShiftExtendType getShiftType(unsigned Imm)
getShiftType - Extract the shift type.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static const uint64_t InstrFlagIsWhile
static const uint64_t InstrFlagIsPTestLike
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
InstrType
Represents how an instruction should be mapped by the outliner.
LLVM_ABI Instruction & back() const
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
static bool isCondBranchOpcode(int Opc)
MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg, unsigned Reg, const StackOffset &Offset, bool LastAdjustmentWasScalable=true)
static bool isPTrueOpcode(unsigned Opc)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool succeeded(LogicalResult Result)
Utility function that returns true if the provided LogicalResult corresponds to a success value.
int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset, bool *OutUseUnscaledOp=nullptr, unsigned *OutUnscaledOp=nullptr, int64_t *EmittableOffset=nullptr)
Check if the Offset is a valid frame offset for MI.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static bool isIndirectBranchOpcode(int Opc)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned getBLRCallOpcode(const MachineFunction &MF)
Return opcode to be used for indirect calls.
@ AArch64FrameOffsetIsLegal
Offset is legal.
@ AArch64FrameOffsetCanUpdate
Offset can apply, at least partly.
@ AArch64FrameOffsetCannotUpdate
Offset cannot apply.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
Op::Description Desc
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
static bool isSEHInstruction(const MachineInstr &MI)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
AArch64MachineCombinerPattern
@ MULSUBv8i16_OP2
@ FMULv4i16_indexed_OP1
@ FMLSv1i32_indexed_OP2
@ MULSUBv2i32_indexed_OP1
@ FMLAv2i32_indexed_OP2
@ MULADDv4i16_indexed_OP2
@ FMLAv1i64_indexed_OP1
@ MULSUBv16i8_OP1
@ FMLAv8i16_indexed_OP2
@ FMULv2i32_indexed_OP1
@ MULSUBv8i16_indexed_OP2
@ FMLAv1i64_indexed_OP2
@ MULSUBv4i16_indexed_OP2
@ FMLAv1i32_indexed_OP1
@ FMLAv2i64_indexed_OP2
@ FMLSv8i16_indexed_OP1
@ MULSUBv2i32_OP1
@ FMULv4i16_indexed_OP2
@ MULSUBv4i32_indexed_OP2
@ FMULv2i64_indexed_OP2
@ FMLAv4i32_indexed_OP1
@ MULADDv4i16_OP2
@ FMULv8i16_indexed_OP2
@ MULSUBv4i16_OP1
@ MULADDv4i32_OP2
@ MULADDv2i32_OP2
@ MULADDv16i8_OP2
@ FMLSv4i16_indexed_OP1
@ MULADDv16i8_OP1
@ FMLAv2i64_indexed_OP1
@ FMLAv1i32_indexed_OP2
@ FMLSv2i64_indexed_OP2
@ MULADDv2i32_OP1
@ MULADDv4i32_OP1
@ MULADDv2i32_indexed_OP1
@ MULSUBv16i8_OP2
@ MULADDv4i32_indexed_OP1
@ MULADDv2i32_indexed_OP2
@ FMLAv4i16_indexed_OP2
@ MULSUBv8i16_OP1
@ FMULv2i32_indexed_OP2
@ FMLSv2i32_indexed_OP2
@ FMLSv4i32_indexed_OP1
@ FMULv2i64_indexed_OP1
@ MULSUBv4i16_OP2
@ FMLSv4i16_indexed_OP2
@ FMLAv2i32_indexed_OP1
@ FMLSv2i32_indexed_OP1
@ FMLAv8i16_indexed_OP1
@ MULSUBv4i16_indexed_OP1
@ FMLSv4i32_indexed_OP2
@ MULADDv4i32_indexed_OP2
@ MULSUBv4i32_OP2
@ MULSUBv8i16_indexed_OP1
@ MULADDv8i16_OP2
@ MULSUBv2i32_indexed_OP2
@ FMULv4i32_indexed_OP2
@ FMLSv2i64_indexed_OP1
@ MULADDv4i16_OP1
@ FMLAv4i32_indexed_OP2
@ MULADDv8i16_indexed_OP1
@ FMULv4i32_indexed_OP1
@ FMLAv4i16_indexed_OP1
@ FMULv8i16_indexed_OP1
@ MULADDv8i16_OP1
@ MULSUBv4i32_indexed_OP1
@ MULSUBv4i32_OP1
@ FMLSv8i16_indexed_OP2
@ MULADDv8i16_indexed_OP2
@ MULSUBv2i32_OP2
@ FMLSv1i64_indexed_OP2
@ MULADDv4i16_indexed_OP1
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag=MachineInstr::NoFlags, bool SetNZCV=false, bool NeedsWinCFI=false, bool *HasWinCFI=nullptr, bool EmitCFAOffset=false, StackOffset InitialOffset={}, unsigned FrameReg=AArch64::SP)
emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg plus Offset.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
std::optional< UsedNZCV > examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, const TargetRegisterInfo &TRI, SmallVectorImpl< MachineInstr * > *CCUseInstrs=nullptr)
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
unsigned getUndefRegState(bool B)
static MCRegister getXRegFromWReg(MCRegister Reg)
unsigned getDefRegState(bool B)
MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg, const StackOffset &OffsetFromDefCFA, std::optional< int64_t > IncomingVGOffsetFromDefCFA)
unsigned getKillRegState(bool B)
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
static bool isUncondBranchOpcode(int Opc)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2168
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, StackOffset &Offset, const AArch64InstrInfo *TII)
rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the FP.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
static const MachineMemOperand::Flags MOSuppressPair
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
void appendLEB128(SmallVectorImpl< U > &Buffer, T Value)
Definition LEB128.h:236
bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, const MachineInstr &UseMI, const TargetRegisterInfo *TRI)
Return true if there is an instruction /after/ DefMI and before UseMI which either reads or clobbers ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
static const MachineMemOperand::Flags MOStridedAccess
void fullyRecomputeLiveIns(ArrayRef< MachineBasicBlock * > MBBs)
Convenience function for recomputing live-in's for a set of MBBs until the computation converges.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
LLVM_ABI static const MBBSectionID ColdSectionID
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
An individual sequence of instructions to be replaced with a call to an outlined function.
MachineFunction * getMF() const
The information necessary to create an outlined function for some class of candidate.