LLVM 18.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "AMDGPU.h"
12#include "GCNSubtarget.h"
18
19#define DEBUG_TYPE "si-fold-operands"
20using namespace llvm;
21
22namespace {
23
24struct FoldCandidate {
26 union {
27 MachineOperand *OpToFold;
28 uint64_t ImmToFold;
29 int FrameIndexToFold;
30 };
31 int ShrinkOpcode;
32 unsigned UseOpNo;
34 bool Commuted;
35
36 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37 bool Commuted_ = false,
38 int ShrinkOp = -1) :
39 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40 Kind(FoldOp->getType()),
41 Commuted(Commuted_) {
42 if (FoldOp->isImm()) {
43 ImmToFold = FoldOp->getImm();
44 } else if (FoldOp->isFI()) {
45 FrameIndexToFold = FoldOp->getIndex();
46 } else {
47 assert(FoldOp->isReg() || FoldOp->isGlobal());
48 OpToFold = FoldOp;
49 }
50 }
51
52 bool isFI() const {
53 return Kind == MachineOperand::MO_FrameIndex;
54 }
55
56 bool isImm() const {
57 return Kind == MachineOperand::MO_Immediate;
58 }
59
60 bool isReg() const {
61 return Kind == MachineOperand::MO_Register;
62 }
63
64 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65
66 bool needsShrink() const { return ShrinkOpcode != -1; }
67};
68
69class SIFoldOperands : public MachineFunctionPass {
70public:
71 static char ID;
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 bool updateOperand(FoldCandidate &Fold) const;
82
83 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
84 MachineInstr *MI, unsigned OpNo,
85 MachineOperand *OpToFold) const;
86 bool isUseSafeToFold(const MachineInstr &MI,
87 const MachineOperand &UseMO) const;
88 bool
89 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
90 Register UseReg, uint8_t OpTy) const;
91 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
92 unsigned UseOpIdx,
93 SmallVectorImpl<FoldCandidate> &FoldList) const;
94 void foldOperand(MachineOperand &OpToFold,
96 int UseOpIdx,
98 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
99
100 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
101 bool tryConstantFoldOp(MachineInstr *MI) const;
102 bool tryFoldCndMask(MachineInstr &MI) const;
103 bool tryFoldZeroHighBits(MachineInstr &MI) const;
104 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
105 bool tryFoldFoldableCopy(MachineInstr &MI,
106 MachineOperand *&CurrentKnownM0Val) const;
107
108 const MachineOperand *isClamp(const MachineInstr &MI) const;
109 bool tryFoldClamp(MachineInstr &MI);
110
111 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
112 bool tryFoldOMod(MachineInstr &MI);
113 bool tryFoldRegSequence(MachineInstr &MI);
114 bool tryFoldPhiAGPR(MachineInstr &MI);
115 bool tryFoldLoad(MachineInstr &MI);
116
117 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
118
119public:
120 SIFoldOperands() : MachineFunctionPass(ID) {
122 }
123
124 bool runOnMachineFunction(MachineFunction &MF) override;
125
126 StringRef getPassName() const override { return "SI Fold Operands"; }
127
128 void getAnalysisUsage(AnalysisUsage &AU) const override {
129 AU.setPreservesCFG();
131 }
132};
133
134} // End anonymous namespace.
135
136INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
137 "SI Fold Operands", false, false)
138
139char SIFoldOperands::ID = 0;
140
141char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
142
145 const MachineOperand &MO) {
146 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
147 if (const TargetRegisterClass *SubRC =
148 TRI.getSubRegisterClass(RC, MO.getSubReg()))
149 RC = SubRC;
150 return RC;
151}
152
153// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
154static unsigned macToMad(unsigned Opc) {
155 switch (Opc) {
156 case AMDGPU::V_MAC_F32_e64:
157 return AMDGPU::V_MAD_F32_e64;
158 case AMDGPU::V_MAC_F16_e64:
159 return AMDGPU::V_MAD_F16_e64;
160 case AMDGPU::V_FMAC_F32_e64:
161 return AMDGPU::V_FMA_F32_e64;
162 case AMDGPU::V_FMAC_F16_e64:
163 return AMDGPU::V_FMA_F16_gfx9_e64;
164 case AMDGPU::V_FMAC_F16_t16_e64:
165 return AMDGPU::V_FMA_F16_gfx9_e64;
166 case AMDGPU::V_FMAC_LEGACY_F32_e64:
167 return AMDGPU::V_FMA_LEGACY_F32_e64;
168 case AMDGPU::V_FMAC_F64_e64:
169 return AMDGPU::V_FMA_F64_e64;
170 }
171 return AMDGPU::INSTRUCTION_LIST_END;
172}
173
174// TODO: Add heuristic that the frame index might not fit in the addressing mode
175// immediate offset to avoid materializing in loops.
176bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
177 const MachineOperand &OpToFold) const {
178 if (!OpToFold.isFI())
179 return false;
180
181 const unsigned Opc = UseMI.getOpcode();
182 if (TII->isMUBUF(UseMI))
183 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
184 if (!TII->isFLATScratch(UseMI))
185 return false;
186
187 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
188 if (OpNo == SIdx)
189 return true;
190
191 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
192 return OpNo == VIdx && SIdx == -1;
193}
194
196 return new SIFoldOperands();
197}
198
199bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
200 MachineInstr *MI = Fold.UseMI;
201 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
202 assert(Old.isReg());
203
204
205 const uint64_t TSFlags = MI->getDesc().TSFlags;
206 if (Fold.isImm()) {
208 (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
209 AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
210 ST->hasInv2PiInlineImm())) {
211 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
212 // already set.
213 unsigned Opcode = MI->getOpcode();
214 int OpNo = MI->getOperandNo(&Old);
215 int ModIdx = -1;
216 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
217 ModIdx = AMDGPU::OpName::src0_modifiers;
218 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
219 ModIdx = AMDGPU::OpName::src1_modifiers;
220 else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
221 ModIdx = AMDGPU::OpName::src2_modifiers;
222 assert(ModIdx != -1);
223 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
224 MachineOperand &Mod = MI->getOperand(ModIdx);
225 unsigned Val = Mod.getImm();
226 if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
227 // Only apply the following transformation if that operand requires
228 // a packed immediate.
229 switch (TII->get(Opcode).operands()[OpNo].OperandType) {
234 // If upper part is all zero we do not need op_sel_hi.
235 if (!isUInt<16>(Fold.ImmToFold)) {
236 if (!(Fold.ImmToFold & 0xffff)) {
237 Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
238 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
239 Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
240 return true;
241 }
242 Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
243 Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
244 return true;
245 }
246 break;
247 default:
248 break;
249 }
250 }
251 }
252 }
253
254 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
255 MachineBasicBlock *MBB = MI->getParent();
256 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
257 if (Liveness != MachineBasicBlock::LQR_Dead) {
258 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
259 return false;
260 }
261
262 int Op32 = Fold.ShrinkOpcode;
263 MachineOperand &Dst0 = MI->getOperand(0);
264 MachineOperand &Dst1 = MI->getOperand(1);
265 assert(Dst0.isDef() && Dst1.isDef());
266
267 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
268
269 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
270 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
271
272 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
273
274 if (HaveNonDbgCarryUse) {
275 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
276 Dst1.getReg())
277 .addReg(AMDGPU::VCC, RegState::Kill);
278 }
279
280 // Keep the old instruction around to avoid breaking iterators, but
281 // replace it with a dummy instruction to remove uses.
282 //
283 // FIXME: We should not invert how this pass looks at operands to avoid
284 // this. Should track set of foldable movs instead of looking for uses
285 // when looking at a use.
286 Dst0.setReg(NewReg0);
287 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
288 MI->removeOperand(I);
289 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
290
291 if (Fold.Commuted)
292 TII->commuteInstruction(*Inst32, false);
293 return true;
294 }
295
296 assert(!Fold.needsShrink() && "not handled");
297
298 if (Fold.isImm()) {
299 if (Old.isTied()) {
300 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
301 if (NewMFMAOpc == -1)
302 return false;
303 MI->setDesc(TII->get(NewMFMAOpc));
304 MI->untieRegOperand(0);
305 }
306 Old.ChangeToImmediate(Fold.ImmToFold);
307 return true;
308 }
309
310 if (Fold.isGlobal()) {
311 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
312 Fold.OpToFold->getTargetFlags());
313 return true;
314 }
315
316 if (Fold.isFI()) {
317 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
318 return true;
319 }
320
321 MachineOperand *New = Fold.OpToFold;
322 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
323 Old.setIsUndef(New->isUndef());
324 return true;
325}
326
328 const MachineInstr *MI) {
329 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
330}
331
333 MachineInstr *MI, unsigned OpNo,
334 MachineOperand *FoldOp, bool Commuted = false,
335 int ShrinkOp = -1) {
336 // Skip additional folding on the same operand.
337 for (FoldCandidate &Fold : FoldList)
338 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
339 return;
340 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
341 << " operand " << OpNo << "\n " << *MI);
342 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
343}
344
345bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
346 MachineInstr *MI, unsigned OpNo,
347 MachineOperand *OpToFold) const {
348 const unsigned Opc = MI->getOpcode();
349
350 auto tryToFoldAsFMAAKorMK = [&]() {
351 if (!OpToFold->isImm())
352 return false;
353
354 const bool TryAK = OpNo == 3;
355 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
356 MI->setDesc(TII->get(NewOpc));
357
358 // We have to fold into operand which would be Imm not into OpNo.
359 bool FoldAsFMAAKorMK =
360 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
361 if (FoldAsFMAAKorMK) {
362 // Untie Src2 of fmac.
363 MI->untieRegOperand(3);
364 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
365 if (OpNo == 1) {
366 MachineOperand &Op1 = MI->getOperand(1);
367 MachineOperand &Op2 = MI->getOperand(2);
368 Register OldReg = Op1.getReg();
369 // Operand 2 might be an inlinable constant
370 if (Op2.isImm()) {
371 Op1.ChangeToImmediate(Op2.getImm());
372 Op2.ChangeToRegister(OldReg, false);
373 } else {
374 Op1.setReg(Op2.getReg());
375 Op2.setReg(OldReg);
376 }
377 }
378 return true;
379 }
380 MI->setDesc(TII->get(Opc));
381 return false;
382 };
383
384 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
385 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
386 unsigned NewOpc = macToMad(Opc);
387 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
388 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
389 // to fold the operand.
390 MI->setDesc(TII->get(NewOpc));
391 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
392 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
393 if (AddOpSel)
394 MI->addOperand(MachineOperand::CreateImm(0));
395 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
396 if (FoldAsMAD) {
397 MI->untieRegOperand(OpNo);
398 return true;
399 }
400 if (AddOpSel)
401 MI->removeOperand(MI->getNumExplicitOperands() - 1);
402 MI->setDesc(TII->get(Opc));
403 }
404
405 // Special case for s_fmac_f32 if we are trying to fold into Src2.
406 // By transforming into fmaak we can untie Src2 and make folding legal.
407 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
408 if (tryToFoldAsFMAAKorMK())
409 return true;
410 }
411
412 // Special case for s_setreg_b32
413 if (OpToFold->isImm()) {
414 unsigned ImmOpc = 0;
415 if (Opc == AMDGPU::S_SETREG_B32)
416 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
417 else if (Opc == AMDGPU::S_SETREG_B32_mode)
418 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
419 if (ImmOpc) {
420 MI->setDesc(TII->get(ImmOpc));
421 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
422 return true;
423 }
424 }
425
426 // If we are already folding into another operand of MI, then
427 // we can't commute the instruction, otherwise we risk making the
428 // other fold illegal.
429 if (isUseMIInFoldList(FoldList, MI))
430 return false;
431
432 unsigned CommuteOpNo = OpNo;
433
434 // Operand is not legal, so try to commute the instruction to
435 // see if this makes it possible to fold.
436 unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
437 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
438 bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
439
440 if (CanCommute) {
441 if (CommuteIdx0 == OpNo)
442 CommuteOpNo = CommuteIdx1;
443 else if (CommuteIdx1 == OpNo)
444 CommuteOpNo = CommuteIdx0;
445 }
446
447
448 // One of operands might be an Imm operand, and OpNo may refer to it after
449 // the call of commuteInstruction() below. Such situations are avoided
450 // here explicitly as OpNo must be a register operand to be a candidate
451 // for memory folding.
452 if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
453 !MI->getOperand(CommuteIdx1).isReg()))
454 return false;
455
456 if (!CanCommute ||
457 !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
458 return false;
459
460 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
461 if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
462 Opc == AMDGPU::V_SUB_CO_U32_e64 ||
463 Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
464 (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
465
466 // Verify the other operand is a VGPR, otherwise we would violate the
467 // constant bus restriction.
468 unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
469 MachineOperand &OtherOp = MI->getOperand(OtherIdx);
470 if (!OtherOp.isReg() ||
471 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
472 return false;
473
474 assert(MI->getOperand(1).isDef());
475
476 // Make sure to get the 32-bit version of the commuted opcode.
477 unsigned MaybeCommutedOpc = MI->getOpcode();
478 int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
479
480 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
481 return true;
482 }
483
484 TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
485 return false;
486 }
487
488 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
489 return true;
490 }
491
492 // Inlineable constant might have been folded into Imm operand of fmaak or
493 // fmamk and we are trying to fold a non-inlinable constant.
494 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
495 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
496 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
497 MachineOperand &OpImm = MI->getOperand(ImmIdx);
498 if (!OpImm.isReg() &&
499 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
500 return tryToFoldAsFMAAKorMK();
501 }
502
503 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
504 // By changing into fmamk we can untie Src2.
505 // If folding for Src0 happens first and it is identical operand to Src1 we
506 // should avoid transforming into fmamk which requires commuting as it would
507 // cause folding into Src1 to fail later on due to wrong OpNo used.
508 if (Opc == AMDGPU::S_FMAC_F32 &&
509 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
510 if (tryToFoldAsFMAAKorMK())
511 return true;
512 }
513
514 // Check the case where we might introduce a second constant operand to a
515 // scalar instruction
516 if (TII->isSALU(MI->getOpcode())) {
517 const MCInstrDesc &InstDesc = MI->getDesc();
518 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
519
520 // Fine if the operand can be encoded as an inline constant
521 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
522 // Otherwise check for another constant
523 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
524 auto &Op = MI->getOperand(i);
525 if (OpNo != i && !Op.isReg() &&
526 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
527 return false;
528 }
529 }
530 }
531
532 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
533 return true;
534}
535
536bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
537 const MachineOperand &UseMO) const {
538 // Operands of SDWA instructions must be registers.
539 return !TII->isSDWA(MI);
540}
541
542// Find a def of the UseReg, check if it is a reg_sequence and find initializers
543// for each subreg, tracking it to foldable inline immediate if possible.
544// Returns true on success.
545bool SIFoldOperands::getRegSeqInit(
546 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
547 Register UseReg, uint8_t OpTy) const {
548 MachineInstr *Def = MRI->getVRegDef(UseReg);
549 if (!Def || !Def->isRegSequence())
550 return false;
551
552 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
553 MachineOperand *Sub = &Def->getOperand(I);
554 assert(Sub->isReg());
555
556 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
557 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
558 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
559 SubDef = MRI->getVRegDef(Sub->getReg())) {
560 MachineOperand *Op = &SubDef->getOperand(1);
561 if (Op->isImm()) {
562 if (TII->isInlineConstant(*Op, OpTy))
563 Sub = Op;
564 break;
565 }
566 if (!Op->isReg() || Op->getReg().isPhysical())
567 break;
568 Sub = Op;
569 }
570
571 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
572 }
573
574 return true;
575}
576
577bool SIFoldOperands::tryToFoldACImm(
578 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
579 SmallVectorImpl<FoldCandidate> &FoldList) const {
580 const MCInstrDesc &Desc = UseMI->getDesc();
581 if (UseOpIdx >= Desc.getNumOperands())
582 return false;
583
584 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
589 return false;
590
591 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
592 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
593 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
594 return true;
595 }
596
597 if (!OpToFold.isReg())
598 return false;
599
600 Register UseReg = OpToFold.getReg();
601 if (!UseReg.isVirtual())
602 return false;
603
604 if (isUseMIInFoldList(FoldList, UseMI))
605 return false;
606
607 // Maybe it is just a COPY of an immediate itself.
608 MachineInstr *Def = MRI->getVRegDef(UseReg);
609 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
610 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
611 MachineOperand &DefOp = Def->getOperand(1);
612 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
613 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
614 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
615 return true;
616 }
617 }
618
620 if (!getRegSeqInit(Defs, UseReg, OpTy))
621 return false;
622
623 int32_t Imm;
624 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
625 const MachineOperand *Op = Defs[I].first;
626 if (!Op->isImm())
627 return false;
628
629 auto SubImm = Op->getImm();
630 if (!I) {
631 Imm = SubImm;
632 if (!TII->isInlineConstant(*Op, OpTy) ||
633 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
634 return false;
635
636 continue;
637 }
638 if (Imm != SubImm)
639 return false; // Can only fold splat constants
640 }
641
642 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
643 return true;
644}
645
646void SIFoldOperands::foldOperand(
647 MachineOperand &OpToFold,
649 int UseOpIdx,
651 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
652 const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
653
654 if (!isUseSafeToFold(*UseMI, UseOp))
655 return;
656
657 // FIXME: Fold operands with subregs.
658 if (UseOp.isReg() && OpToFold.isReg() &&
659 (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
660 return;
661
662 // Special case for REG_SEQUENCE: We can't fold literals into
663 // REG_SEQUENCE instructions, so we have to fold them into the
664 // uses of REG_SEQUENCE.
665 if (UseMI->isRegSequence()) {
666 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
667 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
668
669 for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
670 MachineInstr *RSUseMI = RSUse.getParent();
671
672 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
673 RSUseMI->getOperandNo(&RSUse), FoldList))
674 continue;
675
676 if (RSUse.getSubReg() != RegSeqDstSubReg)
677 continue;
678
679 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
680 CopiesToReplace);
681 }
682
683 return;
684 }
685
686 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
687 return;
688
689 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
690 // Verify that this is a stack access.
691 // FIXME: Should probably use stack pseudos before frame lowering.
692
693 if (TII->isMUBUF(*UseMI)) {
694 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
695 MFI->getScratchRSrcReg())
696 return;
697
698 // Ensure this is either relative to the current frame or the current
699 // wave.
700 MachineOperand &SOff =
701 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
702 if (!SOff.isImm() || SOff.getImm() != 0)
703 return;
704 }
705
706 // A frame index will resolve to a positive constant, so it should always be
707 // safe to fold the addressing mode, even pre-GFX9.
708 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
709
710 const unsigned Opc = UseMI->getOpcode();
711 if (TII->isFLATScratch(*UseMI) &&
712 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
713 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
714 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
715 UseMI->setDesc(TII->get(NewOpc));
716 }
717
718 return;
719 }
720
721 bool FoldingImmLike =
722 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
723
724 if (FoldingImmLike && UseMI->isCopy()) {
725 Register DestReg = UseMI->getOperand(0).getReg();
726 Register SrcReg = UseMI->getOperand(1).getReg();
727 assert(SrcReg.isVirtual());
728
729 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
730
731 // Don't fold into a copy to a physical register with the same class. Doing
732 // so would interfere with the register coalescer's logic which would avoid
733 // redundant initializations.
734 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
735 return;
736
737 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
738 if (!DestReg.isPhysical()) {
739 if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
741 for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
742 // There's no point trying to fold into an implicit operand.
743 if (Use.isImplicit())
744 continue;
745
746 CopyUses.emplace_back(Use.getParent(),
747 Use.getParent()->getOperandNo(&Use),
748 &UseMI->getOperand(1));
749 }
750
751 for (auto &F : CopyUses) {
752 foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
753 CopiesToReplace);
754 }
755 }
756
757 if (DestRC == &AMDGPU::AGPR_32RegClass &&
758 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
759 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
761 CopiesToReplace.push_back(UseMI);
762 return;
763 }
764 }
765
766 // In order to fold immediates into copies, we need to change the
767 // copy to a MOV.
768
769 unsigned MovOp = TII->getMovOpcode(DestRC);
770 if (MovOp == AMDGPU::COPY)
771 return;
772
773 UseMI->setDesc(TII->get(MovOp));
776 while (ImpOpI != ImpOpE) {
777 MachineInstr::mop_iterator Tmp = ImpOpI;
778 ImpOpI++;
780 }
781 CopiesToReplace.push_back(UseMI);
782 } else {
783 if (UseMI->isCopy() && OpToFold.isReg() &&
785 !UseMI->getOperand(1).getSubReg()) {
786 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
787 unsigned Size = TII->getOpSize(*UseMI, 1);
788 Register UseReg = OpToFold.getReg();
790 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
791 UseMI->getOperand(1).setIsKill(false);
792 CopiesToReplace.push_back(UseMI);
793 OpToFold.setIsKill(false);
794
795 // Remove kill flags as kills may now be out of order with uses.
796 MRI->clearKillFlags(OpToFold.getReg());
797
798 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
799 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
800 // its initializers right here, so we will rematerialize immediates and
801 // avoid copies via different reg classes.
803 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
804 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
805 const DebugLoc &DL = UseMI->getDebugLoc();
807
808 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
809 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
811
815 for (unsigned I = 0; I < Size / 4; ++I) {
816 MachineOperand *Def = Defs[I].first;
818 if (Def->isImm() &&
819 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
820 int64_t Imm = Def->getImm();
821
822 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
824 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
825 B.addReg(Tmp);
826 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
827 auto Src = getRegSubRegPair(*Def);
828 Def->setIsKill(false);
829 if (!SeenAGPRs.insert(Src)) {
830 // We cannot build a reg_sequence out of the same registers, they
831 // must be copied. Better do it here before copyPhysReg() created
832 // several reads to do the AGPR->VGPR->AGPR copy.
833 CopyToVGPR = Src;
834 } else {
835 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
836 Src.SubReg);
837 }
838 } else {
839 assert(Def->isReg());
840 Def->setIsKill(false);
841 auto Src = getRegSubRegPair(*Def);
842
843 // Direct copy from SGPR to AGPR is not possible. To avoid creation
844 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
845 // create a copy here and track if we already have such a copy.
846 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
847 CopyToVGPR = Src;
848 } else {
849 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
850 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
851 B.addReg(Tmp);
852 }
853 }
854
855 if (CopyToVGPR.Reg) {
856 Register Vgpr;
857 if (VGPRCopies.count(CopyToVGPR)) {
858 Vgpr = VGPRCopies[CopyToVGPR];
859 } else {
860 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
861 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
862 VGPRCopies[CopyToVGPR] = Vgpr;
863 }
864 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
866 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
867 B.addReg(Tmp);
868 }
869
870 B.addImm(Defs[I].second);
871 }
872 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
873 return;
874 }
875
876 if (Size != 4)
877 return;
878
879 Register Reg0 = UseMI->getOperand(0).getReg();
880 Register Reg1 = UseMI->getOperand(1).getReg();
881 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
882 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
883 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
884 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
885 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
886 TRI->isAGPR(*MRI, Reg1))
887 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
888 return;
889 }
890
891 unsigned UseOpc = UseMI->getOpcode();
892 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
893 (UseOpc == AMDGPU::V_READLANE_B32 &&
894 (int)UseOpIdx ==
895 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
896 // %vgpr = V_MOV_B32 imm
897 // %sgpr = V_READFIRSTLANE_B32 %vgpr
898 // =>
899 // %sgpr = S_MOV_B32 imm
900 if (FoldingImmLike) {
902 UseMI->getOperand(UseOpIdx).getReg(),
903 *OpToFold.getParent(),
904 *UseMI))
905 return;
906
907 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
908
909 if (OpToFold.isImm())
911 else
913 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
914 return;
915 }
916
917 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
919 UseMI->getOperand(UseOpIdx).getReg(),
920 *OpToFold.getParent(),
921 *UseMI))
922 return;
923
924 // %vgpr = COPY %sgpr0
925 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
926 // =>
927 // %sgpr1 = COPY %sgpr0
928 UseMI->setDesc(TII->get(AMDGPU::COPY));
929 UseMI->getOperand(1).setReg(OpToFold.getReg());
930 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
931 UseMI->getOperand(1).setIsKill(false);
932 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
933 return;
934 }
935 }
936
937 const MCInstrDesc &UseDesc = UseMI->getDesc();
938
939 // Don't fold into target independent nodes. Target independent opcodes
940 // don't have defined register classes.
941 if (UseDesc.isVariadic() || UseOp.isImplicit() ||
942 UseDesc.operands()[UseOpIdx].RegClass == -1)
943 return;
944 }
945
946 if (!FoldingImmLike) {
947 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
948 // Don't fold if OpToFold doesn't hold an aligned register.
949 const TargetRegisterClass *RC =
950 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
951 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
952 unsigned SubReg = OpToFold.getSubReg();
953 if (const TargetRegisterClass *SubRC =
954 TRI->getSubRegisterClass(RC, SubReg))
955 RC = SubRC;
956 }
957
958 if (!RC || !TRI->isProperlyAlignedRC(*RC))
959 return;
960 }
961
962 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
963
964 // FIXME: We could try to change the instruction from 64-bit to 32-bit
965 // to enable more folding opportunities. The shrink operands pass
966 // already does this.
967 return;
968 }
969
970
971 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
972 const TargetRegisterClass *FoldRC =
973 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
974
975 // Split 64-bit constants into 32-bits for folding.
976 if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
977 Register UseReg = UseOp.getReg();
978 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
979 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
980 return;
981
982 APInt Imm(64, OpToFold.getImm());
983 if (UseOp.getSubReg() == AMDGPU::sub0) {
984 Imm = Imm.getLoBits(32);
985 } else {
986 assert(UseOp.getSubReg() == AMDGPU::sub1);
987 Imm = Imm.getHiBits(32);
988 }
989
990 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
991 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
992 return;
993 }
994
995 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
996}
997
998static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
999 uint32_t LHS, uint32_t RHS) {
1000 switch (Opcode) {
1001 case AMDGPU::V_AND_B32_e64:
1002 case AMDGPU::V_AND_B32_e32:
1003 case AMDGPU::S_AND_B32:
1004 Result = LHS & RHS;
1005 return true;
1006 case AMDGPU::V_OR_B32_e64:
1007 case AMDGPU::V_OR_B32_e32:
1008 case AMDGPU::S_OR_B32:
1009 Result = LHS | RHS;
1010 return true;
1011 case AMDGPU::V_XOR_B32_e64:
1012 case AMDGPU::V_XOR_B32_e32:
1013 case AMDGPU::S_XOR_B32:
1014 Result = LHS ^ RHS;
1015 return true;
1016 case AMDGPU::S_XNOR_B32:
1017 Result = ~(LHS ^ RHS);
1018 return true;
1019 case AMDGPU::S_NAND_B32:
1020 Result = ~(LHS & RHS);
1021 return true;
1022 case AMDGPU::S_NOR_B32:
1023 Result = ~(LHS | RHS);
1024 return true;
1025 case AMDGPU::S_ANDN2_B32:
1026 Result = LHS & ~RHS;
1027 return true;
1028 case AMDGPU::S_ORN2_B32:
1029 Result = LHS | ~RHS;
1030 return true;
1031 case AMDGPU::V_LSHL_B32_e64:
1032 case AMDGPU::V_LSHL_B32_e32:
1033 case AMDGPU::S_LSHL_B32:
1034 // The instruction ignores the high bits for out of bounds shifts.
1035 Result = LHS << (RHS & 31);
1036 return true;
1037 case AMDGPU::V_LSHLREV_B32_e64:
1038 case AMDGPU::V_LSHLREV_B32_e32:
1039 Result = RHS << (LHS & 31);
1040 return true;
1041 case AMDGPU::V_LSHR_B32_e64:
1042 case AMDGPU::V_LSHR_B32_e32:
1043 case AMDGPU::S_LSHR_B32:
1044 Result = LHS >> (RHS & 31);
1045 return true;
1046 case AMDGPU::V_LSHRREV_B32_e64:
1047 case AMDGPU::V_LSHRREV_B32_e32:
1048 Result = RHS >> (LHS & 31);
1049 return true;
1050 case AMDGPU::V_ASHR_I32_e64:
1051 case AMDGPU::V_ASHR_I32_e32:
1052 case AMDGPU::S_ASHR_I32:
1053 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1054 return true;
1055 case AMDGPU::V_ASHRREV_I32_e64:
1056 case AMDGPU::V_ASHRREV_I32_e32:
1057 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1058 return true;
1059 default:
1060 return false;
1061 }
1062}
1063
1064static unsigned getMovOpc(bool IsScalar) {
1065 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1066}
1067
1068static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1069 MI.setDesc(NewDesc);
1070
1071 // Remove any leftover implicit operands from mutating the instruction. e.g.
1072 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1073 // anymore.
1074 const MCInstrDesc &Desc = MI.getDesc();
1075 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1076 Desc.implicit_defs().size();
1077
1078 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1079 MI.removeOperand(I);
1080}
1081
1083SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1084 // If this has a subregister, it obviously is a register source.
1085 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1086 !Op.getReg().isVirtual())
1087 return &Op;
1088
1089 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1090 if (Def && Def->isMoveImmediate()) {
1091 MachineOperand &ImmSrc = Def->getOperand(1);
1092 if (ImmSrc.isImm())
1093 return &ImmSrc;
1094 }
1095
1096 return &Op;
1097}
1098
1099// Try to simplify operations with a constant that may appear after instruction
1100// selection.
1101// TODO: See if a frame index with a fixed offset can fold.
1102bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1103 if (!MI->allImplicitDefsAreDead())
1104 return false;
1105
1106 unsigned Opc = MI->getOpcode();
1107
1108 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1109 if (Src0Idx == -1)
1110 return false;
1111 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1112
1113 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1114 Opc == AMDGPU::S_NOT_B32) &&
1115 Src0->isImm()) {
1116 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1117 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1118 return true;
1119 }
1120
1121 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1122 if (Src1Idx == -1)
1123 return false;
1124 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1125
1126 if (!Src0->isImm() && !Src1->isImm())
1127 return false;
1128
1129 // and k0, k1 -> v_mov_b32 (k0 & k1)
1130 // or k0, k1 -> v_mov_b32 (k0 | k1)
1131 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1132 if (Src0->isImm() && Src1->isImm()) {
1133 int32_t NewImm;
1134 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1135 return false;
1136
1137 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1138
1139 // Be careful to change the right operand, src0 may belong to a different
1140 // instruction.
1141 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1142 MI->removeOperand(Src1Idx);
1143 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1144 return true;
1145 }
1146
1147 if (!MI->isCommutable())
1148 return false;
1149
1150 if (Src0->isImm() && !Src1->isImm()) {
1151 std::swap(Src0, Src1);
1152 std::swap(Src0Idx, Src1Idx);
1153 }
1154
1155 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1156 if (Opc == AMDGPU::V_OR_B32_e64 ||
1157 Opc == AMDGPU::V_OR_B32_e32 ||
1158 Opc == AMDGPU::S_OR_B32) {
1159 if (Src1Val == 0) {
1160 // y = or x, 0 => y = copy x
1161 MI->removeOperand(Src1Idx);
1162 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1163 } else if (Src1Val == -1) {
1164 // y = or x, -1 => y = v_mov_b32 -1
1165 MI->removeOperand(Src1Idx);
1166 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1167 } else
1168 return false;
1169
1170 return true;
1171 }
1172
1173 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1174 Opc == AMDGPU::S_AND_B32) {
1175 if (Src1Val == 0) {
1176 // y = and x, 0 => y = v_mov_b32 0
1177 MI->removeOperand(Src0Idx);
1178 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1179 } else if (Src1Val == -1) {
1180 // y = and x, -1 => y = copy x
1181 MI->removeOperand(Src1Idx);
1182 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1183 } else
1184 return false;
1185
1186 return true;
1187 }
1188
1189 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1190 Opc == AMDGPU::S_XOR_B32) {
1191 if (Src1Val == 0) {
1192 // y = xor x, 0 => y = copy x
1193 MI->removeOperand(Src1Idx);
1194 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1195 return true;
1196 }
1197 }
1198
1199 return false;
1200}
1201
1202// Try to fold an instruction into a simpler one
1203bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1204 unsigned Opc = MI.getOpcode();
1205 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1206 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1207 return false;
1208
1209 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1210 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1211 if (!Src1->isIdenticalTo(*Src0)) {
1212 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1213 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1214 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1215 return false;
1216 }
1217
1218 int Src1ModIdx =
1219 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1220 int Src0ModIdx =
1221 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1222 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1223 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1224 return false;
1225
1226 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1227 auto &NewDesc =
1228 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1229 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1230 if (Src2Idx != -1)
1231 MI.removeOperand(Src2Idx);
1232 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1233 if (Src1ModIdx != -1)
1234 MI.removeOperand(Src1ModIdx);
1235 if (Src0ModIdx != -1)
1236 MI.removeOperand(Src0ModIdx);
1237 mutateCopyOp(MI, NewDesc);
1238 LLVM_DEBUG(dbgs() << MI);
1239 return true;
1240}
1241
1242bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1243 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1244 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1245 return false;
1246
1247 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1248 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1249 return false;
1250
1251 Register Src1 = MI.getOperand(2).getReg();
1252 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1253 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1254 return false;
1255
1256 Register Dst = MI.getOperand(0).getReg();
1257 MRI->replaceRegWith(Dst, SrcDef->getOperand(0).getReg());
1258 MI.eraseFromParent();
1259 return true;
1260}
1261
1262bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1263 MachineOperand &OpToFold) const {
1264 // We need mutate the operands of new mov instructions to add implicit
1265 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1266 // this.
1267 SmallVector<MachineInstr *, 4> CopiesToReplace;
1269 MachineOperand &Dst = MI.getOperand(0);
1270 bool Changed = false;
1271
1272 if (OpToFold.isImm()) {
1273 for (auto &UseMI :
1274 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1275 // Folding the immediate may reveal operations that can be constant
1276 // folded or replaced with a copy. This can happen for example after
1277 // frame indices are lowered to constants or from splitting 64-bit
1278 // constants.
1279 //
1280 // We may also encounter cases where one or both operands are
1281 // immediates materialized into a register, which would ordinarily not
1282 // be folded due to multiple uses or operand constraints.
1283 if (tryConstantFoldOp(&UseMI)) {
1284 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1285 Changed = true;
1286 }
1287 }
1288 }
1289
1291 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1292 UsesToProcess.push_back(&Use);
1293 for (auto *U : UsesToProcess) {
1294 MachineInstr *UseMI = U->getParent();
1295 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1296 CopiesToReplace);
1297 }
1298
1299 if (CopiesToReplace.empty() && FoldList.empty())
1300 return Changed;
1301
1302 MachineFunction *MF = MI.getParent()->getParent();
1303 // Make sure we add EXEC uses to any new v_mov instructions created.
1304 for (MachineInstr *Copy : CopiesToReplace)
1305 Copy->addImplicitDefUseOperands(*MF);
1306
1307 for (FoldCandidate &Fold : FoldList) {
1308 assert(!Fold.isReg() || Fold.OpToFold);
1309 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1310 Register Reg = Fold.OpToFold->getReg();
1311 MachineInstr *DefMI = Fold.OpToFold->getParent();
1312 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1313 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1314 continue;
1315 }
1316 if (updateOperand(Fold)) {
1317 // Clear kill flags.
1318 if (Fold.isReg()) {
1319 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1320 // FIXME: Probably shouldn't bother trying to fold if not an
1321 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1322 // copies.
1323 MRI->clearKillFlags(Fold.OpToFold->getReg());
1324 }
1325 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1326 << static_cast<int>(Fold.UseOpNo) << " of "
1327 << *Fold.UseMI);
1328 } else if (Fold.Commuted) {
1329 // Restoring instruction's original operand order if fold has failed.
1330 TII->commuteInstruction(*Fold.UseMI, false);
1331 }
1332 }
1333 return true;
1334}
1335
1336bool SIFoldOperands::tryFoldFoldableCopy(
1337 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1338 // Specially track simple redefs of m0 to the same value in a block, so we
1339 // can erase the later ones.
1340 if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1341 MachineOperand &NewM0Val = MI.getOperand(1);
1342 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1343 MI.eraseFromParent();
1344 return true;
1345 }
1346
1347 // We aren't tracking other physical registers
1348 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1349 ? nullptr
1350 : &NewM0Val;
1351 return false;
1352 }
1353
1354 MachineOperand &OpToFold = MI.getOperand(1);
1355 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1356
1357 // FIXME: We could also be folding things like TargetIndexes.
1358 if (!FoldingImm && !OpToFold.isReg())
1359 return false;
1360
1361 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1362 return false;
1363
1364 // Prevent folding operands backwards in the function. For example,
1365 // the COPY opcode must not be replaced by 1 in this example:
1366 //
1367 // %3 = COPY %vgpr0; VGPR_32:%3
1368 // ...
1369 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1370 if (!MI.getOperand(0).getReg().isVirtual())
1371 return false;
1372
1373 bool Changed = foldInstOperand(MI, OpToFold);
1374
1375 // If we managed to fold all uses of this copy then we might as well
1376 // delete it now.
1377 // The only reason we need to follow chains of copies here is that
1378 // tryFoldRegSequence looks forward through copies before folding a
1379 // REG_SEQUENCE into its eventual users.
1380 auto *InstToErase = &MI;
1381 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1382 auto &SrcOp = InstToErase->getOperand(1);
1383 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1384 InstToErase->eraseFromParent();
1385 Changed = true;
1386 InstToErase = nullptr;
1387 if (!SrcReg || SrcReg.isPhysical())
1388 break;
1389 InstToErase = MRI->getVRegDef(SrcReg);
1390 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1391 break;
1392 }
1393
1394 if (InstToErase && InstToErase->isRegSequence() &&
1395 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1396 InstToErase->eraseFromParent();
1397 Changed = true;
1398 }
1399
1400 return Changed;
1401}
1402
1403// Clamp patterns are canonically selected to v_max_* instructions, so only
1404// handle them.
1405const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1406 unsigned Op = MI.getOpcode();
1407 switch (Op) {
1408 case AMDGPU::V_MAX_F32_e64:
1409 case AMDGPU::V_MAX_F16_e64:
1410 case AMDGPU::V_MAX_F16_t16_e64:
1411 case AMDGPU::V_MAX_F16_fake16_e64:
1412 case AMDGPU::V_MAX_F64_e64:
1413 case AMDGPU::V_PK_MAX_F16: {
1414 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1415 return nullptr;
1416
1417 // Make sure sources are identical.
1418 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1419 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1420 if (!Src0->isReg() || !Src1->isReg() ||
1421 Src0->getReg() != Src1->getReg() ||
1422 Src0->getSubReg() != Src1->getSubReg() ||
1423 Src0->getSubReg() != AMDGPU::NoSubRegister)
1424 return nullptr;
1425
1426 // Can't fold up if we have modifiers.
1427 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1428 return nullptr;
1429
1430 unsigned Src0Mods
1431 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1432 unsigned Src1Mods
1433 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1434
1435 // Having a 0 op_sel_hi would require swizzling the output in the source
1436 // instruction, which we can't do.
1437 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1438 : 0u;
1439 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1440 return nullptr;
1441 return Src0;
1442 }
1443 default:
1444 return nullptr;
1445 }
1446}
1447
1448// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1449bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1450 const MachineOperand *ClampSrc = isClamp(MI);
1451 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1452 return false;
1453
1454 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1455
1456 // The type of clamp must be compatible.
1457 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1458 return false;
1459
1460 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1461 if (!DefClamp)
1462 return false;
1463
1464 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1465
1466 // Clamp is applied after omod, so it is OK if omod is set.
1467 DefClamp->setImm(1);
1468 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1469 MI.eraseFromParent();
1470
1471 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1472 // instruction, so we might as well convert it to the more flexible VOP3-only
1473 // mad/fma form.
1474 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1475 Def->eraseFromParent();
1476
1477 return true;
1478}
1479
1480static int getOModValue(unsigned Opc, int64_t Val) {
1481 switch (Opc) {
1482 case AMDGPU::V_MUL_F64_e64: {
1483 switch (Val) {
1484 case 0x3fe0000000000000: // 0.5
1485 return SIOutMods::DIV2;
1486 case 0x4000000000000000: // 2.0
1487 return SIOutMods::MUL2;
1488 case 0x4010000000000000: // 4.0
1489 return SIOutMods::MUL4;
1490 default:
1491 return SIOutMods::NONE;
1492 }
1493 }
1494 case AMDGPU::V_MUL_F32_e64: {
1495 switch (static_cast<uint32_t>(Val)) {
1496 case 0x3f000000: // 0.5
1497 return SIOutMods::DIV2;
1498 case 0x40000000: // 2.0
1499 return SIOutMods::MUL2;
1500 case 0x40800000: // 4.0
1501 return SIOutMods::MUL4;
1502 default:
1503 return SIOutMods::NONE;
1504 }
1505 }
1506 case AMDGPU::V_MUL_F16_e64:
1507 case AMDGPU::V_MUL_F16_t16_e64:
1508 case AMDGPU::V_MUL_F16_fake16_e64: {
1509 switch (static_cast<uint16_t>(Val)) {
1510 case 0x3800: // 0.5
1511 return SIOutMods::DIV2;
1512 case 0x4000: // 2.0
1513 return SIOutMods::MUL2;
1514 case 0x4400: // 4.0
1515 return SIOutMods::MUL4;
1516 default:
1517 return SIOutMods::NONE;
1518 }
1519 }
1520 default:
1521 llvm_unreachable("invalid mul opcode");
1522 }
1523}
1524
1525// FIXME: Does this really not support denormals with f16?
1526// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1527// handled, so will anything other than that break?
1528std::pair<const MachineOperand *, int>
1529SIFoldOperands::isOMod(const MachineInstr &MI) const {
1530 unsigned Op = MI.getOpcode();
1531 switch (Op) {
1532 case AMDGPU::V_MUL_F64_e64:
1533 case AMDGPU::V_MUL_F32_e64:
1534 case AMDGPU::V_MUL_F16_t16_e64:
1535 case AMDGPU::V_MUL_F16_fake16_e64:
1536 case AMDGPU::V_MUL_F16_e64: {
1537 // If output denormals are enabled, omod is ignored.
1538 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1539 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1540 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
1541 Op == AMDGPU::V_MUL_F16_t16_e64 ||
1542 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1543 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1544 return std::pair(nullptr, SIOutMods::NONE);
1545
1546 const MachineOperand *RegOp = nullptr;
1547 const MachineOperand *ImmOp = nullptr;
1548 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1549 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1550 if (Src0->isImm()) {
1551 ImmOp = Src0;
1552 RegOp = Src1;
1553 } else if (Src1->isImm()) {
1554 ImmOp = Src1;
1555 RegOp = Src0;
1556 } else
1557 return std::pair(nullptr, SIOutMods::NONE);
1558
1559 int OMod = getOModValue(Op, ImmOp->getImm());
1560 if (OMod == SIOutMods::NONE ||
1561 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1562 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1563 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1564 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1565 return std::pair(nullptr, SIOutMods::NONE);
1566
1567 return std::pair(RegOp, OMod);
1568 }
1569 case AMDGPU::V_ADD_F64_e64:
1570 case AMDGPU::V_ADD_F32_e64:
1571 case AMDGPU::V_ADD_F16_e64:
1572 case AMDGPU::V_ADD_F16_t16_e64:
1573 case AMDGPU::V_ADD_F16_fake16_e64: {
1574 // If output denormals are enabled, omod is ignored.
1575 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1576 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1577 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
1578 Op == AMDGPU::V_ADD_F16_t16_e64 ||
1579 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1580 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1581 return std::pair(nullptr, SIOutMods::NONE);
1582
1583 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1584 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1585 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1586
1587 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1588 Src0->getSubReg() == Src1->getSubReg() &&
1589 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1590 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1591 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1592 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1593 return std::pair(Src0, SIOutMods::MUL2);
1594
1595 return std::pair(nullptr, SIOutMods::NONE);
1596 }
1597 default:
1598 return std::pair(nullptr, SIOutMods::NONE);
1599 }
1600}
1601
1602// FIXME: Does this need to check IEEE bit on function?
1603bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1604 const MachineOperand *RegOp;
1605 int OMod;
1606 std::tie(RegOp, OMod) = isOMod(MI);
1607 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1608 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1609 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1610 return false;
1611
1612 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1613 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1614 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1615 return false;
1616
1617 // Clamp is applied after omod. If the source already has clamp set, don't
1618 // fold it.
1619 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1620 return false;
1621
1622 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1623
1624 DefOMod->setImm(OMod);
1625 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1626 MI.eraseFromParent();
1627
1628 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1629 // instruction, so we might as well convert it to the more flexible VOP3-only
1630 // mad/fma form.
1631 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1632 Def->eraseFromParent();
1633
1634 return true;
1635}
1636
1637// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1638// instruction which can take an agpr. So far that means a store.
1639bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1640 assert(MI.isRegSequence());
1641 auto Reg = MI.getOperand(0).getReg();
1642
1643 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1644 !MRI->hasOneNonDBGUse(Reg))
1645 return false;
1646
1648 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1649 return false;
1650
1651 for (auto &Def : Defs) {
1652 const auto *Op = Def.first;
1653 if (!Op->isReg())
1654 return false;
1655 if (TRI->isAGPR(*MRI, Op->getReg()))
1656 continue;
1657 // Maybe this is a COPY from AREG
1658 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1659 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1660 return false;
1661 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1662 return false;
1663 }
1664
1665 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1666 MachineInstr *UseMI = Op->getParent();
1667 while (UseMI->isCopy() && !Op->getSubReg()) {
1668 Reg = UseMI->getOperand(0).getReg();
1669 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1670 return false;
1671 Op = &*MRI->use_nodbg_begin(Reg);
1672 UseMI = Op->getParent();
1673 }
1674
1675 if (Op->getSubReg())
1676 return false;
1677
1678 unsigned OpIdx = Op - &UseMI->getOperand(0);
1679 const MCInstrDesc &InstDesc = UseMI->getDesc();
1680 const TargetRegisterClass *OpRC =
1681 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1682 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1683 return false;
1684
1685 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1686 auto Dst = MRI->createVirtualRegister(NewDstRC);
1687 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1688 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1689
1690 for (unsigned I = 0; I < Defs.size(); ++I) {
1691 MachineOperand *Def = Defs[I].first;
1692 Def->setIsKill(false);
1693 if (TRI->isAGPR(*MRI, Def->getReg())) {
1694 RS.add(*Def);
1695 } else { // This is a copy
1696 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1697 SubDef->getOperand(1).setIsKill(false);
1698 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1699 }
1700 RS.addImm(Defs[I].second);
1701 }
1702
1703 Op->setReg(Dst);
1704 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1705 Op->setReg(Reg);
1706 RS->eraseFromParent();
1707 return false;
1708 }
1709
1710 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1711
1712 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1713 // in which case we can erase them all later in runOnMachineFunction.
1714 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1715 MI.eraseFromParent();
1716 return true;
1717}
1718
1719/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1720/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1721static bool isAGPRCopy(const SIRegisterInfo &TRI,
1722 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1723 Register &OutReg, unsigned &OutSubReg) {
1724 assert(Copy.isCopy());
1725
1726 const MachineOperand &CopySrc = Copy.getOperand(1);
1727 Register CopySrcReg = CopySrc.getReg();
1728 if (!CopySrcReg.isVirtual())
1729 return false;
1730
1731 // Common case: copy from AGPR directly, e.g.
1732 // %1:vgpr_32 = COPY %0:agpr_32
1733 if (TRI.isAGPR(MRI, CopySrcReg)) {
1734 OutReg = CopySrcReg;
1735 OutSubReg = CopySrc.getSubReg();
1736 return true;
1737 }
1738
1739 // Sometimes it can also involve two copies, e.g.
1740 // %1:vgpr_256 = COPY %0:agpr_256
1741 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1742 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1743 if (!CopySrcDef || !CopySrcDef->isCopy())
1744 return false;
1745
1746 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1747 Register OtherCopySrcReg = OtherCopySrc.getReg();
1748 if (!OtherCopySrcReg.isVirtual() ||
1749 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1750 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1751 !TRI.isAGPR(MRI, OtherCopySrcReg))
1752 return false;
1753
1754 OutReg = OtherCopySrcReg;
1755 OutSubReg = CopySrc.getSubReg();
1756 return true;
1757}
1758
1759// Try to hoist an AGPR to VGPR copy across a PHI.
1760// This should allow folding of an AGPR into a consumer which may support it.
1761//
1762// Example 1: LCSSA PHI
1763// loop:
1764// %1:vreg = COPY %0:areg
1765// exit:
1766// %2:vreg = PHI %1:vreg, %loop
1767// =>
1768// loop:
1769// exit:
1770// %1:areg = PHI %0:areg, %loop
1771// %2:vreg = COPY %1:areg
1772//
1773// Example 2: PHI with multiple incoming values:
1774// entry:
1775// %1:vreg = GLOBAL_LOAD(..)
1776// loop:
1777// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1778// %3:areg = COPY %2:vreg
1779// %4:areg = (instr using %3:areg)
1780// %5:vreg = COPY %4:areg
1781// =>
1782// entry:
1783// %1:vreg = GLOBAL_LOAD(..)
1784// %2:areg = COPY %1:vreg
1785// loop:
1786// %3:areg = PHI %2:areg, %entry, %X:areg,
1787// %4:areg = (instr using %3:areg)
1788bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1789 assert(PHI.isPHI());
1790
1791 Register PhiOut = PHI.getOperand(0).getReg();
1792 if (!TRI->isVGPR(*MRI, PhiOut))
1793 return false;
1794
1795 // Iterate once over all incoming values of the PHI to check if this PHI is
1796 // eligible, and determine the exact AGPR RC we'll target.
1797 const TargetRegisterClass *ARC = nullptr;
1798 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1799 MachineOperand &MO = PHI.getOperand(K);
1800 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1801 if (!Copy || !Copy->isCopy())
1802 continue;
1803
1804 Register AGPRSrc;
1805 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1806 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1807 continue;
1808
1809 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1810 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1811 CopyInRC = SubRC;
1812
1813 if (ARC && !ARC->hasSubClassEq(CopyInRC))
1814 return false;
1815 ARC = CopyInRC;
1816 }
1817
1818 if (!ARC)
1819 return false;
1820
1821 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1822
1823 // Rewrite the PHI's incoming values to ARC.
1824 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1825 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1826 MachineOperand &MO = PHI.getOperand(K);
1827 Register Reg = MO.getReg();
1828
1830 MachineBasicBlock *InsertMBB = nullptr;
1831
1832 // Look at the def of Reg, ignoring all copies.
1833 unsigned CopyOpc = AMDGPU::COPY;
1834 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1835
1836 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1837 // the copy was single-use, it will be removed by DCE later.
1838 if (Def->isCopy()) {
1839 Register AGPRSrc;
1840 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1841 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1842 MO.setReg(AGPRSrc);
1843 MO.setSubReg(AGPRSubReg);
1844 continue;
1845 }
1846
1847 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1848 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1849 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1850 // is unlikely to be profitable.
1851 //
1852 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1853 MachineOperand &CopyIn = Def->getOperand(1);
1854 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1855 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1856 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1857 }
1858
1859 InsertMBB = Def->getParent();
1860 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1861 } else {
1862 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1863 InsertPt = InsertMBB->getFirstTerminator();
1864 }
1865
1866 Register NewReg = MRI->createVirtualRegister(ARC);
1867 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1868 TII->get(CopyOpc), NewReg)
1869 .addReg(Reg);
1870 MO.setReg(NewReg);
1871
1872 (void)MI;
1873 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1874 }
1875
1876 // Replace the PHI's result with a new register.
1877 Register NewReg = MRI->createVirtualRegister(ARC);
1878 PHI.getOperand(0).setReg(NewReg);
1879
1880 // COPY that new register back to the original PhiOut register. This COPY will
1881 // usually be folded out later.
1882 MachineBasicBlock *MBB = PHI.getParent();
1883 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
1884 TII->get(AMDGPU::COPY), PhiOut)
1885 .addReg(NewReg);
1886
1887 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
1888 return true;
1889}
1890
1891// Attempt to convert VGPR load to an AGPR load.
1892bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1893 assert(MI.mayLoad());
1894 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
1895 return false;
1896
1897 MachineOperand &Def = MI.getOperand(0);
1898 if (!Def.isDef())
1899 return false;
1900
1901 Register DefReg = Def.getReg();
1902
1903 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
1904 return false;
1905
1907 SmallVector<Register, 8> MoveRegs;
1908 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
1909 Users.push_back(&I);
1910
1911 if (Users.empty())
1912 return false;
1913
1914 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1915 while (!Users.empty()) {
1916 const MachineInstr *I = Users.pop_back_val();
1917 if (!I->isCopy() && !I->isRegSequence())
1918 return false;
1919 Register DstReg = I->getOperand(0).getReg();
1920 // Physical registers may have more than one instruction definitions
1921 if (DstReg.isPhysical())
1922 return false;
1923 if (TRI->isAGPR(*MRI, DstReg))
1924 continue;
1925 MoveRegs.push_back(DstReg);
1926 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
1927 Users.push_back(&U);
1928 }
1929
1930 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
1931 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
1932 if (!TII->isOperandLegal(MI, 0, &Def)) {
1933 MRI->setRegClass(DefReg, RC);
1934 return false;
1935 }
1936
1937 while (!MoveRegs.empty()) {
1938 Register Reg = MoveRegs.pop_back_val();
1939 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
1940 }
1941
1942 LLVM_DEBUG(dbgs() << "Folded " << MI);
1943
1944 return true;
1945}
1946
1947// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
1948// For GFX90A and later, this is pretty much always a good thing, but for GFX908
1949// there's cases where it can create a lot more AGPR-AGPR copies, which are
1950// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
1951//
1952// This function looks at all AGPR PHIs in a basic block and collects their
1953// operands. Then, it checks for register that are used more than once across
1954// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
1955// having to create one VGPR temporary per use, which can get very messy if
1956// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
1957// element).
1958//
1959// Example
1960// a:
1961// %in:agpr_256 = COPY %foo:vgpr_256
1962// c:
1963// %x:agpr_32 = ..
1964// b:
1965// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
1966// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
1967// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
1968// =>
1969// a:
1970// %in:agpr_256 = COPY %foo:vgpr_256
1971// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
1972// %tmp_agpr:agpr_32 = COPY %tmp
1973// c:
1974// %x:agpr_32 = ..
1975// b:
1976// %0:areg = PHI %tmp_agpr, %a, %x, %c
1977// %1:areg = PHI %tmp_agpr, %a, %y, %c
1978// %2:areg = PHI %tmp_agpr, %a, %z, %c
1979bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
1980 // This is only really needed on GFX908 where AGPR-AGPR copies are
1981 // unreasonably difficult.
1982 if (ST->hasGFX90AInsts())
1983 return false;
1984
1985 // Look at all AGPR Phis and collect the register + subregister used.
1986 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
1987 RegToMO;
1988
1989 for (auto &MI : MBB) {
1990 if (!MI.isPHI())
1991 break;
1992
1993 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
1994 continue;
1995
1996 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
1997 MachineOperand &PhiMO = MI.getOperand(K);
1998 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
1999 }
2000 }
2001
2002 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2003 // a VGPR.
2004 bool Changed = false;
2005 for (const auto &[Entry, MOs] : RegToMO) {
2006 if (MOs.size() == 1)
2007 continue;
2008
2009 const auto [Reg, SubReg] = Entry;
2010 MachineInstr *Def = MRI->getVRegDef(Reg);
2011 MachineBasicBlock *DefMBB = Def->getParent();
2012
2013 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2014 // out.
2015 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2016 Register TempVGPR =
2017 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2018 MachineInstr *VGPRCopy =
2019 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2020 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2021 .addReg(Reg, /* flags */ 0, SubReg);
2022
2023 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2024 Register TempAGPR = MRI->createVirtualRegister(ARC);
2025 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2026 TII->get(AMDGPU::COPY), TempAGPR)
2027 .addReg(TempVGPR);
2028
2029 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2030 for (MachineOperand *MO : MOs) {
2031 MO->setReg(TempAGPR);
2032 MO->setSubReg(AMDGPU::NoSubRegister);
2033 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2034 }
2035
2036 Changed = true;
2037 }
2038
2039 return Changed;
2040}
2041
2042bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2043 if (skipFunction(MF.getFunction()))
2044 return false;
2045
2046 MRI = &MF.getRegInfo();
2047 ST = &MF.getSubtarget<GCNSubtarget>();
2048 TII = ST->getInstrInfo();
2049 TRI = &TII->getRegisterInfo();
2050 MFI = MF.getInfo<SIMachineFunctionInfo>();
2051
2052 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2053 // correctly handle signed zeros.
2054 //
2055 // FIXME: Also need to check strictfp
2056 bool IsIEEEMode = MFI->getMode().IEEE;
2057 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2058
2059 bool Changed = false;
2060 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2061 MachineOperand *CurrentKnownM0Val = nullptr;
2062 for (auto &MI : make_early_inc_range(*MBB)) {
2063 Changed |= tryFoldCndMask(MI);
2064
2065 if (tryFoldZeroHighBits(MI)) {
2066 Changed = true;
2067 continue;
2068 }
2069
2070 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2071 Changed = true;
2072 continue;
2073 }
2074
2075 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2076 Changed = true;
2077 continue;
2078 }
2079
2080 if (MI.mayLoad() && tryFoldLoad(MI)) {
2081 Changed = true;
2082 continue;
2083 }
2084
2085 if (TII->isFoldableCopy(MI)) {
2086 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2087 continue;
2088 }
2089
2090 // Saw an unknown clobber of m0, so we no longer know what it is.
2091 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2092 CurrentKnownM0Val = nullptr;
2093
2094 // TODO: Omod might be OK if there is NSZ only on the source
2095 // instruction, and not the omod multiply.
2096 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2097 !tryFoldOMod(MI))
2098 Changed |= tryFoldClamp(MI);
2099 }
2100
2101 Changed |= tryOptimizeAGPRPhis(*MBB);
2102 }
2103
2104 return Changed;
2105}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
Module * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
uint64_t TSFlags
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:546
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:749
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:540
bool isRegSequence() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:472
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:673
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_INLINE_C_LAST
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:196
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:197
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:209
@ OPERAND_REG_INLINE_C_FIRST
Definition: SIDefines.h:237
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:203
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:208
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1281
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
char & SIFoldOperandsID
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:666
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
FunctionPass * createSIFoldOperandsPass()
DWARFExpression::Operation Op
void initializeSIFoldOperandsPass(PassRegistry &)
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.