LLVM 20.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
19
20#define DEBUG_TYPE "si-fold-operands"
21using namespace llvm;
22
23namespace {
24
25struct FoldCandidate {
27 union {
28 MachineOperand *OpToFold;
29 uint64_t ImmToFold;
30 int FrameIndexToFold;
31 };
32 int ShrinkOpcode;
33 unsigned UseOpNo;
35 bool Commuted;
36
37 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
38 bool Commuted_ = false,
39 int ShrinkOp = -1) :
40 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
41 Kind(FoldOp->getType()),
42 Commuted(Commuted_) {
43 if (FoldOp->isImm()) {
44 ImmToFold = FoldOp->getImm();
45 } else if (FoldOp->isFI()) {
46 FrameIndexToFold = FoldOp->getIndex();
47 } else {
48 assert(FoldOp->isReg() || FoldOp->isGlobal());
49 OpToFold = FoldOp;
50 }
51 }
52
53 bool isFI() const {
54 return Kind == MachineOperand::MO_FrameIndex;
55 }
56
57 bool isImm() const {
58 return Kind == MachineOperand::MO_Immediate;
59 }
60
61 bool isReg() const {
62 return Kind == MachineOperand::MO_Register;
63 }
64
65 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
66
67 bool needsShrink() const { return ShrinkOpcode != -1; }
68};
69
70class SIFoldOperandsImpl {
71public:
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 // TODO: Just use TII::getVALUOp
82 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
83 switch (Opc) {
84 case AMDGPU::S_ADD_I32: {
85 if (ST->hasAddNoCarry())
86 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
87 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
88 }
89 case AMDGPU::S_OR_B32:
90 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
91 case AMDGPU::S_AND_B32:
92 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
93 case AMDGPU::S_MUL_I32:
94 return AMDGPU::V_MUL_LO_U32_e64;
95 default:
96 return AMDGPU::INSTRUCTION_LIST_END;
97 }
98 }
99
100 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
101 MachineInstr &MI) const;
102
103 bool updateOperand(FoldCandidate &Fold) const;
104
105 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
106
107 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
108
109 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
110 MachineInstr *MI, unsigned OpNo,
111 MachineOperand *OpToFold) const;
112 bool isUseSafeToFold(const MachineInstr &MI,
113 const MachineOperand &UseMO) const;
114 bool
115 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
116 Register UseReg, uint8_t OpTy) const;
117 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
118 unsigned UseOpIdx,
119 SmallVectorImpl<FoldCandidate> &FoldList) const;
120 void foldOperand(MachineOperand &OpToFold,
122 int UseOpIdx,
124 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
125
126 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
127 bool tryConstantFoldOp(MachineInstr *MI) const;
128 bool tryFoldCndMask(MachineInstr &MI) const;
129 bool tryFoldZeroHighBits(MachineInstr &MI) const;
130 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
131 bool tryFoldFoldableCopy(MachineInstr &MI,
132 MachineOperand *&CurrentKnownM0Val) const;
133
134 const MachineOperand *isClamp(const MachineInstr &MI) const;
135 bool tryFoldClamp(MachineInstr &MI);
136
137 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
138 bool tryFoldOMod(MachineInstr &MI);
139 bool tryFoldRegSequence(MachineInstr &MI);
140 bool tryFoldPhiAGPR(MachineInstr &MI);
141 bool tryFoldLoad(MachineInstr &MI);
142
143 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
144
145public:
146 SIFoldOperandsImpl() = default;
147
148 bool run(MachineFunction &MF);
149};
150
151class SIFoldOperandsLegacy : public MachineFunctionPass {
152public:
153 static char ID;
154
155 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
156
157 bool runOnMachineFunction(MachineFunction &MF) override {
158 if (skipFunction(MF.getFunction()))
159 return false;
160 return SIFoldOperandsImpl().run(MF);
161 }
162
163 StringRef getPassName() const override { return "SI Fold Operands"; }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesCFG();
168 }
169};
170
171} // End anonymous namespace.
172
173INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
174 false)
175
176char SIFoldOperandsLegacy::ID = 0;
177
178char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
179
182 const MachineOperand &MO) {
183 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
184 if (const TargetRegisterClass *SubRC =
185 TRI.getSubRegisterClass(RC, MO.getSubReg()))
186 RC = SubRC;
187 return RC;
188}
189
190// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
191static unsigned macToMad(unsigned Opc) {
192 switch (Opc) {
193 case AMDGPU::V_MAC_F32_e64:
194 return AMDGPU::V_MAD_F32_e64;
195 case AMDGPU::V_MAC_F16_e64:
196 return AMDGPU::V_MAD_F16_e64;
197 case AMDGPU::V_FMAC_F32_e64:
198 return AMDGPU::V_FMA_F32_e64;
199 case AMDGPU::V_FMAC_F16_e64:
200 return AMDGPU::V_FMA_F16_gfx9_e64;
201 case AMDGPU::V_FMAC_F16_fake16_e64:
202 return AMDGPU::V_FMA_F16_gfx9_e64;
203 case AMDGPU::V_FMAC_LEGACY_F32_e64:
204 return AMDGPU::V_FMA_LEGACY_F32_e64;
205 case AMDGPU::V_FMAC_F64_e64:
206 return AMDGPU::V_FMA_F64_e64;
207 }
208 return AMDGPU::INSTRUCTION_LIST_END;
209}
210
211// TODO: Add heuristic that the frame index might not fit in the addressing mode
212// immediate offset to avoid materializing in loops.
213bool SIFoldOperandsImpl::frameIndexMayFold(
214 const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) const {
215 if (!OpToFold.isFI())
216 return false;
217
218 const unsigned Opc = UseMI.getOpcode();
219 switch (Opc) {
220 case AMDGPU::S_ADD_I32:
221 case AMDGPU::S_OR_B32:
222 case AMDGPU::S_AND_B32:
223 case AMDGPU::V_ADD_U32_e32:
224 case AMDGPU::V_ADD_CO_U32_e32:
225 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
226 // to insert the wave size shift at every point we use the index.
227 // TODO: Fix depending on visit order to fold immediates into the operand
228 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
229 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
230 case AMDGPU::V_ADD_U32_e64:
231 case AMDGPU::V_ADD_CO_U32_e64:
232 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
233 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
234 default:
235 break;
236 }
237
238 if (TII->isMUBUF(UseMI))
239 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
240 if (!TII->isFLATScratch(UseMI))
241 return false;
242
243 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
244 if (OpNo == SIdx)
245 return true;
246
247 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
248 return OpNo == VIdx && SIdx == -1;
249}
250
251/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
252///
253/// => %vgpr = V_ADD_U32 x, frameindex
254bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
255 Register DstReg, Register SrcReg, MachineInstr &MI) const {
256 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
257 MRI->hasOneNonDBGUse(SrcReg)) {
258 MachineInstr *Def = MRI->getVRegDef(SrcReg);
259 if (!Def || Def->getNumOperands() != 4)
260 return false;
261
262 MachineOperand *Src0 = &Def->getOperand(1);
263 MachineOperand *Src1 = &Def->getOperand(2);
264
265 // TODO: This is profitable with more operand types, and for more
266 // opcodes. But ultimately this is working around poor / nonexistent
267 // regbankselect.
268 if (!Src0->isFI() && !Src1->isFI())
269 return false;
270
271 if (Src0->isFI())
272 std::swap(Src0, Src1);
273
274 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
275 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
276 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
277 !Def->getOperand(3).isDead()) // Check if scc is dead
278 return false;
279
280 MachineBasicBlock *MBB = Def->getParent();
281 const DebugLoc &DL = Def->getDebugLoc();
282 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
284 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
285
286 if (Add->getDesc().getNumDefs() == 2) {
287 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
288 Add.addDef(CarryOutReg, RegState::Dead);
289 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
290 }
291
292 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
293 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
294 Add.addImm(0);
295
296 Def->eraseFromParent();
297 MI.eraseFromParent();
298 return true;
299 }
300
301 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
302
304 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
305 if (Liveness == MachineBasicBlock::LQR_Dead) {
306 // TODO: If src1 satisfies operand constraints, use vop3 version.
307 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
308 .add(*Src0)
309 .add(*Src1)
310 .setOperandDead(3) // implicit-def $vcc
311 .setMIFlags(Def->getFlags());
312 Def->eraseFromParent();
313 MI.eraseFromParent();
314 return true;
315 }
316 }
317
318 return false;
319}
320
322 return new SIFoldOperandsLegacy();
323}
324
325bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const {
326 MachineInstr *MI = Fold.UseMI;
327 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
328 const uint64_t TSFlags = MI->getDesc().TSFlags;
329
330 assert(Old.isReg() && Fold.isImm());
331
332 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
333 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
334 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
335 return false;
336
337 unsigned Opcode = MI->getOpcode();
338 int OpNo = MI->getOperandNo(&Old);
339 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
340 switch (OpType) {
341 default:
342 return false;
349 break;
350 }
351
352 return true;
353}
354
355bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
356 MachineInstr *MI = Fold.UseMI;
357 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
358 unsigned Opcode = MI->getOpcode();
359 int OpNo = MI->getOperandNo(&Old);
360 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
361
362 // If the literal can be inlined as-is, apply it and short-circuit the
363 // tests below. The main motivation for this is to avoid unintuitive
364 // uses of opsel.
365 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
366 Old.ChangeToImmediate(Fold.ImmToFold);
367 return true;
368 }
369
370 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
371 // op_sel in a way that allows an inline constant.
372 int ModIdx = -1;
373 unsigned SrcIdx = ~0;
374 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
375 ModIdx = AMDGPU::OpName::src0_modifiers;
376 SrcIdx = 0;
377 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
378 ModIdx = AMDGPU::OpName::src1_modifiers;
379 SrcIdx = 1;
380 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
381 ModIdx = AMDGPU::OpName::src2_modifiers;
382 SrcIdx = 2;
383 }
384 assert(ModIdx != -1);
385 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
386 MachineOperand &Mod = MI->getOperand(ModIdx);
387 unsigned ModVal = Mod.getImm();
388
389 uint16_t ImmLo = static_cast<uint16_t>(
390 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
391 uint16_t ImmHi = static_cast<uint16_t>(
392 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
393 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
394 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
395
396 // Helper function that attempts to inline the given value with a newly
397 // chosen opsel pattern.
398 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
399 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
400 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
401 Old.ChangeToImmediate(Imm);
402 return true;
403 }
404
405 // Try to shuffle the halves around and leverage opsel to get an inline
406 // constant.
407 uint16_t Lo = static_cast<uint16_t>(Imm);
408 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
409 if (Lo == Hi) {
410 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
411 Mod.setImm(NewModVal);
413 return true;
414 }
415
416 if (static_cast<int16_t>(Lo) < 0) {
417 int32_t SExt = static_cast<int16_t>(Lo);
418 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
419 Mod.setImm(NewModVal);
420 Old.ChangeToImmediate(SExt);
421 return true;
422 }
423 }
424
425 // This check is only useful for integer instructions
426 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
428 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
429 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
430 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
431 return true;
432 }
433 }
434 } else {
435 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
436 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
437 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
438 Old.ChangeToImmediate(Swapped);
439 return true;
440 }
441 }
442
443 return false;
444 };
445
446 if (tryFoldToInline(Imm))
447 return true;
448
449 // Replace integer addition by subtraction and vice versa if it allows
450 // folding the immediate to an inline constant.
451 //
452 // We should only ever get here for SrcIdx == 1 due to canonicalization
453 // earlier in the pipeline, but we double-check here to be safe / fully
454 // general.
455 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
456 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
457 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
458 unsigned ClampIdx =
459 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
460 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
461
462 if (!Clamp) {
463 uint16_t NegLo = -static_cast<uint16_t>(Imm);
464 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
465 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
466
467 if (tryFoldToInline(NegImm)) {
468 unsigned NegOpcode =
469 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
470 MI->setDesc(TII->get(NegOpcode));
471 return true;
472 }
473 }
474 }
475
476 return false;
477}
478
479bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
480 MachineInstr *MI = Fold.UseMI;
481 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
482 assert(Old.isReg());
483
484 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
485 if (tryFoldImmWithOpSel(Fold))
486 return true;
487
488 // We can't represent the candidate as an inline constant. Try as a literal
489 // with the original opsel, checking constant bus limitations.
491 int OpNo = MI->getOperandNo(&Old);
492 if (!TII->isOperandLegal(*MI, OpNo, &New))
493 return false;
494 Old.ChangeToImmediate(Fold.ImmToFold);
495 return true;
496 }
497
498 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
499 MachineBasicBlock *MBB = MI->getParent();
500 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
501 if (Liveness != MachineBasicBlock::LQR_Dead) {
502 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
503 return false;
504 }
505
506 int Op32 = Fold.ShrinkOpcode;
507 MachineOperand &Dst0 = MI->getOperand(0);
508 MachineOperand &Dst1 = MI->getOperand(1);
509 assert(Dst0.isDef() && Dst1.isDef());
510
511 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
512
513 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
514 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
515
516 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
517
518 if (HaveNonDbgCarryUse) {
519 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
520 Dst1.getReg())
521 .addReg(AMDGPU::VCC, RegState::Kill);
522 }
523
524 // Keep the old instruction around to avoid breaking iterators, but
525 // replace it with a dummy instruction to remove uses.
526 //
527 // FIXME: We should not invert how this pass looks at operands to avoid
528 // this. Should track set of foldable movs instead of looking for uses
529 // when looking at a use.
530 Dst0.setReg(NewReg0);
531 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
532 MI->removeOperand(I);
533 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
534
535 if (Fold.Commuted)
536 TII->commuteInstruction(*Inst32, false);
537 return true;
538 }
539
540 assert(!Fold.needsShrink() && "not handled");
541
542 if (Fold.isImm()) {
543 if (Old.isTied()) {
544 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
545 if (NewMFMAOpc == -1)
546 return false;
547 MI->setDesc(TII->get(NewMFMAOpc));
548 MI->untieRegOperand(0);
549 }
550 Old.ChangeToImmediate(Fold.ImmToFold);
551 return true;
552 }
553
554 if (Fold.isGlobal()) {
555 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
556 Fold.OpToFold->getTargetFlags());
557 return true;
558 }
559
560 if (Fold.isFI()) {
561 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
562 return true;
563 }
564
565 MachineOperand *New = Fold.OpToFold;
566 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
567 Old.setIsUndef(New->isUndef());
568 return true;
569}
570
572 const MachineInstr *MI) {
573 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
574}
575
577 MachineInstr *MI, unsigned OpNo,
578 MachineOperand *FoldOp, bool Commuted = false,
579 int ShrinkOp = -1) {
580 // Skip additional folding on the same operand.
581 for (FoldCandidate &Fold : FoldList)
582 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
583 return;
584 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
585 << " operand " << OpNo << "\n " << *MI);
586 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
587}
588
589bool SIFoldOperandsImpl::tryAddToFoldList(
590 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
591 MachineOperand *OpToFold) const {
592 const unsigned Opc = MI->getOpcode();
593
594 auto tryToFoldAsFMAAKorMK = [&]() {
595 if (!OpToFold->isImm())
596 return false;
597
598 const bool TryAK = OpNo == 3;
599 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
600 MI->setDesc(TII->get(NewOpc));
601
602 // We have to fold into operand which would be Imm not into OpNo.
603 bool FoldAsFMAAKorMK =
604 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
605 if (FoldAsFMAAKorMK) {
606 // Untie Src2 of fmac.
607 MI->untieRegOperand(3);
608 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
609 if (OpNo == 1) {
610 MachineOperand &Op1 = MI->getOperand(1);
611 MachineOperand &Op2 = MI->getOperand(2);
612 Register OldReg = Op1.getReg();
613 // Operand 2 might be an inlinable constant
614 if (Op2.isImm()) {
615 Op1.ChangeToImmediate(Op2.getImm());
616 Op2.ChangeToRegister(OldReg, false);
617 } else {
618 Op1.setReg(Op2.getReg());
619 Op2.setReg(OldReg);
620 }
621 }
622 return true;
623 }
624 MI->setDesc(TII->get(Opc));
625 return false;
626 };
627
628 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
629 if (!IsLegal && OpToFold->isImm()) {
630 FoldCandidate Fold(MI, OpNo, OpToFold);
631 IsLegal = canUseImmWithOpSel(Fold);
632 }
633
634 if (!IsLegal) {
635 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
636 unsigned NewOpc = macToMad(Opc);
637 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
638 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
639 // to fold the operand.
640 MI->setDesc(TII->get(NewOpc));
641 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
642 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
643 if (AddOpSel)
644 MI->addOperand(MachineOperand::CreateImm(0));
645 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
646 if (FoldAsMAD) {
647 MI->untieRegOperand(OpNo);
648 return true;
649 }
650 if (AddOpSel)
651 MI->removeOperand(MI->getNumExplicitOperands() - 1);
652 MI->setDesc(TII->get(Opc));
653 }
654
655 // Special case for s_fmac_f32 if we are trying to fold into Src2.
656 // By transforming into fmaak we can untie Src2 and make folding legal.
657 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
658 if (tryToFoldAsFMAAKorMK())
659 return true;
660 }
661
662 // Special case for s_setreg_b32
663 if (OpToFold->isImm()) {
664 unsigned ImmOpc = 0;
665 if (Opc == AMDGPU::S_SETREG_B32)
666 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
667 else if (Opc == AMDGPU::S_SETREG_B32_mode)
668 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
669 if (ImmOpc) {
670 MI->setDesc(TII->get(ImmOpc));
671 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
672 return true;
673 }
674 }
675
676 // If we are already folding into another operand of MI, then
677 // we can't commute the instruction, otherwise we risk making the
678 // other fold illegal.
679 if (isUseMIInFoldList(FoldList, MI))
680 return false;
681
682 // Operand is not legal, so try to commute the instruction to
683 // see if this makes it possible to fold.
684 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
685 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
686 if (!CanCommute)
687 return false;
688
689 // One of operands might be an Imm operand, and OpNo may refer to it after
690 // the call of commuteInstruction() below. Such situations are avoided
691 // here explicitly as OpNo must be a register operand to be a candidate
692 // for memory folding.
693 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
694 return false;
695
696 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
697 return false;
698
699 int Op32 = -1;
700 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
701 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
702 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
703 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
704 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
705 return false;
706 }
707
708 // Verify the other operand is a VGPR, otherwise we would violate the
709 // constant bus restriction.
710 MachineOperand &OtherOp = MI->getOperand(OpNo);
711 if (!OtherOp.isReg() ||
712 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
713 return false;
714
715 assert(MI->getOperand(1).isDef());
716
717 // Make sure to get the 32-bit version of the commuted opcode.
718 unsigned MaybeCommutedOpc = MI->getOpcode();
719 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
720 }
721
722 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
723 return true;
724 }
725
726 // Inlineable constant might have been folded into Imm operand of fmaak or
727 // fmamk and we are trying to fold a non-inlinable constant.
728 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
729 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
730 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
731 MachineOperand &OpImm = MI->getOperand(ImmIdx);
732 if (!OpImm.isReg() &&
733 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
734 return tryToFoldAsFMAAKorMK();
735 }
736
737 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
738 // By changing into fmamk we can untie Src2.
739 // If folding for Src0 happens first and it is identical operand to Src1 we
740 // should avoid transforming into fmamk which requires commuting as it would
741 // cause folding into Src1 to fail later on due to wrong OpNo used.
742 if (Opc == AMDGPU::S_FMAC_F32 &&
743 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
744 if (tryToFoldAsFMAAKorMK())
745 return true;
746 }
747
748 // Check the case where we might introduce a second constant operand to a
749 // scalar instruction
750 if (TII->isSALU(MI->getOpcode())) {
751 const MCInstrDesc &InstDesc = MI->getDesc();
752 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
753
754 // Fine if the operand can be encoded as an inline constant
755 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
756 // Otherwise check for another constant
757 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
758 auto &Op = MI->getOperand(i);
759 if (OpNo != i && !Op.isReg() &&
760 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
761 return false;
762 }
763 }
764 }
765
766 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
767 return true;
768}
769
770bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
771 const MachineOperand &UseMO) const {
772 // Operands of SDWA instructions must be registers.
773 return !TII->isSDWA(MI);
774}
775
776// Find a def of the UseReg, check if it is a reg_sequence and find initializers
777// for each subreg, tracking it to foldable inline immediate if possible.
778// Returns true on success.
779bool SIFoldOperandsImpl::getRegSeqInit(
780 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
781 Register UseReg, uint8_t OpTy) const {
782 MachineInstr *Def = MRI->getVRegDef(UseReg);
783 if (!Def || !Def->isRegSequence())
784 return false;
785
786 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
787 MachineOperand *Sub = &Def->getOperand(I);
788 assert(Sub->isReg());
789
790 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
791 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
792 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
793 SubDef = MRI->getVRegDef(Sub->getReg())) {
794 MachineOperand *Op = &SubDef->getOperand(1);
795 if (Op->isImm()) {
796 if (TII->isInlineConstant(*Op, OpTy))
797 Sub = Op;
798 break;
799 }
800 if (!Op->isReg() || Op->getReg().isPhysical())
801 break;
802 Sub = Op;
803 }
804
805 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
806 }
807
808 return true;
809}
810
811bool SIFoldOperandsImpl::tryToFoldACImm(
812 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
813 SmallVectorImpl<FoldCandidate> &FoldList) const {
814 const MCInstrDesc &Desc = UseMI->getDesc();
815 if (UseOpIdx >= Desc.getNumOperands())
816 return false;
817
819 return false;
820
821 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
822 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
823 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
824 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
825 return true;
826 }
827
828 if (!OpToFold.isReg())
829 return false;
830
831 Register UseReg = OpToFold.getReg();
832 if (!UseReg.isVirtual())
833 return false;
834
835 if (isUseMIInFoldList(FoldList, UseMI))
836 return false;
837
838 // Maybe it is just a COPY of an immediate itself.
839 MachineInstr *Def = MRI->getVRegDef(UseReg);
840 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
841 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
842 MachineOperand &DefOp = Def->getOperand(1);
843 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
844 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
845 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
846 return true;
847 }
848 }
849
851 if (!getRegSeqInit(Defs, UseReg, OpTy))
852 return false;
853
854 int32_t Imm;
855 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
856 const MachineOperand *Op = Defs[I].first;
857 if (!Op->isImm())
858 return false;
859
860 auto SubImm = Op->getImm();
861 if (!I) {
862 Imm = SubImm;
863 if (!TII->isInlineConstant(*Op, OpTy) ||
864 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
865 return false;
866
867 continue;
868 }
869 if (Imm != SubImm)
870 return false; // Can only fold splat constants
871 }
872
873 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
874 return true;
875}
876
877void SIFoldOperandsImpl::foldOperand(
878 MachineOperand &OpToFold, MachineInstr *UseMI, int UseOpIdx,
880 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
881 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
882
883 if (!isUseSafeToFold(*UseMI, *UseOp))
884 return;
885
886 // FIXME: Fold operands with subregs.
887 if (UseOp->isReg() && OpToFold.isReg() &&
888 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
889 return;
890
891 // Special case for REG_SEQUENCE: We can't fold literals into
892 // REG_SEQUENCE instructions, so we have to fold them into the
893 // uses of REG_SEQUENCE.
894 if (UseMI->isRegSequence()) {
895 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
896 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
897
898 // Grab the use operands first
900 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
901 UsesToProcess.push_back(&Use);
902 for (auto *RSUse : UsesToProcess) {
903 MachineInstr *RSUseMI = RSUse->getParent();
904
905 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
906 RSUseMI->getOperandNo(RSUse), FoldList))
907 continue;
908
909 if (RSUse->getSubReg() != RegSeqDstSubReg)
910 continue;
911
912 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
913 CopiesToReplace);
914 }
915 return;
916 }
917
918 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
919 return;
920
921 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
922 // Verify that this is a stack access.
923 // FIXME: Should probably use stack pseudos before frame lowering.
924
925 if (TII->isMUBUF(*UseMI)) {
926 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
927 MFI->getScratchRSrcReg())
928 return;
929
930 // Ensure this is either relative to the current frame or the current
931 // wave.
932 MachineOperand &SOff =
933 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
934 if (!SOff.isImm() || SOff.getImm() != 0)
935 return;
936 }
937
938 // A frame index will resolve to a positive constant, so it should always be
939 // safe to fold the addressing mode, even pre-GFX9.
940 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
941
942 const unsigned Opc = UseMI->getOpcode();
943 if (TII->isFLATScratch(*UseMI) &&
944 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
945 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
946 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
947 UseMI->setDesc(TII->get(NewOpc));
948 }
949
950 return;
951 }
952
953 bool FoldingImmLike =
954 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
955
956 if (FoldingImmLike && UseMI->isCopy()) {
957 Register DestReg = UseMI->getOperand(0).getReg();
958 Register SrcReg = UseMI->getOperand(1).getReg();
959 assert(SrcReg.isVirtual());
960
961 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
962
963 // Don't fold into a copy to a physical register with the same class. Doing
964 // so would interfere with the register coalescer's logic which would avoid
965 // redundant initializations.
966 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
967 return;
968
969 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
970 if (!DestReg.isPhysical()) {
971 if (DestRC == &AMDGPU::AGPR_32RegClass &&
972 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
973 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
975 CopiesToReplace.push_back(UseMI);
976 return;
977 }
978 }
979
980 // In order to fold immediates into copies, we need to change the
981 // copy to a MOV.
982
983 unsigned MovOp = TII->getMovOpcode(DestRC);
984 if (MovOp == AMDGPU::COPY)
985 return;
986
989 while (ImpOpI != ImpOpE) {
990 MachineInstr::mop_iterator Tmp = ImpOpI;
991 ImpOpI++;
993 }
994 UseMI->setDesc(TII->get(MovOp));
995
996 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
997 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
998 MachineOperand NewSrcOp(SrcOp);
1000 UseMI->removeOperand(1);
1001 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1002 UseMI->addOperand(NewSrcOp); // src0
1003 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1004 UseOpIdx = 2;
1005 UseOp = &UseMI->getOperand(UseOpIdx);
1006 }
1007 CopiesToReplace.push_back(UseMI);
1008 } else {
1009 if (UseMI->isCopy() && OpToFold.isReg() &&
1010 UseMI->getOperand(0).getReg().isVirtual() &&
1011 !UseMI->getOperand(1).getSubReg()) {
1012 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1013 unsigned Size = TII->getOpSize(*UseMI, 1);
1014 Register UseReg = OpToFold.getReg();
1016 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1017 UseMI->getOperand(1).setIsKill(false);
1018 CopiesToReplace.push_back(UseMI);
1019 OpToFold.setIsKill(false);
1020
1021 // Remove kill flags as kills may now be out of order with uses.
1022 MRI->clearKillFlags(OpToFold.getReg());
1023
1024 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1025 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1026 // its initializers right here, so we will rematerialize immediates and
1027 // avoid copies via different reg classes.
1029 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1030 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1031 const DebugLoc &DL = UseMI->getDebugLoc();
1033
1034 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1035 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
1037
1041 for (unsigned I = 0; I < Size / 4; ++I) {
1042 MachineOperand *Def = Defs[I].first;
1044 if (Def->isImm() &&
1045 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1046 int64_t Imm = Def->getImm();
1047
1048 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1049 BuildMI(MBB, UseMI, DL,
1050 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
1051 B.addReg(Tmp);
1052 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1053 auto Src = getRegSubRegPair(*Def);
1054 Def->setIsKill(false);
1055 if (!SeenAGPRs.insert(Src)) {
1056 // We cannot build a reg_sequence out of the same registers, they
1057 // must be copied. Better do it here before copyPhysReg() created
1058 // several reads to do the AGPR->VGPR->AGPR copy.
1059 CopyToVGPR = Src;
1060 } else {
1061 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
1062 Src.SubReg);
1063 }
1064 } else {
1065 assert(Def->isReg());
1066 Def->setIsKill(false);
1067 auto Src = getRegSubRegPair(*Def);
1068
1069 // Direct copy from SGPR to AGPR is not possible. To avoid creation
1070 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1071 // create a copy here and track if we already have such a copy.
1072 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1073 CopyToVGPR = Src;
1074 } else {
1075 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1076 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1077 B.addReg(Tmp);
1078 }
1079 }
1080
1081 if (CopyToVGPR.Reg) {
1082 Register Vgpr;
1083 if (VGPRCopies.count(CopyToVGPR)) {
1084 Vgpr = VGPRCopies[CopyToVGPR];
1085 } else {
1086 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1087 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1088 VGPRCopies[CopyToVGPR] = Vgpr;
1089 }
1090 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1091 BuildMI(MBB, UseMI, DL,
1092 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
1093 B.addReg(Tmp);
1094 }
1095
1096 B.addImm(Defs[I].second);
1097 }
1098 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
1099 return;
1100 }
1101
1102 if (Size != 4)
1103 return;
1104
1105 Register Reg0 = UseMI->getOperand(0).getReg();
1106 Register Reg1 = UseMI->getOperand(1).getReg();
1107 if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
1108 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
1109 else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
1110 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
1111 else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
1112 TRI->isAGPR(*MRI, Reg1))
1113 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
1114 return;
1115 }
1116
1117 unsigned UseOpc = UseMI->getOpcode();
1118 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1119 (UseOpc == AMDGPU::V_READLANE_B32 &&
1120 (int)UseOpIdx ==
1121 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1122 // %vgpr = V_MOV_B32 imm
1123 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1124 // =>
1125 // %sgpr = S_MOV_B32 imm
1126 if (FoldingImmLike) {
1128 UseMI->getOperand(UseOpIdx).getReg(),
1129 *OpToFold.getParent(),
1130 *UseMI))
1131 return;
1132
1133 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1134
1135 if (OpToFold.isImm())
1136 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1137 else
1139 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1140 return;
1141 }
1142
1143 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1145 UseMI->getOperand(UseOpIdx).getReg(),
1146 *OpToFold.getParent(),
1147 *UseMI))
1148 return;
1149
1150 // %vgpr = COPY %sgpr0
1151 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1152 // =>
1153 // %sgpr1 = COPY %sgpr0
1154 UseMI->setDesc(TII->get(AMDGPU::COPY));
1155 UseMI->getOperand(1).setReg(OpToFold.getReg());
1156 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1157 UseMI->getOperand(1).setIsKill(false);
1158 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1159 return;
1160 }
1161 }
1162
1163 const MCInstrDesc &UseDesc = UseMI->getDesc();
1164
1165 // Don't fold into target independent nodes. Target independent opcodes
1166 // don't have defined register classes.
1167 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1168 UseDesc.operands()[UseOpIdx].RegClass == -1)
1169 return;
1170 }
1171
1172 if (!FoldingImmLike) {
1173 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1174 // Don't fold if OpToFold doesn't hold an aligned register.
1175 const TargetRegisterClass *RC =
1176 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1177 assert(RC);
1178 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1179 unsigned SubReg = OpToFold.getSubReg();
1180 if (const TargetRegisterClass *SubRC =
1181 TRI->getSubRegisterClass(RC, SubReg))
1182 RC = SubRC;
1183 }
1184
1185 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1186 return;
1187 }
1188
1189 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1190
1191 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1192 // to enable more folding opportunities. The shrink operands pass
1193 // already does this.
1194 return;
1195 }
1196
1197
1198 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1199 const TargetRegisterClass *FoldRC =
1200 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1201
1202 // Split 64-bit constants into 32-bits for folding.
1203 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1204 Register UseReg = UseOp->getReg();
1205 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1206 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1207 return;
1208
1209 APInt Imm(64, OpToFold.getImm());
1210 if (UseOp->getSubReg() == AMDGPU::sub0) {
1211 Imm = Imm.getLoBits(32);
1212 } else {
1213 assert(UseOp->getSubReg() == AMDGPU::sub1);
1214 Imm = Imm.getHiBits(32);
1215 }
1216
1217 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1218 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1219 return;
1220 }
1221
1222 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1223}
1224
1225static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1226 uint32_t LHS, uint32_t RHS) {
1227 switch (Opcode) {
1228 case AMDGPU::V_AND_B32_e64:
1229 case AMDGPU::V_AND_B32_e32:
1230 case AMDGPU::S_AND_B32:
1231 Result = LHS & RHS;
1232 return true;
1233 case AMDGPU::V_OR_B32_e64:
1234 case AMDGPU::V_OR_B32_e32:
1235 case AMDGPU::S_OR_B32:
1236 Result = LHS | RHS;
1237 return true;
1238 case AMDGPU::V_XOR_B32_e64:
1239 case AMDGPU::V_XOR_B32_e32:
1240 case AMDGPU::S_XOR_B32:
1241 Result = LHS ^ RHS;
1242 return true;
1243 case AMDGPU::S_XNOR_B32:
1244 Result = ~(LHS ^ RHS);
1245 return true;
1246 case AMDGPU::S_NAND_B32:
1247 Result = ~(LHS & RHS);
1248 return true;
1249 case AMDGPU::S_NOR_B32:
1250 Result = ~(LHS | RHS);
1251 return true;
1252 case AMDGPU::S_ANDN2_B32:
1253 Result = LHS & ~RHS;
1254 return true;
1255 case AMDGPU::S_ORN2_B32:
1256 Result = LHS | ~RHS;
1257 return true;
1258 case AMDGPU::V_LSHL_B32_e64:
1259 case AMDGPU::V_LSHL_B32_e32:
1260 case AMDGPU::S_LSHL_B32:
1261 // The instruction ignores the high bits for out of bounds shifts.
1262 Result = LHS << (RHS & 31);
1263 return true;
1264 case AMDGPU::V_LSHLREV_B32_e64:
1265 case AMDGPU::V_LSHLREV_B32_e32:
1266 Result = RHS << (LHS & 31);
1267 return true;
1268 case AMDGPU::V_LSHR_B32_e64:
1269 case AMDGPU::V_LSHR_B32_e32:
1270 case AMDGPU::S_LSHR_B32:
1271 Result = LHS >> (RHS & 31);
1272 return true;
1273 case AMDGPU::V_LSHRREV_B32_e64:
1274 case AMDGPU::V_LSHRREV_B32_e32:
1275 Result = RHS >> (LHS & 31);
1276 return true;
1277 case AMDGPU::V_ASHR_I32_e64:
1278 case AMDGPU::V_ASHR_I32_e32:
1279 case AMDGPU::S_ASHR_I32:
1280 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1281 return true;
1282 case AMDGPU::V_ASHRREV_I32_e64:
1283 case AMDGPU::V_ASHRREV_I32_e32:
1284 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1285 return true;
1286 default:
1287 return false;
1288 }
1289}
1290
1291static unsigned getMovOpc(bool IsScalar) {
1292 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1293}
1294
1295static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1296 MI.setDesc(NewDesc);
1297
1298 // Remove any leftover implicit operands from mutating the instruction. e.g.
1299 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1300 // anymore.
1301 const MCInstrDesc &Desc = MI.getDesc();
1302 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1303 Desc.implicit_defs().size();
1304
1305 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1306 MI.removeOperand(I);
1307}
1308
1310SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1311 // If this has a subregister, it obviously is a register source.
1312 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1313 !Op.getReg().isVirtual())
1314 return &Op;
1315
1316 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1317 if (Def && Def->isMoveImmediate()) {
1318 MachineOperand &ImmSrc = Def->getOperand(1);
1319 if (ImmSrc.isImm())
1320 return &ImmSrc;
1321 }
1322
1323 return &Op;
1324}
1325
1326// Try to simplify operations with a constant that may appear after instruction
1327// selection.
1328// TODO: See if a frame index with a fixed offset can fold.
1329bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1330 if (!MI->allImplicitDefsAreDead())
1331 return false;
1332
1333 unsigned Opc = MI->getOpcode();
1334
1335 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1336 if (Src0Idx == -1)
1337 return false;
1338 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1339
1340 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1341 Opc == AMDGPU::S_NOT_B32) &&
1342 Src0->isImm()) {
1343 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1344 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1345 return true;
1346 }
1347
1348 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1349 if (Src1Idx == -1)
1350 return false;
1351 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1352
1353 if (!Src0->isImm() && !Src1->isImm())
1354 return false;
1355
1356 // and k0, k1 -> v_mov_b32 (k0 & k1)
1357 // or k0, k1 -> v_mov_b32 (k0 | k1)
1358 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1359 if (Src0->isImm() && Src1->isImm()) {
1360 int32_t NewImm;
1361 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1362 return false;
1363
1364 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1365
1366 // Be careful to change the right operand, src0 may belong to a different
1367 // instruction.
1368 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1369 MI->removeOperand(Src1Idx);
1370 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1371 return true;
1372 }
1373
1374 if (!MI->isCommutable())
1375 return false;
1376
1377 if (Src0->isImm() && !Src1->isImm()) {
1378 std::swap(Src0, Src1);
1379 std::swap(Src0Idx, Src1Idx);
1380 }
1381
1382 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1383 if (Opc == AMDGPU::V_OR_B32_e64 ||
1384 Opc == AMDGPU::V_OR_B32_e32 ||
1385 Opc == AMDGPU::S_OR_B32) {
1386 if (Src1Val == 0) {
1387 // y = or x, 0 => y = copy x
1388 MI->removeOperand(Src1Idx);
1389 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1390 } else if (Src1Val == -1) {
1391 // y = or x, -1 => y = v_mov_b32 -1
1392 MI->removeOperand(Src1Idx);
1393 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1394 } else
1395 return false;
1396
1397 return true;
1398 }
1399
1400 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1401 Opc == AMDGPU::S_AND_B32) {
1402 if (Src1Val == 0) {
1403 // y = and x, 0 => y = v_mov_b32 0
1404 MI->removeOperand(Src0Idx);
1405 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1406 } else if (Src1Val == -1) {
1407 // y = and x, -1 => y = copy x
1408 MI->removeOperand(Src1Idx);
1409 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1410 } else
1411 return false;
1412
1413 return true;
1414 }
1415
1416 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1417 Opc == AMDGPU::S_XOR_B32) {
1418 if (Src1Val == 0) {
1419 // y = xor x, 0 => y = copy x
1420 MI->removeOperand(Src1Idx);
1421 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1422 return true;
1423 }
1424 }
1425
1426 return false;
1427}
1428
1429// Try to fold an instruction into a simpler one
1430bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1431 unsigned Opc = MI.getOpcode();
1432 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1433 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1434 return false;
1435
1436 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1437 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1438 if (!Src1->isIdenticalTo(*Src0)) {
1439 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1440 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1441 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1442 return false;
1443 }
1444
1445 int Src1ModIdx =
1446 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1447 int Src0ModIdx =
1448 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1449 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1450 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1451 return false;
1452
1453 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1454 auto &NewDesc =
1455 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1456 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1457 if (Src2Idx != -1)
1458 MI.removeOperand(Src2Idx);
1459 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1460 if (Src1ModIdx != -1)
1461 MI.removeOperand(Src1ModIdx);
1462 if (Src0ModIdx != -1)
1463 MI.removeOperand(Src0ModIdx);
1464 mutateCopyOp(MI, NewDesc);
1465 LLVM_DEBUG(dbgs() << MI);
1466 return true;
1467}
1468
1469bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1470 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1471 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1472 return false;
1473
1474 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1475 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1476 return false;
1477
1478 Register Src1 = MI.getOperand(2).getReg();
1479 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1480 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1481 return false;
1482
1483 Register Dst = MI.getOperand(0).getReg();
1484 MRI->replaceRegWith(Dst, Src1);
1485 if (!MI.getOperand(2).isKill())
1486 MRI->clearKillFlags(Src1);
1487 MI.eraseFromParent();
1488 return true;
1489}
1490
1491bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1492 MachineOperand &OpToFold) const {
1493 // We need mutate the operands of new mov instructions to add implicit
1494 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1495 // this.
1496 SmallVector<MachineInstr *, 4> CopiesToReplace;
1498 MachineOperand &Dst = MI.getOperand(0);
1499 bool Changed = false;
1500
1501 if (OpToFold.isImm()) {
1502 for (auto &UseMI :
1503 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1504 // Folding the immediate may reveal operations that can be constant
1505 // folded or replaced with a copy. This can happen for example after
1506 // frame indices are lowered to constants or from splitting 64-bit
1507 // constants.
1508 //
1509 // We may also encounter cases where one or both operands are
1510 // immediates materialized into a register, which would ordinarily not
1511 // be folded due to multiple uses or operand constraints.
1512 if (tryConstantFoldOp(&UseMI)) {
1513 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1514 Changed = true;
1515 }
1516 }
1517 }
1518
1520 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1521 UsesToProcess.push_back(&Use);
1522 for (auto *U : UsesToProcess) {
1523 MachineInstr *UseMI = U->getParent();
1524 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1525 CopiesToReplace);
1526 }
1527
1528 if (CopiesToReplace.empty() && FoldList.empty())
1529 return Changed;
1530
1531 MachineFunction *MF = MI.getParent()->getParent();
1532 // Make sure we add EXEC uses to any new v_mov instructions created.
1533 for (MachineInstr *Copy : CopiesToReplace)
1534 Copy->addImplicitDefUseOperands(*MF);
1535
1536 for (FoldCandidate &Fold : FoldList) {
1537 assert(!Fold.isReg() || Fold.OpToFold);
1538 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1539 Register Reg = Fold.OpToFold->getReg();
1540 MachineInstr *DefMI = Fold.OpToFold->getParent();
1541 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1542 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1543 continue;
1544 }
1545 if (updateOperand(Fold)) {
1546 // Clear kill flags.
1547 if (Fold.isReg()) {
1548 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1549 // FIXME: Probably shouldn't bother trying to fold if not an
1550 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1551 // copies.
1552 MRI->clearKillFlags(Fold.OpToFold->getReg());
1553 }
1554 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1555 << static_cast<int>(Fold.UseOpNo) << " of "
1556 << *Fold.UseMI);
1557 } else if (Fold.Commuted) {
1558 // Restoring instruction's original operand order if fold has failed.
1559 TII->commuteInstruction(*Fold.UseMI, false);
1560 }
1561 }
1562 return true;
1563}
1564
1565bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1566 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1567 Register DstReg = MI.getOperand(0).getReg();
1568 // Specially track simple redefs of m0 to the same value in a block, so we
1569 // can erase the later ones.
1570 if (DstReg == AMDGPU::M0) {
1571 MachineOperand &NewM0Val = MI.getOperand(1);
1572 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1573 MI.eraseFromParent();
1574 return true;
1575 }
1576
1577 // We aren't tracking other physical registers
1578 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1579 ? nullptr
1580 : &NewM0Val;
1581 return false;
1582 }
1583
1584 MachineOperand *OpToFoldPtr;
1585 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1586 // Folding when any src_modifiers are non-zero is unsupported
1587 if (TII->hasAnyModifiersSet(MI))
1588 return false;
1589 OpToFoldPtr = &MI.getOperand(2);
1590 } else
1591 OpToFoldPtr = &MI.getOperand(1);
1592 MachineOperand &OpToFold = *OpToFoldPtr;
1593 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1594
1595 // FIXME: We could also be folding things like TargetIndexes.
1596 if (!FoldingImm && !OpToFold.isReg())
1597 return false;
1598
1599 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1600 return false;
1601
1602 // Prevent folding operands backwards in the function. For example,
1603 // the COPY opcode must not be replaced by 1 in this example:
1604 //
1605 // %3 = COPY %vgpr0; VGPR_32:%3
1606 // ...
1607 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1608 if (!DstReg.isVirtual())
1609 return false;
1610
1611 if (OpToFold.isReg() &&
1612 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1613 return true;
1614
1615 bool Changed = foldInstOperand(MI, OpToFold);
1616
1617 // If we managed to fold all uses of this copy then we might as well
1618 // delete it now.
1619 // The only reason we need to follow chains of copies here is that
1620 // tryFoldRegSequence looks forward through copies before folding a
1621 // REG_SEQUENCE into its eventual users.
1622 auto *InstToErase = &MI;
1623 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1624 auto &SrcOp = InstToErase->getOperand(1);
1625 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1626 InstToErase->eraseFromParent();
1627 Changed = true;
1628 InstToErase = nullptr;
1629 if (!SrcReg || SrcReg.isPhysical())
1630 break;
1631 InstToErase = MRI->getVRegDef(SrcReg);
1632 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1633 break;
1634 }
1635
1636 if (InstToErase && InstToErase->isRegSequence() &&
1637 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1638 InstToErase->eraseFromParent();
1639 Changed = true;
1640 }
1641
1642 return Changed;
1643}
1644
1645// Clamp patterns are canonically selected to v_max_* instructions, so only
1646// handle them.
1647const MachineOperand *
1648SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
1649 unsigned Op = MI.getOpcode();
1650 switch (Op) {
1651 case AMDGPU::V_MAX_F32_e64:
1652 case AMDGPU::V_MAX_F16_e64:
1653 case AMDGPU::V_MAX_F16_t16_e64:
1654 case AMDGPU::V_MAX_F16_fake16_e64:
1655 case AMDGPU::V_MAX_F64_e64:
1656 case AMDGPU::V_MAX_NUM_F64_e64:
1657 case AMDGPU::V_PK_MAX_F16: {
1658 if (MI.mayRaiseFPException())
1659 return nullptr;
1660
1661 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1662 return nullptr;
1663
1664 // Make sure sources are identical.
1665 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1666 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1667 if (!Src0->isReg() || !Src1->isReg() ||
1668 Src0->getReg() != Src1->getReg() ||
1669 Src0->getSubReg() != Src1->getSubReg() ||
1670 Src0->getSubReg() != AMDGPU::NoSubRegister)
1671 return nullptr;
1672
1673 // Can't fold up if we have modifiers.
1674 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1675 return nullptr;
1676
1677 unsigned Src0Mods
1678 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1679 unsigned Src1Mods
1680 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1681
1682 // Having a 0 op_sel_hi would require swizzling the output in the source
1683 // instruction, which we can't do.
1684 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1685 : 0u;
1686 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1687 return nullptr;
1688 return Src0;
1689 }
1690 default:
1691 return nullptr;
1692 }
1693}
1694
1695// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1696bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
1697 const MachineOperand *ClampSrc = isClamp(MI);
1698 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1699 return false;
1700
1701 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1702
1703 // The type of clamp must be compatible.
1704 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1705 return false;
1706
1707 if (Def->mayRaiseFPException())
1708 return false;
1709
1710 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1711 if (!DefClamp)
1712 return false;
1713
1714 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1715
1716 // Clamp is applied after omod, so it is OK if omod is set.
1717 DefClamp->setImm(1);
1718
1719 Register DefReg = Def->getOperand(0).getReg();
1720 Register MIDstReg = MI.getOperand(0).getReg();
1721 if (TRI->isSGPRReg(*MRI, DefReg)) {
1722 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1723 // instruction with a VGPR dst.
1724 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1725 MIDstReg)
1726 .addReg(DefReg);
1727 } else {
1728 MRI->replaceRegWith(MIDstReg, DefReg);
1729 }
1730 MI.eraseFromParent();
1731
1732 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1733 // instruction, so we might as well convert it to the more flexible VOP3-only
1734 // mad/fma form.
1735 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1736 Def->eraseFromParent();
1737
1738 return true;
1739}
1740
1741static int getOModValue(unsigned Opc, int64_t Val) {
1742 switch (Opc) {
1743 case AMDGPU::V_MUL_F64_e64:
1744 case AMDGPU::V_MUL_F64_pseudo_e64: {
1745 switch (Val) {
1746 case 0x3fe0000000000000: // 0.5
1747 return SIOutMods::DIV2;
1748 case 0x4000000000000000: // 2.0
1749 return SIOutMods::MUL2;
1750 case 0x4010000000000000: // 4.0
1751 return SIOutMods::MUL4;
1752 default:
1753 return SIOutMods::NONE;
1754 }
1755 }
1756 case AMDGPU::V_MUL_F32_e64: {
1757 switch (static_cast<uint32_t>(Val)) {
1758 case 0x3f000000: // 0.5
1759 return SIOutMods::DIV2;
1760 case 0x40000000: // 2.0
1761 return SIOutMods::MUL2;
1762 case 0x40800000: // 4.0
1763 return SIOutMods::MUL4;
1764 default:
1765 return SIOutMods::NONE;
1766 }
1767 }
1768 case AMDGPU::V_MUL_F16_e64:
1769 case AMDGPU::V_MUL_F16_t16_e64:
1770 case AMDGPU::V_MUL_F16_fake16_e64: {
1771 switch (static_cast<uint16_t>(Val)) {
1772 case 0x3800: // 0.5
1773 return SIOutMods::DIV2;
1774 case 0x4000: // 2.0
1775 return SIOutMods::MUL2;
1776 case 0x4400: // 4.0
1777 return SIOutMods::MUL4;
1778 default:
1779 return SIOutMods::NONE;
1780 }
1781 }
1782 default:
1783 llvm_unreachable("invalid mul opcode");
1784 }
1785}
1786
1787// FIXME: Does this really not support denormals with f16?
1788// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1789// handled, so will anything other than that break?
1790std::pair<const MachineOperand *, int>
1791SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
1792 unsigned Op = MI.getOpcode();
1793 switch (Op) {
1794 case AMDGPU::V_MUL_F64_e64:
1795 case AMDGPU::V_MUL_F64_pseudo_e64:
1796 case AMDGPU::V_MUL_F32_e64:
1797 case AMDGPU::V_MUL_F16_t16_e64:
1798 case AMDGPU::V_MUL_F16_fake16_e64:
1799 case AMDGPU::V_MUL_F16_e64: {
1800 // If output denormals are enabled, omod is ignored.
1801 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1802 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1803 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1804 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1805 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1806 MFI->getMode().FP64FP16Denormals.Output !=
1808 MI.mayRaiseFPException())
1809 return std::pair(nullptr, SIOutMods::NONE);
1810
1811 const MachineOperand *RegOp = nullptr;
1812 const MachineOperand *ImmOp = nullptr;
1813 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1814 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1815 if (Src0->isImm()) {
1816 ImmOp = Src0;
1817 RegOp = Src1;
1818 } else if (Src1->isImm()) {
1819 ImmOp = Src1;
1820 RegOp = Src0;
1821 } else
1822 return std::pair(nullptr, SIOutMods::NONE);
1823
1824 int OMod = getOModValue(Op, ImmOp->getImm());
1825 if (OMod == SIOutMods::NONE ||
1826 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1827 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1828 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1829 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1830 return std::pair(nullptr, SIOutMods::NONE);
1831
1832 return std::pair(RegOp, OMod);
1833 }
1834 case AMDGPU::V_ADD_F64_e64:
1835 case AMDGPU::V_ADD_F64_pseudo_e64:
1836 case AMDGPU::V_ADD_F32_e64:
1837 case AMDGPU::V_ADD_F16_e64:
1838 case AMDGPU::V_ADD_F16_t16_e64:
1839 case AMDGPU::V_ADD_F16_fake16_e64: {
1840 // If output denormals are enabled, omod is ignored.
1841 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1842 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1843 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1844 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1845 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1846 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1847 return std::pair(nullptr, SIOutMods::NONE);
1848
1849 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1850 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1851 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1852
1853 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1854 Src0->getSubReg() == Src1->getSubReg() &&
1855 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1856 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1857 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1858 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1859 return std::pair(Src0, SIOutMods::MUL2);
1860
1861 return std::pair(nullptr, SIOutMods::NONE);
1862 }
1863 default:
1864 return std::pair(nullptr, SIOutMods::NONE);
1865 }
1866}
1867
1868// FIXME: Does this need to check IEEE bit on function?
1869bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
1870 const MachineOperand *RegOp;
1871 int OMod;
1872 std::tie(RegOp, OMod) = isOMod(MI);
1873 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1874 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1875 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1876 return false;
1877
1878 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1879 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1880 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1881 return false;
1882
1883 if (Def->mayRaiseFPException())
1884 return false;
1885
1886 // Clamp is applied after omod. If the source already has clamp set, don't
1887 // fold it.
1888 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1889 return false;
1890
1891 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1892
1893 DefOMod->setImm(OMod);
1894 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1895 // Kill flags can be wrong if we replaced a def inside a loop with a def
1896 // outside the loop.
1897 MRI->clearKillFlags(Def->getOperand(0).getReg());
1898 MI.eraseFromParent();
1899
1900 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1901 // instruction, so we might as well convert it to the more flexible VOP3-only
1902 // mad/fma form.
1903 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1904 Def->eraseFromParent();
1905
1906 return true;
1907}
1908
1909// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1910// instruction which can take an agpr. So far that means a store.
1911bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
1912 assert(MI.isRegSequence());
1913 auto Reg = MI.getOperand(0).getReg();
1914
1915 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1916 !MRI->hasOneNonDBGUse(Reg))
1917 return false;
1918
1920 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1921 return false;
1922
1923 for (auto &[Op, SubIdx] : Defs) {
1924 if (!Op->isReg())
1925 return false;
1926 if (TRI->isAGPR(*MRI, Op->getReg()))
1927 continue;
1928 // Maybe this is a COPY from AREG
1929 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1930 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1931 return false;
1932 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1933 return false;
1934 }
1935
1936 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1937 MachineInstr *UseMI = Op->getParent();
1938 while (UseMI->isCopy() && !Op->getSubReg()) {
1939 Reg = UseMI->getOperand(0).getReg();
1940 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1941 return false;
1942 Op = &*MRI->use_nodbg_begin(Reg);
1943 UseMI = Op->getParent();
1944 }
1945
1946 if (Op->getSubReg())
1947 return false;
1948
1949 unsigned OpIdx = Op - &UseMI->getOperand(0);
1950 const MCInstrDesc &InstDesc = UseMI->getDesc();
1951 const TargetRegisterClass *OpRC =
1952 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1953 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1954 return false;
1955
1956 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1957 auto Dst = MRI->createVirtualRegister(NewDstRC);
1958 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1959 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1960
1961 for (auto &[Def, SubIdx] : Defs) {
1962 Def->setIsKill(false);
1963 if (TRI->isAGPR(*MRI, Def->getReg())) {
1964 RS.add(*Def);
1965 } else { // This is a copy
1966 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1967 SubDef->getOperand(1).setIsKill(false);
1968 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1969 }
1970 RS.addImm(SubIdx);
1971 }
1972
1973 Op->setReg(Dst);
1974 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1975 Op->setReg(Reg);
1976 RS->eraseFromParent();
1977 return false;
1978 }
1979
1980 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1981
1982 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1983 // in which case we can erase them all later in runOnMachineFunction.
1984 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1985 MI.eraseFromParent();
1986 return true;
1987}
1988
1989/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1990/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1991static bool isAGPRCopy(const SIRegisterInfo &TRI,
1992 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1993 Register &OutReg, unsigned &OutSubReg) {
1994 assert(Copy.isCopy());
1995
1996 const MachineOperand &CopySrc = Copy.getOperand(1);
1997 Register CopySrcReg = CopySrc.getReg();
1998 if (!CopySrcReg.isVirtual())
1999 return false;
2000
2001 // Common case: copy from AGPR directly, e.g.
2002 // %1:vgpr_32 = COPY %0:agpr_32
2003 if (TRI.isAGPR(MRI, CopySrcReg)) {
2004 OutReg = CopySrcReg;
2005 OutSubReg = CopySrc.getSubReg();
2006 return true;
2007 }
2008
2009 // Sometimes it can also involve two copies, e.g.
2010 // %1:vgpr_256 = COPY %0:agpr_256
2011 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2012 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2013 if (!CopySrcDef || !CopySrcDef->isCopy())
2014 return false;
2015
2016 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2017 Register OtherCopySrcReg = OtherCopySrc.getReg();
2018 if (!OtherCopySrcReg.isVirtual() ||
2019 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2020 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2021 !TRI.isAGPR(MRI, OtherCopySrcReg))
2022 return false;
2023
2024 OutReg = OtherCopySrcReg;
2025 OutSubReg = CopySrc.getSubReg();
2026 return true;
2027}
2028
2029// Try to hoist an AGPR to VGPR copy across a PHI.
2030// This should allow folding of an AGPR into a consumer which may support it.
2031//
2032// Example 1: LCSSA PHI
2033// loop:
2034// %1:vreg = COPY %0:areg
2035// exit:
2036// %2:vreg = PHI %1:vreg, %loop
2037// =>
2038// loop:
2039// exit:
2040// %1:areg = PHI %0:areg, %loop
2041// %2:vreg = COPY %1:areg
2042//
2043// Example 2: PHI with multiple incoming values:
2044// entry:
2045// %1:vreg = GLOBAL_LOAD(..)
2046// loop:
2047// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2048// %3:areg = COPY %2:vreg
2049// %4:areg = (instr using %3:areg)
2050// %5:vreg = COPY %4:areg
2051// =>
2052// entry:
2053// %1:vreg = GLOBAL_LOAD(..)
2054// %2:areg = COPY %1:vreg
2055// loop:
2056// %3:areg = PHI %2:areg, %entry, %X:areg,
2057// %4:areg = (instr using %3:areg)
2058bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2059 assert(PHI.isPHI());
2060
2061 Register PhiOut = PHI.getOperand(0).getReg();
2062 if (!TRI->isVGPR(*MRI, PhiOut))
2063 return false;
2064
2065 // Iterate once over all incoming values of the PHI to check if this PHI is
2066 // eligible, and determine the exact AGPR RC we'll target.
2067 const TargetRegisterClass *ARC = nullptr;
2068 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2069 MachineOperand &MO = PHI.getOperand(K);
2070 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2071 if (!Copy || !Copy->isCopy())
2072 continue;
2073
2074 Register AGPRSrc;
2075 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2076 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2077 continue;
2078
2079 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2080 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2081 CopyInRC = SubRC;
2082
2083 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2084 return false;
2085 ARC = CopyInRC;
2086 }
2087
2088 if (!ARC)
2089 return false;
2090
2091 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2092
2093 // Rewrite the PHI's incoming values to ARC.
2094 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2095 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2096 MachineOperand &MO = PHI.getOperand(K);
2097 Register Reg = MO.getReg();
2098
2100 MachineBasicBlock *InsertMBB = nullptr;
2101
2102 // Look at the def of Reg, ignoring all copies.
2103 unsigned CopyOpc = AMDGPU::COPY;
2104 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2105
2106 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2107 // the copy was single-use, it will be removed by DCE later.
2108 if (Def->isCopy()) {
2109 Register AGPRSrc;
2110 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2111 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2112 MO.setReg(AGPRSrc);
2113 MO.setSubReg(AGPRSubReg);
2114 continue;
2115 }
2116
2117 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2118 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2119 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2120 // is unlikely to be profitable.
2121 //
2122 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2123 MachineOperand &CopyIn = Def->getOperand(1);
2124 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2125 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2126 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2127 }
2128
2129 InsertMBB = Def->getParent();
2130 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2131 } else {
2132 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2133 InsertPt = InsertMBB->getFirstTerminator();
2134 }
2135
2136 Register NewReg = MRI->createVirtualRegister(ARC);
2137 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2138 TII->get(CopyOpc), NewReg)
2139 .addReg(Reg);
2140 MO.setReg(NewReg);
2141
2142 (void)MI;
2143 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2144 }
2145
2146 // Replace the PHI's result with a new register.
2147 Register NewReg = MRI->createVirtualRegister(ARC);
2148 PHI.getOperand(0).setReg(NewReg);
2149
2150 // COPY that new register back to the original PhiOut register. This COPY will
2151 // usually be folded out later.
2152 MachineBasicBlock *MBB = PHI.getParent();
2153 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2154 TII->get(AMDGPU::COPY), PhiOut)
2155 .addReg(NewReg);
2156
2157 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2158 return true;
2159}
2160
2161// Attempt to convert VGPR load to an AGPR load.
2162bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2163 assert(MI.mayLoad());
2164 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2165 return false;
2166
2167 MachineOperand &Def = MI.getOperand(0);
2168 if (!Def.isDef())
2169 return false;
2170
2171 Register DefReg = Def.getReg();
2172
2173 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2174 return false;
2175
2177 SmallVector<Register, 8> MoveRegs;
2178 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2179 Users.push_back(&I);
2180
2181 if (Users.empty())
2182 return false;
2183
2184 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2185 while (!Users.empty()) {
2186 const MachineInstr *I = Users.pop_back_val();
2187 if (!I->isCopy() && !I->isRegSequence())
2188 return false;
2189 Register DstReg = I->getOperand(0).getReg();
2190 // Physical registers may have more than one instruction definitions
2191 if (DstReg.isPhysical())
2192 return false;
2193 if (TRI->isAGPR(*MRI, DstReg))
2194 continue;
2195 MoveRegs.push_back(DstReg);
2196 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2197 Users.push_back(&U);
2198 }
2199
2200 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2201 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2202 if (!TII->isOperandLegal(MI, 0, &Def)) {
2203 MRI->setRegClass(DefReg, RC);
2204 return false;
2205 }
2206
2207 while (!MoveRegs.empty()) {
2208 Register Reg = MoveRegs.pop_back_val();
2209 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2210 }
2211
2212 LLVM_DEBUG(dbgs() << "Folded " << MI);
2213
2214 return true;
2215}
2216
2217// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2218// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2219// there's cases where it can create a lot more AGPR-AGPR copies, which are
2220// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2221//
2222// This function looks at all AGPR PHIs in a basic block and collects their
2223// operands. Then, it checks for register that are used more than once across
2224// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2225// having to create one VGPR temporary per use, which can get very messy if
2226// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2227// element).
2228//
2229// Example
2230// a:
2231// %in:agpr_256 = COPY %foo:vgpr_256
2232// c:
2233// %x:agpr_32 = ..
2234// b:
2235// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2236// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2237// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2238// =>
2239// a:
2240// %in:agpr_256 = COPY %foo:vgpr_256
2241// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2242// %tmp_agpr:agpr_32 = COPY %tmp
2243// c:
2244// %x:agpr_32 = ..
2245// b:
2246// %0:areg = PHI %tmp_agpr, %a, %x, %c
2247// %1:areg = PHI %tmp_agpr, %a, %y, %c
2248// %2:areg = PHI %tmp_agpr, %a, %z, %c
2249bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2250 // This is only really needed on GFX908 where AGPR-AGPR copies are
2251 // unreasonably difficult.
2252 if (ST->hasGFX90AInsts())
2253 return false;
2254
2255 // Look at all AGPR Phis and collect the register + subregister used.
2256 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2257 RegToMO;
2258
2259 for (auto &MI : MBB) {
2260 if (!MI.isPHI())
2261 break;
2262
2263 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2264 continue;
2265
2266 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2267 MachineOperand &PhiMO = MI.getOperand(K);
2268 if (!PhiMO.getSubReg())
2269 continue;
2270 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2271 }
2272 }
2273
2274 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2275 // a VGPR.
2276 bool Changed = false;
2277 for (const auto &[Entry, MOs] : RegToMO) {
2278 if (MOs.size() == 1)
2279 continue;
2280
2281 const auto [Reg, SubReg] = Entry;
2282 MachineInstr *Def = MRI->getVRegDef(Reg);
2283 MachineBasicBlock *DefMBB = Def->getParent();
2284
2285 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2286 // out.
2287 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2288 Register TempVGPR =
2289 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2290 MachineInstr *VGPRCopy =
2291 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2292 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2293 .addReg(Reg, /* flags */ 0, SubReg);
2294
2295 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2296 Register TempAGPR = MRI->createVirtualRegister(ARC);
2297 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2298 TII->get(AMDGPU::COPY), TempAGPR)
2299 .addReg(TempVGPR);
2300
2301 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2302 for (MachineOperand *MO : MOs) {
2303 MO->setReg(TempAGPR);
2304 MO->setSubReg(AMDGPU::NoSubRegister);
2305 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2306 }
2307
2308 Changed = true;
2309 }
2310
2311 return Changed;
2312}
2313
2314bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2315 MRI = &MF.getRegInfo();
2316 ST = &MF.getSubtarget<GCNSubtarget>();
2317 TII = ST->getInstrInfo();
2318 TRI = &TII->getRegisterInfo();
2319 MFI = MF.getInfo<SIMachineFunctionInfo>();
2320
2321 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2322 // correctly handle signed zeros.
2323 //
2324 // FIXME: Also need to check strictfp
2325 bool IsIEEEMode = MFI->getMode().IEEE;
2326 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2327
2328 bool Changed = false;
2329 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2330 MachineOperand *CurrentKnownM0Val = nullptr;
2331 for (auto &MI : make_early_inc_range(*MBB)) {
2332 Changed |= tryFoldCndMask(MI);
2333
2334 if (tryFoldZeroHighBits(MI)) {
2335 Changed = true;
2336 continue;
2337 }
2338
2339 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2340 Changed = true;
2341 continue;
2342 }
2343
2344 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2345 Changed = true;
2346 continue;
2347 }
2348
2349 if (MI.mayLoad() && tryFoldLoad(MI)) {
2350 Changed = true;
2351 continue;
2352 }
2353
2354 if (TII->isFoldableCopy(MI)) {
2355 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2356 continue;
2357 }
2358
2359 // Saw an unknown clobber of m0, so we no longer know what it is.
2360 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2361 CurrentKnownM0Val = nullptr;
2362
2363 // TODO: Omod might be OK if there is NSZ only on the source
2364 // instruction, and not the omod multiply.
2365 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2366 !tryFoldOMod(MI))
2367 Changed |= tryFoldClamp(MI);
2368 }
2369
2370 Changed |= tryOptimizeAGPRPhis(*MBB);
2371 }
2372
2373 return Changed;
2374}
2375
2378 bool Changed = SIFoldOperandsImpl().run(MF);
2379 if (!Changed) {
2380 return PreservedAnalyses::all();
2381 }
2383 PA.preserveSet<CFGAnalyses>();
2384 return PA;
2385}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:204
support::ulittle16_t & Hi
Definition: aarch32.cpp:203
Class for arbitrary precision integers.
Definition: APInt.h:78
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:178
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:781
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:572
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:705
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1468
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.