LLVM 20.0.0git
SIFoldOperands.cpp
Go to the documentation of this file.
1//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7/// \file
8//===----------------------------------------------------------------------===//
9//
10
11#include "SIFoldOperands.h"
12#include "AMDGPU.h"
13#include "GCNSubtarget.h"
19
20#define DEBUG_TYPE "si-fold-operands"
21using namespace llvm;
22
23namespace {
24
25struct FoldCandidate {
27 union {
28 MachineOperand *OpToFold;
29 uint64_t ImmToFold;
30 int FrameIndexToFold;
31 };
32 int ShrinkOpcode;
33 unsigned UseOpNo;
35 bool Commuted;
36
37 FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
38 bool Commuted_ = false,
39 int ShrinkOp = -1) :
40 UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
41 Kind(FoldOp->getType()),
42 Commuted(Commuted_) {
43 if (FoldOp->isImm()) {
44 ImmToFold = FoldOp->getImm();
45 } else if (FoldOp->isFI()) {
46 FrameIndexToFold = FoldOp->getIndex();
47 } else {
48 assert(FoldOp->isReg() || FoldOp->isGlobal());
49 OpToFold = FoldOp;
50 }
51 }
52
53 bool isFI() const {
54 return Kind == MachineOperand::MO_FrameIndex;
55 }
56
57 bool isImm() const {
58 return Kind == MachineOperand::MO_Immediate;
59 }
60
61 bool isReg() const {
62 return Kind == MachineOperand::MO_Register;
63 }
64
65 bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
66
67 bool needsShrink() const { return ShrinkOpcode != -1; }
68};
69
70class SIFoldOperandsImpl {
71public:
73 const SIInstrInfo *TII;
74 const SIRegisterInfo *TRI;
75 const GCNSubtarget *ST;
76 const SIMachineFunctionInfo *MFI;
77
78 bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79 const MachineOperand &OpToFold) const;
80
81 // TODO: Just use TII::getVALUOp
82 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
83 switch (Opc) {
84 case AMDGPU::S_ADD_I32: {
85 if (ST->hasAddNoCarry())
86 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
87 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
88 }
89 case AMDGPU::S_OR_B32:
90 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
91 case AMDGPU::S_AND_B32:
92 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
93 case AMDGPU::S_MUL_I32:
94 return AMDGPU::V_MUL_LO_U32_e64;
95 default:
96 return AMDGPU::INSTRUCTION_LIST_END;
97 }
98 }
99
100 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
101 MachineInstr &MI) const;
102
103 bool updateOperand(FoldCandidate &Fold) const;
104
105 bool canUseImmWithOpSel(FoldCandidate &Fold) const;
106
107 bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
108
109 bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
110 MachineInstr *MI, unsigned OpNo,
111 MachineOperand *OpToFold) const;
112 bool isUseSafeToFold(const MachineInstr &MI,
113 const MachineOperand &UseMO) const;
114 bool
115 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
116 Register UseReg, uint8_t OpTy) const;
117 bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
118 unsigned UseOpIdx,
119 SmallVectorImpl<FoldCandidate> &FoldList) const;
120 void foldOperand(MachineOperand &OpToFold,
122 int UseOpIdx,
124 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
125
126 MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
127 bool tryConstantFoldOp(MachineInstr *MI) const;
128 bool tryFoldCndMask(MachineInstr &MI) const;
129 bool tryFoldZeroHighBits(MachineInstr &MI) const;
130 bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
131 bool tryFoldFoldableCopy(MachineInstr &MI,
132 MachineOperand *&CurrentKnownM0Val) const;
133
134 const MachineOperand *isClamp(const MachineInstr &MI) const;
135 bool tryFoldClamp(MachineInstr &MI);
136
137 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
138 bool tryFoldOMod(MachineInstr &MI);
139 bool tryFoldRegSequence(MachineInstr &MI);
140 bool tryFoldPhiAGPR(MachineInstr &MI);
141 bool tryFoldLoad(MachineInstr &MI);
142
143 bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
144
145public:
146 SIFoldOperandsImpl() = default;
147
148 bool run(MachineFunction &MF);
149};
150
151class SIFoldOperandsLegacy : public MachineFunctionPass {
152public:
153 static char ID;
154
155 SIFoldOperandsLegacy() : MachineFunctionPass(ID) {}
156
157 bool runOnMachineFunction(MachineFunction &MF) override {
158 if (skipFunction(MF.getFunction()))
159 return false;
160 return SIFoldOperandsImpl().run(MF);
161 }
162
163 StringRef getPassName() const override { return "SI Fold Operands"; }
164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.setPreservesCFG();
168 }
169};
170
171} // End anonymous namespace.
172
173INITIALIZE_PASS(SIFoldOperandsLegacy, DEBUG_TYPE, "SI Fold Operands", false,
174 false)
175
176char SIFoldOperandsLegacy::ID = 0;
177
178char &llvm::SIFoldOperandsLegacyID = SIFoldOperandsLegacy::ID;
179
182 const MachineOperand &MO) {
183 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
184 if (const TargetRegisterClass *SubRC =
185 TRI.getSubRegisterClass(RC, MO.getSubReg()))
186 RC = SubRC;
187 return RC;
188}
189
190// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
191static unsigned macToMad(unsigned Opc) {
192 switch (Opc) {
193 case AMDGPU::V_MAC_F32_e64:
194 return AMDGPU::V_MAD_F32_e64;
195 case AMDGPU::V_MAC_F16_e64:
196 return AMDGPU::V_MAD_F16_e64;
197 case AMDGPU::V_FMAC_F32_e64:
198 return AMDGPU::V_FMA_F32_e64;
199 case AMDGPU::V_FMAC_F16_e64:
200 return AMDGPU::V_FMA_F16_gfx9_e64;
201 case AMDGPU::V_FMAC_F16_fake16_e64:
202 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
203 case AMDGPU::V_FMAC_LEGACY_F32_e64:
204 return AMDGPU::V_FMA_LEGACY_F32_e64;
205 case AMDGPU::V_FMAC_F64_e64:
206 return AMDGPU::V_FMA_F64_e64;
207 }
208 return AMDGPU::INSTRUCTION_LIST_END;
209}
210
211// TODO: Add heuristic that the frame index might not fit in the addressing mode
212// immediate offset to avoid materializing in loops.
213bool SIFoldOperandsImpl::frameIndexMayFold(
214 const MachineInstr &UseMI, int OpNo, const MachineOperand &OpToFold) const {
215 if (!OpToFold.isFI())
216 return false;
217
218 const unsigned Opc = UseMI.getOpcode();
219 switch (Opc) {
220 case AMDGPU::S_ADD_I32:
221 case AMDGPU::S_OR_B32:
222 case AMDGPU::S_AND_B32:
223 case AMDGPU::V_ADD_U32_e32:
224 case AMDGPU::V_ADD_CO_U32_e32:
225 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
226 // to insert the wave size shift at every point we use the index.
227 // TODO: Fix depending on visit order to fold immediates into the operand
228 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
229 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
230 case AMDGPU::V_ADD_U32_e64:
231 case AMDGPU::V_ADD_CO_U32_e64:
232 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
233 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
234 default:
235 break;
236 }
237
238 if (TII->isMUBUF(UseMI))
239 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
240 if (!TII->isFLATScratch(UseMI))
241 return false;
242
243 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
244 if (OpNo == SIdx)
245 return true;
246
247 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
248 return OpNo == VIdx && SIdx == -1;
249}
250
251/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
252///
253/// => %vgpr = V_ADD_U32 x, frameindex
254bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
255 Register DstReg, Register SrcReg, MachineInstr &MI) const {
256 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
257 MRI->hasOneNonDBGUse(SrcReg)) {
258 MachineInstr *Def = MRI->getVRegDef(SrcReg);
259 if (!Def || Def->getNumOperands() != 4)
260 return false;
261
262 MachineOperand *Src0 = &Def->getOperand(1);
263 MachineOperand *Src1 = &Def->getOperand(2);
264
265 // TODO: This is profitable with more operand types, and for more
266 // opcodes. But ultimately this is working around poor / nonexistent
267 // regbankselect.
268 if (!Src0->isFI() && !Src1->isFI())
269 return false;
270
271 if (Src0->isFI())
272 std::swap(Src0, Src1);
273
274 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
275 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
276 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
277 !Def->getOperand(3).isDead()) // Check if scc is dead
278 return false;
279
280 MachineBasicBlock *MBB = Def->getParent();
281 const DebugLoc &DL = Def->getDebugLoc();
282 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
284 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
285
286 if (Add->getDesc().getNumDefs() == 2) {
287 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
288 Add.addDef(CarryOutReg, RegState::Dead);
289 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
290 }
291
292 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
293 if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
294 Add.addImm(0);
295
296 Def->eraseFromParent();
297 MI.eraseFromParent();
298 return true;
299 }
300
301 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
302
304 MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
305 if (Liveness == MachineBasicBlock::LQR_Dead) {
306 // TODO: If src1 satisfies operand constraints, use vop3 version.
307 BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
308 .add(*Src0)
309 .add(*Src1)
310 .setOperandDead(3) // implicit-def $vcc
311 .setMIFlags(Def->getFlags());
312 Def->eraseFromParent();
313 MI.eraseFromParent();
314 return true;
315 }
316 }
317
318 return false;
319}
320
322 return new SIFoldOperandsLegacy();
323}
324
325bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate &Fold) const {
326 MachineInstr *MI = Fold.UseMI;
327 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
328 const uint64_t TSFlags = MI->getDesc().TSFlags;
329
330 assert(Old.isReg() && Fold.isImm());
331
332 if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
333 (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
334 (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
335 return false;
336
337 unsigned Opcode = MI->getOpcode();
338 int OpNo = MI->getOperandNo(&Old);
339 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
340 switch (OpType) {
341 default:
342 return false;
349 break;
350 }
351
352 return true;
353}
354
355bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
356 MachineInstr *MI = Fold.UseMI;
357 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
358 unsigned Opcode = MI->getOpcode();
359 int OpNo = MI->getOperandNo(&Old);
360 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
361
362 // If the literal can be inlined as-is, apply it and short-circuit the
363 // tests below. The main motivation for this is to avoid unintuitive
364 // uses of opsel.
365 if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
366 Old.ChangeToImmediate(Fold.ImmToFold);
367 return true;
368 }
369
370 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
371 // op_sel in a way that allows an inline constant.
372 int ModIdx = -1;
373 unsigned SrcIdx = ~0;
374 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
375 ModIdx = AMDGPU::OpName::src0_modifiers;
376 SrcIdx = 0;
377 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
378 ModIdx = AMDGPU::OpName::src1_modifiers;
379 SrcIdx = 1;
380 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
381 ModIdx = AMDGPU::OpName::src2_modifiers;
382 SrcIdx = 2;
383 }
384 assert(ModIdx != -1);
385 ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
386 MachineOperand &Mod = MI->getOperand(ModIdx);
387 unsigned ModVal = Mod.getImm();
388
389 uint16_t ImmLo = static_cast<uint16_t>(
390 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
391 uint16_t ImmHi = static_cast<uint16_t>(
392 Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
393 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
394 unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
395
396 // Helper function that attempts to inline the given value with a newly
397 // chosen opsel pattern.
398 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
399 if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
400 Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
401 Old.ChangeToImmediate(Imm);
402 return true;
403 }
404
405 // Try to shuffle the halves around and leverage opsel to get an inline
406 // constant.
407 uint16_t Lo = static_cast<uint16_t>(Imm);
408 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
409 if (Lo == Hi) {
410 if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
411 Mod.setImm(NewModVal);
413 return true;
414 }
415
416 if (static_cast<int16_t>(Lo) < 0) {
417 int32_t SExt = static_cast<int16_t>(Lo);
418 if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
419 Mod.setImm(NewModVal);
420 Old.ChangeToImmediate(SExt);
421 return true;
422 }
423 }
424
425 // This check is only useful for integer instructions
426 if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
428 if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
429 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
430 Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
431 return true;
432 }
433 }
434 } else {
435 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
436 if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
437 Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
438 Old.ChangeToImmediate(Swapped);
439 return true;
440 }
441 }
442
443 return false;
444 };
445
446 if (tryFoldToInline(Imm))
447 return true;
448
449 // Replace integer addition by subtraction and vice versa if it allows
450 // folding the immediate to an inline constant.
451 //
452 // We should only ever get here for SrcIdx == 1 due to canonicalization
453 // earlier in the pipeline, but we double-check here to be safe / fully
454 // general.
455 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
456 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
457 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
458 unsigned ClampIdx =
459 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
460 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
461
462 if (!Clamp) {
463 uint16_t NegLo = -static_cast<uint16_t>(Imm);
464 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
465 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
466
467 if (tryFoldToInline(NegImm)) {
468 unsigned NegOpcode =
469 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
470 MI->setDesc(TII->get(NegOpcode));
471 return true;
472 }
473 }
474 }
475
476 return false;
477}
478
479bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
480 MachineInstr *MI = Fold.UseMI;
481 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
482 assert(Old.isReg());
483
484 if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
485 if (tryFoldImmWithOpSel(Fold))
486 return true;
487
488 // We can't represent the candidate as an inline constant. Try as a literal
489 // with the original opsel, checking constant bus limitations.
491 int OpNo = MI->getOperandNo(&Old);
492 if (!TII->isOperandLegal(*MI, OpNo, &New))
493 return false;
494 Old.ChangeToImmediate(Fold.ImmToFold);
495 return true;
496 }
497
498 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
499 MachineBasicBlock *MBB = MI->getParent();
500 auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
501 if (Liveness != MachineBasicBlock::LQR_Dead) {
502 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
503 return false;
504 }
505
506 int Op32 = Fold.ShrinkOpcode;
507 MachineOperand &Dst0 = MI->getOperand(0);
508 MachineOperand &Dst1 = MI->getOperand(1);
509 assert(Dst0.isDef() && Dst1.isDef());
510
511 bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
512
513 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
514 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
515
516 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
517
518 if (HaveNonDbgCarryUse) {
519 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
520 Dst1.getReg())
521 .addReg(AMDGPU::VCC, RegState::Kill);
522 }
523
524 // Keep the old instruction around to avoid breaking iterators, but
525 // replace it with a dummy instruction to remove uses.
526 //
527 // FIXME: We should not invert how this pass looks at operands to avoid
528 // this. Should track set of foldable movs instead of looking for uses
529 // when looking at a use.
530 Dst0.setReg(NewReg0);
531 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
532 MI->removeOperand(I);
533 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
534
535 if (Fold.Commuted)
536 TII->commuteInstruction(*Inst32, false);
537 return true;
538 }
539
540 assert(!Fold.needsShrink() && "not handled");
541
542 if (Fold.isImm()) {
543 if (Old.isTied()) {
544 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
545 if (NewMFMAOpc == -1)
546 return false;
547 MI->setDesc(TII->get(NewMFMAOpc));
548 MI->untieRegOperand(0);
549 }
550 Old.ChangeToImmediate(Fold.ImmToFold);
551 return true;
552 }
553
554 if (Fold.isGlobal()) {
555 Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
556 Fold.OpToFold->getTargetFlags());
557 return true;
558 }
559
560 if (Fold.isFI()) {
561 Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
562 return true;
563 }
564
565 MachineOperand *New = Fold.OpToFold;
566 Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
567 Old.setIsUndef(New->isUndef());
568 return true;
569}
570
572 const MachineInstr *MI) {
573 return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
574}
575
577 MachineInstr *MI, unsigned OpNo,
578 MachineOperand *FoldOp, bool Commuted = false,
579 int ShrinkOp = -1) {
580 // Skip additional folding on the same operand.
581 for (FoldCandidate &Fold : FoldList)
582 if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
583 return;
584 LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
585 << " operand " << OpNo << "\n " << *MI);
586 FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
587}
588
589bool SIFoldOperandsImpl::tryAddToFoldList(
590 SmallVectorImpl<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo,
591 MachineOperand *OpToFold) const {
592 const unsigned Opc = MI->getOpcode();
593
594 auto tryToFoldAsFMAAKorMK = [&]() {
595 if (!OpToFold->isImm())
596 return false;
597
598 const bool TryAK = OpNo == 3;
599 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
600 MI->setDesc(TII->get(NewOpc));
601
602 // We have to fold into operand which would be Imm not into OpNo.
603 bool FoldAsFMAAKorMK =
604 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
605 if (FoldAsFMAAKorMK) {
606 // Untie Src2 of fmac.
607 MI->untieRegOperand(3);
608 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
609 if (OpNo == 1) {
610 MachineOperand &Op1 = MI->getOperand(1);
611 MachineOperand &Op2 = MI->getOperand(2);
612 Register OldReg = Op1.getReg();
613 // Operand 2 might be an inlinable constant
614 if (Op2.isImm()) {
615 Op1.ChangeToImmediate(Op2.getImm());
616 Op2.ChangeToRegister(OldReg, false);
617 } else {
618 Op1.setReg(Op2.getReg());
619 Op2.setReg(OldReg);
620 }
621 }
622 return true;
623 }
624 MI->setDesc(TII->get(Opc));
625 return false;
626 };
627
628 bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
629 if (!IsLegal && OpToFold->isImm()) {
630 FoldCandidate Fold(MI, OpNo, OpToFold);
631 IsLegal = canUseImmWithOpSel(Fold);
632 }
633
634 if (!IsLegal) {
635 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
636 unsigned NewOpc = macToMad(Opc);
637 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
638 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
639 // to fold the operand.
640 MI->setDesc(TII->get(NewOpc));
641 bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
642 AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
643 if (AddOpSel)
644 MI->addOperand(MachineOperand::CreateImm(0));
645 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
646 if (FoldAsMAD) {
647 MI->untieRegOperand(OpNo);
648 return true;
649 }
650 if (AddOpSel)
651 MI->removeOperand(MI->getNumExplicitOperands() - 1);
652 MI->setDesc(TII->get(Opc));
653 }
654
655 // Special case for s_fmac_f32 if we are trying to fold into Src2.
656 // By transforming into fmaak we can untie Src2 and make folding legal.
657 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
658 if (tryToFoldAsFMAAKorMK())
659 return true;
660 }
661
662 // Special case for s_setreg_b32
663 if (OpToFold->isImm()) {
664 unsigned ImmOpc = 0;
665 if (Opc == AMDGPU::S_SETREG_B32)
666 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
667 else if (Opc == AMDGPU::S_SETREG_B32_mode)
668 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
669 if (ImmOpc) {
670 MI->setDesc(TII->get(ImmOpc));
671 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
672 return true;
673 }
674 }
675
676 // If we are already folding into another operand of MI, then
677 // we can't commute the instruction, otherwise we risk making the
678 // other fold illegal.
679 if (isUseMIInFoldList(FoldList, MI))
680 return false;
681
682 // Operand is not legal, so try to commute the instruction to
683 // see if this makes it possible to fold.
684 unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
685 bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
686 if (!CanCommute)
687 return false;
688
689 // One of operands might be an Imm operand, and OpNo may refer to it after
690 // the call of commuteInstruction() below. Such situations are avoided
691 // here explicitly as OpNo must be a register operand to be a candidate
692 // for memory folding.
693 if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
694 return false;
695
696 if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
697 return false;
698
699 int Op32 = -1;
700 if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
701 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
702 Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
703 (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
704 TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
705 return false;
706 }
707
708 // Verify the other operand is a VGPR, otherwise we would violate the
709 // constant bus restriction.
710 MachineOperand &OtherOp = MI->getOperand(OpNo);
711 if (!OtherOp.isReg() ||
712 !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
713 return false;
714
715 assert(MI->getOperand(1).isDef());
716
717 // Make sure to get the 32-bit version of the commuted opcode.
718 unsigned MaybeCommutedOpc = MI->getOpcode();
719 Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
720 }
721
722 appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
723 return true;
724 }
725
726 // Inlineable constant might have been folded into Imm operand of fmaak or
727 // fmamk and we are trying to fold a non-inlinable constant.
728 if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
729 !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
730 unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
731 MachineOperand &OpImm = MI->getOperand(ImmIdx);
732 if (!OpImm.isReg() &&
733 TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
734 return tryToFoldAsFMAAKorMK();
735 }
736
737 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
738 // By changing into fmamk we can untie Src2.
739 // If folding for Src0 happens first and it is identical operand to Src1 we
740 // should avoid transforming into fmamk which requires commuting as it would
741 // cause folding into Src1 to fail later on due to wrong OpNo used.
742 if (Opc == AMDGPU::S_FMAC_F32 &&
743 (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
744 if (tryToFoldAsFMAAKorMK())
745 return true;
746 }
747
748 // Check the case where we might introduce a second constant operand to a
749 // scalar instruction
750 if (TII->isSALU(MI->getOpcode())) {
751 const MCInstrDesc &InstDesc = MI->getDesc();
752 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
753
754 // Fine if the operand can be encoded as an inline constant
755 if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
756 // Otherwise check for another constant
757 for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
758 auto &Op = MI->getOperand(i);
759 if (OpNo != i && !Op.isReg() &&
760 !TII->isInlineConstant(Op, InstDesc.operands()[i]))
761 return false;
762 }
763 }
764 }
765
766 appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
767 return true;
768}
769
770bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
771 const MachineOperand &UseMO) const {
772 // Operands of SDWA instructions must be registers.
773 return !TII->isSDWA(MI);
774}
775
776// Find a def of the UseReg, check if it is a reg_sequence and find initializers
777// for each subreg, tracking it to foldable inline immediate if possible.
778// Returns true on success.
779bool SIFoldOperandsImpl::getRegSeqInit(
780 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
781 Register UseReg, uint8_t OpTy) const {
782 MachineInstr *Def = MRI->getVRegDef(UseReg);
783 if (!Def || !Def->isRegSequence())
784 return false;
785
786 for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
787 MachineOperand *Sub = &Def->getOperand(I);
788 assert(Sub->isReg());
789
790 for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
791 SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
792 !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
793 SubDef = MRI->getVRegDef(Sub->getReg())) {
794 MachineOperand *Op = &SubDef->getOperand(1);
795 if (Op->isImm()) {
796 if (TII->isInlineConstant(*Op, OpTy))
797 Sub = Op;
798 break;
799 }
800 if (!Op->isReg() || Op->getReg().isPhysical())
801 break;
802 Sub = Op;
803 }
804
805 Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
806 }
807
808 return true;
809}
810
811bool SIFoldOperandsImpl::tryToFoldACImm(
812 const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
813 SmallVectorImpl<FoldCandidate> &FoldList) const {
814 const MCInstrDesc &Desc = UseMI->getDesc();
815 if (UseOpIdx >= Desc.getNumOperands())
816 return false;
817
819 return false;
820
821 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
822 if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
823 TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
824 UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
825 return true;
826 }
827
828 if (!OpToFold.isReg())
829 return false;
830
831 Register UseReg = OpToFold.getReg();
832 if (!UseReg.isVirtual())
833 return false;
834
835 if (isUseMIInFoldList(FoldList, UseMI))
836 return false;
837
838 // Maybe it is just a COPY of an immediate itself.
839 MachineInstr *Def = MRI->getVRegDef(UseReg);
840 MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
841 if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
842 MachineOperand &DefOp = Def->getOperand(1);
843 if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
844 TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
845 UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
846 return true;
847 }
848 }
849
851 if (!getRegSeqInit(Defs, UseReg, OpTy))
852 return false;
853
854 int32_t Imm;
855 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
856 const MachineOperand *Op = Defs[I].first;
857 if (!Op->isImm())
858 return false;
859
860 auto SubImm = Op->getImm();
861 if (!I) {
862 Imm = SubImm;
863 if (!TII->isInlineConstant(*Op, OpTy) ||
864 !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
865 return false;
866
867 continue;
868 }
869 if (Imm != SubImm)
870 return false; // Can only fold splat constants
871 }
872
873 appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
874 return true;
875}
876
877void SIFoldOperandsImpl::foldOperand(
878 MachineOperand &OpToFold, MachineInstr *UseMI, int UseOpIdx,
880 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
881 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
882
883 if (!isUseSafeToFold(*UseMI, *UseOp))
884 return;
885
886 // FIXME: Fold operands with subregs.
887 if (UseOp->isReg() && OpToFold.isReg() &&
888 (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
889 return;
890
891 // Special case for REG_SEQUENCE: We can't fold literals into
892 // REG_SEQUENCE instructions, so we have to fold them into the
893 // uses of REG_SEQUENCE.
894 if (UseMI->isRegSequence()) {
895 Register RegSeqDstReg = UseMI->getOperand(0).getReg();
896 unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
897
898 // Grab the use operands first
900 for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
901 UsesToProcess.push_back(&Use);
902 for (auto *RSUse : UsesToProcess) {
903 MachineInstr *RSUseMI = RSUse->getParent();
904
905 if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
906 RSUseMI->getOperandNo(RSUse), FoldList))
907 continue;
908
909 if (RSUse->getSubReg() != RegSeqDstSubReg)
910 continue;
911
912 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
913 CopiesToReplace);
914 }
915 return;
916 }
917
918 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
919 return;
920
921 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
922 // Verify that this is a stack access.
923 // FIXME: Should probably use stack pseudos before frame lowering.
924
925 if (TII->isMUBUF(*UseMI)) {
926 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
927 MFI->getScratchRSrcReg())
928 return;
929
930 // Ensure this is either relative to the current frame or the current
931 // wave.
932 MachineOperand &SOff =
933 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
934 if (!SOff.isImm() || SOff.getImm() != 0)
935 return;
936 }
937
938 // A frame index will resolve to a positive constant, so it should always be
939 // safe to fold the addressing mode, even pre-GFX9.
940 UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
941
942 const unsigned Opc = UseMI->getOpcode();
943 if (TII->isFLATScratch(*UseMI) &&
944 AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
945 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
946 unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
947 UseMI->setDesc(TII->get(NewOpc));
948 }
949
950 return;
951 }
952
953 bool FoldingImmLike =
954 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
955
956 if (FoldingImmLike && UseMI->isCopy()) {
957 Register DestReg = UseMI->getOperand(0).getReg();
958 Register SrcReg = UseMI->getOperand(1).getReg();
959 assert(SrcReg.isVirtual());
960
961 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
962
963 // Don't fold into a copy to a physical register with the same class. Doing
964 // so would interfere with the register coalescer's logic which would avoid
965 // redundant initializations.
966 if (DestReg.isPhysical() && SrcRC->contains(DestReg))
967 return;
968
969 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
970 if (!DestReg.isPhysical()) {
971 if (DestRC == &AMDGPU::AGPR_32RegClass &&
972 TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
973 UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
975 CopiesToReplace.push_back(UseMI);
976 return;
977 }
978 }
979
980 // In order to fold immediates into copies, we need to change the
981 // copy to a MOV.
982
983 unsigned MovOp = TII->getMovOpcode(DestRC);
984 if (MovOp == AMDGPU::COPY)
985 return;
986
989 while (ImpOpI != ImpOpE) {
990 MachineInstr::mop_iterator Tmp = ImpOpI;
991 ImpOpI++;
993 }
994 UseMI->setDesc(TII->get(MovOp));
995
996 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
997 const auto &SrcOp = UseMI->getOperand(UseOpIdx);
998 MachineOperand NewSrcOp(SrcOp);
1000 UseMI->removeOperand(1);
1001 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
1002 UseMI->addOperand(NewSrcOp); // src0
1003 UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
1004 UseOpIdx = 2;
1005 UseOp = &UseMI->getOperand(UseOpIdx);
1006 }
1007 CopiesToReplace.push_back(UseMI);
1008 } else {
1009 if (UseMI->isCopy() && OpToFold.isReg() &&
1010 UseMI->getOperand(0).getReg().isVirtual() &&
1011 !UseMI->getOperand(1).getSubReg()) {
1012 LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
1013 unsigned Size = TII->getOpSize(*UseMI, 1);
1014 Register UseReg = OpToFold.getReg();
1016 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1017 UseMI->getOperand(1).setIsKill(false);
1018 CopiesToReplace.push_back(UseMI);
1019 OpToFold.setIsKill(false);
1020
1021 // Remove kill flags as kills may now be out of order with uses.
1022 MRI->clearKillFlags(OpToFold.getReg());
1023
1024 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1025 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1026 // its initializers right here, so we will rematerialize immediates and
1027 // avoid copies via different reg classes.
1029 if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
1030 getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1031 const DebugLoc &DL = UseMI->getDebugLoc();
1033
1034 UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1035 for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
1037
1041 for (unsigned I = 0; I < Size / 4; ++I) {
1042 MachineOperand *Def = Defs[I].first;
1044 if (Def->isImm() &&
1045 TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
1046 int64_t Imm = Def->getImm();
1047
1048 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1049 BuildMI(MBB, UseMI, DL,
1050 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
1051 B.addReg(Tmp);
1052 } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
1053 auto Src = getRegSubRegPair(*Def);
1054 Def->setIsKill(false);
1055 if (!SeenAGPRs.insert(Src)) {
1056 // We cannot build a reg_sequence out of the same registers, they
1057 // must be copied. Better do it here before copyPhysReg() created
1058 // several reads to do the AGPR->VGPR->AGPR copy.
1059 CopyToVGPR = Src;
1060 } else {
1061 B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
1062 Src.SubReg);
1063 }
1064 } else {
1065 assert(Def->isReg());
1066 Def->setIsKill(false);
1067 auto Src = getRegSubRegPair(*Def);
1068
1069 // Direct copy from SGPR to AGPR is not possible. To avoid creation
1070 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1071 // create a copy here and track if we already have such a copy.
1072 if (TRI->isSGPRReg(*MRI, Src.Reg)) {
1073 CopyToVGPR = Src;
1074 } else {
1075 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1076 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
1077 B.addReg(Tmp);
1078 }
1079 }
1080
1081 if (CopyToVGPR.Reg) {
1082 Register Vgpr;
1083 if (VGPRCopies.count(CopyToVGPR)) {
1084 Vgpr = VGPRCopies[CopyToVGPR];
1085 } else {
1086 Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1087 BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
1088 VGPRCopies[CopyToVGPR] = Vgpr;
1089 }
1090 auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1091 BuildMI(MBB, UseMI, DL,
1092 TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
1093 B.addReg(Tmp);
1094 }
1095
1096 B.addImm(Defs[I].second);
1097 }
1098 LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
1099 }
1100
1101 return;
1102 }
1103
1104 unsigned UseOpc = UseMI->getOpcode();
1105 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1106 (UseOpc == AMDGPU::V_READLANE_B32 &&
1107 (int)UseOpIdx ==
1108 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1109 // %vgpr = V_MOV_B32 imm
1110 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1111 // =>
1112 // %sgpr = S_MOV_B32 imm
1113 if (FoldingImmLike) {
1115 UseMI->getOperand(UseOpIdx).getReg(),
1116 *OpToFold.getParent(),
1117 *UseMI))
1118 return;
1119
1120 UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1121
1122 if (OpToFold.isImm())
1123 UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1124 else
1126 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1127 return;
1128 }
1129
1130 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1132 UseMI->getOperand(UseOpIdx).getReg(),
1133 *OpToFold.getParent(),
1134 *UseMI))
1135 return;
1136
1137 // %vgpr = COPY %sgpr0
1138 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1139 // =>
1140 // %sgpr1 = COPY %sgpr0
1141 UseMI->setDesc(TII->get(AMDGPU::COPY));
1142 UseMI->getOperand(1).setReg(OpToFold.getReg());
1143 UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1144 UseMI->getOperand(1).setIsKill(false);
1145 UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1146 return;
1147 }
1148 }
1149
1150 const MCInstrDesc &UseDesc = UseMI->getDesc();
1151
1152 // Don't fold into target independent nodes. Target independent opcodes
1153 // don't have defined register classes.
1154 if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1155 UseDesc.operands()[UseOpIdx].RegClass == -1)
1156 return;
1157 }
1158
1159 if (!FoldingImmLike) {
1160 if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1161 // Don't fold if OpToFold doesn't hold an aligned register.
1162 const TargetRegisterClass *RC =
1163 TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1164 assert(RC);
1165 if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1166 unsigned SubReg = OpToFold.getSubReg();
1167 if (const TargetRegisterClass *SubRC =
1168 TRI->getSubRegisterClass(RC, SubReg))
1169 RC = SubRC;
1170 }
1171
1172 if (!RC || !TRI->isProperlyAlignedRC(*RC))
1173 return;
1174 }
1175
1176 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1177
1178 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1179 // to enable more folding opportunities. The shrink operands pass
1180 // already does this.
1181 return;
1182 }
1183
1184
1185 const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1186 const TargetRegisterClass *FoldRC =
1187 TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1188
1189 // Split 64-bit constants into 32-bits for folding.
1190 if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1191 Register UseReg = UseOp->getReg();
1192 const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1193 if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1194 return;
1195
1196 APInt Imm(64, OpToFold.getImm());
1197 if (UseOp->getSubReg() == AMDGPU::sub0) {
1198 Imm = Imm.getLoBits(32);
1199 } else {
1200 assert(UseOp->getSubReg() == AMDGPU::sub1);
1201 Imm = Imm.getHiBits(32);
1202 }
1203
1204 MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1205 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1206 return;
1207 }
1208
1209 tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1210}
1211
1212static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1213 uint32_t LHS, uint32_t RHS) {
1214 switch (Opcode) {
1215 case AMDGPU::V_AND_B32_e64:
1216 case AMDGPU::V_AND_B32_e32:
1217 case AMDGPU::S_AND_B32:
1218 Result = LHS & RHS;
1219 return true;
1220 case AMDGPU::V_OR_B32_e64:
1221 case AMDGPU::V_OR_B32_e32:
1222 case AMDGPU::S_OR_B32:
1223 Result = LHS | RHS;
1224 return true;
1225 case AMDGPU::V_XOR_B32_e64:
1226 case AMDGPU::V_XOR_B32_e32:
1227 case AMDGPU::S_XOR_B32:
1228 Result = LHS ^ RHS;
1229 return true;
1230 case AMDGPU::S_XNOR_B32:
1231 Result = ~(LHS ^ RHS);
1232 return true;
1233 case AMDGPU::S_NAND_B32:
1234 Result = ~(LHS & RHS);
1235 return true;
1236 case AMDGPU::S_NOR_B32:
1237 Result = ~(LHS | RHS);
1238 return true;
1239 case AMDGPU::S_ANDN2_B32:
1240 Result = LHS & ~RHS;
1241 return true;
1242 case AMDGPU::S_ORN2_B32:
1243 Result = LHS | ~RHS;
1244 return true;
1245 case AMDGPU::V_LSHL_B32_e64:
1246 case AMDGPU::V_LSHL_B32_e32:
1247 case AMDGPU::S_LSHL_B32:
1248 // The instruction ignores the high bits for out of bounds shifts.
1249 Result = LHS << (RHS & 31);
1250 return true;
1251 case AMDGPU::V_LSHLREV_B32_e64:
1252 case AMDGPU::V_LSHLREV_B32_e32:
1253 Result = RHS << (LHS & 31);
1254 return true;
1255 case AMDGPU::V_LSHR_B32_e64:
1256 case AMDGPU::V_LSHR_B32_e32:
1257 case AMDGPU::S_LSHR_B32:
1258 Result = LHS >> (RHS & 31);
1259 return true;
1260 case AMDGPU::V_LSHRREV_B32_e64:
1261 case AMDGPU::V_LSHRREV_B32_e32:
1262 Result = RHS >> (LHS & 31);
1263 return true;
1264 case AMDGPU::V_ASHR_I32_e64:
1265 case AMDGPU::V_ASHR_I32_e32:
1266 case AMDGPU::S_ASHR_I32:
1267 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1268 return true;
1269 case AMDGPU::V_ASHRREV_I32_e64:
1270 case AMDGPU::V_ASHRREV_I32_e32:
1271 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1272 return true;
1273 default:
1274 return false;
1275 }
1276}
1277
1278static unsigned getMovOpc(bool IsScalar) {
1279 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1280}
1281
1282static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1283 MI.setDesc(NewDesc);
1284
1285 // Remove any leftover implicit operands from mutating the instruction. e.g.
1286 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1287 // anymore.
1288 const MCInstrDesc &Desc = MI.getDesc();
1289 unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1290 Desc.implicit_defs().size();
1291
1292 for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1293 MI.removeOperand(I);
1294}
1295
1297SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1298 // If this has a subregister, it obviously is a register source.
1299 if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1300 !Op.getReg().isVirtual())
1301 return &Op;
1302
1303 MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1304 if (Def && Def->isMoveImmediate()) {
1305 MachineOperand &ImmSrc = Def->getOperand(1);
1306 if (ImmSrc.isImm())
1307 return &ImmSrc;
1308 }
1309
1310 return &Op;
1311}
1312
1313// Try to simplify operations with a constant that may appear after instruction
1314// selection.
1315// TODO: See if a frame index with a fixed offset can fold.
1316bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1317 if (!MI->allImplicitDefsAreDead())
1318 return false;
1319
1320 unsigned Opc = MI->getOpcode();
1321
1322 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1323 if (Src0Idx == -1)
1324 return false;
1325 MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1326
1327 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1328 Opc == AMDGPU::S_NOT_B32) &&
1329 Src0->isImm()) {
1330 MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1331 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1332 return true;
1333 }
1334
1335 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1336 if (Src1Idx == -1)
1337 return false;
1338 MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1339
1340 if (!Src0->isImm() && !Src1->isImm())
1341 return false;
1342
1343 // and k0, k1 -> v_mov_b32 (k0 & k1)
1344 // or k0, k1 -> v_mov_b32 (k0 | k1)
1345 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1346 if (Src0->isImm() && Src1->isImm()) {
1347 int32_t NewImm;
1348 if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1349 return false;
1350
1351 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1352
1353 // Be careful to change the right operand, src0 may belong to a different
1354 // instruction.
1355 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1356 MI->removeOperand(Src1Idx);
1357 mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1358 return true;
1359 }
1360
1361 if (!MI->isCommutable())
1362 return false;
1363
1364 if (Src0->isImm() && !Src1->isImm()) {
1365 std::swap(Src0, Src1);
1366 std::swap(Src0Idx, Src1Idx);
1367 }
1368
1369 int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1370 if (Opc == AMDGPU::V_OR_B32_e64 ||
1371 Opc == AMDGPU::V_OR_B32_e32 ||
1372 Opc == AMDGPU::S_OR_B32) {
1373 if (Src1Val == 0) {
1374 // y = or x, 0 => y = copy x
1375 MI->removeOperand(Src1Idx);
1376 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1377 } else if (Src1Val == -1) {
1378 // y = or x, -1 => y = v_mov_b32 -1
1379 MI->removeOperand(Src1Idx);
1380 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1381 } else
1382 return false;
1383
1384 return true;
1385 }
1386
1387 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1388 Opc == AMDGPU::S_AND_B32) {
1389 if (Src1Val == 0) {
1390 // y = and x, 0 => y = v_mov_b32 0
1391 MI->removeOperand(Src0Idx);
1392 mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1393 } else if (Src1Val == -1) {
1394 // y = and x, -1 => y = copy x
1395 MI->removeOperand(Src1Idx);
1396 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1397 } else
1398 return false;
1399
1400 return true;
1401 }
1402
1403 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1404 Opc == AMDGPU::S_XOR_B32) {
1405 if (Src1Val == 0) {
1406 // y = xor x, 0 => y = copy x
1407 MI->removeOperand(Src1Idx);
1408 mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1409 return true;
1410 }
1411 }
1412
1413 return false;
1414}
1415
1416// Try to fold an instruction into a simpler one
1417bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1418 unsigned Opc = MI.getOpcode();
1419 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1420 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1421 return false;
1422
1423 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1424 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1425 if (!Src1->isIdenticalTo(*Src0)) {
1426 auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1427 auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1428 if (!Src1Imm->isIdenticalTo(*Src0Imm))
1429 return false;
1430 }
1431
1432 int Src1ModIdx =
1433 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1434 int Src0ModIdx =
1435 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1436 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1437 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1438 return false;
1439
1440 LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1441 auto &NewDesc =
1442 TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1443 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1444 if (Src2Idx != -1)
1445 MI.removeOperand(Src2Idx);
1446 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1447 if (Src1ModIdx != -1)
1448 MI.removeOperand(Src1ModIdx);
1449 if (Src0ModIdx != -1)
1450 MI.removeOperand(Src0ModIdx);
1451 mutateCopyOp(MI, NewDesc);
1452 LLVM_DEBUG(dbgs() << MI);
1453 return true;
1454}
1455
1456bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1457 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1458 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1459 return false;
1460
1461 MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1462 if (!Src0->isImm() || Src0->getImm() != 0xffff)
1463 return false;
1464
1465 Register Src1 = MI.getOperand(2).getReg();
1466 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1467 if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1468 return false;
1469
1470 Register Dst = MI.getOperand(0).getReg();
1471 MRI->replaceRegWith(Dst, Src1);
1472 if (!MI.getOperand(2).isKill())
1473 MRI->clearKillFlags(Src1);
1474 MI.eraseFromParent();
1475 return true;
1476}
1477
1478bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1479 MachineOperand &OpToFold) const {
1480 // We need mutate the operands of new mov instructions to add implicit
1481 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1482 // this.
1483 SmallVector<MachineInstr *, 4> CopiesToReplace;
1485 MachineOperand &Dst = MI.getOperand(0);
1486 bool Changed = false;
1487
1488 if (OpToFold.isImm()) {
1489 for (auto &UseMI :
1490 make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1491 // Folding the immediate may reveal operations that can be constant
1492 // folded or replaced with a copy. This can happen for example after
1493 // frame indices are lowered to constants or from splitting 64-bit
1494 // constants.
1495 //
1496 // We may also encounter cases where one or both operands are
1497 // immediates materialized into a register, which would ordinarily not
1498 // be folded due to multiple uses or operand constraints.
1499 if (tryConstantFoldOp(&UseMI)) {
1500 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1501 Changed = true;
1502 }
1503 }
1504 }
1505
1507 for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1508 UsesToProcess.push_back(&Use);
1509 for (auto *U : UsesToProcess) {
1510 MachineInstr *UseMI = U->getParent();
1511 foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1512 CopiesToReplace);
1513 }
1514
1515 if (CopiesToReplace.empty() && FoldList.empty())
1516 return Changed;
1517
1518 MachineFunction *MF = MI.getParent()->getParent();
1519 // Make sure we add EXEC uses to any new v_mov instructions created.
1520 for (MachineInstr *Copy : CopiesToReplace)
1521 Copy->addImplicitDefUseOperands(*MF);
1522
1523 for (FoldCandidate &Fold : FoldList) {
1524 assert(!Fold.isReg() || Fold.OpToFold);
1525 if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1526 Register Reg = Fold.OpToFold->getReg();
1527 MachineInstr *DefMI = Fold.OpToFold->getParent();
1528 if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1529 execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1530 continue;
1531 }
1532 if (updateOperand(Fold)) {
1533 // Clear kill flags.
1534 if (Fold.isReg()) {
1535 assert(Fold.OpToFold && Fold.OpToFold->isReg());
1536 // FIXME: Probably shouldn't bother trying to fold if not an
1537 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1538 // copies.
1539 MRI->clearKillFlags(Fold.OpToFold->getReg());
1540 }
1541 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1542 << static_cast<int>(Fold.UseOpNo) << " of "
1543 << *Fold.UseMI);
1544 } else if (Fold.Commuted) {
1545 // Restoring instruction's original operand order if fold has failed.
1546 TII->commuteInstruction(*Fold.UseMI, false);
1547 }
1548 }
1549 return true;
1550}
1551
1552bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1553 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1554 Register DstReg = MI.getOperand(0).getReg();
1555 // Specially track simple redefs of m0 to the same value in a block, so we
1556 // can erase the later ones.
1557 if (DstReg == AMDGPU::M0) {
1558 MachineOperand &NewM0Val = MI.getOperand(1);
1559 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1560 MI.eraseFromParent();
1561 return true;
1562 }
1563
1564 // We aren't tracking other physical registers
1565 CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1566 ? nullptr
1567 : &NewM0Val;
1568 return false;
1569 }
1570
1571 MachineOperand *OpToFoldPtr;
1572 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
1573 // Folding when any src_modifiers are non-zero is unsupported
1574 if (TII->hasAnyModifiersSet(MI))
1575 return false;
1576 OpToFoldPtr = &MI.getOperand(2);
1577 } else
1578 OpToFoldPtr = &MI.getOperand(1);
1579 MachineOperand &OpToFold = *OpToFoldPtr;
1580 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1581
1582 // FIXME: We could also be folding things like TargetIndexes.
1583 if (!FoldingImm && !OpToFold.isReg())
1584 return false;
1585
1586 if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1587 return false;
1588
1589 // Prevent folding operands backwards in the function. For example,
1590 // the COPY opcode must not be replaced by 1 in this example:
1591 //
1592 // %3 = COPY %vgpr0; VGPR_32:%3
1593 // ...
1594 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1595 if (!DstReg.isVirtual())
1596 return false;
1597
1598 if (OpToFold.isReg() &&
1599 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1600 return true;
1601
1602 bool Changed = foldInstOperand(MI, OpToFold);
1603
1604 // If we managed to fold all uses of this copy then we might as well
1605 // delete it now.
1606 // The only reason we need to follow chains of copies here is that
1607 // tryFoldRegSequence looks forward through copies before folding a
1608 // REG_SEQUENCE into its eventual users.
1609 auto *InstToErase = &MI;
1610 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1611 auto &SrcOp = InstToErase->getOperand(1);
1612 auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1613 InstToErase->eraseFromParent();
1614 Changed = true;
1615 InstToErase = nullptr;
1616 if (!SrcReg || SrcReg.isPhysical())
1617 break;
1618 InstToErase = MRI->getVRegDef(SrcReg);
1619 if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1620 break;
1621 }
1622
1623 if (InstToErase && InstToErase->isRegSequence() &&
1624 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1625 InstToErase->eraseFromParent();
1626 Changed = true;
1627 }
1628
1629 return Changed;
1630}
1631
1632// Clamp patterns are canonically selected to v_max_* instructions, so only
1633// handle them.
1634const MachineOperand *
1635SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
1636 unsigned Op = MI.getOpcode();
1637 switch (Op) {
1638 case AMDGPU::V_MAX_F32_e64:
1639 case AMDGPU::V_MAX_F16_e64:
1640 case AMDGPU::V_MAX_F16_t16_e64:
1641 case AMDGPU::V_MAX_F16_fake16_e64:
1642 case AMDGPU::V_MAX_F64_e64:
1643 case AMDGPU::V_MAX_NUM_F64_e64:
1644 case AMDGPU::V_PK_MAX_F16: {
1645 if (MI.mayRaiseFPException())
1646 return nullptr;
1647
1648 if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1649 return nullptr;
1650
1651 // Make sure sources are identical.
1652 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1653 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1654 if (!Src0->isReg() || !Src1->isReg() ||
1655 Src0->getReg() != Src1->getReg() ||
1656 Src0->getSubReg() != Src1->getSubReg() ||
1657 Src0->getSubReg() != AMDGPU::NoSubRegister)
1658 return nullptr;
1659
1660 // Can't fold up if we have modifiers.
1661 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1662 return nullptr;
1663
1664 unsigned Src0Mods
1665 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1666 unsigned Src1Mods
1667 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1668
1669 // Having a 0 op_sel_hi would require swizzling the output in the source
1670 // instruction, which we can't do.
1671 unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1672 : 0u;
1673 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1674 return nullptr;
1675 return Src0;
1676 }
1677 default:
1678 return nullptr;
1679 }
1680}
1681
1682// FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1683bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
1684 const MachineOperand *ClampSrc = isClamp(MI);
1685 if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1686 return false;
1687
1688 MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1689
1690 // The type of clamp must be compatible.
1691 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1692 return false;
1693
1694 if (Def->mayRaiseFPException())
1695 return false;
1696
1697 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1698 if (!DefClamp)
1699 return false;
1700
1701 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1702
1703 // Clamp is applied after omod, so it is OK if omod is set.
1704 DefClamp->setImm(1);
1705
1706 Register DefReg = Def->getOperand(0).getReg();
1707 Register MIDstReg = MI.getOperand(0).getReg();
1708 if (TRI->isSGPRReg(*MRI, DefReg)) {
1709 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1710 // instruction with a VGPR dst.
1711 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1712 MIDstReg)
1713 .addReg(DefReg);
1714 } else {
1715 MRI->replaceRegWith(MIDstReg, DefReg);
1716 }
1717 MI.eraseFromParent();
1718
1719 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1720 // instruction, so we might as well convert it to the more flexible VOP3-only
1721 // mad/fma form.
1722 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1723 Def->eraseFromParent();
1724
1725 return true;
1726}
1727
1728static int getOModValue(unsigned Opc, int64_t Val) {
1729 switch (Opc) {
1730 case AMDGPU::V_MUL_F64_e64:
1731 case AMDGPU::V_MUL_F64_pseudo_e64: {
1732 switch (Val) {
1733 case 0x3fe0000000000000: // 0.5
1734 return SIOutMods::DIV2;
1735 case 0x4000000000000000: // 2.0
1736 return SIOutMods::MUL2;
1737 case 0x4010000000000000: // 4.0
1738 return SIOutMods::MUL4;
1739 default:
1740 return SIOutMods::NONE;
1741 }
1742 }
1743 case AMDGPU::V_MUL_F32_e64: {
1744 switch (static_cast<uint32_t>(Val)) {
1745 case 0x3f000000: // 0.5
1746 return SIOutMods::DIV2;
1747 case 0x40000000: // 2.0
1748 return SIOutMods::MUL2;
1749 case 0x40800000: // 4.0
1750 return SIOutMods::MUL4;
1751 default:
1752 return SIOutMods::NONE;
1753 }
1754 }
1755 case AMDGPU::V_MUL_F16_e64:
1756 case AMDGPU::V_MUL_F16_t16_e64:
1757 case AMDGPU::V_MUL_F16_fake16_e64: {
1758 switch (static_cast<uint16_t>(Val)) {
1759 case 0x3800: // 0.5
1760 return SIOutMods::DIV2;
1761 case 0x4000: // 2.0
1762 return SIOutMods::MUL2;
1763 case 0x4400: // 4.0
1764 return SIOutMods::MUL4;
1765 default:
1766 return SIOutMods::NONE;
1767 }
1768 }
1769 default:
1770 llvm_unreachable("invalid mul opcode");
1771 }
1772}
1773
1774// FIXME: Does this really not support denormals with f16?
1775// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1776// handled, so will anything other than that break?
1777std::pair<const MachineOperand *, int>
1778SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
1779 unsigned Op = MI.getOpcode();
1780 switch (Op) {
1781 case AMDGPU::V_MUL_F64_e64:
1782 case AMDGPU::V_MUL_F64_pseudo_e64:
1783 case AMDGPU::V_MUL_F32_e64:
1784 case AMDGPU::V_MUL_F16_t16_e64:
1785 case AMDGPU::V_MUL_F16_fake16_e64:
1786 case AMDGPU::V_MUL_F16_e64: {
1787 // If output denormals are enabled, omod is ignored.
1788 if ((Op == AMDGPU::V_MUL_F32_e64 &&
1789 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1790 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1791 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1792 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1793 MFI->getMode().FP64FP16Denormals.Output !=
1795 MI.mayRaiseFPException())
1796 return std::pair(nullptr, SIOutMods::NONE);
1797
1798 const MachineOperand *RegOp = nullptr;
1799 const MachineOperand *ImmOp = nullptr;
1800 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1801 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1802 if (Src0->isImm()) {
1803 ImmOp = Src0;
1804 RegOp = Src1;
1805 } else if (Src1->isImm()) {
1806 ImmOp = Src1;
1807 RegOp = Src0;
1808 } else
1809 return std::pair(nullptr, SIOutMods::NONE);
1810
1811 int OMod = getOModValue(Op, ImmOp->getImm());
1812 if (OMod == SIOutMods::NONE ||
1813 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1814 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1815 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1816 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1817 return std::pair(nullptr, SIOutMods::NONE);
1818
1819 return std::pair(RegOp, OMod);
1820 }
1821 case AMDGPU::V_ADD_F64_e64:
1822 case AMDGPU::V_ADD_F64_pseudo_e64:
1823 case AMDGPU::V_ADD_F32_e64:
1824 case AMDGPU::V_ADD_F16_e64:
1825 case AMDGPU::V_ADD_F16_t16_e64:
1826 case AMDGPU::V_ADD_F16_fake16_e64: {
1827 // If output denormals are enabled, omod is ignored.
1828 if ((Op == AMDGPU::V_ADD_F32_e64 &&
1829 MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1830 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1831 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1832 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1833 MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1834 return std::pair(nullptr, SIOutMods::NONE);
1835
1836 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1837 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1838 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1839
1840 if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1841 Src0->getSubReg() == Src1->getSubReg() &&
1842 !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1843 !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1844 !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1845 !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1846 return std::pair(Src0, SIOutMods::MUL2);
1847
1848 return std::pair(nullptr, SIOutMods::NONE);
1849 }
1850 default:
1851 return std::pair(nullptr, SIOutMods::NONE);
1852 }
1853}
1854
1855// FIXME: Does this need to check IEEE bit on function?
1856bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
1857 const MachineOperand *RegOp;
1858 int OMod;
1859 std::tie(RegOp, OMod) = isOMod(MI);
1860 if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1861 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1862 !MRI->hasOneNonDBGUser(RegOp->getReg()))
1863 return false;
1864
1865 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1866 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1867 if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1868 return false;
1869
1870 if (Def->mayRaiseFPException())
1871 return false;
1872
1873 // Clamp is applied after omod. If the source already has clamp set, don't
1874 // fold it.
1875 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1876 return false;
1877
1878 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1879
1880 DefOMod->setImm(OMod);
1881 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1882 // Kill flags can be wrong if we replaced a def inside a loop with a def
1883 // outside the loop.
1884 MRI->clearKillFlags(Def->getOperand(0).getReg());
1885 MI.eraseFromParent();
1886
1887 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1888 // instruction, so we might as well convert it to the more flexible VOP3-only
1889 // mad/fma form.
1890 if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1891 Def->eraseFromParent();
1892
1893 return true;
1894}
1895
1896// Try to fold a reg_sequence with vgpr output and agpr inputs into an
1897// instruction which can take an agpr. So far that means a store.
1898bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
1899 assert(MI.isRegSequence());
1900 auto Reg = MI.getOperand(0).getReg();
1901
1902 if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1903 !MRI->hasOneNonDBGUse(Reg))
1904 return false;
1905
1907 if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1908 return false;
1909
1910 for (auto &[Op, SubIdx] : Defs) {
1911 if (!Op->isReg())
1912 return false;
1913 if (TRI->isAGPR(*MRI, Op->getReg()))
1914 continue;
1915 // Maybe this is a COPY from AREG
1916 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1917 if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1918 return false;
1919 if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1920 return false;
1921 }
1922
1923 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1924 MachineInstr *UseMI = Op->getParent();
1925 while (UseMI->isCopy() && !Op->getSubReg()) {
1926 Reg = UseMI->getOperand(0).getReg();
1927 if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1928 return false;
1929 Op = &*MRI->use_nodbg_begin(Reg);
1930 UseMI = Op->getParent();
1931 }
1932
1933 if (Op->getSubReg())
1934 return false;
1935
1936 unsigned OpIdx = Op - &UseMI->getOperand(0);
1937 const MCInstrDesc &InstDesc = UseMI->getDesc();
1938 const TargetRegisterClass *OpRC =
1939 TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1940 if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1941 return false;
1942
1943 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1944 auto Dst = MRI->createVirtualRegister(NewDstRC);
1945 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1946 TII->get(AMDGPU::REG_SEQUENCE), Dst);
1947
1948 for (auto &[Def, SubIdx] : Defs) {
1949 Def->setIsKill(false);
1950 if (TRI->isAGPR(*MRI, Def->getReg())) {
1951 RS.add(*Def);
1952 } else { // This is a copy
1953 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1954 SubDef->getOperand(1).setIsKill(false);
1955 RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1956 }
1957 RS.addImm(SubIdx);
1958 }
1959
1960 Op->setReg(Dst);
1961 if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1962 Op->setReg(Reg);
1963 RS->eraseFromParent();
1964 return false;
1965 }
1966
1967 LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1968
1969 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1970 // in which case we can erase them all later in runOnMachineFunction.
1971 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1972 MI.eraseFromParent();
1973 return true;
1974}
1975
1976/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1977/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1978static bool isAGPRCopy(const SIRegisterInfo &TRI,
1979 const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1980 Register &OutReg, unsigned &OutSubReg) {
1981 assert(Copy.isCopy());
1982
1983 const MachineOperand &CopySrc = Copy.getOperand(1);
1984 Register CopySrcReg = CopySrc.getReg();
1985 if (!CopySrcReg.isVirtual())
1986 return false;
1987
1988 // Common case: copy from AGPR directly, e.g.
1989 // %1:vgpr_32 = COPY %0:agpr_32
1990 if (TRI.isAGPR(MRI, CopySrcReg)) {
1991 OutReg = CopySrcReg;
1992 OutSubReg = CopySrc.getSubReg();
1993 return true;
1994 }
1995
1996 // Sometimes it can also involve two copies, e.g.
1997 // %1:vgpr_256 = COPY %0:agpr_256
1998 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1999 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2000 if (!CopySrcDef || !CopySrcDef->isCopy())
2001 return false;
2002
2003 const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
2004 Register OtherCopySrcReg = OtherCopySrc.getReg();
2005 if (!OtherCopySrcReg.isVirtual() ||
2006 CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
2007 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2008 !TRI.isAGPR(MRI, OtherCopySrcReg))
2009 return false;
2010
2011 OutReg = OtherCopySrcReg;
2012 OutSubReg = CopySrc.getSubReg();
2013 return true;
2014}
2015
2016// Try to hoist an AGPR to VGPR copy across a PHI.
2017// This should allow folding of an AGPR into a consumer which may support it.
2018//
2019// Example 1: LCSSA PHI
2020// loop:
2021// %1:vreg = COPY %0:areg
2022// exit:
2023// %2:vreg = PHI %1:vreg, %loop
2024// =>
2025// loop:
2026// exit:
2027// %1:areg = PHI %0:areg, %loop
2028// %2:vreg = COPY %1:areg
2029//
2030// Example 2: PHI with multiple incoming values:
2031// entry:
2032// %1:vreg = GLOBAL_LOAD(..)
2033// loop:
2034// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2035// %3:areg = COPY %2:vreg
2036// %4:areg = (instr using %3:areg)
2037// %5:vreg = COPY %4:areg
2038// =>
2039// entry:
2040// %1:vreg = GLOBAL_LOAD(..)
2041// %2:areg = COPY %1:vreg
2042// loop:
2043// %3:areg = PHI %2:areg, %entry, %X:areg,
2044// %4:areg = (instr using %3:areg)
2045bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2046 assert(PHI.isPHI());
2047
2048 Register PhiOut = PHI.getOperand(0).getReg();
2049 if (!TRI->isVGPR(*MRI, PhiOut))
2050 return false;
2051
2052 // Iterate once over all incoming values of the PHI to check if this PHI is
2053 // eligible, and determine the exact AGPR RC we'll target.
2054 const TargetRegisterClass *ARC = nullptr;
2055 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2056 MachineOperand &MO = PHI.getOperand(K);
2057 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2058 if (!Copy || !Copy->isCopy())
2059 continue;
2060
2061 Register AGPRSrc;
2062 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2063 if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
2064 continue;
2065
2066 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2067 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2068 CopyInRC = SubRC;
2069
2070 if (ARC && !ARC->hasSubClassEq(CopyInRC))
2071 return false;
2072 ARC = CopyInRC;
2073 }
2074
2075 if (!ARC)
2076 return false;
2077
2078 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2079
2080 // Rewrite the PHI's incoming values to ARC.
2081 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
2082 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2083 MachineOperand &MO = PHI.getOperand(K);
2084 Register Reg = MO.getReg();
2085
2087 MachineBasicBlock *InsertMBB = nullptr;
2088
2089 // Look at the def of Reg, ignoring all copies.
2090 unsigned CopyOpc = AMDGPU::COPY;
2091 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2092
2093 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2094 // the copy was single-use, it will be removed by DCE later.
2095 if (Def->isCopy()) {
2096 Register AGPRSrc;
2097 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2098 if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
2099 MO.setReg(AGPRSrc);
2100 MO.setSubReg(AGPRSubReg);
2101 continue;
2102 }
2103
2104 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2105 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2106 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2107 // is unlikely to be profitable.
2108 //
2109 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2110 MachineOperand &CopyIn = Def->getOperand(1);
2111 if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
2112 TRI->isSGPRReg(*MRI, CopyIn.getReg()))
2113 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2114 }
2115
2116 InsertMBB = Def->getParent();
2117 InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
2118 } else {
2119 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2120 InsertPt = InsertMBB->getFirstTerminator();
2121 }
2122
2123 Register NewReg = MRI->createVirtualRegister(ARC);
2124 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2125 TII->get(CopyOpc), NewReg)
2126 .addReg(Reg);
2127 MO.setReg(NewReg);
2128
2129 (void)MI;
2130 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
2131 }
2132
2133 // Replace the PHI's result with a new register.
2134 Register NewReg = MRI->createVirtualRegister(ARC);
2135 PHI.getOperand(0).setReg(NewReg);
2136
2137 // COPY that new register back to the original PhiOut register. This COPY will
2138 // usually be folded out later.
2139 MachineBasicBlock *MBB = PHI.getParent();
2140 BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2141 TII->get(AMDGPU::COPY), PhiOut)
2142 .addReg(NewReg);
2143
2144 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
2145 return true;
2146}
2147
2148// Attempt to convert VGPR load to an AGPR load.
2149bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2150 assert(MI.mayLoad());
2151 if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2152 return false;
2153
2154 MachineOperand &Def = MI.getOperand(0);
2155 if (!Def.isDef())
2156 return false;
2157
2158 Register DefReg = Def.getReg();
2159
2160 if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2161 return false;
2162
2164 SmallVector<Register, 8> MoveRegs;
2165 for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2166 Users.push_back(&I);
2167
2168 if (Users.empty())
2169 return false;
2170
2171 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2172 while (!Users.empty()) {
2173 const MachineInstr *I = Users.pop_back_val();
2174 if (!I->isCopy() && !I->isRegSequence())
2175 return false;
2176 Register DstReg = I->getOperand(0).getReg();
2177 // Physical registers may have more than one instruction definitions
2178 if (DstReg.isPhysical())
2179 return false;
2180 if (TRI->isAGPR(*MRI, DstReg))
2181 continue;
2182 MoveRegs.push_back(DstReg);
2183 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2184 Users.push_back(&U);
2185 }
2186
2187 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2188 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2189 if (!TII->isOperandLegal(MI, 0, &Def)) {
2190 MRI->setRegClass(DefReg, RC);
2191 return false;
2192 }
2193
2194 while (!MoveRegs.empty()) {
2195 Register Reg = MoveRegs.pop_back_val();
2196 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2197 }
2198
2199 LLVM_DEBUG(dbgs() << "Folded " << MI);
2200
2201 return true;
2202}
2203
2204// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2205// For GFX90A and later, this is pretty much always a good thing, but for GFX908
2206// there's cases where it can create a lot more AGPR-AGPR copies, which are
2207// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2208//
2209// This function looks at all AGPR PHIs in a basic block and collects their
2210// operands. Then, it checks for register that are used more than once across
2211// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2212// having to create one VGPR temporary per use, which can get very messy if
2213// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2214// element).
2215//
2216// Example
2217// a:
2218// %in:agpr_256 = COPY %foo:vgpr_256
2219// c:
2220// %x:agpr_32 = ..
2221// b:
2222// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2223// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2224// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2225// =>
2226// a:
2227// %in:agpr_256 = COPY %foo:vgpr_256
2228// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2229// %tmp_agpr:agpr_32 = COPY %tmp
2230// c:
2231// %x:agpr_32 = ..
2232// b:
2233// %0:areg = PHI %tmp_agpr, %a, %x, %c
2234// %1:areg = PHI %tmp_agpr, %a, %y, %c
2235// %2:areg = PHI %tmp_agpr, %a, %z, %c
2236bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2237 // This is only really needed on GFX908 where AGPR-AGPR copies are
2238 // unreasonably difficult.
2239 if (ST->hasGFX90AInsts())
2240 return false;
2241
2242 // Look at all AGPR Phis and collect the register + subregister used.
2243 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2244 RegToMO;
2245
2246 for (auto &MI : MBB) {
2247 if (!MI.isPHI())
2248 break;
2249
2250 if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2251 continue;
2252
2253 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2254 MachineOperand &PhiMO = MI.getOperand(K);
2255 if (!PhiMO.getSubReg())
2256 continue;
2257 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2258 }
2259 }
2260
2261 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2262 // a VGPR.
2263 bool Changed = false;
2264 for (const auto &[Entry, MOs] : RegToMO) {
2265 if (MOs.size() == 1)
2266 continue;
2267
2268 const auto [Reg, SubReg] = Entry;
2269 MachineInstr *Def = MRI->getVRegDef(Reg);
2270 MachineBasicBlock *DefMBB = Def->getParent();
2271
2272 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2273 // out.
2274 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2275 Register TempVGPR =
2276 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2277 MachineInstr *VGPRCopy =
2278 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2279 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2280 .addReg(Reg, /* flags */ 0, SubReg);
2281
2282 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2283 Register TempAGPR = MRI->createVirtualRegister(ARC);
2284 BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2285 TII->get(AMDGPU::COPY), TempAGPR)
2286 .addReg(TempVGPR);
2287
2288 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2289 for (MachineOperand *MO : MOs) {
2290 MO->setReg(TempAGPR);
2291 MO->setSubReg(AMDGPU::NoSubRegister);
2292 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2293 }
2294
2295 Changed = true;
2296 }
2297
2298 return Changed;
2299}
2300
2301bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2302 MRI = &MF.getRegInfo();
2303 ST = &MF.getSubtarget<GCNSubtarget>();
2304 TII = ST->getInstrInfo();
2305 TRI = &TII->getRegisterInfo();
2306 MFI = MF.getInfo<SIMachineFunctionInfo>();
2307
2308 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2309 // correctly handle signed zeros.
2310 //
2311 // FIXME: Also need to check strictfp
2312 bool IsIEEEMode = MFI->getMode().IEEE;
2313 bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2314
2315 bool Changed = false;
2316 for (MachineBasicBlock *MBB : depth_first(&MF)) {
2317 MachineOperand *CurrentKnownM0Val = nullptr;
2318 for (auto &MI : make_early_inc_range(*MBB)) {
2319 Changed |= tryFoldCndMask(MI);
2320
2321 if (tryFoldZeroHighBits(MI)) {
2322 Changed = true;
2323 continue;
2324 }
2325
2326 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2327 Changed = true;
2328 continue;
2329 }
2330
2331 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2332 Changed = true;
2333 continue;
2334 }
2335
2336 if (MI.mayLoad() && tryFoldLoad(MI)) {
2337 Changed = true;
2338 continue;
2339 }
2340
2341 if (TII->isFoldableCopy(MI)) {
2342 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2343 continue;
2344 }
2345
2346 // Saw an unknown clobber of m0, so we no longer know what it is.
2347 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2348 CurrentKnownM0Val = nullptr;
2349
2350 // TODO: Omod might be OK if there is NSZ only on the source
2351 // instruction, and not the omod multiply.
2352 if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2353 !tryFoldOMod(MI))
2354 Changed |= tryFoldClamp(MI);
2355 }
2356
2357 Changed |= tryOptimizeAGPRPhis(*MBB);
2358 }
2359
2360 return Changed;
2361}
2362
2365 bool Changed = SIFoldOperandsImpl().run(MF);
2366 if (!Changed) {
2367 return PreservedAnalyses::all();
2368 }
2370 PA.preserveSet<CFGAnalyses>();
2371 return PA;
2372}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static bool isReg(const MCInst &MI, unsigned OpNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
static unsigned macToMad(unsigned Opc)
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
static int getOModValue(unsigned Opc, int64_t Val)
static bool isUseMIInFoldList(ArrayRef< FoldCandidate > FoldList, const MachineInstr *MI)
static unsigned getMovOpc(bool IsScalar)
static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc)
#define DEBUG_TYPE
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, bool Commuted=false, int ShrinkOp=-1)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:204
support::ulittle16_t & Hi
Definition: aarch32.cpp:203
Class for arbitrary precision integers.
Definition: APInt.h:78
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool skipFunction(const Function &F) const
Optional passes call this function to check whether the pass should be skipped.
Definition: Pass.cpp:178
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
Definition: MCInstrDesc.h:261
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:781
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:572
bool isRegSequence() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:705
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo)
Does this operand support only inlinable literals?
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1477
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ Add
Sum of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< df_iterator< T > > depth_first(const T &G)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Description of the encoding of one expression Op.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
A pair composed of a register and a sub-register index.